def write_to_bq(bigquery): """Write the data to BigQuery in small chunks.""" tweets = [] CHUNK = 50 # The size of the BigQuery insertion batch. twstring = '' tweet = None mtweet = None while True: while len(tweets) < CHUNK: # We'll use a blocking list pop -- it returns when there is # new data. res = r.brpop(REDIS_LIST) twstring = res[1] try: tweet = json.loads(res[1]) except Exception, bqe: print bqe continue # First do some massaging of the raw data mtweet = utils.cleanup(tweet) # We only want to write tweets to BigQuery; we'll skip 'delete' and # 'limit' information. if 'delete' in mtweet: continue if 'limit' in mtweet: print mtweet continue tweets.append(mtweet) # try to insert the tweets into bigquery utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'], os.environ['BQ_TABLE'], tweets) tweets = []
def write_to_bq(pubsub, sub_name, bigquery): """Write the data to BigQuery in small chunks.""" tweets = [] CHUNK = 50 # The size of the BigQuery insertion batch. # If no data on the subscription, the time to sleep in seconds # before checking again. WAIT = 2 tweet = None mtweet = None while True: while len(tweets) < CHUNK: twmessages = pull_messages(pubsub, PROJECT_ID, sub_name) if twmessages: for res in twmessages: try: tweet = json.loads(res) except Exception, bqe: print bqe # First do some massaging of the raw data mtweet = utils.cleanup(tweet) # We only want to write tweets to BigQuery; we'll skip # 'delete' and 'limit' information. if 'delete' in mtweet: continue if 'limit' in mtweet: print mtweet continue tweets.append(mtweet) else: # pause before checking again print 'sleeping...' time.sleep(WAIT) utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'], os.environ['BQ_TABLE'], tweets) tweets = []
def write_to_bq(pubsub_sub, pubsub_pub, sub_name, bigquery): """Write the data to BigQuery in small chunks.""" tweets = [] CHUNK = 50 # The size of the BigQuery insertion batch. WAIT = 2 # Sleep time in seconds if no data while 1 > 0: while len(tweets) < CHUNK: twmessages = pull_messages(pubsub_sub, PROJECT_ID, sub_name) if twmessages: for res in twmessages: try: tweet = json.loads(res) if tweet.get('id') is None: logging.error(f'Tweet Parse: Missing ID - {res}') raise ValueError('Missing Tweet ID') mtweet = utils.cleanup(tweet) tweets.append(mtweet) except Exception as bqe: logging.error(f'Tweet Parse: Error - {bqe}') else: time.sleep(WAIT) utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'], os.environ['BQ_TABLE'], tweets) tweets = []
def write_to_bq(bigquery): """Write the data to BigQuery in small chunks.""" tweets = [] CHUNK = 50 # The size of the BigQuery insertion batch. tweet = None mtweet = None count = 0 count_max = 50000 redis_errors = 0 allowed_redis_errors = 3 while count < count_max: while len(tweets) < CHUNK: # We'll use a blocking list pop -- it returns when there is # new data. res = None try: res = r.brpop(REDIS_LIST) except: print 'Problem getting data from Redis.' redis_errors += 1 if redis_errors > allowed_redis_errors: print "Too many redis errors: exiting." return continue try: tweet = json.loads(res[1]) except Exception, e: print e redis_errors += 1 if redis_errors > allowed_redis_errors: print "Too many redis-related errors: exiting." return continue # First do some massaging of the raw data mtweet = utils.cleanup(tweet) # We only want to write tweets to BigQuery; we'll skip 'delete' and # 'limit' information. if 'delete' in mtweet: continue if 'limit' in mtweet: continue tweets.append(mtweet) # try to insert the tweets into bigquery response = utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'], os.environ['BQ_TABLE'], tweets) tweets = [] count += 1 if count % 25 == 0: print("processing count: %s of %s at %s: %s" % (count, count_max, datetime.datetime.now(), response))
def write_to_bq(bigquery): """Write the data to BigQuery in small chunks.""" tweets = [] CHUNK = 50 # The size of the BigQuery insertion batch. tweet = None mtweet = None count = 0 count_max = 50000 redis_errors = 0 allowed_redis_errors = 3 while count < count_max: while len(tweets) < CHUNK: # We'll use a blocking list pop -- it returns when there is # new data. res = None try: res = r.brpop(REDIS_LIST) except: print 'Problem getting data from Redis.' redis_errors += 1 if redis_errors > allowed_redis_errors: print "Too many redis errors: exiting." return continue try: tweet = json.loads(res[1]) except Exception, e: print e redis_errors += 1 if redis_errors > allowed_redis_errors: print "Too many redis-related errors: exiting." return continue # First do some massaging of the raw data mtweet = utils.cleanup(tweet) # We only want to write tweets to BigQuery; we'll skip 'delete' and # 'limit' information. if 'delete' in mtweet: continue if 'limit' in mtweet: continue tweets.append(mtweet) # try to insert the tweets into bigquery response = utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'], os.environ['BQ_TABLE'], tweets) tweets = [] count += 1 if count % 25 == 0: print ("processing count: %s of %s at %s: %s" % (count, count_max, datetime.datetime.now(), response))
def write_to_bq(pubsub, sub_name, bigquery): """Write the data to BigQuery in small chunks.""" tweets = [] CHUNK = 50 # The size of the BigQuery insertion batch. # If no data on the subscription, the time to sleep in seconds # before checking again. WAIT = 2 tweet = None mtweet = None count = 0 count_max = 50000 while count < count_max: while len(tweets) < CHUNK: twmessages = pull_messages(pubsub, PROJECT_ID, sub_name) if twmessages: for res in twmessages: print(res) decoded_res = base64.urlsafe_b64decode(res) print(decoded_res) try: tweet = json.loads(decoded_res) print(tweet) except Exception, bqe: print bqe # First do some massaging of the raw data mtweet = utils.cleanup(tweet) # We only want to write tweets to BigQuery; we'll skip # 'delete' and 'limit' information. if not mtweet: continue if 'delete' in mtweet: continue if 'limit' in mtweet: continue tweets.append(mtweet) else: # pause before checking again print 'sleeping...' time.sleep(WAIT) response = utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'], os.environ['BQ_TABLE'], tweets) tweets = [] count += 1 if count % 25 == 0: print("processing count: %s of %s at %s: %s" % (count, count_max, datetime.datetime.now(), response))
print "NLP result - sentiment magnitude:", response[ 'documentSentiment']['magnitude'] mtweet['sentiment_score'] = response[ 'documentSentiment']['score'] mtweet['sentiment_magnitude'] = response[ 'documentSentiment']['magnitude'] except Exception, e: print e print "Unsupported language, skipping the tweet" tweets.append(mtweet) else: # pause before checking again print 'sleeping...' time.sleep(WAIT) response = utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'], os.environ['BQ_TABLE'], tweets) tweets = [] count += 1 if count % 25 == 0: print("processing count: %s of %s at %s: %s" % (count, count_max, datetime.datetime.now(), response)) if __name__ == '__main__': topic_info = PUBSUB_TOPIC.split('/') topic_name = topic_info[-1] sub_name = "tweets-%s" % topic_name print "starting write to BigQuery...." credentials = utils.get_credentials() bigquery = utils.create_bigquery_client(credentials)