class StdOutListener(StreamListener): """A listener handles tweets that are received from the stream. This listener dumps the tweets into a PubSub topic """ count = 0 twstring = '' tweets = [] batch_size = 50 total_tweets = 10000000 client = utils.create_pubsub_client(utils.get_credentials()) def write_to_pubsub(self, tw): publish(self.client, PUBSUB_TOPIC, tw) def on_data(self, data): """What to do when tweet data is received.""" self.tweets.append(data) if len(self.tweets) >= self.batch_size: self.write_to_pubsub(self.tweets) self.tweets = [] self.count += 1 # if we've grabbed more than total_tweets tweets, exit the script. # If this script is being run in the context of a kubernetes # replicationController, the pod will be restarted fresh when # that happens. if self.count > self.total_tweets: return False if (self.count % 1000) == 0: print 'count is: %s' % self.count return True def on_error(self, status): print status
class StdOutListener(StreamListener): """A listener handles tweets that are received from the stream. This listener dumps the tweets into a PubSub topic """ count = 0 twstring = '' tweets = [] batch_size = 50 total_tweets = 10000000 client = utils.create_pubsub_client(utils.get_credentials()) def write_to_pubsub(self, tw): publish(self.client, 'projects/assignment3-276800/topics/assignment3', tw) def on_data(self, data): """What to do when tweet data is received.""" self.tweets.append(data) if len(self.tweets) >= self.batch_size: self.write_to_pubsub(self.tweets) self.tweets = [] self.count += 1 # if we've grabbed more than total_tweets tweets, exit the script. if self.count > self.total_tweets: return False if (self.count % 1000) == 0: print 'count is: %s at %s' % (self.count, datetime.datetime.now()) return True def on_error(self, status): print status
class StdOutListener(StreamListener): """A listener handles tweets that are received from the stream. This listener dumps the tweets into a PubSub topic """ count = 0 twstring = '' tweets = [] batch_size = 50 total_tweets = 100000 client = utils.create_pubsub_client(utils.get_credentials()) def on_status(self, data): write_to_pubsub(reformat_tweet(data._json)) self.count += 1 # if we've grabbed more than total_tweets tweets, exit the script. if self.count > self.total_tweets: return False return True ''' def on_data(self, data): """What to do when tweet data is received.""" self.tweets.append(data) if len(self.tweets) >= self.batch_size: write_to_pubsub(reformat_tweet(data._json)) self.tweets = [] if self.count > self.total_tweets: return False if (self.count % 1000) == 0: print('count is: %s at %s' % (self.count, datetime.now())) return True ''' def on_error(self, status): print(status)
class StdOutListener(StreamListener): count = 0 twstring = '' tweets = [] batch_size = 50 #cuantos tweets extraer por request total_tweets = 10000000 #detener al llegar a total_tweets client = utils.create_pubsub_client(utils.get_credentials()) def write_to_pubsub(self, tw): publish(self.client, 'projects/sd-3-241301/topics/twitter', tw) def on_data(self, data): self.tweets.append(data) if len(self.tweets) >= self.batch_size: self.write_to_pubsub(self.tweets) self.tweets = [] self.count += 1 if self.count > self.total_tweets: return False if (self.count % 1000) == 0: print 'count is: %s at %s' % (self.count, datetime.datetime.now()) return True def on_error(self, status): print status
def publish(pubsub_topic, data_lines): """Publish to the given pubsub topic.""" messages = [] for line in data_lines: pub = base64.urlsafe_b64encode(str(line)) messages.append({'data': pub}) body = {'messages': messages} client = utils.create_pubsub_client(utils.get_credentials()) resp = client.projects().topics().publish( topic=pubsub_topic, body=body).execute(num_retries=NUM_RETRIES) return resp
class StdOutListener(StreamListener): """A listener handles tweets that are received from the stream. This listener dumps the tweets into a PubSub topic """ count = 0 twstring = '' tweets = [] batch_size = 50 total_tweets = 10000000 client = utils.create_pubsub_client(utils.get_credentials()) print 'in stdoutlistener' def write_to_pubsub(self, tw): publish(self.client, PUBSUB_TOPIC, tw) def on_data(self, data): """What to do when tweet data is received.""" pub_data = {} all_data = json.loads(data) pub_data["tweet"] = all_data["text"] pub_data["username"] = all_data["user"]["screen_name"] pub_data["userlocation"] = all_data["user"]["location"] pub_data["retweetcount"] = all_data["retweet_count"] pub_data["favoritecount"] = all_data["favorite_count"] pass_data = json.dumps(pub_data) self.tweets.append(pass_data) if len(self.tweets) >= self.batch_size: self.write_to_pubsub(self.tweets) self.tweets = [] self.count += 1 # if we've grabbed more than total_tweets tweets, exit the script. # If this script is being run in the context of a kubernetes # replicationController, the pod will be restarted fresh when # that happens. if self.count > self.total_tweets: return False if (self.count % 1000) == 0: print 'count is: %s at %s' % (self.count, datetime.datetime.now()) return True def on_error(self, status): print status
tweet_string = json.dumps(mtweet) tweets.append(tweet_string) else: # pause before checking again print 'sleeping...' time.sleep(WAIT) if len(tweets) >= CHUNK: write_to_pubsub(pubsub, tweets) tweets = [] count += 1 if count % 25 == 0: print("processing count: %s of %s at %s" % (count, count_max, datetime.datetime.now())) if __name__ == '__main__': ingest_topic_info = PUBSUB_TOPIC_INGEST.split('/') ingest_topic_name = ingest_topic_info[-1] ingest_sub_name = "tweets-%s" % ingest_topic_name print "starting modeling...." credentials = utils.get_credentials() pubsub = utils.create_pubsub_client(credentials) try: # TODO: check if subscription exists first subscription = utils.create_subscription(pubsub, PROJECT_ID, ingest_sub_name, PUBSUB_TOPIC_INGEST) except Exception, e: print e model_tweets(pubsub, ingest_sub_name) print 'exited write loop'
continue tweets.append(mtweet) else: # pause before checking again print 'sleeping...' time.sleep(WAIT) response = utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'], os.environ['BQ_TABLE'], tweets) tweets = [] count += 1 if count % 25 == 0: print ("processing count: %s of %s at %s: %s" % (count, count_max, datetime.datetime.now(), response)) if __name__ == '__main__': topic_info = PUBSUB_TOPIC.split('/') topic_name = topic_info[-1] sub_name = "tweets-%s" % topic_name print "starting write to BigQuery...." credentials = utils.get_credentials() bigquery = utils.create_bigquery_client(credentials) pubsub = utils.create_pubsub_client(credentials) try: # TODO: check if subscription exists first subscription = create_subscription(pubsub, PROJECT_ID, sub_name) except Exception, e: print e write_to_bq(pubsub, sub_name, bigquery) print 'exited write loop'
class StdOutListener(StreamListener): """A listener handles tweets that are received from the stream. This listener dumps the tweets into a PubSub topic """ count = 0 twstring = '' tweets = [] batch_size = 50 total_tweets = 10000000 client = utils.create_pubsub_client(utils.get_credentials()) print 'in stdoutlistener' def write_to_pubsub(self, tw): publish(self.client, PUBSUB_TOPIC, tw) def on_data(self, data): """What to do when tweet data is received.""" client = language.LanguageServiceClient() pub_data = {} all_data = json.loads(data) pub_data["created_at"] = all_data["created_at"] pub_data["tweet"] = all_data["text"] pub_data["username"] = all_data["user"]["screen_name"] pub_data["userlocation"] = all_data["user"]["location"] pub_data["retweetcount"] = all_data["retweet_count"] pub_data["favoritecount"] = all_data["favorite_count"] pub_data["profileimage_url"] = all_data["user"][ "profile_image_url_https"] pub_data["replycount"] = all_data["reply_count"] document = types.Document(content=pub_data["tweet"], type=enums.Document.Type.PLAIN_TEXT) try: sentiment = client.analyze_sentiment( document=document).document_sentiment pub_data["score"] = sentiment.score pub_data["magnitude"] = sentiment.magnitude entities = client.analyze_entities(document).entities mention_count = 1 for entity in entities: if mention_count == 1: pub_data["mention_1"] = entity.name elif mention_count == 2: pub_data["mention_2"] = entity.name elif mention_count == 3: pub_data["mention_3"] = entity.name elif mention_count == 4: pub_data["mention_4"] = entity.name elif mention_count == 5: pub_data["mention_5"] = entity.name else: pass mention_count = mention_count + 1 print(u'{:<16}: {}'.format('name', entity.name)) print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude)) except InvalidArgument as e: pub_data["score"] = 0 pub_data["magnitude"] = 0 print "NLP API Bypassed" print "Tweet", pub_data["tweet"] pass_data = json.dumps(pub_data) self.tweets.append(pass_data) if len(self.tweets) >= self.batch_size: self.write_to_pubsub(self.tweets) self.tweets = [] self.count += 1 # if we've grabbed more than total_tweets tweets, exit the script. if self.count > self.total_tweets: return False if (self.count % 1000) == 0: print 'count is: %s at %s' % (self.count, datetime.datetime.now()) return True def on_error(self, status): print status