Пример #1
0
class StdOutListener(StreamListener):
    """A listener handles tweets that are received from the stream.
    This listener dumps the tweets into a PubSub topic
    """

    count = 0
    twstring = ''
    tweets = []
    batch_size = 50
    total_tweets = 10000000
    client = utils.create_pubsub_client(utils.get_credentials())

    def write_to_pubsub(self, tw):
        publish(self.client, PUBSUB_TOPIC, tw)

    def on_data(self, data):
        """What to do when tweet data is received."""
        self.tweets.append(data)
        if len(self.tweets) >= self.batch_size:
            self.write_to_pubsub(self.tweets)
            self.tweets = []
        self.count += 1
        # if we've grabbed more than total_tweets tweets, exit the script.
        # If this script is being run in the context of a kubernetes
        # replicationController, the pod will be restarted fresh when
        # that happens.
        if self.count > self.total_tweets:
            return False
        if (self.count % 1000) == 0:
            print 'count is: %s' % self.count
        return True

    def on_error(self, status):
        print status
Пример #2
0
class StdOutListener(StreamListener):
    """A listener handles tweets that are received from the stream.
    This listener dumps the tweets into a PubSub topic
    """
    count = 0
    twstring = ''
    tweets = []
    batch_size = 50
    total_tweets = 10000000
    client = utils.create_pubsub_client(utils.get_credentials())

    def write_to_pubsub(self, tw):
        publish(self.client, 'projects/assignment3-276800/topics/assignment3', tw)

    def on_data(self, data):
        """What to do when tweet data is received."""
        self.tweets.append(data)
        if len(self.tweets) >= self.batch_size:
            self.write_to_pubsub(self.tweets)
            self.tweets = []
        self.count += 1
        # if we've grabbed more than total_tweets tweets, exit the script.
        if self.count > self.total_tweets:
            return False
        if (self.count % 1000) == 0:
            print 'count is: %s at %s' % (self.count, datetime.datetime.now())
        return True

    def on_error(self, status):
        print status
Пример #3
0
class StdOutListener(StreamListener):
    """A listener handles tweets that are received from the stream.
    This listener dumps the tweets into a PubSub topic
    """
    count = 0
    twstring = ''
    tweets = []
    batch_size = 50
    total_tweets = 100000
    client = utils.create_pubsub_client(utils.get_credentials())

    def on_status(self, data):
        write_to_pubsub(reformat_tweet(data._json))
        self.count += 1
        # if we've grabbed more than total_tweets tweets, exit the script.
        if self.count > self.total_tweets:
            return False
        return True

    '''
    def on_data(self, data):
        """What to do when tweet data is received."""
        self.tweets.append(data)
        if len(self.tweets) >= self.batch_size:
            write_to_pubsub(reformat_tweet(data._json))
            self.tweets = []
        if self.count > self.total_tweets:
            return False
        if (self.count % 1000) == 0:
            print('count is: %s at %s' % (self.count, datetime.now()))
        return True
    '''

    def on_error(self, status):
        print(status)
Пример #4
0
class StdOutListener(StreamListener):
    count = 0
    twstring = ''
    tweets = []
    batch_size = 50  #cuantos tweets extraer por request
    total_tweets = 10000000  #detener al llegar a total_tweets
    client = utils.create_pubsub_client(utils.get_credentials())

    def write_to_pubsub(self, tw):
        publish(self.client, 'projects/sd-3-241301/topics/twitter', tw)

    def on_data(self, data):
        self.tweets.append(data)
        if len(self.tweets) >= self.batch_size:
            self.write_to_pubsub(self.tweets)
            self.tweets = []
        self.count += 1
        if self.count > self.total_tweets:
            return False
        if (self.count % 1000) == 0:
            print 'count is: %s at %s' % (self.count, datetime.datetime.now())
        return True

    def on_error(self, status):
        print status
Пример #5
0
def publish(pubsub_topic, data_lines):
    """Publish to the given pubsub topic."""
    messages = []
    for line in data_lines:

        pub = base64.urlsafe_b64encode(str(line))
        messages.append({'data': pub})

    body = {'messages': messages}
    client = utils.create_pubsub_client(utils.get_credentials())
    resp = client.projects().topics().publish(
        topic=pubsub_topic, body=body).execute(num_retries=NUM_RETRIES)
    return resp
class StdOutListener(StreamListener):
    """A listener handles tweets that are received from the stream.
    This listener dumps the tweets into a PubSub topic
    """

    count = 0
    twstring = ''
    tweets = []
    batch_size = 50
    total_tweets = 10000000
    client = utils.create_pubsub_client(utils.get_credentials())
    print 'in stdoutlistener'

    def write_to_pubsub(self, tw):
        publish(self.client, PUBSUB_TOPIC, tw)

    def on_data(self, data):
        """What to do when tweet data is received."""

        pub_data = {}

        all_data = json.loads(data)

        pub_data["tweet"] = all_data["text"]
        pub_data["username"] = all_data["user"]["screen_name"]
        pub_data["userlocation"] = all_data["user"]["location"]
        pub_data["retweetcount"] = all_data["retweet_count"]
        pub_data["favoritecount"] = all_data["favorite_count"]

        pass_data = json.dumps(pub_data)

        self.tweets.append(pass_data)
        if len(self.tweets) >= self.batch_size:
            self.write_to_pubsub(self.tweets)
            self.tweets = []
        self.count += 1
        # if we've grabbed more than total_tweets tweets, exit the script.
        # If this script is being run in the context of a kubernetes
        # replicationController, the pod will be restarted fresh when
        # that happens.
        if self.count > self.total_tweets:
            return False
        if (self.count % 1000) == 0:
            print 'count is: %s at %s' % (self.count, datetime.datetime.now())
        return True

    def on_error(self, status):
        print status
                    tweet_string = json.dumps(mtweet)
                    tweets.append(tweet_string)
            else:
                # pause before checking again
                print 'sleeping...'
                time.sleep(WAIT)
        if len(tweets) >= CHUNK:
            write_to_pubsub(pubsub, tweets)
            tweets = []
        count += 1
        if count % 25 == 0:
            print("processing count: %s of %s at %s" %
                  (count, count_max, datetime.datetime.now()))


if __name__ == '__main__':
    ingest_topic_info = PUBSUB_TOPIC_INGEST.split('/')
    ingest_topic_name = ingest_topic_info[-1]
    ingest_sub_name = "tweets-%s" % ingest_topic_name
    print "starting modeling...."
    credentials = utils.get_credentials()
    pubsub = utils.create_pubsub_client(credentials)
    try:
        # TODO: check if subscription exists first
        subscription = utils.create_subscription(pubsub, PROJECT_ID,
                                                 ingest_sub_name,
                                                 PUBSUB_TOPIC_INGEST)
    except Exception, e:
        print e
    model_tweets(pubsub, ingest_sub_name)
    print 'exited write loop'
                        continue
                    tweets.append(mtweet)
            else:
                # pause before checking again
                print 'sleeping...'
                time.sleep(WAIT)
        response = utils.bq_data_insert(bigquery, PROJECT_ID, os.environ['BQ_DATASET'],
                             os.environ['BQ_TABLE'], tweets)
        tweets = []
        count += 1
        if count % 25 == 0:
            print ("processing count: %s of %s at %s: %s" %
                   (count, count_max, datetime.datetime.now(), response))


if __name__ == '__main__':
    topic_info = PUBSUB_TOPIC.split('/')
    topic_name = topic_info[-1]
    sub_name = "tweets-%s" % topic_name
    print "starting write to BigQuery...."
    credentials = utils.get_credentials()
    bigquery = utils.create_bigquery_client(credentials)
    pubsub = utils.create_pubsub_client(credentials)
    try:
        # TODO: check if subscription exists first
        subscription = create_subscription(pubsub, PROJECT_ID, sub_name)
    except Exception, e:
        print e
    write_to_bq(pubsub, sub_name, bigquery)
    print 'exited write loop'
Пример #9
0
class StdOutListener(StreamListener):
    """A listener handles tweets that are received from the stream.
    This listener dumps the tweets into a PubSub topic
    """

    count = 0
    twstring = ''
    tweets = []
    batch_size = 50
    total_tweets = 10000000
    client = utils.create_pubsub_client(utils.get_credentials())
    print 'in stdoutlistener'

    def write_to_pubsub(self, tw):
        publish(self.client, PUBSUB_TOPIC, tw)

    def on_data(self, data):
        """What to do when tweet data is received."""
        client = language.LanguageServiceClient()

        pub_data = {}

        all_data = json.loads(data)
        pub_data["created_at"] = all_data["created_at"]
        pub_data["tweet"] = all_data["text"]
        pub_data["username"] = all_data["user"]["screen_name"]
        pub_data["userlocation"] = all_data["user"]["location"]
        pub_data["retweetcount"] = all_data["retweet_count"]
        pub_data["favoritecount"] = all_data["favorite_count"]
        pub_data["profileimage_url"] = all_data["user"][
            "profile_image_url_https"]
        pub_data["replycount"] = all_data["reply_count"]

        document = types.Document(content=pub_data["tweet"],
                                  type=enums.Document.Type.PLAIN_TEXT)
        try:
            sentiment = client.analyze_sentiment(
                document=document).document_sentiment
            pub_data["score"] = sentiment.score
            pub_data["magnitude"] = sentiment.magnitude
            entities = client.analyze_entities(document).entities
            mention_count = 1
            for entity in entities:
                if mention_count == 1: pub_data["mention_1"] = entity.name
                elif mention_count == 2: pub_data["mention_2"] = entity.name
                elif mention_count == 3: pub_data["mention_3"] = entity.name
                elif mention_count == 4: pub_data["mention_4"] = entity.name
                elif mention_count == 5: pub_data["mention_5"] = entity.name
                else: pass
                mention_count = mention_count + 1
                print(u'{:<16}: {}'.format('name', entity.name))
            print('Sentiment: {}, {}'.format(sentiment.score,
                                             sentiment.magnitude))
        except InvalidArgument as e:
            pub_data["score"] = 0
            pub_data["magnitude"] = 0
            print "NLP API Bypassed"

        print "Tweet", pub_data["tweet"]

        pass_data = json.dumps(pub_data)

        self.tweets.append(pass_data)
        if len(self.tweets) >= self.batch_size:
            self.write_to_pubsub(self.tweets)
            self.tweets = []
        self.count += 1

        # if we've grabbed more than total_tweets tweets, exit the script.
        if self.count > self.total_tweets:
            return False
        if (self.count % 1000) == 0:
            print 'count is: %s at %s' % (self.count, datetime.datetime.now())
        return True

    def on_error(self, status):
        print status