class CustomStreamListener(tweepy.StreamListener):
    def __init__(self, db=TwitterConfig.tweet_db, collection=TwitterConfig.tweet_collection):
        tweepy.StreamListener.__init__(self)
        self.mid_list = []
        self.ti = TweetInterface(db=db, collection=collection)

    def save_to_mongo(self,tweet):
        tweet = json.loads(tweet.json)
        tweet['_id'] = tweet['id']
        self.ti.saveDocument(tweet, must_have_geo_tag=False)

    def on_status(self, status):
        print 'get'
        try:
            print "%s\t%s\t%s\t%s" % (status.text,
                    status.author.screen_name,
                    status.created_at,
                    status.source,
                    )
            self.save_to_mongo(status)
        except Exception, e:
            print >> sys.stderr, 'Encountered Exception:', e
            pass
        def on_error(self, status_code):
            print >> sys.stderr, 'Encountered error with status code:', status_code
            return True # Don't kill the stream
        def on_timeout(self):
            print >> sys.stderr, 'Timeout...'
            return True # Don't kill the stream
示例#2
0
class CustomStreamListener(tweepy.StreamListener):
    def __init__(self):
        tweepy.StreamListener.__init__(self)
        self.mid_list = []
        self.ti = TweetInterface()

    def save_to_mongo(self,tweet):
        tweet = json.loads(tweet.json)
        if tweet['coordinates'] is None:
            return

        tweet['_id'] = tweet['id']
        self.ti.saveDocument(tweet)

    def on_status(self, status):
        print 'get'
        try:
            print "%s\t%s\t%s\t%s\t%s" % (status.text, 
                    status.author.screen_name, 
                    status.created_at, 
                    status.source,
                    status.coordinates['coordinates']
                    )
            self.save_to_mongo(status)
        except Exception, e:
            print >> sys.stderr, 'Encountered Exception:', e
            pass
        def on_error(self, status_code):
            print >> sys.stderr, 'Encountered error with status code:', status_code
            return True # Don't kill the stream
        def on_timeout(self):
            print >> sys.stderr, 'Timeout...'
            return True # Don't kill the stream
 def _getFiftenMiniutesData(self):
     data_interface = None
     if self.data_source == 'twitter':
         data_interface = TweetInterface('citybeat_production', 'tweets')
     elif self.data_source == 'instagram':
         data_interface = PhotoInterface('citybeat_production', 'photos')
     _fifteen_minutes_ago = 15 * 60
     cursor = data_interface.rangeQuery(self.region, (str(self.cur_time - _fifteen_minutes_ago), str(self.cur_time)))
     _data = []
     for p in cursor:
         _data.append(p)
     _data = sorted(_data, key=lambda k: k['created_time'])
     before = len(_data)
     _data = processAsPeopleCount(_data)
     after = len(_data)
     self.current_value = after
     self.data = _data
示例#4
0
    def _extractMostPopularTweet(self):
        ti = TweetInterface(collection=TwitterConfig.extended_tweet_collection)
        tweets = {}
        most_popular_tweet_text = ''
        max_retweet_count = -1
        user_name = ''

        # 60 minutes
        now = int(getCurrentStampUTC())
        time_span = 60 * 60
        end_time = now
        begin_time = end_time - time_span

        for tweet in ti.rangeQuery(period=[begin_time, end_time], fields=['text', 'user.screen_name']):
            text = tweet['text']
            count = tweets.get(text, 0) + 1
            tweets[text] = count
            if count > max_retweet_count:
                max_retweet_count = count
                most_popular_tweet_text = text
                user_name = tweet['user']['screen_name']

        single_tweet_count = 0
        retweet_count = 0
        for key, value in tweets.items():
            if value == 1:
                single_tweet_count += 1
            else:
                retweet_count += value

        most_popular_tweet = {}
        most_popular_tweet['user_name'] = user_name
        most_popular_tweet['text'] = most_popular_tweet_text
        most_popular_tweet['count'] = max_retweet_count

        tweets_count = {}
        tweets_count['tweet_percentage'] = 1.0 * single_tweet_count / (single_tweet_count + retweet_count)
        tweets_count['retweet_percentage'] = 1.0 * retweet_count / (single_tweet_count + retweet_count)

        return [most_popular_tweet, tweets_count]
示例#5
0
 def __init__(self):
     tweepy.StreamListener.__init__(self)
     self.mid_list = []
     self.ti = TweetInterface()
 def __init__(self, db=TwitterConfig.tweet_db, collection=TwitterConfig.tweet_collection):
     tweepy.StreamListener.__init__(self)
     self.mid_list = []
     self.ti = TweetInterface(db=db, collection=collection)
示例#7
0
 def __init__(self):
     # emty dictionary
     self._tweet_interface = TweetInterface()
     self._photo_interface = PhotoInterface()
示例#8
0
class Stats(object):

    def __init__(self):
        # emty dictionary
        self._tweet_interface = TweetInterface()
        self._photo_interface = PhotoInterface()

    def getTweetAndPhotoStats(self):
        stats = {}
        tweet_basic_count = {}
        photo_basic_count = {}

        photo_basic_count['last_minute'] = self._getCurrentCountStats('photos')
        photo_basic_count['last_24_hour'] = self._get24HoursCountStats('photos')

        tweet_basic_count['last_minute'] = self._getCurrentCountStats('tweets')
        tweet_basic_count['last_24_hour'] = self._get24HoursCountStats('tweets')

        res = self._extractMostPopularTweet()
        stats['photo_basic_count'] = photo_basic_count
        stats['tweet_basic_count'] = tweet_basic_count
        stats['created_time'] = str(getCurrentStampUTC())
        stats['tweet_top_mentions'] = self._extractTweetTopMentions()
        stats['most_popular_tweet'] = res[0]
        stats['tweet_vs_retweet'] = res[1]
        return stats

    def _getCurrentCountStats(self, type):
        assert type in ['photos', 'tweets']
        stats = {}
        if type == 'photos':
            res = self._extractPhotoCount()
        else:
            res = self._extractTweetCount()
        stats['count'] = res[0]
        stats['delta'] = res[1]
        return stats

    def _get24HoursCountStats(self, type):
        assert type in ['photos', 'tweets']
        stats = {}
        stats['current_count'] = self._extract24HoursCountsStats(type=type)
        stats['last_week_count'] = self._extract24HoursCountsStats(past_week=True, type=type)
        return stats

    def _extractTweetCount(self):
        now = int(getCurrentStampUTC())
        # 5 seconds as the latency
        current_count = self._tweet_interface.rangeQuery(period=[now - 65, now - 5]).count()
        baseline_count = self._tweet_interface.rangeQuery(period=[now - 65 - 60 * 20, now - 65]).count() / 20.0
        if baseline_count == 0.0:
            return [current_count, stats_config.NO_BASE_LINE]
        else:
            return [current_count, (current_count - baseline_count) / baseline_count]

    def _extractPhotoCount(self):
        now = int(getCurrentStampUTC())
        offset = 4 * 60
        current_count = self._photo_interface.rangeQuery(period=[now - offset - 60, now - offset]).count()
        baseline_count = self._photo_interface.rangeQuery(period=[now - 60 * 21 - offset, now - offset - 60]).count() / 20.0
        if baseline_count == 0.0:
            return [current_count, stats_config.NO_BASE_LINE]
        else:
            return [current_count, (current_count - baseline_count) / baseline_count]

    def _extract24HoursCountsStats(self, past_week=False, type='tweets'):
        now = int(getCurrentStampUTC())
        offset = 0
        if past_week:
            offset = 7 * 24
        count_during_past_24_hours = []
        for hour in xrange(24):
            end_time = now - 3600 * (hour + offset)
            begin_time = end_time - 3600
            if type == 'tweets':
                count_during_past_24_hours.append(self._tweet_interface.rangeQuery(period=[begin_time, end_time]).count())
            else:
                count_during_past_24_hours.append(self._photo_interface.rangeQuery(period=[begin_time, end_time]).count())
        return count_during_past_24_hours

    def _extractTweetTopMentions(self, k=10):
        # 60 minutes
        now = int(getCurrentStampUTC())
        time_span = 60 * 60
        end_time = now
        begin_time = end_time - time_span
        cur = self._tweet_interface.rangeQuery(period=[begin_time, end_time], fields=['text'])

        users = {}
        twitter_username_re = re.compile(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_-]+)')
        for tweet in cur:
            text = tweet['text']
            mentions = twitter_username_re.findall(text)
            for mention in mentions:
                count = users.get(mention, 0) + 1
                users[mention] = count

        users = sorted(users.iteritems(), key=operator.itemgetter(1), reverse=True)
        res = []
        for key, value in users:
            res_pair = {}
            res_pair['user_name'] = key
            res_pair['count'] = value
            res.append(res_pair)
            if len(res) >= 10:
                break
        return res

    def _extractMostPopularTweet(self):
        ti = TweetInterface(collection=TwitterConfig.extended_tweet_collection)
        tweets = {}
        most_popular_tweet_text = ''
        max_retweet_count = -1
        user_name = ''

        # 60 minutes
        now = int(getCurrentStampUTC())
        time_span = 60 * 60
        end_time = now
        begin_time = end_time - time_span

        for tweet in ti.rangeQuery(period=[begin_time, end_time], fields=['text', 'user.screen_name']):
            text = tweet['text']
            count = tweets.get(text, 0) + 1
            tweets[text] = count
            if count > max_retweet_count:
                max_retweet_count = count
                most_popular_tweet_text = text
                user_name = tweet['user']['screen_name']

        single_tweet_count = 0
        retweet_count = 0
        for key, value in tweets.items():
            if value == 1:
                single_tweet_count += 1
            else:
                retweet_count += value

        most_popular_tweet = {}
        most_popular_tweet['user_name'] = user_name
        most_popular_tweet['text'] = most_popular_tweet_text
        most_popular_tweet['count'] = max_retweet_count

        tweets_count = {}
        tweets_count['tweet_percentage'] = 1.0 * single_tweet_count / (single_tweet_count + retweet_count)
        tweets_count['retweet_percentage'] = 1.0 * retweet_count / (single_tweet_count + retweet_count)

        return [most_popular_tweet, tweets_count]