def __init__(self, daemon_number): super(KeywordsTweetDaemon, self).__init__() if daemon_number == 0: raise ValueError("daemon_number must be 1 or higher") self._daemon_number = daemon_number self._countstore = CountStore() self._tweetstore = TweetStore()
class GeoTweetDaemon(StreamDaemon): def __init__(self): self._credentials = settings.Twitter['accounts'][0] self._payload = {'locations': ["-180,-90","180,90"]} self._countstore = CountStore() def _get_streaming_args(self): return self._credentials, self._payload def _payload_is_empty(self): return False def _on_tweet_callback(self, tweet): country_code = tweet.country_code if country_code: for entity_type, entities in tweet.entities.items(): for entity_id in entities: self._countstore.put(entity_id, entity_type, country_code)
def test_put_with_total_count(self): countstore = CountStore() countstore.put("entity1", "test_entity", "test_seg", total_count = 10) docs = self.get_all_docs() self.assertEqual(9, len(docs)) expected = self.build_timeslice_docs({ "entity_id" : "entity1", "entity_type" : "test_entity", "segmentation": "test_seg", "base_count" : 10, "count" : 1, }) self.assertEqual(expected, docs) fixture.setup_mock_time(fixture.jan_1st_2013_midday + 60*15) countstore.put("entity1", "test_entity", "test_seg", total_count = 15) docs = self.get_all_docs() self.assertEqual(10, len(docs)) expected = self.build_timeslice_docs({ "entity_id" : "entity1", "entity_type" : "test_entity", "segmentation": "test_seg", "count" : 6, "base_count" : 10, }) expected[0]["count"] = 1 expected[0]["base_count"] = 10 last_doc = copy(expected[-1]) last_doc["count"] = 1 last_doc["base_count"] = 15 last_doc["timeslice"] = fixture.jan_1st_2013_midday + 60*15 expected.append(last_doc) self.assertEqual(expected, docs)
def cache_top_tweets(): #initialize stores ts = int(time.time()) countstore = CountStore() tweetstore = TweetStore() cache = RedisCache(namespace=settings.TopTweetsCache["namespace"]) countries = Geo.country_codes() top_tweets_cache = {} for country in countries: print "*************" print country print "*************" top_tweets = {} segmentation = "C" + country for entitytype in ["hashtag", "user_mention"]: top_tweets[entitytype] = [] top_entities = countstore.get_top(entitytype, segmentation, settings.Aggregation['top_entities'], ts) for entity, count in top_entities: data = {"text":entity, "count":count, "tweets":[]} tweets = top_tweets_cache.get((entitytype, entity, ts)) if not tweets: print "fetching tweets for " + str((entitytype, entity, ts)) segmentation = ":".join([entitytype, entity]) tweets = countstore.get_top("tweet", segmentation, settings.Aggregation['top_tweets'], ts) tweets = map(lambda x: (tweetstore.get(x[0]), x[1]), tweets) top_tweets_cache[(entitytype, entity, ts)] = tweets for tweet, count in tweets: data["tweets"].append({"tweet":tweet.data, "count": count}) top_tweets[entitytype].append(data) cache.put(segmentation, top_tweets)
def __init__(self): self._credentials = settings.Twitter['accounts'][0] self._payload = {'locations': ["-180,-90","180,90"]} self._countstore = CountStore()
class KeywordsTweetDaemon(object): def __init__(self, daemon_number): super(KeywordsTweetDaemon, self).__init__() if daemon_number == 0: raise ValueError("daemon_number must be 1 or higher") self._daemon_number = daemon_number self._countstore = CountStore() self._tweetstore = TweetStore() def _get_streaming_args(self): self._credentials = settings.Twitter['accounts'][self._daemon_number] segs = ["C" + code for code in Geo.country_codes()] #We're only allowed to track 400 keywords lock = FileLock('/tmp/trackwords') with lock: all_hashtags = self._get_all_entities("hashtag", segs) all_usermentions = self._get_all_entities("user_mention", segs) used_hashtags, used_usermentions = map(set, self._get_used_trackwords()) hashtags = [ht for ht in all_hashtags if not ht in used_hashtags][:200] usermentions = [um for um in all_usermentions if not um in used_usermentions][:200] self._set_used_trackwords(hashtags, usermentions) self._payload = {'track': hashtags + usermentions} return self._credentials, self._payload def _on_tweet_callback(self, tweet): self._tweetstore.put(tweet) tweet_id = tweet.original_id for entity_type, entities in tweet.entities.items(): for entity in entities: segmentation = ":".join([entity_type, entity]) self._countstore.put(tweet_id, "tweet", segmentation) def _get_all_entities(self, entity_type, segmentations): fetch = settings.Aggregation["top_entities"] get_top_entities_for_seg = lambda s: set(self._countstore.get_top( entity_type=entity_type, segmentation=s, num_to_get=fetch).keys()) top_entities = map(get_top_entities_for_seg, segmentations) return list(reduce(lambda x, y : x | y, top_entities)) def _payload_is_empty(self): return not self._payload or not self._payload['track'] def _get_used_trackwords(self): used_hashtags, used_usermentions = [], [] last_index = len(settings.Twitter["accounts"]) indices = range(1, last_index) cache = RedisCache(namespace=settings.TrackwordCache["namespace"]) for idx in indices: if idx == self._daemon_number: continue hashtag_key = "streamer%s:hashtags" % idx usermention_key = "streamer%s:usermentions" % idx hashtags = cache.get(hashtag_key) or [] used_hashtags += hashtags usermentions = cache.get(usermention_key) or [] used_usermentions += usermentions return used_hashtags, used_usermentions def _set_used_trackwords(self, hashtags, usermentions): hashtag_key = "streamer%s:hashtags" % self._daemon_number usermention_key = "streamer%s:usermentions" % self._daemon_number cache = RedisCache(namespace=settings.TrackwordCache["namespace"]) cache.put(hashtag_key, hashtags) cache.put(usermention_key, usermentions) def _reset_used_trackwords(self): last_index = len(settings.Twitter["accounts"]) indices = range(1, last_index) for idx in indices: hashtag_key = "streamer%s:hashtags" % idx usermention_key = "streamer%s:usermentions" % idx cache = RedisCache(namespace=settings.TrackwordCache["namespace"]) cache.delete(hashtag_key) cache.delete(usermention_key) def _filter_track(self, words): return [utf8(w[0]) for w in words if utf8(w[0])[:60] == utf8(w[0])]
def test_get_top(self): countstore = CountStore() countstore.put("entity1", "test_entity", "test_seg") countstore.put("entity2", "test_entity", "test_seg") countstore.put("entity2", "test_entity", "test_seg") actual = countstore.get_top("test_entity", "test_seg", 10) expected = [("entity2", 2), ("entity1", 1)] self.assertEqual(expected, actual) fixture.setup_mock_time(fixture.jan_1st_2013_midday + 3600) countstore.put("entity1", "test_entity", "test_seg") countstore.put("entity1", "test_entity", "test_seg") countstore.put("entity3", "test_entity", "test_seg") actual = countstore.get_top("test_entity", "test_seg", 10) expected = [("entity1", 3), ("entity2", 2), ("entity3", 1)] self.assertEqual(expected, actual) fixture.setup_mock_time(fixture.jan_1st_2013_midday + 2*3600) countstore.put("entity3", "test_entity", "test_seg") countstore.put("entity3", "test_entity", "test_seg") actual = countstore.get_top("test_entity", "test_seg", 10) expected = [("entity3", 3), ("entity1", 2)] self.assertEqual(expected, actual)