def get_top_entities(self, requested_entities): returndict = {} returnstructure = {} tweet_parser = smappdragon.TweetParser() #init dempty dict for all entity types for entity_type in requested_entities: returndict[entity_type] = {} for tweet in self.get_collection_iterators(): for entity_type in requested_entities: for entity in tweet_parser.get_entity(entity_type, tweet): if entity_type == 'user_mentions': entity_value = tweet_parser.get_entity_field( 'id_str', entity) elif entity_type == 'hashtags' or entity_type == 'symbols': try: entity_value = tweet_parser.get_entity_field( 'text', entity) except: entity_value = tweet_parser.get_entity_field( 'full_text', entity) else: entity_value = tweet_parser.get_entity_field( 'url', entity) if entity_value in returndict[entity_type]: returndict[entity_type][entity_value] += 1 else: returndict[entity_type][entity_value] = 1 for entity_type in returndict: returnstructure[entity_type] = {} if len(returndict[entity_type]) > 0: sorted_list = sorted(returndict[entity_type].items(), key=operator.itemgetter(1), reverse=True) # if the user put in 0 return all entites # otherwise slice the array and return the # number of top things they asked for # if the list is too short throw in None if requested_entities[entity_type] == 0: returnstructure[entity_type] = { name: count for name, count in sorted_list } elif len(sorted_list) < requested_entities[entity_type]: returnstructure[entity_type] = { name: count for name, count in sorted_list } for i in range( 0, requested_entities[entity_type] - len(sorted_list)): returnstructure[entity_type][i] = None else: returnstructure[entity_type] = { \ name: count for name, count in sorted_list[0:requested_entities[entity_type]] \ } return returnstructure
def new_get_iterator(): tweet_parser = smappdragon.TweetParser() it = iter(self.collection.get_iterator()) sample = list(itertools.islice(it, k)) random.shuffle(sample) for i, item in enumerate(it, start=k + 1): j = random.randrange(i) if j < k: sample[j] = item for tweet in sample: if self.collection.limit != 0 and self.collection.limit <= count: return elif tweet_parser.tweet_passes_filter(self.collection.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.collection.custom_filters, tweet): yield tweet