예제 #1
0
    def get_top_entities(self, requested_entities):
        returndict = {}
        returnstructure = {}
        tweet_parser = smappdragon.TweetParser()
        #init dempty dict for all entity types
        for entity_type in requested_entities:
            returndict[entity_type] = {}

        for tweet in self.get_collection_iterators():
            for entity_type in requested_entities:
                for entity in tweet_parser.get_entity(entity_type, tweet):
                    if entity_type == 'user_mentions':
                        entity_value = tweet_parser.get_entity_field(
                            'id_str', entity)
                    elif entity_type == 'hashtags' or entity_type == 'symbols':
                        try:
                            entity_value = tweet_parser.get_entity_field(
                                'text', entity)
                        except:
                            entity_value = tweet_parser.get_entity_field(
                                'full_text', entity)
                    else:
                        entity_value = tweet_parser.get_entity_field(
                            'url', entity)

                    if entity_value in returndict[entity_type]:
                        returndict[entity_type][entity_value] += 1
                    else:
                        returndict[entity_type][entity_value] = 1

        for entity_type in returndict:
            returnstructure[entity_type] = {}
            if len(returndict[entity_type]) > 0:
                sorted_list = sorted(returndict[entity_type].items(),
                                     key=operator.itemgetter(1),
                                     reverse=True)
                # if the user put in 0 return all entites
                # otherwise slice the array and return the
                # number of top things they asked for
                # if the list is too short throw in None
                if requested_entities[entity_type] == 0:
                    returnstructure[entity_type] = {
                        name: count
                        for name, count in sorted_list
                    }
                elif len(sorted_list) < requested_entities[entity_type]:
                    returnstructure[entity_type] = {
                        name: count
                        for name, count in sorted_list
                    }
                    for i in range(
                            0, requested_entities[entity_type] -
                            len(sorted_list)):
                        returnstructure[entity_type][i] = None
                else:
                    returnstructure[entity_type] = { \
                        name: count for name, count in sorted_list[0:requested_entities[entity_type]] \
                    }
        return returnstructure
예제 #2
0
 def new_get_iterator():
     tweet_parser = smappdragon.TweetParser()
     it = iter(self.collection.get_iterator())
     sample = list(itertools.islice(it, k))
     random.shuffle(sample)
     for i, item in enumerate(it, start=k + 1):
         j = random.randrange(i)
         if j < k:
             sample[j] = item
     for tweet in sample:
         if self.collection.limit != 0 and self.collection.limit <= count:
             return
         elif tweet_parser.tweet_passes_filter(self.collection.filter, tweet) \
         and tweet_parser.tweet_passes_custom_filter_list(self.collection.custom_filters, tweet):
             yield tweet