Пример #1
0
    def extract_local_words_batch(self, params):
        lwords = {}
        word_counts = {}

        for user in self.users.iter():
            location = user['location_point']
            if location != None:
                city = str(self.model.predict([location])[0])
                tweets = self.tweets.get(user['id'])
                user_words = set([])
                for tweet in tweets:
                    user_words |= set(Util.get_words(tweet['text']))
                for w in user_words:
                    if not w in word_counts:
                        word_counts[w] = {city: 1}
                    elif not city in word_counts[w]:
                        word_counts[w][city] = 1
                    else:
                        word_counts[w][city] += 1

        """ calculating divergences """
        for w in word_counts:
            N = float(sum([v for v in word_counts[w].values()]))
            if N >= params['cmin']:
                d = self.calc_divergence(N, word_counts[w], params)
                if self.check_divergence(d, params) == True:
                        lwords[w] = {'word':w, 'd':d, 'distribution':word_counts[w]} # save as dict
        return Words(lwords)
Пример #2
0
    def extract_local_words(self, tweets, params):
        lwords = {}
        word_counts = {}

        """ making user sets """
        user_sets = {}
        for tweet in tweets:
            words = Util.get_words(tweet['text'])
            for w in words:
                if not w in user_sets: user_sets[w] = set([])
                user_sets[w].add(tweet['user_id'])

        """ making word distributions """
        for w in user_sets:
            for user_id in user_sets[w]:
                user = self.users.get(user_id)
                if user != None:
                    location = user['location_point']
                    if location != None:
                        """ labeled user """
                        if not w in word_counts: word_counts[w] = {}
                        city = str(self.model.predict([location])[0])
                        if not city in word_counts[w]: word_counts[w][city] = 0
                        word_counts[w][city] += 1

        """ calculating divergences """
        for w in word_counts:
            N = float(sum([v for v in word_counts[w].values()]))
            if N >= params['cmin']:
                d = self.calc_divergence(N, word_counts[w], params)
                if self.check_divergence(d, params) == True:
                        lwords[w] = {'word':w, 'd':d, 'distribution':word_counts[w]} # save as dict
        return Words(lwords)
Пример #3
0
 def update_user_distributions(self, tweets, tlwords, params):
     for tweet in tweets:
         neighbors = self.get_neighbors(tweet['user_id'], params)
         users = neighbors | set([tweet['user_id']])
         for user_id in users:
             user = self.users.get(user_id)
             if user != None:
                 if user['location_point'] == None:
                     """ unlabeled users """
                     if not user['id'] in self.user_distributions:
                         self.user_distributions[user['id']] = self.init_user_distribution()
                     words = Util.get_words(tweet['text'])
                     for w in words:
                         if tlwords.contain(w):
                             """ update using temporally-local word """
                             tlword = tlwords.get(w)
                             self.user_distributions[user['id']] = self.add_distribution(self.user_distributions[user['id']], tlword['distribution'])
                         if self.lwords.contain(w):
                             """ update using local word """
                             lword = self.lwords.get(w)
                             if params['divergence'] in ['l2', 'kl']:
                                 if lword['d'] >= params['dmin']:
                                     self.user_distributions[user['id']] = self.add_distribution(self.user_distributions[user['id']], lword['distribution'])
                             else:
                                 if lword['d'] <= params['dmin']:
                                     self.user_distributions[user['id']] = self.add_distribution(self.user_distributions[user['id']], lword['distribution'])