def extract_local_words_batch(self, params): lwords = {} word_counts = {} for user in self.users.iter(): location = user['location_point'] if location != None: city = str(self.model.predict([location])[0]) tweets = self.tweets.get(user['id']) user_words = set([]) for tweet in tweets: user_words |= set(Util.get_words(tweet['text'])) for w in user_words: if not w in word_counts: word_counts[w] = {city: 1} elif not city in word_counts[w]: word_counts[w][city] = 1 else: word_counts[w][city] += 1 """ calculating divergences """ for w in word_counts: N = float(sum([v for v in word_counts[w].values()])) if N >= params['cmin']: d = self.calc_divergence(N, word_counts[w], params) if self.check_divergence(d, params) == True: lwords[w] = {'word':w, 'd':d, 'distribution':word_counts[w]} # save as dict return Words(lwords)
def extract_local_words(self, tweets, params): lwords = {} word_counts = {} """ making user sets """ user_sets = {} for tweet in tweets: words = Util.get_words(tweet['text']) for w in words: if not w in user_sets: user_sets[w] = set([]) user_sets[w].add(tweet['user_id']) """ making word distributions """ for w in user_sets: for user_id in user_sets[w]: user = self.users.get(user_id) if user != None: location = user['location_point'] if location != None: """ labeled user """ if not w in word_counts: word_counts[w] = {} city = str(self.model.predict([location])[0]) if not city in word_counts[w]: word_counts[w][city] = 0 word_counts[w][city] += 1 """ calculating divergences """ for w in word_counts: N = float(sum([v for v in word_counts[w].values()])) if N >= params['cmin']: d = self.calc_divergence(N, word_counts[w], params) if self.check_divergence(d, params) == True: lwords[w] = {'word':w, 'd':d, 'distribution':word_counts[w]} # save as dict return Words(lwords)
def update_user_distributions(self, tweets, tlwords, params): for tweet in tweets: neighbors = self.get_neighbors(tweet['user_id'], params) users = neighbors | set([tweet['user_id']]) for user_id in users: user = self.users.get(user_id) if user != None: if user['location_point'] == None: """ unlabeled users """ if not user['id'] in self.user_distributions: self.user_distributions[user['id']] = self.init_user_distribution() words = Util.get_words(tweet['text']) for w in words: if tlwords.contain(w): """ update using temporally-local word """ tlword = tlwords.get(w) self.user_distributions[user['id']] = self.add_distribution(self.user_distributions[user['id']], tlword['distribution']) if self.lwords.contain(w): """ update using local word """ lword = self.lwords.get(w) if params['divergence'] in ['l2', 'kl']: if lword['d'] >= params['dmin']: self.user_distributions[user['id']] = self.add_distribution(self.user_distributions[user['id']], lword['distribution']) else: if lword['d'] <= params['dmin']: self.user_distributions[user['id']] = self.add_distribution(self.user_distributions[user['id']], lword['distribution'])