def __init__(self, settings): """ Initializing class variables. """ # the mention network that will store inferred locations in node_data self.mention_network = MultiLocationMethod.dataset.bi_mention_network() self.nodes = set(self.mention_network.nodes()) #self.u_n, self.u_star are the sets of users with unknown and known locations respectively. self.u_n = set() self.u_star = set() #the set of all known venues self.venues = set() #list of all locations, and the co-occurences with a user. self.psi = Counter() #alpha and beta are the coefficients for eq.1 as per the paper self.alpha = -0.55 self.beta = 0.0045 #K is the total number of tweeting relationships self.K = 0 #N_squared is the total number of user pairs self.N_squared = 0 #S is the number of following relationships self.S = 0 #geocoder is a forward/reverse geocoder for location -> lat/long and lat/lon -> location. if 'location_source' in settings: self.geocoder = Geocoder(dataset=settings['location_source']) else: self.geocoder = Geocoder() #F_r is the random following model Bernoulli distribution parameter self.F_r = None #T_r is the random tweeting model Bernoulli distribution parameter self.T_r = Counter() #mu and nu are the model selectors according to a bernoulli distribution self.mu = defaultdict(bool) self.nu = defaultdict(bool) #the multi-location list generated by the MLP self.user_multi_locations = defaultdict(list) #runs the model, populates all the variables and generates user_multi_locations self.run_model()
def train_model(self, settings, dataset, model_dir): # Initialize the geocoder, which we'll use to resolve location strings. # We use the default name-to-location mapping unless the user has # specified otherwise. if 'location_source' in settings: self.geocoder = Geocoder(dataset=settings['location_source']) else: self.geocoder = Geocoder() # NOTE: The original paper used the directional friends/followers # network. However, the paper was tested on a much smaller network # (9.8M edges), which doesn't scale when including the full network. We # opt for using the bi-directional networks as these (1) provide a # stronger signal of social relationships and (2) significantly reduce # the memory requirement. LOGGER.debug('Loading mention network') mention_network = dataset.bi_mention_network() # This dict will contain a mapping from user ID to an associated home # location, which is derived either from the location field (as in the # original paper), from GPS-tagged tweets, or from both user_to_home_loc = {} # For each of the users that we have in the network, see if we can # associate that user with a home location. all_users = set(mention_network.nodes_iter()) LOGGER.debug('Calculating users with recognizable home location') num_users_processed = 0 # Keep track of how many times each location occurred. We'll filter # this down to only the most common locations location_counts = collections.Counter() for user_id, home_loc in dataset.user_home_location_iter(): if not user_id in all_users: continue # home_loc is a (lat,lon) tuple. While this is accurate, we want to # coarsen the location data to decrease sparsity (i.e., more people # located in the same city location, despite slightly different # underlying lat/lon values). Here, use the Geocoder to map the # lat/lon to a name and then back to a canonical lat/lon for that # name canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1]) location_counts[canonical_lat_lon] += 1 user_to_home_loc[user_id] = canonical_lat_lon num_users_processed += 1 if num_users_processed % 500000 == 0: LOGGER.debug('Processed %s of the %s users, associated %s a known location (%s)' % (num_users_processed, len(all_users), len(user_to_home_loc), len(user_to_home_loc) / float(num_users_processed))) # Iterate through the locations pruning out those that do not occur more # than some threshold number of times num_locs_removed = 0 for lat_lon, count in location_counts.iteritems(): if count >= 20: self.unique_locations.add(lat_lon) else: num_locs_removed += 1 LOGGER.debug('Saw %d locations, %d with at least 5 users, %d to be pruned' % (len(location_counts), len(self.unique_locations), num_locs_removed)) # Remove the home locations of users whose locations aren't in the # pruned list of minimum-frequency locations num_user_home_locs_removed = 0 for user_id, loc in user_to_home_loc.items(): if not loc in self.unique_locations: del user_to_home_loc[user_id] num_user_home_locs_removed += 1 LOGGER.debug('After pruning removed home locations of %d users, %d still have homes' % (num_user_home_locs_removed, len(user_to_home_loc))) # Create a bi-directional mapping from locations to unique # numeric identifiers. This mapping will be used when # representing locations in the classifier feature space and # when converting classifier output to specific locations location_to_id = {} for loc in self.unique_locations: id_ = len(location_to_id) location_to_id[loc] = id_ self.id_to_location[id_] = loc # Associate each location with its set of features n = len(self.unique_locations) # Each location has 7 features associated with it for classifying a # user's location. The seven features per location are arranged next to # each other in the feature space. feature_offset = 0 for loc in self.unique_locations: # Feat1: it's population bin (size approx.) self.pop_bin_feature_indices[loc] = feature_offset # Feat2: the number of reciprocal friends self.reciprocal_feature_indices[loc] = feature_offset + 1 # Feat3-7: the bins indicating how many friends were in reciprocal # triads in that city for bin_num in range(0, 5): feat = "%s,%s:%s" % (loc[0], loc[1], bin_num) self.triad_feature_indices[feat] = feature_offset + bin_num + 2 # Increment the feature offset so the next city's features don't # collide with this city's indices feature_offset += 7 # Set the total number of features seen self.total_num_features = feature_offset LOGGER.debug('Saw %d unique locations, %d total featurs' % (len(self.unique_locations), feature_offset)) LOGGER.debug('Associated %s of the %s users with a known location (%s unique)' % (len(user_to_home_loc), len(all_users), len(self.unique_locations))) # The list of locations for each corresponding user in X B = [] # Train the classifier based on users with known home locations LOGGER.debug("Generating feature vectors for training") X = scipy.sparse.lil_matrix((len(user_to_home_loc), self.total_num_features), dtype=numpy.float64) print X row = 0 total_nz = 0 for user_id, location in user_to_home_loc.iteritems(): # Skip users whose locations were omitted due to frequency filtering # or who have home locations but are not in the mention network #if not location in self.unique_locations or not user_id in all_users: # continue # Fill the row in the matrix corresponding to this user's features nz = self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, row) total_nz += nz # Get the index of this user's location location_id = location_to_id[location] B.append(location_id) row += 1 X = X.tocsr() #X = X.toarray() LOGGER.debug("Generated training data for %d users, %d nz features, %f on average" % (row, total_nz, float(total_nz) / row)) # Convert the location list into a numpy array for use with scikit Y = numpy.asarray(B) if len(X.nonzero()[0]) == 0: LOGGER.warning("Too little training data seen and no user had non-zero feature "+ "values. Cowardly aborting classification") else: # Use SVM classifier with a linear kernel. # # NOTE NOTE NOTE NOTE # # The original paper uses an RBF kernel with their SVM. However, # this proved impossibly slow during testing, so a linear kernel was # used instead. # # NOTE NOTE NOTE NOTE # # slow: self.location_classifier = svm.SVC(kernel='rbf') #self.location_classifier = svm.LinearSVC(dual=False) #self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000) #self.location_classifier = naive_bayes.BernoulliNB() self.location_classifier = svm.LinearSVC(dual=False, loss='l2', penalty="l2", tol=1e-2) # Note: we expect the vector representations to be sparse, so avoid mean # scaling since it would create dense vectors, which would blow up the # memory consumption of the model self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False) # Learn the scaling parameters and then rescale the input LOGGER.debug("Scaling feature vectors for training") X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64)) LOGGER.debug("Training classifier") self.location_classifier.fit(X_scaled, Y) LOGGER.debug("Finished training classifier") # Assign all the users some location, if we can figure it out users_assigned = 0 users_seen = 0 for user_id in all_users: users_seen += 1 # If we know where to place this user, assign it to their home location if user_id in user_to_home_loc: self.user_id_to_location[user_id] = user_to_home_loc[user_id] # Otherwise try to infer the location else: location = self.infer_location(user_id, mention_network, user_to_home_loc) if not location is None: self.user_id_to_location[user_id] = location users_assigned += 1 if users_seen % 100000 == 0: LOGGER.debug((("Saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)") % (users_seen, len(all_users), len(self.user_id_to_location) - users_assigned, users_assigned, len(self.user_id_to_location)))) LOGGER.debug((("Ultimately saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)") % (users_seen, len(all_users), len(self.user_id_to_location) - users_assigned, users_assigned, len(self.user_id_to_location)))) # Short circuit early if the caller has specified that the model is not # to be saved into a directory if model_dir is None: return Wheres_Wally_Model(self.user_id_to_location) if not os.path.exists(model_dir): os.mkdir(model_dir) # Write the .tsv for human debugability too fh = gzip.open(os.path.join(model_dir, 'user-to-lat-lon.tsv.gz'), 'w') for user_id, loc in self.user_id_to_location.iteritems(): fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1])); fh.close() return Wheres_Wally_Model(self.user_id_to_location)
class Wheres_Wally(GIMethod): def __init__(self): # Location is represented as a lat/lon geopy Point self.user_id_to_location = {} self.geocoder = None; self.unique_locations = set() self.id_to_location = {} # Mappings from feature names to their corresponding indices in a # feature vector self.pop_bin_feature_indices = {} self.reciprocal_feature_indices = {} self.triad_feature_indices = {} self.total_num_features = 0 # The SVM classifier and feature vector scaler self.location_classifier = None self.location_vector_scaler = None def train_model(self, settings, dataset, model_dir): # Initialize the geocoder, which we'll use to resolve location strings. # We use the default name-to-location mapping unless the user has # specified otherwise. if 'location_source' in settings: self.geocoder = Geocoder(dataset=settings['location_source']) else: self.geocoder = Geocoder() # NOTE: The original paper used the directional friends/followers # network. However, the paper was tested on a much smaller network # (9.8M edges), which doesn't scale when including the full network. We # opt for using the bi-directional networks as these (1) provide a # stronger signal of social relationships and (2) significantly reduce # the memory requirement. LOGGER.debug('Loading mention network') mention_network = dataset.bi_mention_network() # This dict will contain a mapping from user ID to an associated home # location, which is derived either from the location field (as in the # original paper), from GPS-tagged tweets, or from both user_to_home_loc = {} # For each of the users that we have in the network, see if we can # associate that user with a home location. all_users = set(mention_network.nodes_iter()) LOGGER.debug('Calculating users with recognizable home location') num_users_processed = 0 # Keep track of how many times each location occurred. We'll filter # this down to only the most common locations location_counts = collections.Counter() for user_id, home_loc in dataset.user_home_location_iter(): if not user_id in all_users: continue # home_loc is a (lat,lon) tuple. While this is accurate, we want to # coarsen the location data to decrease sparsity (i.e., more people # located in the same city location, despite slightly different # underlying lat/lon values). Here, use the Geocoder to map the # lat/lon to a name and then back to a canonical lat/lon for that # name canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1]) location_counts[canonical_lat_lon] += 1 user_to_home_loc[user_id] = canonical_lat_lon num_users_processed += 1 if num_users_processed % 500000 == 0: LOGGER.debug('Processed %s of the %s users, associated %s a known location (%s)' % (num_users_processed, len(all_users), len(user_to_home_loc), len(user_to_home_loc) / float(num_users_processed))) # Iterate through the locations pruning out those that do not occur more # than some threshold number of times num_locs_removed = 0 for lat_lon, count in location_counts.iteritems(): if count >= 20: self.unique_locations.add(lat_lon) else: num_locs_removed += 1 LOGGER.debug('Saw %d locations, %d with at least 5 users, %d to be pruned' % (len(location_counts), len(self.unique_locations), num_locs_removed)) # Remove the home locations of users whose locations aren't in the # pruned list of minimum-frequency locations num_user_home_locs_removed = 0 for user_id, loc in user_to_home_loc.items(): if not loc in self.unique_locations: del user_to_home_loc[user_id] num_user_home_locs_removed += 1 LOGGER.debug('After pruning removed home locations of %d users, %d still have homes' % (num_user_home_locs_removed, len(user_to_home_loc))) # Create a bi-directional mapping from locations to unique # numeric identifiers. This mapping will be used when # representing locations in the classifier feature space and # when converting classifier output to specific locations location_to_id = {} for loc in self.unique_locations: id_ = len(location_to_id) location_to_id[loc] = id_ self.id_to_location[id_] = loc # Associate each location with its set of features n = len(self.unique_locations) # Each location has 7 features associated with it for classifying a # user's location. The seven features per location are arranged next to # each other in the feature space. feature_offset = 0 for loc in self.unique_locations: # Feat1: it's population bin (size approx.) self.pop_bin_feature_indices[loc] = feature_offset # Feat2: the number of reciprocal friends self.reciprocal_feature_indices[loc] = feature_offset + 1 # Feat3-7: the bins indicating how many friends were in reciprocal # triads in that city for bin_num in range(0, 5): feat = "%s,%s:%s" % (loc[0], loc[1], bin_num) self.triad_feature_indices[feat] = feature_offset + bin_num + 2 # Increment the feature offset so the next city's features don't # collide with this city's indices feature_offset += 7 # Set the total number of features seen self.total_num_features = feature_offset LOGGER.debug('Saw %d unique locations, %d total featurs' % (len(self.unique_locations), feature_offset)) LOGGER.debug('Associated %s of the %s users with a known location (%s unique)' % (len(user_to_home_loc), len(all_users), len(self.unique_locations))) # The list of locations for each corresponding user in X B = [] # Train the classifier based on users with known home locations LOGGER.debug("Generating feature vectors for training") X = scipy.sparse.lil_matrix((len(user_to_home_loc), self.total_num_features), dtype=numpy.float64) print X row = 0 total_nz = 0 for user_id, location in user_to_home_loc.iteritems(): # Skip users whose locations were omitted due to frequency filtering # or who have home locations but are not in the mention network #if not location in self.unique_locations or not user_id in all_users: # continue # Fill the row in the matrix corresponding to this user's features nz = self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, row) total_nz += nz # Get the index of this user's location location_id = location_to_id[location] B.append(location_id) row += 1 X = X.tocsr() #X = X.toarray() LOGGER.debug("Generated training data for %d users, %d nz features, %f on average" % (row, total_nz, float(total_nz) / row)) # Convert the location list into a numpy array for use with scikit Y = numpy.asarray(B) if len(X.nonzero()[0]) == 0: LOGGER.warning("Too little training data seen and no user had non-zero feature "+ "values. Cowardly aborting classification") else: # Use SVM classifier with a linear kernel. # # NOTE NOTE NOTE NOTE # # The original paper uses an RBF kernel with their SVM. However, # this proved impossibly slow during testing, so a linear kernel was # used instead. # # NOTE NOTE NOTE NOTE # # slow: self.location_classifier = svm.SVC(kernel='rbf') #self.location_classifier = svm.LinearSVC(dual=False) #self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000) #self.location_classifier = naive_bayes.BernoulliNB() self.location_classifier = svm.LinearSVC(dual=False, loss='l2', penalty="l2", tol=1e-2) # Note: we expect the vector representations to be sparse, so avoid mean # scaling since it would create dense vectors, which would blow up the # memory consumption of the model self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False) # Learn the scaling parameters and then rescale the input LOGGER.debug("Scaling feature vectors for training") X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64)) LOGGER.debug("Training classifier") self.location_classifier.fit(X_scaled, Y) LOGGER.debug("Finished training classifier") # Assign all the users some location, if we can figure it out users_assigned = 0 users_seen = 0 for user_id in all_users: users_seen += 1 # If we know where to place this user, assign it to their home location if user_id in user_to_home_loc: self.user_id_to_location[user_id] = user_to_home_loc[user_id] # Otherwise try to infer the location else: location = self.infer_location(user_id, mention_network, user_to_home_loc) if not location is None: self.user_id_to_location[user_id] = location users_assigned += 1 if users_seen % 100000 == 0: LOGGER.debug((("Saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)") % (users_seen, len(all_users), len(self.user_id_to_location) - users_assigned, users_assigned, len(self.user_id_to_location)))) LOGGER.debug((("Ultimately saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)") % (users_seen, len(all_users), len(self.user_id_to_location) - users_assigned, users_assigned, len(self.user_id_to_location)))) # Short circuit early if the caller has specified that the model is not # to be saved into a directory if model_dir is None: return Wheres_Wally_Model(self.user_id_to_location) if not os.path.exists(model_dir): os.mkdir(model_dir) # Write the .tsv for human debugability too fh = gzip.open(os.path.join(model_dir, 'user-to-lat-lon.tsv.gz'), 'w') for user_id, loc in self.user_id_to_location.iteritems(): fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1])); fh.close() return Wheres_Wally_Model(self.user_id_to_location) def infer_location(self, user_id, mention_network, user_to_home_loc): """ Infers and returns the location of the provided users based on their features in the network """ # Ensure that the model has been trained; otherwise, report an # empty classification if self.location_vector_scaler is None or self.location_classifier is None: return None # Convert the user's network-based features into a numeric vector X = scipy.sparse.lil_matrix((1, self.total_num_features), dtype=numpy.float64) self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, 0) X = X.tocsr() # Rescale the vector according to the training data's scaling user_vector_scaled = self.location_vector_scaler.transform(X) # Classify the results location_id = self.location_classifier.predict(user_vector_scaled)[0] # Convert the index into a location return self.id_to_location[location_id] def fill_user_vector(self, user_id, mention_network, user_to_home_loc, csr_matrix, row_to_fill): """ Creates a vector for the user and fills their data into the specified row in the provided matrix """ feat_dict = self.create_user_vector(user_id, mention_network, user_to_home_loc) nz = 0 for col, val in feat_dict.iteritems(): csr_matrix[row_to_fill, col] = val nz += 1 return nz def create_user_vector(self, user_id, mention_network, user_to_home_loc): """ Creates a vector to use with SciPy that represents this user's features """ # The binned location features look at all the locations of this user's # neighbors and then provide a weight for each location according to how # many of the user's friends are in that location multiplied by how # large the city is, which is represented as one of five bins location_to_friends = defaultdict(list) location_to_followers = defaultdict(list) num_friends = mention_network.degree(user_id) # Record which friend appear in each city for neighbor_id in mention_network.neighbors_iter(user_id): if neighbor_id in user_to_home_loc: location_name = user_to_home_loc[neighbor_id] location_to_friends[location_name].append(neighbor_id) location_to_followers[location_name].append(neighbor_id) # Since the vector is expected to be very sparse, create it as a dict # for the indices with non-zero feature values. classifier_input_vector = {} num_non_zero_features = 0 # Each city/location generates 7 unique features in the best performing # system for city, followers_in_city in location_to_followers.iteritems(): n = len(followers_in_city) # Feature 1: the city's bin multiplied by the number of users in the # city city_bin = self.get_city_bin(n) pop_bin_feature_index = self.pop_bin_feature_indices[city] classifier_input_vector[pop_bin_feature_index] = city_bin for city, friends_in_city in location_to_friends.iteritems(): n = len(friends_in_city) # Feature 2: the percentage of friends with reciprocal edges at that # location num_reciprocal_friends = 0 for n1 in friends_in_city: if mention_network.has_edge(n1, user_id): num_reciprocal_friends += 1 num_non_zero_features += 1 reciprocal_feature_index = self.reciprocal_feature_indices[city] classifier_input_vector[reciprocal_feature_index] = num_reciprocal_friends / n if num_reciprocal_friends > 0: num_non_zero_features += 1 # Features 3-7: the number of triads in the city triad_counter = collections.Counter() for n1 in friends_in_city: num_triads = 0 for n2 in friends_in_city: if mention_network.has_edge(n1, n2): num_triads += 1 # Decide which bin this user is in triad_counter[self.get_triad_bin(num_triads)] += 1 for bin_num, count in triad_counter.iteritems(): feat = "%s,%s:%s" % (city[0], city[1], bin_num) triad_bin_feature_index = self.triad_feature_indices[feat] classifier_input_vector[triad_bin_feature_index] = count / num_friends if count > 0: num_non_zero_features += 1 return classifier_input_vector def get_triad_bin(self, num_triads): """ Returns which bin this count of the number of triads should be in """ # Bins in the paper [0,5,10,20,40] if num_triads < 5: return 0 elif num_triads < 10: return 1 elif num_triads < 20: return 2 elif num_triads < 40: return 3 else: return 4 def get_city_bin(self, city_size): """ Returns which bin this count of the number of triads should be in """ # Bins in the paper [1,2,4,12,57054] if city_size <= 1: return 0 elif city_size <= 2: return 1 elif city_size <= 4: return 2 elif city_size <= 12: return 3 # This sould be 57054, but we use any value larger than 12 to # avoid the edge case where a city has more than 57k users else: return 4 def load_model(self, model_dir, settings): """ Reads in the Where's Wally model from a gzipped .tsv """ user_id_to_location = {} model_file = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), 'r') for line in model_file: cols = line.split("\t") user_id = cols[0] lat = float(cols[1]) lon = float(cols[2]) user_id_to_location[user_id] = (lat, lon) model_file.close() return Wheres_Wally_Model(user_id_to_location)
class MultiLocation(object): """ MultiLocation is the implemented method from Multiple Location Profiling for Users and Relationships, from Social Network and Content by Rui Li, Shengjie Wang and Kevin Chen-Chuan Chang. """ def __init__(self, settings): """ Initializing class variables. """ # the mention network that will store inferred locations in node_data self.mention_network = MultiLocationMethod.dataset.bi_mention_network() self.nodes = set(self.mention_network.nodes()) #self.u_n, self.u_star are the sets of users with unknown and known locations respectively. self.u_n = set() self.u_star = set() #the set of all known venues self.venues = set() #list of all locations, and the co-occurences with a user. self.psi = Counter() #alpha and beta are the coefficients for eq.1 as per the paper self.alpha = -0.55 self.beta = 0.0045 #K is the total number of tweeting relationships self.K = 0 #N_squared is the total number of user pairs self.N_squared = 0 #S is the number of following relationships self.S = 0 #geocoder is a forward/reverse geocoder for location -> lat/long and lat/lon -> location. if 'location_source' in settings: self.geocoder = Geocoder(dataset=settings['location_source']) else: self.geocoder = Geocoder() #F_r is the random following model Bernoulli distribution parameter self.F_r = None #T_r is the random tweeting model Bernoulli distribution parameter self.T_r = Counter() #mu and nu are the model selectors according to a bernoulli distribution self.mu = defaultdict(bool) self.nu = defaultdict(bool) #the multi-location list generated by the MLP self.user_multi_locations = defaultdict(list) #runs the model, populates all the variables and generates user_multi_locations self.run_model() def store_location_data(self): """ Sets the node_data field with the relevant gold-standard location data from the bidirectional dataset. """ num_users_seen = 0 for user_id, loc in MultiLocationMethod.dataset.user_home_location_iter(): if loc[0] == 0 and loc[1] == 0: continue try: self.mention_network.set_node_data(user_id, loc) self.u_star.add(user_id) num_users_seen += 1 if num_users_seen % 100000 == 0: logger.debug('Multilocation saw %d users' % num_users_seen) except KeyError: pass def find_locations(self): users_seen = 1 for possible_posts in MultiLocationMethod.dataset.user_iter(): users_seen += 1 if users_seen % 1000000 == 0: logger.debug("Seen %d users" %users_seen) user_id = possible_posts['user_id'] posts = possible_posts['posts'] if len(posts) > 600: posts = posts[-600:] for post in posts: #twokenizer may be too computationally expensive here... #text = tokenizer(post['text']) text = post['text'].split() lc_text = [] is_upper = [] for s in text: isup = s[0].isupper() is_upper.append(isup) if isup: lc_text.append(s.lower()) else: lc_text.append(s) i = 0 n = len(text) while True: if i >= n: break if not is_upper[i]: i += 1 continue is_up1 = i + 1 < n and is_upper[i+1] first_two_with_space = None first_two_with_tab = None if i + 2 < n and is_upper[i+2] and is_up1: w1 = lc_text[i] w2 = lc_text[i+1] w3 = lc_text[i+2] first_two_with_space = w1 + " " + w2 s2 = first_two_with_space + " " + w3 location = self.geocoder.geocode(s2) if not location is None: self.record_user_location(s2, location, user_id) i += 3 continue s3 = first_two_with_space + "\t" + w3 location = self.geocoder.geocode(s3) if not location is None: self.record_user_location(s3, location, user_id) i += 3 continue first_two_with_tab = w1 + "\t" + w2 s4 = first_two_with_tab + "\t" + w3 location = self.geocoder.geocode(s4) if not location is None: self.record_user_location(s4, location, user_id) i += 3 continue s5 = first_two_with_tab + " " + w3 location = self.geocoder.geocode(s5) if not location is None: self.record_user_location(s5, location, user_id) i += 3 continue elif i + 1 < n and is_up1: w1 = lc_text[i] w2 = lc_text[i+1] if first_two_with_tab is None: first_two_with_tab = w1 + "\t" + w2 location = self.geocoder.geocode(first_two_with_tab) if not location is None: self.record_user_location(first_two_with_tab, location, user_id) i += 2 continue if first_two_with_space is None: first_two_with_space = w1 + " " + w2 location = self.geocoder.geocode(first_two_with_space) if not location is None: self.record_user_location(first_two_with_space, location, user_id) i += 2 continue else: w1 = lc_text[i] location = self.geocoder.geocode(w1) if not location is None: self.record_user_location(w1, location, user_id) i += 1 def record_user_location(self, location_name, location, user_id): try: self.mention_network.add_edge(user_id,location_name) self.mention_network.set_node_data(location_name,location) except: return self.venues.add(location_name) self.psi[location] += 1 self.T_r[user_id] += 1 self.K += 1 return def compute_coefficients(self): """ Computes the coefficients for equation (1) form the paper, P(f<i,j>|alpha,beta,x_i,y_i) = beta*distance(x_i,y_i)^alpha """ def func_to_fit(x, a, b): return b * x ** a mentions_per_distance = Counter() following_relationship = Counter() # our networks are too large to generate these coefficients on each call... # this is about the same number of combinations as shown in the paper... n = 10000000 #random_sample = random.sample(list(self.u_star),n) random_sample = list(self.u_star) number_of_users = len(self.u_star) # processed_combinations = 0 # start_time = time.time() #for node_u, node_v in combinations(random_sample,2): for i in range(0,n): node_u, node_v = (random_sample[random.randint(0,number_of_users-1)],random_sample[random.randint(0,number_of_users-1)]) if node_u == node_v: continue # if processed_combinations % 1000000 == 0: # logger.debug("Took %f to process %d combinations..." % ((time.time() - start_time), processed_combinations)) # processed_combinations += 1 l_u = self.mention_network.node_data(node_u) l_v = self.mention_network.node_data(node_v) distance = round(haversine(l_u,l_v,miles=True),0) if distance > 10000: continue mentions_per_distance[distance] += 1.0 self.N_squared += 1.0 if self.mention_network.has_edge(node_u,node_v): following_relationship[distance] += 1.0 self.S += 1.0 x = list(sorted([key for key in mentions_per_distance])) x[0] += 1e-8 y = [] for key in mentions_per_distance: # "ratio of the number of pairs that have following relationship to the total number of pairs in the d_th bucket" mentions = mentions_per_distance[key] if mentions == 0: x.remove(key) continue following = following_relationship[key] ratio = following/mentions y.append(ratio) solutions = curve_fit(func_to_fit, x, y,p0=[-0.55,0.0045], maxfev=100000)[0] self.alpha = solutions[0] self.beta = solutions[1] return def generate_model_selector(self): for user in self.u_n: if np.random.binomial(1, self.F_r) == 1: #generate a model selector, u according to a bernoulli distribution self.mu[user] = True else: self.mu[user] = False #normalizing K if np.random.binomial(1, (self.T_r[user] / self.K)) == 1: self.nu[user] = True else: self.nu[user] = False def random_following_model(self, user): """ If mu = 1, we choose the random following model using p(f<i,j> == 1 | F_r) to decide if the location of a neighbor of the user is a possible location. """ for neighbor in self.mention_network.neighbors_iter(user): if neighbor not in self.u_star: continue elif np.random.binomial(1, self.F_r): self.user_multi_locations[user].append(self.mention_network.node_data(neighbor)) return def following_model(self, user): """ If mu = 0, we decide whether there is f<i,j> based on the location-based following model as shown in eq. 1 """ #(note: this is almost the same as the Backstrom paper, thus I'll ignore generating #the theta values and just calculate max probability) def calculate_probability(l_u, l_v): """ Calculates the probability, P(f<i,j>|alpha,beta,location_1,location_2) """ try: return self.beta * (abs(haversine(l_u, l_v))) ** (self.alpha) except: #this needs to be changed to a very small value.... return self.beta * (0.00000001) ** self.alpha best_log_probability = float('-inf') best_location = None for neighbor_u in self.mention_network.neighbors_iter(user): log_probability = 0 if neighbor_u not in self.u_star: continue for neighbor_v in self.mention_network.neighbors_iter(neighbor_u): if neighbor_v not in self.u_star: continue else: l_u = self.mention_network.node_data(neighbor_u) l_v = self.mention_network.node_data(neighbor_v) plu_lv = calculate_probability(l_u, l_v) try: log_gamma_lu = math.log((plu_lv / (1 - plu_lv))) except ValueError: #in the case where l_u == l_v, then plu_lv --> 0 and log(1) = 0, #thus this exception should be valid. log_gamma_lu = 0 log_probability += log_gamma_lu if log_probability > best_log_probability: best_log_probability = log_probability best_location = self.mention_network.node_data(neighbor_u) if best_location: self.user_multi_locations[user].append(best_location) return def random_tweeting_model(self, user): for venue in self.mention_network.neighbors_iter(user): if venue not in self.venues: continue elif np.random.binomial(1, self.T_r[user]): self.user_multi_locations[user].append(self.mention_network.node_data(venue)) return def tweeting_model(self, user): best_probability = float("-inf") best_venue = None for venue in self.mention_network.neighbors_iter(user): if venue not in self.venues: continue probability = self.psi[venue] if best_probability < probability: best_probability = probability best_venue = venue if best_venue: self.user_multi_locations[user].append(self.mention_network.node_data(best_venue)) return def run_model(self): """ run_model generates the values for all the initialized class variables, and follows the MLP algorithm described in the paper to infer locations for users. """ #NOTE: K is not normalized to save computations, and is normalized on the fly in "generate_model_selector" #self.populate_mention_network() logger.debug("Variables have been initialized. Starting the model.") logger.debug("Storing location data...") self.store_location_data() self.u_n = self.nodes.difference(self.u_star) logger.debug("Location data stored!") logger.debug("Starting to compute the coefficients for the model...") #calculates the coefficients to be used in eq.1, alpha and beta self.compute_coefficients() logger.debug("Coefficients have been calculated. Alpha: %f and beta: %f." %(self.alpha, self.beta)) logger.debug("Finding venue data..") self.find_locations() for venue in self.psi: self.psi[venue] /= self.K logger.debug("Finished finding venue data! %d venues found!" % len(self.venues)) #self.N_squared = len(self.mention_network.edges_()) #p(f<i,j> = 1 | F_r) = S / N^2 self.F_r = (self.S / self.N_squared) #Section 4.4, generate model selector based on bernoulli distributions using T_r and F_r logger.debug("Generating model selectors...") self.generate_model_selector() logger.debug("Model selectors have been generated!") logger.debug("Starting to find user locations...") for user in self.u_n: if self.mu[user]: self.random_following_model(user) else: self.following_model(user) if self.nu[user]: self.random_tweeting_model(user) else: self.tweeting_model(user) logger.debug("Finished finding user locations...") for user in self.user_multi_locations: location_list = self.user_multi_locations[user] location = self.get_geometric_mean(location_list) self.mention_network.set_node_data(user,location) def return_network(self): return self.mention_network def get_geometric_mean(self, locations): """ Locates the geometric mean of a list of locations, taken from David Jurgen's implementation, with less than three locations a random location is selected, else construct a geometric mean. """ n = len(locations) # The geometric median is only defined for n > 2 points, so just return # an arbitrary point if we have fewer if n < 2: return locations[np.random.randint(0, n)] min_distance_sum = 10000000 median = None # Point type # Loop through all the points, finding the point that minimizes the # geodetic distance to all other points. By construction median will # always be assigned to some non-None value by the end of the loop. for i in range(0, n): p1 = locations[i] dist_sum = 0 for j in range(0, n): p2 = locations[j] # Skip self-comparison if i == j: continue dist = haversine(p1, p2) dist_sum += dist # Short-circuit early if it's clear that this point cannot be # the median since it does not minimize the distance sum if dist_sum > min_distance_sum: break if dist_sum < min_distance_sum: min_distance_sum = dist_sum median = p1 return median
def train_model(self, settings, dataset, model_dir): # Initialize the geocoder, which we'll use to resolve location strings. # We use the default name-to-location mapping unless the user has # specified otherwise. if "location_source" in settings: self.geocoder = Geocoder(dataset=settings["location_source"]) else: self.geocoder = Geocoder() # NOTE: The original paper used the directional friends/followers # network. However, the paper was tested on a much smaller network # (9.8M edges), which doesn't scale when including the full network. We # opt for using the bi-directional networks as these (1) provide a # stronger signal of social relationships and (2) significantly reduce # the memory requirement. LOGGER.debug("Loading mention network") mention_network = dataset.bi_mention_network() # This dict will contain a mapping from user ID to an associated home # location, which is derived either from the location field (as in the # original paper), from GPS-tagged tweets, or from both user_to_home_loc = {} # For each of the users that we have in the network, see if we can # associate that user with a home location. all_users = set(mention_network.nodes_iter()) LOGGER.debug("Calculating users with recognizable home location") num_users_processed = 0 # Keep track of how many times each location occurred. We'll filter # this down to only the most common locations location_counts = collections.Counter() for user_id, home_loc in dataset.user_home_location_iter(): if not user_id in all_users: continue # home_loc is a (lat,lon) tuple. While this is accurate, we want to # coarsen the location data to decrease sparsity (i.e., more people # located in the same city location, despite slightly different # underlying lat/lon values). Here, use the Geocoder to map the # lat/lon to a name and then back to a canonical lat/lon for that # name canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1]) location_counts[canonical_lat_lon] += 1 user_to_home_loc[user_id] = canonical_lat_lon num_users_processed += 1 if num_users_processed % 500000 == 0: LOGGER.debug( "Processed %s of the %s users, associated %s a known location (%s)" % ( num_users_processed, len(all_users), len(user_to_home_loc), len(user_to_home_loc) / float(num_users_processed), ) ) # Iterate through the locations pruning out those that do not occur more # than some threshold number of times num_locs_removed = 0 for lat_lon, count in location_counts.iteritems(): if count >= 20: self.unique_locations.add(lat_lon) else: num_locs_removed += 1 LOGGER.debug( "Saw %d locations, %d with at least 5 users, %d to be pruned" % (len(location_counts), len(self.unique_locations), num_locs_removed) ) # Remove the home locations of users whose locations aren't in the # pruned list of minimum-frequency locations num_user_home_locs_removed = 0 for user_id, loc in user_to_home_loc.items(): if not loc in self.unique_locations: del user_to_home_loc[user_id] num_user_home_locs_removed += 1 LOGGER.debug( "After pruning removed home locations of %d users, %d still have homes" % (num_user_home_locs_removed, len(user_to_home_loc)) ) # Create a bi-directional mapping from locations to unique # numeric identifiers. This mapping will be used when # representing locations in the classifier feature space and # when converting classifier output to specific locations location_to_id = {} for loc in self.unique_locations: id_ = len(location_to_id) location_to_id[loc] = id_ self.id_to_location[id_] = loc # Associate each location with its set of features n = len(self.unique_locations) # Each location has 7 features associated with it for classifying a # user's location. The seven features per location are arranged next to # each other in the feature space. feature_offset = 0 for loc in self.unique_locations: # Feat1: it's population bin (size approx.) self.pop_bin_feature_indices[loc] = feature_offset # Feat2: the number of reciprocal friends self.reciprocal_feature_indices[loc] = feature_offset + 1 # Feat3-7: the bins indicating how many friends were in reciprocal # triads in that city for bin_num in range(0, 5): feat = "%s,%s:%s" % (loc[0], loc[1], bin_num) self.triad_feature_indices[feat] = feature_offset + bin_num + 2 # Increment the feature offset so the next city's features don't # collide with this city's indices feature_offset += 7 # Set the total number of features seen self.total_num_features = feature_offset LOGGER.debug("Saw %d unique locations, %d total featurs" % (len(self.unique_locations), feature_offset)) LOGGER.debug( "Associated %s of the %s users with a known location (%s unique)" % (len(user_to_home_loc), len(all_users), len(self.unique_locations)) ) # The list of locations for each corresponding user in X B = [] # Train the classifier based on users with known home locations LOGGER.debug("Generating feature vectors for training") X = scipy.sparse.lil_matrix((len(user_to_home_loc), self.total_num_features), dtype=numpy.float64) print X row = 0 total_nz = 0 for user_id, location in user_to_home_loc.iteritems(): # Skip users whose locations were omitted due to frequency filtering # or who have home locations but are not in the mention network # if not location in self.unique_locations or not user_id in all_users: # continue # Fill the row in the matrix corresponding to this user's features nz = self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, row) total_nz += nz # Get the index of this user's location location_id = location_to_id[location] B.append(location_id) row += 1 X = X.tocsr() # X = X.toarray() LOGGER.debug( "Generated training data for %d users, %d nz features, %f on average" % (row, total_nz, float(total_nz) / row) ) # Convert the location list into a numpy array for use with scikit Y = numpy.asarray(B) if len(X.nonzero()[0]) == 0: LOGGER.warning( "Too little training data seen and no user had non-zero feature " + "values. Cowardly aborting classification" ) else: # Use SVM classifier with a linear kernel. # # NOTE NOTE NOTE NOTE # # The original paper uses an RBF kernel with their SVM. However, # this proved impossibly slow during testing, so a linear kernel was # used instead. # # NOTE NOTE NOTE NOTE # # slow: self.location_classifier = svm.SVC(kernel='rbf') # self.location_classifier = svm.LinearSVC(dual=False) # self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000) # self.location_classifier = naive_bayes.BernoulliNB() self.location_classifier = svm.LinearSVC(dual=False, loss="l2", penalty="l2", tol=1e-2) # Note: we expect the vector representations to be sparse, so avoid mean # scaling since it would create dense vectors, which would blow up the # memory consumption of the model self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False) # Learn the scaling parameters and then rescale the input LOGGER.debug("Scaling feature vectors for training") X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64)) LOGGER.debug("Training classifier") self.location_classifier.fit(X_scaled, Y) LOGGER.debug("Finished training classifier") # Assign all the users some location, if we can figure it out users_assigned = 0 users_seen = 0 for user_id in all_users: users_seen += 1 # If we know where to place this user, assign it to their home location if user_id in user_to_home_loc: self.user_id_to_location[user_id] = user_to_home_loc[user_id] # Otherwise try to infer the location else: location = self.infer_location(user_id, mention_network, user_to_home_loc) if not location is None: self.user_id_to_location[user_id] = location users_assigned += 1 if users_seen % 100000 == 0: LOGGER.debug( ( ("Saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)") % ( users_seen, len(all_users), len(self.user_id_to_location) - users_assigned, users_assigned, len(self.user_id_to_location), ) ) ) LOGGER.debug( ( ("Ultimately saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)") % ( users_seen, len(all_users), len(self.user_id_to_location) - users_assigned, users_assigned, len(self.user_id_to_location), ) ) ) # Short circuit early if the caller has specified that the model is not # to be saved into a directory if model_dir is None: return Wheres_Wally_Model(self.user_id_to_location) if not os.path.exists(model_dir): os.mkdir(model_dir) # Write the .tsv for human debugability too fh = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), "w") for user_id, loc in self.user_id_to_location.iteritems(): fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1])) fh.close() return Wheres_Wally_Model(self.user_id_to_location)
class Wheres_Wally(GIMethod): def __init__(self): # Location is represented as a lat/lon geopy Point self.user_id_to_location = {} self.geocoder = None self.unique_locations = set() self.id_to_location = {} # Mappings from feature names to their corresponding indices in a # feature vector self.pop_bin_feature_indices = {} self.reciprocal_feature_indices = {} self.triad_feature_indices = {} self.total_num_features = 0 # The SVM classifier and feature vector scaler self.location_classifier = None self.location_vector_scaler = None def train_model(self, settings, dataset, model_dir): # Initialize the geocoder, which we'll use to resolve location strings. # We use the default name-to-location mapping unless the user has # specified otherwise. if "location_source" in settings: self.geocoder = Geocoder(dataset=settings["location_source"]) else: self.geocoder = Geocoder() # NOTE: The original paper used the directional friends/followers # network. However, the paper was tested on a much smaller network # (9.8M edges), which doesn't scale when including the full network. We # opt for using the bi-directional networks as these (1) provide a # stronger signal of social relationships and (2) significantly reduce # the memory requirement. LOGGER.debug("Loading mention network") mention_network = dataset.bi_mention_network() # This dict will contain a mapping from user ID to an associated home # location, which is derived either from the location field (as in the # original paper), from GPS-tagged tweets, or from both user_to_home_loc = {} # For each of the users that we have in the network, see if we can # associate that user with a home location. all_users = set(mention_network.nodes_iter()) LOGGER.debug("Calculating users with recognizable home location") num_users_processed = 0 # Keep track of how many times each location occurred. We'll filter # this down to only the most common locations location_counts = collections.Counter() for user_id, home_loc in dataset.user_home_location_iter(): if not user_id in all_users: continue # home_loc is a (lat,lon) tuple. While this is accurate, we want to # coarsen the location data to decrease sparsity (i.e., more people # located in the same city location, despite slightly different # underlying lat/lon values). Here, use the Geocoder to map the # lat/lon to a name and then back to a canonical lat/lon for that # name canonical_lat_lon = self.geocoder.canonicalize(home_loc[0], home_loc[1]) location_counts[canonical_lat_lon] += 1 user_to_home_loc[user_id] = canonical_lat_lon num_users_processed += 1 if num_users_processed % 500000 == 0: LOGGER.debug( "Processed %s of the %s users, associated %s a known location (%s)" % ( num_users_processed, len(all_users), len(user_to_home_loc), len(user_to_home_loc) / float(num_users_processed), ) ) # Iterate through the locations pruning out those that do not occur more # than some threshold number of times num_locs_removed = 0 for lat_lon, count in location_counts.iteritems(): if count >= 20: self.unique_locations.add(lat_lon) else: num_locs_removed += 1 LOGGER.debug( "Saw %d locations, %d with at least 5 users, %d to be pruned" % (len(location_counts), len(self.unique_locations), num_locs_removed) ) # Remove the home locations of users whose locations aren't in the # pruned list of minimum-frequency locations num_user_home_locs_removed = 0 for user_id, loc in user_to_home_loc.items(): if not loc in self.unique_locations: del user_to_home_loc[user_id] num_user_home_locs_removed += 1 LOGGER.debug( "After pruning removed home locations of %d users, %d still have homes" % (num_user_home_locs_removed, len(user_to_home_loc)) ) # Create a bi-directional mapping from locations to unique # numeric identifiers. This mapping will be used when # representing locations in the classifier feature space and # when converting classifier output to specific locations location_to_id = {} for loc in self.unique_locations: id_ = len(location_to_id) location_to_id[loc] = id_ self.id_to_location[id_] = loc # Associate each location with its set of features n = len(self.unique_locations) # Each location has 7 features associated with it for classifying a # user's location. The seven features per location are arranged next to # each other in the feature space. feature_offset = 0 for loc in self.unique_locations: # Feat1: it's population bin (size approx.) self.pop_bin_feature_indices[loc] = feature_offset # Feat2: the number of reciprocal friends self.reciprocal_feature_indices[loc] = feature_offset + 1 # Feat3-7: the bins indicating how many friends were in reciprocal # triads in that city for bin_num in range(0, 5): feat = "%s,%s:%s" % (loc[0], loc[1], bin_num) self.triad_feature_indices[feat] = feature_offset + bin_num + 2 # Increment the feature offset so the next city's features don't # collide with this city's indices feature_offset += 7 # Set the total number of features seen self.total_num_features = feature_offset LOGGER.debug("Saw %d unique locations, %d total featurs" % (len(self.unique_locations), feature_offset)) LOGGER.debug( "Associated %s of the %s users with a known location (%s unique)" % (len(user_to_home_loc), len(all_users), len(self.unique_locations)) ) # The list of locations for each corresponding user in X B = [] # Train the classifier based on users with known home locations LOGGER.debug("Generating feature vectors for training") X = scipy.sparse.lil_matrix((len(user_to_home_loc), self.total_num_features), dtype=numpy.float64) print X row = 0 total_nz = 0 for user_id, location in user_to_home_loc.iteritems(): # Skip users whose locations were omitted due to frequency filtering # or who have home locations but are not in the mention network # if not location in self.unique_locations or not user_id in all_users: # continue # Fill the row in the matrix corresponding to this user's features nz = self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, row) total_nz += nz # Get the index of this user's location location_id = location_to_id[location] B.append(location_id) row += 1 X = X.tocsr() # X = X.toarray() LOGGER.debug( "Generated training data for %d users, %d nz features, %f on average" % (row, total_nz, float(total_nz) / row) ) # Convert the location list into a numpy array for use with scikit Y = numpy.asarray(B) if len(X.nonzero()[0]) == 0: LOGGER.warning( "Too little training data seen and no user had non-zero feature " + "values. Cowardly aborting classification" ) else: # Use SVM classifier with a linear kernel. # # NOTE NOTE NOTE NOTE # # The original paper uses an RBF kernel with their SVM. However, # this proved impossibly slow during testing, so a linear kernel was # used instead. # # NOTE NOTE NOTE NOTE # # slow: self.location_classifier = svm.SVC(kernel='rbf') # self.location_classifier = svm.LinearSVC(dual=False) # self.location_classifier = svm.NuSVC(kernel='rbf', verbose=True, max_iter=1000) # self.location_classifier = naive_bayes.BernoulliNB() self.location_classifier = svm.LinearSVC(dual=False, loss="l2", penalty="l2", tol=1e-2) # Note: we expect the vector representations to be sparse, so avoid mean # scaling since it would create dense vectors, which would blow up the # memory consumption of the model self.location_vector_scaler = preprocessing.StandardScaler(with_mean=False) # Learn the scaling parameters and then rescale the input LOGGER.debug("Scaling feature vectors for training") X_scaled = self.location_vector_scaler.fit_transform(X.astype(numpy.float64)) LOGGER.debug("Training classifier") self.location_classifier.fit(X_scaled, Y) LOGGER.debug("Finished training classifier") # Assign all the users some location, if we can figure it out users_assigned = 0 users_seen = 0 for user_id in all_users: users_seen += 1 # If we know where to place this user, assign it to their home location if user_id in user_to_home_loc: self.user_id_to_location[user_id] = user_to_home_loc[user_id] # Otherwise try to infer the location else: location = self.infer_location(user_id, mention_network, user_to_home_loc) if not location is None: self.user_id_to_location[user_id] = location users_assigned += 1 if users_seen % 100000 == 0: LOGGER.debug( ( ("Saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)") % ( users_seen, len(all_users), len(self.user_id_to_location) - users_assigned, users_assigned, len(self.user_id_to_location), ) ) ) LOGGER.debug( ( ("Ultimately saw %d/%d users, knew location of %d, " + "inferred the location of %d (total: %d)") % ( users_seen, len(all_users), len(self.user_id_to_location) - users_assigned, users_assigned, len(self.user_id_to_location), ) ) ) # Short circuit early if the caller has specified that the model is not # to be saved into a directory if model_dir is None: return Wheres_Wally_Model(self.user_id_to_location) if not os.path.exists(model_dir): os.mkdir(model_dir) # Write the .tsv for human debugability too fh = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), "w") for user_id, loc in self.user_id_to_location.iteritems(): fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1])) fh.close() return Wheres_Wally_Model(self.user_id_to_location) def infer_location(self, user_id, mention_network, user_to_home_loc): """ Infers and returns the location of the provided users based on their features in the network """ # Ensure that the model has been trained; otherwise, report an # empty classification if self.location_vector_scaler is None or self.location_classifier is None: return None # Convert the user's network-based features into a numeric vector X = scipy.sparse.lil_matrix((1, self.total_num_features), dtype=numpy.float64) self.fill_user_vector(user_id, mention_network, user_to_home_loc, X, 0) X = X.tocsr() # Rescale the vector according to the training data's scaling user_vector_scaled = self.location_vector_scaler.transform(X) # Classify the results location_id = self.location_classifier.predict(user_vector_scaled)[0] # Convert the index into a location return self.id_to_location[location_id] def fill_user_vector(self, user_id, mention_network, user_to_home_loc, csr_matrix, row_to_fill): """ Creates a vector for the user and fills their data into the specified row in the provided matrix """ feat_dict = self.create_user_vector(user_id, mention_network, user_to_home_loc) nz = 0 for col, val in feat_dict.iteritems(): csr_matrix[row_to_fill, col] = val nz += 1 return nz def create_user_vector(self, user_id, mention_network, user_to_home_loc): """ Creates a vector to use with SciPy that represents this user's features """ # The binned location features look at all the locations of this user's # neighbors and then provide a weight for each location according to how # many of the user's friends are in that location multiplied by how # large the city is, which is represented as one of five bins location_to_friends = defaultdict(list) location_to_followers = defaultdict(list) num_friends = mention_network.degree(user_id) # Record which friend appear in each city for neighbor_id in mention_network.neighbors_iter(user_id): if neighbor_id in user_to_home_loc: location_name = user_to_home_loc[neighbor_id] location_to_friends[location_name].append(neighbor_id) location_to_followers[location_name].append(neighbor_id) # Since the vector is expected to be very sparse, create it as a dict # for the indices with non-zero feature values. classifier_input_vector = {} num_non_zero_features = 0 # Each city/location generates 7 unique features in the best performing # system for city, followers_in_city in location_to_followers.iteritems(): n = len(followers_in_city) # Feature 1: the city's bin multiplied by the number of users in the # city city_bin = self.get_city_bin(n) pop_bin_feature_index = self.pop_bin_feature_indices[city] classifier_input_vector[pop_bin_feature_index] = city_bin for city, friends_in_city in location_to_friends.iteritems(): n = len(friends_in_city) # Feature 2: the percentage of friends with reciprocal edges at that # location num_reciprocal_friends = 0 for n1 in friends_in_city: if mention_network.has_edge(n1, user_id): num_reciprocal_friends += 1 num_non_zero_features += 1 reciprocal_feature_index = self.reciprocal_feature_indices[city] classifier_input_vector[reciprocal_feature_index] = num_reciprocal_friends / n if num_reciprocal_friends > 0: num_non_zero_features += 1 # Features 3-7: the number of triads in the city triad_counter = collections.Counter() for n1 in friends_in_city: num_triads = 0 for n2 in friends_in_city: if mention_network.has_edge(n1, n2): num_triads += 1 # Decide which bin this user is in triad_counter[self.get_triad_bin(num_triads)] += 1 for bin_num, count in triad_counter.iteritems(): feat = "%s,%s:%s" % (city[0], city[1], bin_num) triad_bin_feature_index = self.triad_feature_indices[feat] classifier_input_vector[triad_bin_feature_index] = count / num_friends if count > 0: num_non_zero_features += 1 return classifier_input_vector def get_triad_bin(self, num_triads): """ Returns which bin this count of the number of triads should be in """ # Bins in the paper [0,5,10,20,40] if num_triads < 5: return 0 elif num_triads < 10: return 1 elif num_triads < 20: return 2 elif num_triads < 40: return 3 else: return 4 def get_city_bin(self, city_size): """ Returns which bin this count of the number of triads should be in """ # Bins in the paper [1,2,4,12,57054] if city_size <= 1: return 0 elif city_size <= 2: return 1 elif city_size <= 4: return 2 elif city_size <= 12: return 3 # This sould be 57054, but we use any value larger than 12 to # avoid the edge case where a city has more than 57k users else: return 4 def load_model(self, model_dir, settings): """ Reads in the Where's Wally model from a gzipped .tsv """ user_id_to_location = {} model_file = gzip.open(os.path.join(model_dir, "user-to-lat-lon.tsv.gz"), "r") for line in model_file: cols = line.split("\t") user_id = cols[0] lat = float(cols[1]) lon = float(cols[2]) user_id_to_location[user_id] = (lat, lon) model_file.close() return Wheres_Wally_Model(user_id_to_location)
def train_model(self, settings, dataset, model_dir=None): # settings in the form '''{ 'LCR_min_dist' : the cutoff distance to distinguish between local and non-local contacts (default = 40 km ~ 25 miles) 'qntl_num' : the number of quantiles (default is 10) 'min_geotag' : the minimum number of geotags that makes a user a target (deafault = 3) 'min_samples_leaf' : the maximum number of sample in a leaf of the regression tree, i.e: the minimum for the regressor to not split a leaf (default = 1000) }''' self.min_dist = settings.pop('LCR_min_dist', 40) self.m = settings.pop('qntl_num', 10) self.min_geotag = settings.pop('min_geotag', 3) min_samp_leaf = settings.pop('min_samples_leaf', 1000) LOGGER.debug('tree') self.tree = DecisionTreeRegressor( min_samples_leaf=min_samp_leaf) # the classifier LOGGER.debug('geocoder') #LocRes = Geocoder() if 'location_source' in settings: LocRes = Geocoder(dataset=settings['location_source']) else: LocRes = Geocoder() LOGGER.debug('loading mention network') self.X = dataset.mention_network(bidirectional=True, directed=True, weighted=True) #print len(self.X) #counter = set_counter('has at least %d geotags'%self.min_geotag) ### counter # adding users self.user_to_home_loc = { user: loc for (user, loc) in dataset.user_home_location_iter() } user_loc_list = self.user_to_home_loc.items() random.shuffle(user_loc_list) #print len(user_loc_list) #Take a sample from user home locations to estimate stgrEdges and actEdges start = time.time() user_loc_list = user_loc_list[:50000] #print 'home loc time:' #print len(user_loc_list) #fstgr = open('stgr_edges.tsv', 'w') c = 0 LOGGER.debug('sampling stranger edges and actual edges') for uid1, loc1 in user_loc_list: #if c % 100 == 0: # print c c2 = 0 for uid2, loc2 in user_loc_list: if not c2 == c: if self.X.has_edge(uid1, uid2): self.actEdgesTuples.append((uid1, uid2)) distance = round(utils.distance(loc1, loc2), 1) self.stgrEdges[distance] += 1 c2 += 1 c += 1 #for distance in self.stgrEdges: # fstgr.write(str(distance) + '\t' + str(self.stgrEdges[distance]) + '\n') #fstgr.close() #print len(self.actEdgesTuples) LOGGER.debug('filling network') for _id, loc in dataset.user_home_location_iter(): #_id = user['user_id'] #loc = UserProfilingMethod.dataset.user_home_location_iter() #loc, pd = utils.get_post_data(user['posts']) #l_a = utils.is_geocoded(user, self.min_geotag) #counter.update(loc) ### counter #if not self.X.__contains__(_id): #self.X.add_node(_id) if loc[0] == 0 and loc[1] == 0: continue else: try: self.X.add_node(_id) except: pass l_a = loc #if not l_a: continue self.add_user_data(_id, l_a, {}) le = utils.location_error(l_a, loc, LocRes) self.set_loc_err(_id, le) # remove mentions of itself if self.X.has_edge(_id, _id): self.X.rm_edge(_id, _id) LOGGER.debug(str(self.X.__len__()) + 'users') LOGGER.debug(str(self.X.size()) + 'edges') self.set_d_a_for_all() tempx = [] tempy = [] for u, x in self.iter_contacts(): tempx.append(self.get_contact_vector(u, x)) tempy.append(self.get_d_a(u, x)) X = np.array(tempx) Y = np.array(tempy) #X = np.array([self.get_contact_vector(u,x) # for u, x in self.iter_contacts()]) #Y = np.array([self.get_d_a(u,x) # for u, x in self.iter_contacts()]) LOGGER.debug('number of relationships' + str(len(X))) LOGGER.debug("fitting") start = timeit.default_timer() #try: self.fit(X, Y) #except: # raise RuntimeError, 'No connections to train on.' LOGGER.debug('done fitting tree -' + str(timeit.default_timer() - start) + 'sec') start = timeit.default_timer() self.quantile_boundaries(X) LOGGER.debug('done setting quantile boundaries -' + str(timeit.default_timer() - start) + 'sec') start = timeit.default_timer() self.fit_curves(Y) LOGGER.debug('done fitting curves -' + str(timeit.default_timer() - start) + 'sec') #self.model.allActEdges = self.allActEdges #self.model.stgrEdges = self.stgrEdges self.user_to_loc = self.infer_locs() if model_dir is not None: LOGGER.debug('saving model') filename = os.path.join(model_dir, "user-to-lat-lon.tsv.gz") fh = gzip.open(filename, 'w') for user_id, loc in self.user_to_loc.iteritems(): if not loc is None: fh.write("%s\t%s\t%s\n" % (user_id, loc[0], loc[1])) fh.close() self.model = FriendlyLocation_Model(self.user_to_loc) return self.model
class MultiLocation(object): """ MultiLocation is the implemented method from Multiple Location Profiling for Users and Relationships, from Social Network and Content by Rui Li, Shengjie Wang and Kevin Chen-Chuan Chang. """ def __init__(self, settings): """ Initializing class variables. """ # the mention network that will store inferred locations in node_data self.mention_network = MultiLocationMethod.dataset.bi_mention_network() self.nodes = set(self.mention_network.nodes()) #self.u_n, self.u_star are the sets of users with unknown and known locations respectively. self.u_n = set() self.u_star = set() #the set of all known venues self.venues = set() #list of all locations, and the co-occurences with a user. self.psi = Counter() #alpha and beta are the coefficients for eq.1 as per the paper self.alpha = -0.55 self.beta = 0.0045 #K is the total number of tweeting relationships self.K = 0 #N_squared is the total number of user pairs self.N_squared = 0 #S is the number of following relationships self.S = 0 #geocoder is a forward/reverse geocoder for location -> lat/long and lat/lon -> location. if 'location_source' in settings: self.geocoder = Geocoder(dataset=settings['location_source']) else: self.geocoder = Geocoder() #F_r is the random following model Bernoulli distribution parameter self.F_r = None #T_r is the random tweeting model Bernoulli distribution parameter self.T_r = Counter() #mu and nu are the model selectors according to a bernoulli distribution self.mu = defaultdict(bool) self.nu = defaultdict(bool) #the multi-location list generated by the MLP self.user_multi_locations = defaultdict(list) #runs the model, populates all the variables and generates user_multi_locations self.run_model() def store_location_data(self): """ Sets the node_data field with the relevant gold-standard location data from the bidirectional dataset. """ num_users_seen = 0 for user_id, loc in MultiLocationMethod.dataset.user_home_location_iter( ): if loc[0] == 0 and loc[1] == 0: continue try: self.mention_network.set_node_data(user_id, loc) self.u_star.add(user_id) num_users_seen += 1 if num_users_seen % 100000 == 0: logger.debug('Multilocation saw %d users' % num_users_seen) except KeyError: pass def find_locations(self): users_seen = 1 for possible_posts in MultiLocationMethod.dataset.user_iter(): users_seen += 1 if users_seen % 1000000 == 0: logger.debug("Seen %d users" % users_seen) user_id = possible_posts['user_id'] posts = possible_posts['posts'] if len(posts) > 600: posts = posts[-600:] for post in posts: #twokenizer may be too computationally expensive here... #text = tokenizer(post['text']) text = post['text'].split() lc_text = [] is_upper = [] for s in text: isup = s[0].isupper() is_upper.append(isup) if isup: lc_text.append(s.lower()) else: lc_text.append(s) i = 0 n = len(text) while True: if i >= n: break if not is_upper[i]: i += 1 continue is_up1 = i + 1 < n and is_upper[i + 1] first_two_with_space = None first_two_with_tab = None if i + 2 < n and is_upper[i + 2] and is_up1: w1 = lc_text[i] w2 = lc_text[i + 1] w3 = lc_text[i + 2] first_two_with_space = w1 + " " + w2 s2 = first_two_with_space + " " + w3 location = self.geocoder.geocode(s2) if not location is None: self.record_user_location(s2, location, user_id) i += 3 continue s3 = first_two_with_space + "\t" + w3 location = self.geocoder.geocode(s3) if not location is None: self.record_user_location(s3, location, user_id) i += 3 continue first_two_with_tab = w1 + "\t" + w2 s4 = first_two_with_tab + "\t" + w3 location = self.geocoder.geocode(s4) if not location is None: self.record_user_location(s4, location, user_id) i += 3 continue s5 = first_two_with_tab + " " + w3 location = self.geocoder.geocode(s5) if not location is None: self.record_user_location(s5, location, user_id) i += 3 continue elif i + 1 < n and is_up1: w1 = lc_text[i] w2 = lc_text[i + 1] if first_two_with_tab is None: first_two_with_tab = w1 + "\t" + w2 location = self.geocoder.geocode(first_two_with_tab) if not location is None: self.record_user_location(first_two_with_tab, location, user_id) i += 2 continue if first_two_with_space is None: first_two_with_space = w1 + " " + w2 location = self.geocoder.geocode(first_two_with_space) if not location is None: self.record_user_location(first_two_with_space, location, user_id) i += 2 continue else: w1 = lc_text[i] location = self.geocoder.geocode(w1) if not location is None: self.record_user_location(w1, location, user_id) i += 1 def record_user_location(self, location_name, location, user_id): try: self.mention_network.add_edge(user_id, location_name) self.mention_network.set_node_data(location_name, location) except: return self.venues.add(location_name) self.psi[location] += 1 self.T_r[user_id] += 1 self.K += 1 return def compute_coefficients(self): """ Computes the coefficients for equation (1) form the paper, P(f<i,j>|alpha,beta,x_i,y_i) = beta*distance(x_i,y_i)^alpha """ def func_to_fit(x, a, b): return b * x**a mentions_per_distance = Counter() following_relationship = Counter() # our networks are too large to generate these coefficients on each call... # this is about the same number of combinations as shown in the paper... n = 10000000 #random_sample = random.sample(list(self.u_star),n) random_sample = list(self.u_star) number_of_users = len(self.u_star) # processed_combinations = 0 # start_time = time.time() #for node_u, node_v in combinations(random_sample,2): for i in range(0, n): node_u, node_v = (random_sample[random.randint( 0, number_of_users - 1)], random_sample[random.randint( 0, number_of_users - 1)]) if node_u == node_v: continue # if processed_combinations % 1000000 == 0: # logger.debug("Took %f to process %d combinations..." % ((time.time() - start_time), processed_combinations)) # processed_combinations += 1 l_u = self.mention_network.node_data(node_u) l_v = self.mention_network.node_data(node_v) distance = round(haversine(l_u, l_v, miles=True), 0) if distance > 10000: continue mentions_per_distance[distance] += 1.0 self.N_squared += 1.0 if self.mention_network.has_edge(node_u, node_v): following_relationship[distance] += 1.0 self.S += 1.0 x = list(sorted([key for key in mentions_per_distance])) x[0] += 1e-8 y = [] for key in mentions_per_distance: # "ratio of the number of pairs that have following relationship to the total number of pairs in the d_th bucket" mentions = mentions_per_distance[key] if mentions == 0: x.remove(key) continue following = following_relationship[key] ratio = following / mentions y.append(ratio) solutions = curve_fit(func_to_fit, x, y, p0=[-0.55, 0.0045], maxfev=100000)[0] self.alpha = solutions[0] self.beta = solutions[1] return def generate_model_selector(self): for user in self.u_n: if np.random.binomial( 1, self.F_r ) == 1: #generate a model selector, u according to a bernoulli distribution self.mu[user] = True else: self.mu[user] = False #normalizing K if np.random.binomial(1, (self.T_r[user] / self.K)) == 1: self.nu[user] = True else: self.nu[user] = False def random_following_model(self, user): """ If mu = 1, we choose the random following model using p(f<i,j> == 1 | F_r) to decide if the location of a neighbor of the user is a possible location. """ for neighbor in self.mention_network.neighbors_iter(user): if neighbor not in self.u_star: continue elif np.random.binomial(1, self.F_r): self.user_multi_locations[user].append( self.mention_network.node_data(neighbor)) return def following_model(self, user): """ If mu = 0, we decide whether there is f<i,j> based on the location-based following model as shown in eq. 1 """ #(note: this is almost the same as the Backstrom paper, thus I'll ignore generating #the theta values and just calculate max probability) def calculate_probability(l_u, l_v): """ Calculates the probability, P(f<i,j>|alpha,beta,location_1,location_2) """ try: return self.beta * (abs(haversine(l_u, l_v)))**(self.alpha) except: #this needs to be changed to a very small value.... return self.beta * (0.00000001)**self.alpha best_log_probability = float('-inf') best_location = None for neighbor_u in self.mention_network.neighbors_iter(user): log_probability = 0 if neighbor_u not in self.u_star: continue for neighbor_v in self.mention_network.neighbors_iter(neighbor_u): if neighbor_v not in self.u_star: continue else: l_u = self.mention_network.node_data(neighbor_u) l_v = self.mention_network.node_data(neighbor_v) plu_lv = calculate_probability(l_u, l_v) try: log_gamma_lu = math.log((plu_lv / (1 - plu_lv))) except ValueError: #in the case where l_u == l_v, then plu_lv --> 0 and log(1) = 0, #thus this exception should be valid. log_gamma_lu = 0 log_probability += log_gamma_lu if log_probability > best_log_probability: best_log_probability = log_probability best_location = self.mention_network.node_data(neighbor_u) if best_location: self.user_multi_locations[user].append(best_location) return def random_tweeting_model(self, user): for venue in self.mention_network.neighbors_iter(user): if venue not in self.venues: continue elif np.random.binomial(1, self.T_r[user]): self.user_multi_locations[user].append( self.mention_network.node_data(venue)) return def tweeting_model(self, user): best_probability = float("-inf") best_venue = None for venue in self.mention_network.neighbors_iter(user): if venue not in self.venues: continue probability = self.psi[venue] if best_probability < probability: best_probability = probability best_venue = venue if best_venue: self.user_multi_locations[user].append( self.mention_network.node_data(best_venue)) return def run_model(self): """ run_model generates the values for all the initialized class variables, and follows the MLP algorithm described in the paper to infer locations for users. """ #NOTE: K is not normalized to save computations, and is normalized on the fly in "generate_model_selector" #self.populate_mention_network() logger.debug("Variables have been initialized. Starting the model.") logger.debug("Storing location data...") self.store_location_data() self.u_n = self.nodes.difference(self.u_star) logger.debug("Location data stored!") logger.debug("Starting to compute the coefficients for the model...") #calculates the coefficients to be used in eq.1, alpha and beta self.compute_coefficients() logger.debug( "Coefficients have been calculated. Alpha: %f and beta: %f." % (self.alpha, self.beta)) logger.debug("Finding venue data..") self.find_locations() for venue in self.psi: self.psi[venue] /= self.K logger.debug("Finished finding venue data! %d venues found!" % len(self.venues)) #self.N_squared = len(self.mention_network.edges_()) #p(f<i,j> = 1 | F_r) = S / N^2 self.F_r = (self.S / self.N_squared) #Section 4.4, generate model selector based on bernoulli distributions using T_r and F_r logger.debug("Generating model selectors...") self.generate_model_selector() logger.debug("Model selectors have been generated!") logger.debug("Starting to find user locations...") for user in self.u_n: if self.mu[user]: self.random_following_model(user) else: self.following_model(user) if self.nu[user]: self.random_tweeting_model(user) else: self.tweeting_model(user) logger.debug("Finished finding user locations...") for user in self.user_multi_locations: location_list = self.user_multi_locations[user] location = self.get_geometric_mean(location_list) self.mention_network.set_node_data(user, location) def return_network(self): return self.mention_network def get_geometric_mean(self, locations): """ Locates the geometric mean of a list of locations, taken from David Jurgen's implementation, with less than three locations a random location is selected, else construct a geometric mean. """ n = len(locations) # The geometric median is only defined for n > 2 points, so just return # an arbitrary point if we have fewer if n < 2: return locations[np.random.randint(0, n)] min_distance_sum = 10000000 median = None # Point type # Loop through all the points, finding the point that minimizes the # geodetic distance to all other points. By construction median will # always be assigned to some non-None value by the end of the loop. for i in range(0, n): p1 = locations[i] dist_sum = 0 for j in range(0, n): p2 = locations[j] # Skip self-comparison if i == j: continue dist = haversine(p1, p2) dist_sum += dist # Short-circuit early if it's clear that this point cannot be # the median since it does not minimize the distance sum if dist_sum > min_distance_sum: break if dist_sum < min_distance_sum: min_distance_sum = dist_sum median = p1 return median