def predict_probability_area(self, upper_bound, lower_bound, center, med_error, std_dev): #For now this function will return the minimum and maximum probability using the circle prediction algorithm (lat, lon) = center (max_lat, max_lon) = upper_bound (min_lat, min_lon) = upper_bound top_dist = haversine(lon, lat, lon, max_lat) bottom_dist = haversine(lon, lat, lon, min_lat) r_dist = haversine(lon, lat, max_lon, lat) l_dist = haversine(lon, lat, min_lon, lat) min_dist = min([top_dist, bottom_dist, r_dist, l_dist]) max_dist = max([top_dist, bottom_dist, r_dist, l_dist]) min_prob = SLP.predict_probability_radius(min_dist, med_error, std_dev) max_prob = SLP.predict_probability_radius(max_dist, med_error, std_dev) return (min_prob, max_prob)
def predict_probability_area(self, upper_bound, lower_bound, center, med_error, std_dev): #For now this function will return the minimum and maximum probability using the circle prediction algorithm (lat, lon) = center (max_lat, max_lon) = upper_bound (min_lat, min_lon) = lower_bound top_dist = haversine(lon, lat, lon, max_lat) bottom_dist = haversine(lon, lat, lon, min_lat) r_dist = haversine(lon, lat, max_lon, lat) l_dist = haversine(lon, lat, min_lon, lat) min_dist = min([top_dist, bottom_dist, r_dist, l_dist]) max_dist = max([top_dist, bottom_dist, r_dist, l_dist]) min_prob = SLP.predict_probability_radius(min_dist, med_error, std_dev) max_prob = SLP.predict_probability_radius(max_dist, med_error, std_dev) return (min_prob, max_prob)
def get_errors(model, points): """Computes the median error for a GMM and a set of training points""" (best_lat, best_lon) = model.means_[np.argmax(model.weights_)] errors = [] for point in points: (lat, lon) = point error = haversine(best_lon, best_lat, lon, lat) errors.append(error) return np.median(errors)
def run_gmm_test(sc, sqlCtx, table_name, fields, model, where_clause=''): """ Test a pretrained model on a table of test data Args: sc (pyspark.SparkContext): Spark Context to use for execution sqlCtx (pyspark.sql.SQLContext): Spark SQL Context to use for sql queries table_name (str): Table name to query for test data fields (list): List of field names to extract and then use for GMM prediction model (dict): Dictionary of {word:(mixture.GMM, error)} where_clause (str): A where clause that can be applied to the query Returns: final_result (dict): A description of the performance of the GMM Algorithm """ tweets_w_geo = sqlCtx.sql( 'select geo, entities, extended_entities, %s from %s where geo.coordinates is not null %s' % (','.join(fields), table_name, where_clause)) # for each tweet calculate most likely position model_bcast = sc.broadcast(model) errors_rdd = tweets_w_geo.rdd.keyBy(lambda row: get_location_from_tweet(row))\ .flatMapValues(lambda row: get_most_likely_point(tokenize_tweet(row, fields), model_bcast))\ .map(lambda (true_geo_coord, est_loc): haversine(true_geo_coord, est_loc.geo_coord)) errors = np.array(errors_rdd.collect()) num_vals = tweets_w_geo.count() errors = errors[np.isnan(errors) == False] median_error = np.median(errors) mean_error = np.mean(errors) print('Median Error', median_error) print('Mean Error: ', mean_error) # calculate coverage try: coverage = len(errors) / float(num_vals) except ZeroDivisionError: coverage = np.nan # gather errors final_results = { 'median': median_error, 'mean': mean_error, 'coverage': coverage, 'num_locs': len(errors), 'fields': fields } return final_results
def test(self, all_tweets, skip_load=False): # Push config to all nodes options = self.sc.broadcast(self.options) all_tweets.registerTempTable(self.options['temp_table_name']) if skip_load and self.all_user_locations is not None: # If we've just trained then there is no need to go back to original data original_user_locations = self.all_user_locations else: # Find Known user locations # First map turns Row(id_str, coordinates) -> (id_str, coordinates) # Group by key turns (id_str, coordinates -> (id_str, [coordinates1,coordinates2,..]) # Filter removes enteries without at least 3 locations # Calculate the median point of the locations (id_str, [coordinates1,..]) -> (id_str, median_location) # coalesce then reduces the number of partitions def median_point_w_options_generator(num_points_req, dispersion_threshold): return (lambda x: median_point(x, num_points_req=num_points_req, return_dispersion=False, dispersion_treshold=dispersion_threshold)) f = median_point_w_options_generator(self.options['num_points_req_for_known'],self.options['dispersion_threshold']) original_user_locations = self.sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null'%\ self.options['temp_table_name'])\ .map(lambda a: (a.id_str, a.coordinates))\ .groupByKey().flatMapValues(lambda input_locations:f(input_locations)).coalesce(300) # Filter users that might have been in training set filter_function = lambda (a,b): a[-1] in options.value['hold_out'] original_user_locations = original_user_locations.filter(filter_function) number_locations = original_user_locations.count() found_locations = original_user_locations.join(self.updated_locations.map(lambda (a,b): (a, b[0]))) found_locations_local = found_locations.collect() print 'Number of Found Locations: ', len(found_locations_local) errors = [] for (id_str, ll_tuple) in found_locations_local: (ll_1,ll_2) = ll_tuple errors.append(haversine(ll_1[1], ll_1[0], ll_2[1], ll_2[0])) median_error = np.median(errors) mean_error = np.mean(errors) print('Median Error', median_error) print('Mean Error: ', mean_error) # gather errors final_results = {'median': median_error, 'mean': mean_error, 'coverage': len(errors)/float(number_locations), 'num_locs': number_locations, 'iterations_completed': self.iterations_completed, 'options': self.options} return final_results
def compute_error_using_model(input_val, model=None): """ Given a model that maps tokens -> GMMs this will compute the most likely point and return the distance from the most likely point to the true location""" (location, tokens) = input_val true_lat, true_lon = location models = [] for token in tokens: if token in model: models.append(model[token]) if len(models) > 1: combined_gmm = GMM.combine_gmms(models) (best_lat, best_lon) = combined_gmm.means_[np.argmax(combined_gmm.weights_)] elif len(models) == 1: (best_lat, best_lon) = models[0][0].means_[np.argmax(models[0][0].weights_)] else: return np.nan distance = haversine(best_lon, best_lat, true_lon, true_lat) return distance
def run_gmm_test(sc, sqlCtx, table_name, fields, model, where_clause=''): """ Test a pretrained model on a table of test data Args: sc (pyspark.SparkContext): Spark Context to use for execution sqlCtx (pyspark.sql.SQLContext): Spark SQL Context to use for sql queries table_name (str): Table name to query for test data fields (list): List of field names to extract and then use for GMM prediction model (dict): Dictionary of {word:(mixture.GMM, error)} where_clause (str): A where clause that can be applied to the query Returns: final_result (dict): A description of the performance of the GMM Algorithm """ tweets_w_geo = sqlCtx.sql('select geo, entities, extended_entities, %s from %s where geo.coordinates is not null %s' % (','.join(fields), table_name, where_clause)) # for each tweet calculate most likely position model_bcast = sc.broadcast(model) errors_rdd = tweets_w_geo.rdd.keyBy(lambda row: get_location_from_tweet(row))\ .flatMapValues(lambda row: get_most_likely_point(tokenize_tweet(row, fields), model_bcast))\ .map(lambda (true_geo_coord, est_loc): haversine(true_geo_coord, est_loc.geo_coord)) errors = np.array(errors_rdd.collect()) num_vals = tweets_w_geo.count() errors = errors[np.isnan(errors) == False] median_error = np.median(errors) mean_error = np.mean(errors) print('Median Error', median_error) print('Mean Error: ', mean_error) # calculate coverage try: coverage = len(errors)/float(num_vals) except ZeroDivisionError: coverage = np.nan # gather errors final_results = {'median': median_error, 'mean': mean_error, 'coverage': coverage, 'num_locs': len(errors), 'fields': fields} return final_results
def get_errors(model, points): """ Computes the median error for a GMM model and a set of training points Args: model (mixture.GMM): A GMM model for a word points (list): A list of (lat, lon) tuples Returns: median (float): The median distance to the training points from the most likely point """ (best_lat, best_lon) = model.means_[np.argmax(model.weights_)] best_point = GeoCoord(lat=best_lat, lon=best_lon) errors = [] for (lat, lon) in points: point = GeoCoord(lat, lon) error = haversine(best_point, point) errors.append(error) median = np.median(errors) return median
def evaluate(locs_known, edges, holdout_func, slp_closure): ''' This function is used to assess various stats regarding how well SLP is running. Given all locs that are known and all edges that are known, this funciton will first apply the holdout to the locs_known, allowing for a ground truth comparison to be used. Then, it applies the non-holdout set to the training function, which should yield the locations of the holdout for comparison. For example:: holdout = lambda (src_id) : src_id[-1] == '6' trainer = lambda l, e : slp.train_slp(l, e, 3) results = evaluate(locs_known, edges, holdout, trainer) Args: locs_known (rdd of LocEstimate objects) : The complete list of locations edges (rdd of (src_id, (dest_id, weight)): all available edge information holdout_func (function) : function responsible for filtering a holdout data set. For example:: lambda (src_id) : src_id[-1] == '6' can be used to get approximately 10% of the data since the src_id's are evenly distributed numeric values slp_closure (function closure): a closure over the slp train function. For example:: lambda locs, edges :\n slp.train_slp(locs, edges, 4, neighbor_threshold=4, dispersion_threshold=150) can be used for training with specific threshold parameters Returns: results (dict) : stats of the results from the SLP algorithm `median:` median difference of predicted versus actual `mean:` mean difference of predicted versus actual `coverage:` ratio of number of predicted locations to number of original unknown locations `reserved_locs:` number of known locations used to train `total_locs:` number of known locations input into this function `found_locs:` number of predicted locations `holdout_ratio:` ratio of the holdout set to the entire set ''' reserved_locs = locs_known.filter(lambda (src_id, loc): not holdout_func(src_id)) num_locs = reserved_locs.count() total_locs = locs_known.count() print('Total Locations %s' % total_locs) results = slp_closure(reserved_locs, edges) errors = results\ .filter(lambda (src_id, loc): holdout_func(src_id))\ .join(locs_known)\ .map(lambda (src_id, (vtx_found, vtx_actual)) :\ (src_id, (haversine(vtx_found.geo_coord, vtx_actual.geo_coord), vtx_found))) errors_local = errors.map(lambda (src_id, (dist, est_loc)) : dist).collect() #because cannot easily calculate median in RDDs we will bring deltas local for stats calculations. #With larger datasets, we may need to do this in the cluster, but for now will leave. return (errors, { 'median': np.median(errors_local), 'mean': np.mean(errors_local), 'coverage':len(errors_local)/float(total_locs - num_locs), 'reserved_locs': num_locs, 'total_locs':total_locs, 'found_locs': len(errors_local), 'holdout_ratio' : 1 - num_locs/float(total_locs) })
def train(self, all_tweets, predictions_curve=None): options = self.sc.broadcast(self.options) all_tweets.registerTempTable(self.options['temp_table_name']) # Helper function exploits python closure to pass options to map tasks def median_point_w_options_generator(num_points_req_for_known, home_radius_for_known): return (lambda x: median_point(x, num_points_req=num_points_req_for_known, return_dispersion=True, dispersion_treshold=home_radius_for_known)) print 'Building edge list' # Build full_edge_list # Build Bi-directional graph # the first flatMap turns src, [dsts]) -> [(cannonical order, (src, dst),...] # Group by key turns that into [(canoncial order, [(src,dst), (src, dst)..), ... # The 2nd flatMap turns filters out non-bidirectional and # transforms to[(canoncial order, [(src,dst), (src, dst)..), ...] -> [(src1, dst1), (src1, dst2)] # coalesce then reduces the number of parittions in the edge list full_edge_list = self.sqlCtx.sql('select user.id_str, entities.user_mentions from %s where size(entities.user_mentions) > 0'%\ self.options['temp_table_name'])\ .flatMap(SLP.get_at_mentions).groupByKey()\ .flatMap(lambda (a,b): SLP.filter_non_bidirectional(b)).coalesce(300) full_edge_list.cache() self.full_edge_list = full_edge_list print 'Finding known user locations' # Find Known user locations # First map turns Row(id_str, coordinates) -> (id_str, coordinates) # Group by key turns (id_str, coordinates -> (id_str, [coordinates1,coordinates2,..]) # Calculate the median point of the locations (id_str, [coordinates1,..]) -> (id_str, median_location) # coalesce then reduces the number of partitions median_point_w_options = median_point_w_options_generator(self.options['num_points_req_for_known'],\ self.options['home_radius_for_known']) original_user_locations = self.sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null'%\ self.options['temp_table_name'])\ .map(lambda a: (a.id_str, a.coordinates))\ .groupByKey().flatMapValues(lambda input_locations: median_point_w_options(input_locations)).coalesce(300) # Save a reference to all locations if we are going to test immediately afterwards self.all_user_locations = original_user_locations print 'Filtering out user locations that end in:', ','.join(list(self.options['hold_out'])) filter_function = lambda (a,b): a[-1] not in options.value['hold_out'] original_user_locations = original_user_locations.filter(filter_function) original_user_locations.cache() # Propagate locations updated_locations = original_user_locations if predictions_curve is None: print 'Building the error estimation curve' # For the users in the full edge list, determine all neighbors median point of the neighbors # Define a new median points generator which now returns the neighbor dispersion and standard dev of the dispersion def median_point_w_options_generator(num_located_neighbors_req, dispersion_threshold): return (lambda x: median_point(x, num_points_req=num_located_neighbors_req, return_dispersion=True, dispersion_treshold=dispersion_threshold, use_usr_ids=True)) median_point_w_options = median_point_w_options_generator(self.options['num_points_req_for_known'],\ self.options['home_radius_for_known']) user_location_only = original_user_locations.map(lambda (a,b): (a, b[0])) adj_list_w_locations = full_edge_list.join(user_location_only).map(lambda (a,b): (b[0], (b[1],a))).groupByKey() neighbor_locations = adj_list_w_locations.flatMapValues(lambda input_locations:median_point_w_options(input_locations)) network_info = user_location_only.join(neighbor_locations) std_mults = network_info.map\ (lambda (id_str,(lat0,(lat1, disp, mean_dis, std_dev))) : (haversine(lat0[1], lat0[0], lat1[1], lat1[0]) - disp)/std_dev) std_mults_loc = std_mults.collect() sorted_vals = np.sort(std_mults_loc) yvals=np.arange(len(sorted_vals))/float(len(sorted_vals)) self.predictions_curve = pd.DataFrame(np.column_stack((sorted_vals, yvals)), columns=["std_range", "pct_within_med"]) else: self.predictions_curve = predictions_curve print 'Building a filtered edge list' # Build a filtered edge list so we don't ever try to approximate the known user locations filtered_edge_list = full_edge_list.keyBy(lambda (a, b): b).leftOuterJoin(updated_locations)\ .flatMap(lambda (a,b): [b[0]] if b is not None else []) filtered_edge_list.cache() self.updated_locations = updated_locations self.original_user_locations = original_user_locations self.filtered_edge_list = filtered_edge_list print 'Begining iterations' # Perform iterations start_time = time.time() for i in range(self.options['num_iters']): if i + 1 == self.options['num_iters']: self.do_iteration(True) else: self.do_iteration(False) print 'Completed training', time.time() - start_time
def evaluate(locs_known, edges, holdout_func, slp_closure): ''' This function is used to assess various stats regarding how well SLP is running. Given all locs that are known and all edges that are known, this funciton will first apply the holdout to the locs_known, allowing for a ground truth comparison to be used. Then, it applies the non-holdout set to the training function, which should yield the locations of the holdout for comparison. For example:: holdout = lambda (src_id) : src_id[-1] == '6' trainer = lambda l, e : slp.train_slp(l, e, 3) results = evaluate(locs_known, edges, holdout, trainer) Args: locs_known (rdd of LocEstimate objects) : The complete list of locations edges (rdd of (src_id, (dest_id, weight)): all available edge information holdout_func (function) : function responsible for filtering a holdout data set. For example:: lambda (src_id) : src_id[-1] == '6' can be used to get approximately 10% of the data since the src_id's are evenly distributed numeric values slp_closure (function closure): a closure over the slp train function. For example:: lambda locs, edges :\n slp.train_slp(locs, edges, 4, neighbor_threshold=4, dispersion_threshold=150) can be used for training with specific threshold parameters Returns: results (dict) : stats of the results from the SLP algorithm `median:` median difference of predicted versus actual `mean:` mean difference of predicted versus actual `coverage:` ratio of number of predicted locations to number of original unknown locations `reserved_locs:` number of known locations used to train `total_locs:` number of known locations input into this function `found_locs:` number of predicted locations `holdout_ratio:` ratio of the holdout set to the entire set ''' reserved_locs = locs_known.filter(lambda (src_id, loc): not holdout_func(src_id)) num_locs = reserved_locs.count() total_locs = locs_known.count() print('Total Locations %s' % total_locs) results = slp_closure(reserved_locs, edges) errors = results\ .filter(lambda (src_id, loc): holdout_func(src_id))\ .join(locs_known)\ .map(lambda (src_id, (vtx_found, vtx_actual)) :\ (src_id, (haversine(vtx_found.geo_coord, vtx_actual.geo_coord), vtx_found))) errors_local = errors.map(lambda (src_id, (dist, est_loc)): dist).collect() #because cannot easily calculate median in RDDs we will bring deltas local for stats calculations. #With larger datasets, we may need to do this in the cluster, but for now will leave. return (errors, { 'median': np.median(errors_local), 'mean': np.mean(errors_local), 'coverage': len(errors_local) / float(total_locs - num_locs), 'reserved_locs': num_locs, 'total_locs': total_locs, 'found_locs': len(errors_local), 'holdout_ratio': 1 - num_locs / float(total_locs) })
def test(self, all_tweets, skip_load=False): # Push config to all nodes options = self.sc.broadcast(self.options) all_tweets.registerTempTable(self.options['temp_table_name']) if skip_load and self.all_user_locations is not None: # If we've just trained then there is no need to go back to original data original_user_locations = self.all_user_locations else: # Find Known user locations # First map turns Row(id_str, coordinates) -> (id_str, coordinates) # Group by key turns (id_str, coordinates -> (id_str, [coordinates1,coordinates2,..]) # Filter removes enteries without at least 3 locations # Calculate the median point of the locations (id_str, [coordinates1,..]) -> (id_str, median_location) # coalesce then reduces the number of partitions def median_point_w_options_generator(num_points_req, dispersion_threshold): return (lambda x: median_point(x, num_points_req=num_points_req, return_dispersion=False, dispersion_treshold= dispersion_threshold)) f = median_point_w_options_generator( self.options['num_points_req_for_known'], self.options['dispersion_threshold']) original_user_locations = self.sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null'%\ self.options['temp_table_name'])\ .map(lambda a: (a.id_str, a.coordinates))\ .groupByKey().flatMapValues(lambda input_locations:f(input_locations)).coalesce(300) # Filter users that might have been in training set filter_function = lambda (a, b): a[-1] in options.value['hold_out'] original_user_locations = original_user_locations.filter( filter_function) number_locations = original_user_locations.count() found_locations = original_user_locations.join( self.updated_locations.map(lambda (a, b): (a, b[0]))) found_locations_local = found_locations.collect() print 'Number of Found Locations: ', len(found_locations_local) errors = [] for (id_str, ll_tuple) in found_locations_local: (ll_1, ll_2) = ll_tuple errors.append(haversine(ll_1[1], ll_1[0], ll_2[1], ll_2[0])) median_error = np.median(errors) mean_error = np.mean(errors) print('Median Error', median_error) print('Mean Error: ', mean_error) # gather errors final_results = { 'median': median_error, 'mean': mean_error, 'coverage': len(errors) / float(number_locations), 'num_locs': number_locations, 'iterations_completed': self.iterations_completed, 'options': self.options } return final_results
def train(self, all_tweets, predictions_curve=None): options = self.sc.broadcast(self.options) all_tweets.registerTempTable(self.options['temp_table_name']) # Helper function exploits python closure to pass options to map tasks def median_point_w_options_generator(num_points_req_for_known, home_radius_for_known): return (lambda x: median_point( x, num_points_req=num_points_req_for_known, return_dispersion=True, dispersion_treshold=home_radius_for_known)) print 'Building edge list' # Build full_edge_list # Build Bi-directional graph # the first flatMap turns src, [dsts]) -> [(cannonical order, (src, dst),...] # Group by key turns that into [(canoncial order, [(src,dst), (src, dst)..), ... # The 2nd flatMap turns filters out non-bidirectional and # transforms to[(canoncial order, [(src,dst), (src, dst)..), ...] -> [(src1, dst1), (src1, dst2)] # coalesce then reduces the number of parittions in the edge list full_edge_list = self.sqlCtx.sql('select user.id_str, entities.user_mentions from %s where size(entities.user_mentions) > 0'%\ self.options['temp_table_name'])\ .flatMap(SLP.get_at_mentions).groupByKey()\ .flatMap(lambda (a,b): SLP.filter_non_bidirectional(b)).coalesce(300) full_edge_list.cache() self.full_edge_list = full_edge_list print 'Finding known user locations' # Find Known user locations # First map turns Row(id_str, coordinates) -> (id_str, coordinates) # Group by key turns (id_str, coordinates -> (id_str, [coordinates1,coordinates2,..]) # Calculate the median point of the locations (id_str, [coordinates1,..]) -> (id_str, median_location) # coalesce then reduces the number of partitions median_point_w_options = median_point_w_options_generator(self.options['num_points_req_for_known'],\ self.options['home_radius_for_known']) original_user_locations = self.sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null'%\ self.options['temp_table_name'])\ .map(lambda a: (a.id_str, a.coordinates))\ .groupByKey().flatMapValues(lambda input_locations: median_point_w_options(input_locations)).coalesce(300) # Save a reference to all locations if we are going to test immediately afterwards self.all_user_locations = original_user_locations print 'Filtering out user locations that end in:', ','.join( list(self.options['hold_out'])) filter_function = lambda (a, b): a[-1] not in options.value['hold_out'] original_user_locations = original_user_locations.filter( filter_function) original_user_locations.cache() # Propagate locations updated_locations = original_user_locations if predictions_curve is None: print 'Building the error estimation curve' # For the users in the full edge list, determine all neighbors median point of the neighbors # Define a new median points generator which now returns the neighbor dispersion and standard dev of the dispersion def median_point_w_options_generator(num_located_neighbors_req, dispersion_threshold): return (lambda x: median_point( x, num_points_req=num_located_neighbors_req, return_dispersion=True, dispersion_treshold=dispersion_threshold, use_usr_ids=True)) median_point_w_options = median_point_w_options_generator(self.options['num_points_req_for_known'],\ self.options['home_radius_for_known']) user_location_only = original_user_locations.map(lambda (a, b): (a, b[0])) adj_list_w_locations = full_edge_list.join(user_location_only).map( lambda (a, b): (b[0], (b[1], a))).groupByKey() neighbor_locations = adj_list_w_locations.flatMapValues( lambda input_locations: median_point_w_options(input_locations )) network_info = user_location_only.join(neighbor_locations) std_mults = network_info.map\ (lambda (id_str,(lat0,(lat1, disp, mean_dis, std_dev))) : (haversine(lat0[1], lat0[0], lat1[1], lat1[0]) - disp)/std_dev) std_mults_loc = std_mults.collect() sorted_vals = np.sort(std_mults_loc) yvals = np.arange(len(sorted_vals)) / float(len(sorted_vals)) self.predictions_curve = pd.DataFrame( np.column_stack((sorted_vals, yvals)), columns=["std_range", "pct_within_med"]) else: self.predictions_curve = predictions_curve print 'Building a filtered edge list' # Build a filtered edge list so we don't ever try to approximate the known user locations filtered_edge_list = full_edge_list.keyBy(lambda (a, b): b).leftOuterJoin(updated_locations)\ .flatMap(lambda (a,b): [b[0]] if b[1] is None else []) filtered_edge_list.cache() self.updated_locations = updated_locations self.original_user_locations = original_user_locations self.filtered_edge_list = filtered_edge_list print 'Begining iterations' # Perform iterations start_time = time.time() for i in range(self.options['num_iters']): if i + 1 == self.options['num_iters']: self.do_iteration(True) else: self.do_iteration(False) print 'Completed training', time.time() - start_time