Пример #1
0
 def predict_probability_area(self, upper_bound, lower_bound, center, med_error, std_dev):
     #For now this function will return the minimum and maximum probability using the circle prediction algorithm
     (lat, lon) = center
     (max_lat, max_lon) = upper_bound
     (min_lat, min_lon) = upper_bound
     top_dist = haversine(lon, lat, lon, max_lat)
     bottom_dist = haversine(lon, lat, lon, min_lat)
     r_dist = haversine(lon, lat, max_lon, lat)
     l_dist = haversine(lon, lat, min_lon, lat)
     min_dist = min([top_dist, bottom_dist, r_dist, l_dist])
     max_dist = max([top_dist, bottom_dist, r_dist, l_dist])
     min_prob = SLP.predict_probability_radius(min_dist, med_error, std_dev)
     max_prob = SLP.predict_probability_radius(max_dist, med_error, std_dev)
     return (min_prob, max_prob)
Пример #2
0
 def predict_probability_area(self, upper_bound, lower_bound, center,
                              med_error, std_dev):
     #For now this function will return the minimum and maximum probability using the circle prediction algorithm
     (lat, lon) = center
     (max_lat, max_lon) = upper_bound
     (min_lat, min_lon) = lower_bound
     top_dist = haversine(lon, lat, lon, max_lat)
     bottom_dist = haversine(lon, lat, lon, min_lat)
     r_dist = haversine(lon, lat, max_lon, lat)
     l_dist = haversine(lon, lat, min_lon, lat)
     min_dist = min([top_dist, bottom_dist, r_dist, l_dist])
     max_dist = max([top_dist, bottom_dist, r_dist, l_dist])
     min_prob = SLP.predict_probability_radius(min_dist, med_error, std_dev)
     max_prob = SLP.predict_probability_radius(max_dist, med_error, std_dev)
     return (min_prob, max_prob)
Пример #3
0
 def get_errors(model, points):
     """Computes the median error for a GMM and a set of training points"""
     (best_lat, best_lon) = model.means_[np.argmax(model.weights_)]
     errors = []
     for point in points:
         (lat, lon) = point
         error = haversine(best_lon, best_lat, lon, lat)
         errors.append(error)
     return np.median(errors)
Пример #4
0
 def get_errors(model, points):
     """Computes the median error for a GMM and a set of training points"""
     (best_lat, best_lon) = model.means_[np.argmax(model.weights_)]
     errors = []
     for point in points:
         (lat, lon) = point
         error = haversine(best_lon, best_lat, lon, lat)
         errors.append(error)
     return np.median(errors)
Пример #5
0
def run_gmm_test(sc, sqlCtx, table_name, fields, model, where_clause=''):
    """
    Test a pretrained model on a table of test data

    Args:
        sc (pyspark.SparkContext): Spark Context to use for execution
        sqlCtx (pyspark.sql.SQLContext): Spark SQL Context to use for sql queries
        table_name (str): Table name to query for test data
        fields (list): List of field names to extract and then use for GMM prediction
        model (dict): Dictionary of {word:(mixture.GMM, error)}
        where_clause (str): A where clause that can be applied to the query

    Returns:
        final_result (dict): A description of the performance of the GMM Algorithm
    """
    tweets_w_geo = sqlCtx.sql(
        'select geo, entities,  extended_entities, %s from %s where geo.coordinates is not null %s'
        % (','.join(fields), table_name, where_clause))

    # for each tweet calculate most likely position
    model_bcast = sc.broadcast(model)

    errors_rdd = tweets_w_geo.rdd.keyBy(lambda row: get_location_from_tweet(row))\
                                .flatMapValues(lambda row: get_most_likely_point(tokenize_tweet(row, fields), model_bcast))\
                                .map(lambda (true_geo_coord, est_loc): haversine(true_geo_coord, est_loc.geo_coord))

    errors = np.array(errors_rdd.collect())
    num_vals = tweets_w_geo.count()
    errors = errors[np.isnan(errors) == False]

    median_error = np.median(errors)
    mean_error = np.mean(errors)
    print('Median Error', median_error)
    print('Mean Error: ', mean_error)

    # calculate coverage
    try:
        coverage = len(errors) / float(num_vals)
    except ZeroDivisionError:
        coverage = np.nan

    # gather errors
    final_results = {
        'median': median_error,
        'mean': mean_error,
        'coverage': coverage,
        'num_locs': len(errors),
        'fields': fields
    }
    return final_results
Пример #6
0
    def test(self, all_tweets, skip_load=False):
        # Push config to all nodes
        options = self.sc.broadcast(self.options)
        all_tweets.registerTempTable(self.options['temp_table_name'])

        if skip_load and  self.all_user_locations is not None:
            # If we've just trained then there is no need to go back to original data
            original_user_locations = self.all_user_locations
        else:

            # Find Known user locations
            # First map turns Row(id_str, coordinates) -> (id_str, coordinates)
            # Group by key turns (id_str, coordinates -> (id_str, [coordinates1,coordinates2,..])
            # Filter removes enteries without at least 3 locations
            # Calculate the median point of the locations (id_str, [coordinates1,..]) -> (id_str, median_location)
            # coalesce then reduces the number of partitions
            def median_point_w_options_generator(num_points_req, dispersion_threshold):
                return (lambda x: median_point(x, num_points_req=num_points_req, return_dispersion=False, dispersion_treshold=dispersion_threshold))

            f = median_point_w_options_generator(self.options['num_points_req_for_known'],self.options['dispersion_threshold'])
            original_user_locations = self.sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null'%\
                self.options['temp_table_name'])\
                .map(lambda a: (a.id_str, a.coordinates))\
                .groupByKey().flatMapValues(lambda input_locations:f(input_locations)).coalesce(300)

        # Filter users that might have been in training set
        filter_function = lambda (a,b): a[-1] in options.value['hold_out']
        original_user_locations = original_user_locations.filter(filter_function)
        number_locations = original_user_locations.count()

        found_locations = original_user_locations.join(self.updated_locations.map(lambda (a,b): (a, b[0])))
        found_locations_local = found_locations.collect()
        print 'Number of Found Locations: ', len(found_locations_local)
        errors = []
        for (id_str, ll_tuple) in found_locations_local:
            (ll_1,ll_2) = ll_tuple
            errors.append(haversine(ll_1[1], ll_1[0], ll_2[1], ll_2[0]))

        median_error = np.median(errors)
        mean_error = np.mean(errors)
        print('Median Error', median_error)
        print('Mean Error: ', mean_error)
        # gather errors
        final_results = {'median': median_error, 'mean': mean_error, 'coverage': len(errors)/float(number_locations),
                         'num_locs': number_locations,
                         'iterations_completed': self.iterations_completed, 'options': self.options}

        return final_results
Пример #7
0
 def compute_error_using_model(input_val, model=None):
     """ Given a model that maps tokens -> GMMs this will compute the most likely point and return the distance
         from the most likely point to the true location"""
     (location, tokens) = input_val
     true_lat, true_lon = location
     models = []
     for token in tokens:
         if token in model:
             models.append(model[token])
     if len(models) > 1:
         combined_gmm = GMM.combine_gmms(models)
         (best_lat, best_lon) = combined_gmm.means_[np.argmax(combined_gmm.weights_)]
     elif len(models) == 1:
         (best_lat, best_lon) = models[0][0].means_[np.argmax(models[0][0].weights_)]
     else:
         return np.nan
     distance = haversine(best_lon, best_lat, true_lon, true_lat)
     return distance
Пример #8
0
def run_gmm_test(sc, sqlCtx, table_name, fields, model, where_clause=''):
    """
    Test a pretrained model on a table of test data

    Args:
        sc (pyspark.SparkContext): Spark Context to use for execution
        sqlCtx (pyspark.sql.SQLContext): Spark SQL Context to use for sql queries
        table_name (str): Table name to query for test data
        fields (list): List of field names to extract and then use for GMM prediction
        model (dict): Dictionary of {word:(mixture.GMM, error)}
        where_clause (str): A where clause that can be applied to the query

    Returns:
        final_result (dict): A description of the performance of the GMM Algorithm
    """
    tweets_w_geo = sqlCtx.sql('select geo, entities,  extended_entities, %s from %s where geo.coordinates is not null %s'
                                   % (','.join(fields), table_name, where_clause))

    # for each tweet calculate most likely position
    model_bcast = sc.broadcast(model)

    errors_rdd = tweets_w_geo.rdd.keyBy(lambda row: get_location_from_tweet(row))\
                                .flatMapValues(lambda row: get_most_likely_point(tokenize_tweet(row, fields), model_bcast))\
                                .map(lambda (true_geo_coord, est_loc): haversine(true_geo_coord, est_loc.geo_coord))

    errors = np.array(errors_rdd.collect())
    num_vals = tweets_w_geo.count()
    errors = errors[np.isnan(errors) == False]

    median_error = np.median(errors)
    mean_error = np.mean(errors)
    print('Median Error', median_error)
    print('Mean Error: ', mean_error)

    # calculate coverage
    try:
        coverage = len(errors)/float(num_vals)
    except ZeroDivisionError:
        coverage = np.nan

    # gather errors
    final_results = {'median': median_error, 'mean': mean_error, 'coverage': coverage,
                     'num_locs': len(errors), 'fields': fields}
    return final_results
Пример #9
0
def get_errors(model, points):
    """
    Computes the median error for a GMM model and a set of training points

    Args:
        model (mixture.GMM): A GMM model for a word
        points (list): A list of (lat, lon) tuples

    Returns:
        median (float): The median distance to the training points from the most likely point
    """
    (best_lat, best_lon) = model.means_[np.argmax(model.weights_)]
    best_point = GeoCoord(lat=best_lat, lon=best_lon)
    errors = []
    for (lat, lon) in points:
        point = GeoCoord(lat, lon)
        error = haversine(best_point, point)
        errors.append(error)
    median = np.median(errors)
    return median
Пример #10
0
def get_errors(model, points):
    """
    Computes the median error for a GMM model and a set of training points

    Args:
        model (mixture.GMM): A GMM model for a word
        points (list): A list of (lat, lon) tuples

    Returns:
        median (float): The median distance to the training points from the most likely point
    """
    (best_lat, best_lon) = model.means_[np.argmax(model.weights_)]
    best_point = GeoCoord(lat=best_lat, lon=best_lon)
    errors = []
    for (lat, lon) in points:
        point = GeoCoord(lat, lon)
        error = haversine(best_point, point)
        errors.append(error)
    median = np.median(errors)
    return median
Пример #11
0
 def compute_error_using_model(input_val, model=None):
     """ Given a model that maps tokens -> GMMs this will compute the most likely point and return the distance
         from the most likely point to the true location"""
     (location, tokens) = input_val
     true_lat, true_lon = location
     models = []
     for token in tokens:
         if token in model:
             models.append(model[token])
     if len(models) > 1:
         combined_gmm = GMM.combine_gmms(models)
         (best_lat,
          best_lon) = combined_gmm.means_[np.argmax(combined_gmm.weights_)]
     elif len(models) == 1:
         (best_lat,
          best_lon) = models[0][0].means_[np.argmax(models[0][0].weights_)]
     else:
         return np.nan
     distance = haversine(best_lon, best_lat, true_lon, true_lat)
     return distance
Пример #12
0
def evaluate(locs_known, edges, holdout_func, slp_closure):
    '''
    This function is used to assess various stats regarding how well SLP is running.
    Given all locs that are known and all edges that are known, this funciton will first
    apply the holdout to the locs_known, allowing for a ground truth comparison to be used.
    Then, it applies the non-holdout set to the training function, which should yield the
    locations of the holdout for comparison.

        For example::

            holdout = lambda (src_id) : src_id[-1] == '6'
            trainer = lambda l, e : slp.train_slp(l, e, 3)
            results = evaluate(locs_known, edges, holdout, trainer)

    Args:
        locs_known (rdd of LocEstimate objects) : The complete list of locations

        edges (rdd of (src_id, (dest_id, weight)): all available edge information

        holdout_func (function) : function responsible for filtering a holdout data set. For example::

                lambda (src_id) : src_id[-1] == '6'

            can be used to get approximately 10% of the data since the src_id's are evenly distributed numeric values

        slp_closure (function closure): a closure over the slp train function. For example::

                lambda locs, edges :\n
                        slp.train_slp(locs, edges, 4, neighbor_threshold=4, dispersion_threshold=150)

            can be used for training with specific threshold parameters


    Returns:
        results (dict) : stats of the results from the SLP algorithm

            `median:` median difference of predicted versus actual

            `mean:` mean difference of predicted versus actual

            `coverage:` ratio of number of predicted locations to number of  original unknown locations

            `reserved_locs:` number of known locations used to train

            `total_locs:` number of known locations input into this function

            `found_locs:` number of predicted locations

            `holdout_ratio:` ratio of the holdout set to the entire set
        '''

    reserved_locs = locs_known.filter(lambda (src_id, loc): not holdout_func(src_id))
    num_locs = reserved_locs.count()
    total_locs = locs_known.count()

    print('Total Locations %s' % total_locs)

    results = slp_closure(reserved_locs, edges)

    errors = results\
        .filter(lambda (src_id, loc): holdout_func(src_id))\
        .join(locs_known)\
        .map(lambda (src_id, (vtx_found, vtx_actual)) :\
             (src_id, (haversine(vtx_found.geo_coord, vtx_actual.geo_coord), vtx_found)))

    errors_local = errors.map(lambda (src_id, (dist, est_loc)) : dist).collect()

    #because cannot easily calculate median in RDDs we will bring deltas local for stats calculations.
    #With larger datasets, we may need to do this in the cluster, but for now will leave.
    return (errors, {
        'median': np.median(errors_local),
        'mean': np.mean(errors_local),
        'coverage':len(errors_local)/float(total_locs - num_locs),
        'reserved_locs': num_locs,
        'total_locs':total_locs,
        'found_locs': len(errors_local),
        'holdout_ratio' : 1 - num_locs/float(total_locs)
    })
Пример #13
0
    def train(self, all_tweets, predictions_curve=None):
        options = self.sc.broadcast(self.options)
        all_tweets.registerTempTable(self.options['temp_table_name'])

        # Helper function exploits python closure to pass options to map tasks
        def median_point_w_options_generator(num_points_req_for_known, home_radius_for_known):
            return (lambda x: median_point(x, num_points_req=num_points_req_for_known, return_dispersion=True,
                                           dispersion_treshold=home_radius_for_known))

        print 'Building edge list'
        # Build full_edge_list
        # Build Bi-directional graph
        # the first flatMap turns src, [dsts]) -> [(cannonical order, (src, dst),...]
        # Group by key turns that into [(canoncial order, [(src,dst), (src, dst)..), ...
        # The 2nd flatMap turns filters out non-bidirectional and
        #    transforms to[(canoncial order, [(src,dst), (src, dst)..), ...] -> [(src1, dst1), (src1, dst2)]
        # coalesce then reduces the number of parittions in the edge list
        full_edge_list = self.sqlCtx.sql('select user.id_str, entities.user_mentions from %s where size(entities.user_mentions) > 0'%\
            self.options['temp_table_name'])\
            .flatMap(SLP.get_at_mentions).groupByKey()\
            .flatMap(lambda (a,b): SLP.filter_non_bidirectional(b)).coalesce(300)
        full_edge_list.cache()
        self.full_edge_list = full_edge_list

        print 'Finding known user locations'
        # Find Known user locations
        # First map turns Row(id_str, coordinates) -> (id_str, coordinates)
        # Group by key turns (id_str, coordinates -> (id_str, [coordinates1,coordinates2,..])
        # Calculate the median point of the locations (id_str, [coordinates1,..]) -> (id_str, median_location)
        # coalesce then reduces the number of partitions
        median_point_w_options = median_point_w_options_generator(self.options['num_points_req_for_known'],\
                                                                  self.options['home_radius_for_known'])
        original_user_locations = self.sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null'%\
            self.options['temp_table_name'])\
            .map(lambda a: (a.id_str, a.coordinates))\
            .groupByKey().flatMapValues(lambda input_locations:
                                            median_point_w_options(input_locations)).coalesce(300)

        # Save a reference to all locations if we are going to test immediately afterwards
        self.all_user_locations = original_user_locations
        print 'Filtering out user locations that end in:', ','.join(list(self.options['hold_out']))
        filter_function = lambda (a,b): a[-1] not in options.value['hold_out']
        original_user_locations = original_user_locations.filter(filter_function)
        original_user_locations.cache()
        # Propagate locations
        updated_locations = original_user_locations

        if predictions_curve is None:
            print 'Building the error estimation curve'
            # For the users in the full edge list, determine all neighbors median point of the neighbors
            # Define a new median points generator which now returns the neighbor dispersion and standard dev of the dispersion
            def median_point_w_options_generator(num_located_neighbors_req, dispersion_threshold):
                return (lambda x: median_point(x, num_points_req=num_located_neighbors_req, return_dispersion=True,
                                               dispersion_treshold=dispersion_threshold, use_usr_ids=True))
            median_point_w_options = median_point_w_options_generator(self.options['num_points_req_for_known'],\
                                                                      self.options['home_radius_for_known'])

            user_location_only = original_user_locations.map(lambda (a,b): (a, b[0]))
            adj_list_w_locations = full_edge_list.join(user_location_only).map(lambda (a,b): (b[0], (b[1],a))).groupByKey()
            neighbor_locations = adj_list_w_locations.flatMapValues(lambda input_locations:median_point_w_options(input_locations))
            network_info = user_location_only.join(neighbor_locations)

            std_mults = network_info.map\
                (lambda (id_str,(lat0,(lat1, disp, mean_dis, std_dev))) : (haversine(lat0[1], lat0[0], lat1[1], lat1[0]) - disp)/std_dev)

            std_mults_loc = std_mults.collect()
            sorted_vals = np.sort(std_mults_loc)
            yvals=np.arange(len(sorted_vals))/float(len(sorted_vals))
            self.predictions_curve = pd.DataFrame(np.column_stack((sorted_vals, yvals)), columns=["std_range", "pct_within_med"])
        else:
            self.predictions_curve = predictions_curve

        print 'Building a filtered edge list'
        # Build a filtered edge list so we don't ever try to approximate the known user locations
        filtered_edge_list = full_edge_list.keyBy(lambda (a, b): b).leftOuterJoin(updated_locations)\
                .flatMap(lambda (a,b): [b[0]] if b is not None else [])
        filtered_edge_list.cache()

        self.updated_locations = updated_locations
        self.original_user_locations = original_user_locations
        self.filtered_edge_list = filtered_edge_list

        print 'Begining iterations'
        # Perform iterations
        start_time = time.time()
        for i in range(self.options['num_iters']):
            if i + 1 == self.options['num_iters']:
                self.do_iteration(True)
            else:
                self.do_iteration(False)

        print 'Completed training', time.time() - start_time
Пример #14
0
def evaluate(locs_known, edges, holdout_func, slp_closure):
    '''
    This function is used to assess various stats regarding how well SLP is running.
    Given all locs that are known and all edges that are known, this funciton will first
    apply the holdout to the locs_known, allowing for a ground truth comparison to be used.
    Then, it applies the non-holdout set to the training function, which should yield the
    locations of the holdout for comparison.

        For example::

            holdout = lambda (src_id) : src_id[-1] == '6'
            trainer = lambda l, e : slp.train_slp(l, e, 3)
            results = evaluate(locs_known, edges, holdout, trainer)

    Args:
        locs_known (rdd of LocEstimate objects) : The complete list of locations

        edges (rdd of (src_id, (dest_id, weight)): all available edge information

        holdout_func (function) : function responsible for filtering a holdout data set. For example::

                lambda (src_id) : src_id[-1] == '6'

            can be used to get approximately 10% of the data since the src_id's are evenly distributed numeric values

        slp_closure (function closure): a closure over the slp train function. For example::

                lambda locs, edges :\n
                        slp.train_slp(locs, edges, 4, neighbor_threshold=4, dispersion_threshold=150)

            can be used for training with specific threshold parameters


    Returns:
        results (dict) : stats of the results from the SLP algorithm

            `median:` median difference of predicted versus actual

            `mean:` mean difference of predicted versus actual

            `coverage:` ratio of number of predicted locations to number of  original unknown locations

            `reserved_locs:` number of known locations used to train

            `total_locs:` number of known locations input into this function

            `found_locs:` number of predicted locations

            `holdout_ratio:` ratio of the holdout set to the entire set
        '''

    reserved_locs = locs_known.filter(lambda
                                      (src_id, loc): not holdout_func(src_id))
    num_locs = reserved_locs.count()
    total_locs = locs_known.count()

    print('Total Locations %s' % total_locs)

    results = slp_closure(reserved_locs, edges)

    errors = results\
        .filter(lambda (src_id, loc): holdout_func(src_id))\
        .join(locs_known)\
        .map(lambda (src_id, (vtx_found, vtx_actual)) :\
             (src_id, (haversine(vtx_found.geo_coord, vtx_actual.geo_coord), vtx_found)))

    errors_local = errors.map(lambda (src_id, (dist, est_loc)): dist).collect()

    #because cannot easily calculate median in RDDs we will bring deltas local for stats calculations.
    #With larger datasets, we may need to do this in the cluster, but for now will leave.
    return (errors, {
        'median': np.median(errors_local),
        'mean': np.mean(errors_local),
        'coverage': len(errors_local) / float(total_locs - num_locs),
        'reserved_locs': num_locs,
        'total_locs': total_locs,
        'found_locs': len(errors_local),
        'holdout_ratio': 1 - num_locs / float(total_locs)
    })
Пример #15
0
    def test(self, all_tweets, skip_load=False):
        # Push config to all nodes
        options = self.sc.broadcast(self.options)
        all_tweets.registerTempTable(self.options['temp_table_name'])

        if skip_load and self.all_user_locations is not None:
            # If we've just trained then there is no need to go back to original data
            original_user_locations = self.all_user_locations
        else:

            # Find Known user locations
            # First map turns Row(id_str, coordinates) -> (id_str, coordinates)
            # Group by key turns (id_str, coordinates -> (id_str, [coordinates1,coordinates2,..])
            # Filter removes enteries without at least 3 locations
            # Calculate the median point of the locations (id_str, [coordinates1,..]) -> (id_str, median_location)
            # coalesce then reduces the number of partitions
            def median_point_w_options_generator(num_points_req,
                                                 dispersion_threshold):
                return (lambda x: median_point(x,
                                               num_points_req=num_points_req,
                                               return_dispersion=False,
                                               dispersion_treshold=
                                               dispersion_threshold))

            f = median_point_w_options_generator(
                self.options['num_points_req_for_known'],
                self.options['dispersion_threshold'])
            original_user_locations = self.sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null'%\
                self.options['temp_table_name'])\
                .map(lambda a: (a.id_str, a.coordinates))\
                .groupByKey().flatMapValues(lambda input_locations:f(input_locations)).coalesce(300)

        # Filter users that might have been in training set
        filter_function = lambda (a, b): a[-1] in options.value['hold_out']
        original_user_locations = original_user_locations.filter(
            filter_function)
        number_locations = original_user_locations.count()

        found_locations = original_user_locations.join(
            self.updated_locations.map(lambda (a, b): (a, b[0])))
        found_locations_local = found_locations.collect()
        print 'Number of Found Locations: ', len(found_locations_local)
        errors = []
        for (id_str, ll_tuple) in found_locations_local:
            (ll_1, ll_2) = ll_tuple
            errors.append(haversine(ll_1[1], ll_1[0], ll_2[1], ll_2[0]))

        median_error = np.median(errors)
        mean_error = np.mean(errors)
        print('Median Error', median_error)
        print('Mean Error: ', mean_error)
        # gather errors
        final_results = {
            'median': median_error,
            'mean': mean_error,
            'coverage': len(errors) / float(number_locations),
            'num_locs': number_locations,
            'iterations_completed': self.iterations_completed,
            'options': self.options
        }

        return final_results
Пример #16
0
    def train(self, all_tweets, predictions_curve=None):
        options = self.sc.broadcast(self.options)
        all_tweets.registerTempTable(self.options['temp_table_name'])

        # Helper function exploits python closure to pass options to map tasks
        def median_point_w_options_generator(num_points_req_for_known,
                                             home_radius_for_known):
            return (lambda x: median_point(
                x,
                num_points_req=num_points_req_for_known,
                return_dispersion=True,
                dispersion_treshold=home_radius_for_known))

        print 'Building edge list'
        # Build full_edge_list
        # Build Bi-directional graph
        # the first flatMap turns src, [dsts]) -> [(cannonical order, (src, dst),...]
        # Group by key turns that into [(canoncial order, [(src,dst), (src, dst)..), ...
        # The 2nd flatMap turns filters out non-bidirectional and
        #    transforms to[(canoncial order, [(src,dst), (src, dst)..), ...] -> [(src1, dst1), (src1, dst2)]
        # coalesce then reduces the number of parittions in the edge list
        full_edge_list = self.sqlCtx.sql('select user.id_str, entities.user_mentions from %s where size(entities.user_mentions) > 0'%\
            self.options['temp_table_name'])\
            .flatMap(SLP.get_at_mentions).groupByKey()\
            .flatMap(lambda (a,b): SLP.filter_non_bidirectional(b)).coalesce(300)
        full_edge_list.cache()
        self.full_edge_list = full_edge_list

        print 'Finding known user locations'
        # Find Known user locations
        # First map turns Row(id_str, coordinates) -> (id_str, coordinates)
        # Group by key turns (id_str, coordinates -> (id_str, [coordinates1,coordinates2,..])
        # Calculate the median point of the locations (id_str, [coordinates1,..]) -> (id_str, median_location)
        # coalesce then reduces the number of partitions
        median_point_w_options = median_point_w_options_generator(self.options['num_points_req_for_known'],\
                                                                  self.options['home_radius_for_known'])
        original_user_locations = self.sqlCtx.sql('select user.id_str, geo.coordinates from %s where geo.coordinates is not null'%\
            self.options['temp_table_name'])\
            .map(lambda a: (a.id_str, a.coordinates))\
            .groupByKey().flatMapValues(lambda input_locations:
                                            median_point_w_options(input_locations)).coalesce(300)

        # Save a reference to all locations if we are going to test immediately afterwards
        self.all_user_locations = original_user_locations
        print 'Filtering out user locations that end in:', ','.join(
            list(self.options['hold_out']))
        filter_function = lambda (a, b): a[-1] not in options.value['hold_out']
        original_user_locations = original_user_locations.filter(
            filter_function)
        original_user_locations.cache()
        # Propagate locations
        updated_locations = original_user_locations

        if predictions_curve is None:
            print 'Building the error estimation curve'

            # For the users in the full edge list, determine all neighbors median point of the neighbors
            # Define a new median points generator which now returns the neighbor dispersion and standard dev of the dispersion
            def median_point_w_options_generator(num_located_neighbors_req,
                                                 dispersion_threshold):
                return (lambda x: median_point(
                    x,
                    num_points_req=num_located_neighbors_req,
                    return_dispersion=True,
                    dispersion_treshold=dispersion_threshold,
                    use_usr_ids=True))
            median_point_w_options = median_point_w_options_generator(self.options['num_points_req_for_known'],\
                                                                      self.options['home_radius_for_known'])

            user_location_only = original_user_locations.map(lambda (a, b):
                                                             (a, b[0]))
            adj_list_w_locations = full_edge_list.join(user_location_only).map(
                lambda (a, b): (b[0], (b[1], a))).groupByKey()
            neighbor_locations = adj_list_w_locations.flatMapValues(
                lambda input_locations: median_point_w_options(input_locations
                                                               ))
            network_info = user_location_only.join(neighbor_locations)

            std_mults = network_info.map\
                (lambda (id_str,(lat0,(lat1, disp, mean_dis, std_dev))) : (haversine(lat0[1], lat0[0], lat1[1], lat1[0]) - disp)/std_dev)

            std_mults_loc = std_mults.collect()
            sorted_vals = np.sort(std_mults_loc)
            yvals = np.arange(len(sorted_vals)) / float(len(sorted_vals))
            self.predictions_curve = pd.DataFrame(
                np.column_stack((sorted_vals, yvals)),
                columns=["std_range", "pct_within_med"])
        else:
            self.predictions_curve = predictions_curve

        print 'Building a filtered edge list'
        # Build a filtered edge list so we don't ever try to approximate the known user locations
        filtered_edge_list = full_edge_list.keyBy(lambda (a, b): b).leftOuterJoin(updated_locations)\
                .flatMap(lambda (a,b): [b[0]] if b[1] is None else [])
        filtered_edge_list.cache()

        self.updated_locations = updated_locations
        self.original_user_locations = original_user_locations
        self.filtered_edge_list = filtered_edge_list

        print 'Begining iterations'
        # Perform iterations
        start_time = time.time()
        for i in range(self.options['num_iters']):
            if i + 1 == self.options['num_iters']:
                self.do_iteration(True)
            else:
                self.do_iteration(False)

        print 'Completed training', time.time() - start_time