예제 #1
0
def _linearly_interpolate_infill_times(
        stop_times_orig_df: pd.DataFrame,
        use_multiprocessing: bool) -> pd.DataFrame:
    # Prevent any upstream modification of this object
    stops_times_df = stop_times_orig_df.copy()

    # Extract a list of all unqiue trip ids attached to the stops
    target_trip_ids = stops_times_df['trip_id'].unique().tolist()

    # Monitor run time performance
    start_time = time.time()
    if use_multiprocessing is True:
        cpu_count = mp.cpu_count()
        log('Running parallelized trip times interpolation on '
            '{} processes'.format(cpu_count))

        manager = make_trip_time_interpolator_manager()
        trip_times_interpolator = manager.TripTimesInterpolator(stops_times_df)

        with mp.Pool(processes=cpu_count) as pool:
            results = pool.starmap(_trip_times_interpolator_pool_map,
                                   [(trip_times_interpolator, trip_id)
                                    for trip_id in target_trip_ids])
    else:
        log('Running serialized trip times interpolation (no parallelization)')
        trip_times_interpolator = TripTimesInterpolator(stops_times_df)
        results = [trip_times_interpolator.generate_infilled_times(trip_id)
                   for trip_id in target_trip_ids]
    elapsed = round(time.time() - start_time, 2)
    log('Trip times interpolation complete. Execution time: {}s'.format(
        elapsed))

    # Take all the resulting dataframes and stack them together
    cleaned = []
    for times_sub in results:
        # Note: Extract values as list with the intent of avoiding
        #       otherwise-expensive append operations (df-to-df)
        cleaned.extend(times_sub.values.tolist())

    # Prepare for new df creation by getting list of columns
    cols = stops_times_df.columns.values.tolist()
    cols.remove('trip_id')
    cols.append('trip_id')

    # Convert matrices to a pandas DataFrame again
    cleaned_new_df = pd.DataFrame(cleaned, columns=cols)
    cleaned_new_df = cleaned_new_df.reset_index(drop=True)

    return cleaned_new_df
예제 #2
0
def _generate_route_processing_results(
        target_route_ids: List,
        target_time_start: int,
        target_time_end: int,
        ftrips: pd.DataFrame,
        stop_times: pd.DataFrame,
        feed_stops: pd.DataFrame,
        stop_cost_method: Any,
        use_multiprocessing: bool) -> Tuple[pd.DataFrame, pd.DataFrame]:
    # Track the runtime of this method
    start_time = time.time()

    if use_multiprocessing is True:
        cpu_count = mp.cpu_count()
        log('Running parallelized route costing on '
            '{} processes'.format(cpu_count))

        manager = make_route_processor_manager()
        route_analyzer = manager.RouteProcessor(
            target_time_start,
            target_time_end,
            ftrips,
            stop_times,
            feed_stops,
            stop_cost_method)

        with mp.Pool(processes=cpu_count) as pool:
            results = pool.starmap(_route_analyzer_pool_map,
                                   [(route_analyzer, route_id)
                                    for route_id in target_route_ids])
    else:
        log('Running serialized route costing (no parallelization)')
        route_analyzer = RouteProcessor(
            target_time_start,
            target_time_end,
            ftrips,
            stop_times,
            feed_stops,
            stop_cost_method)
        results = [route_analyzer.generate_route_costs(rid)
                   for rid in target_route_ids]
    elapsed = round(time.time() - start_time, 2)
    log('Route costing complete. Execution time: {}s'.format(elapsed))

    # First, create a 2-dimensional matrix for each of the output series
    all_edge_costs = []
    all_wait_times = []

    for tst_sub, edge_costs in results:

        # For each result, skip if it is empty
        if type(edge_costs) is pd.DataFrame and not edge_costs.empty:
            # Resume the expected adding of each list result to the matrices
            all_edge_costs.extend(edge_costs.values.tolist())

        # And again, for the other dataframe
        if type(tst_sub) is pd.DataFrame and not tst_sub.empty:
            all_wait_times.extend(tst_sub.values.tolist())

    # Convert matrices to a pandas DataFrame again
    all_edge_costs_columns = ['edge_cost', 'from_stop_id', 'to_stop_id']
    all_edge_costs_new_df = pd.DataFrame(all_edge_costs,
                                         columns=all_edge_costs_columns)

    all_wait_times_columns = ['stop_id', 'wait_dir_0', 'wait_dir_1']
    all_wait_times_new_df = pd.DataFrame(all_wait_times,
                                         columns=all_wait_times_columns)

    return (all_edge_costs_new_df, all_wait_times_new_df)
예제 #3
0
def generate_summary_wait_times(
        df: pd.DataFrame,
        fallback_stop_cost: float) -> pd.DataFrame:
    """
    Calculates average wait time at this stop, given all observed

    Parameters
    ----------
    df : pd.DataFrame
        A DataFrame wait times in both directions (0 and 1) for a specific \
        stop id
    fallback_stop_cost : float
        A fallback wait time (in seconds) if there is not enough information \
        (e.g. discrete arrival times scheduled) to calcualte a headway for a \
        specific transit stop

    Returns
    -------
    summary_wait_times : pd.DataFrame
        Returns a DataFrame of the estimated wait time (boarding cost) for \
        each stop given the wait times associated with that stop in the \
        schedule timeframe
    """
    df_sub = df[['stop_id',
                 'wait_dir_0',
                 'wait_dir_1']].reset_index(drop=True)
    init_of_stop_ids = df_sub['stop_id'].unique()

    # Default values for average waits with not enough data should be
    # NaN types, but let's make sure all null types are NaNs to be safe
    for col in ['wait_dir_0', 'wait_dir_1']:
        mask = df_sub[col].isnull()
        df_sub.loc[mask, col] = np.nan

        # Convert anything that is 0 or less seconds to a NaN as well
        # to remove negative or 0 second waits in the system
        over_zero_mask = df_sub[col] > 0
        df_sub.loc[~over_zero_mask, col] = np.nan

        # With all null types converted to NaN, we can cast col as float
        df_sub[col] = df_sub[col].astype(float)

    # Clean out the None values
    dir_0_mask = ~np.isnan(df_sub.wait_dir_0)
    dir_1_mask = ~np.isnan(df_sub.wait_dir_1)

    # We can't include values where both directions
    # have NaNs at same time
    d0_ids = df_sub[dir_0_mask].stop_id.unique()
    d1_ids = df_sub[dir_1_mask].stop_id.unique()
    keep_ids = list(d0_ids) + list(d1_ids)
    df_sub_clean = df_sub[df_sub.stop_id.isin(keep_ids)]

    orig_len = len(df_sub)
    new_len = len(df_sub_clean)
    if not new_len == orig_len:
        log(('Cleaned out bi-directional NaN values from '
             'stop IDs. From {} to {}.'.format(orig_len, new_len)))
        # And now replace df_sub
        df_sub = df_sub_clean

    # Recheck all for NaNs
    dir_0_mask_2 = np.isnan(df_sub.wait_dir_0)
    dir_1_mask_2 = np.isnan(df_sub.wait_dir_1)

    df_sub.loc[dir_0_mask_2, 'wait_dir_0'] = df_sub.wait_dir_1
    df_sub.loc[dir_1_mask_2, 'wait_dir_1'] = df_sub.wait_dir_0

    # TODO: All this pruning is a mess, needs to be
    #       organized much better
    # One more time to drop out the subset that are NaN
    # from a given stop id
    dir_0_mask_3 = ~np.isnan(df_sub.wait_dir_0)
    df_sub = df_sub[dir_0_mask_3]

    dir_1_mask_3 = ~np.isnan(df_sub.wait_dir_1)
    df_sub = df_sub[dir_1_mask_3]

    # Make sure that there are no None values left
    dir_0_check_2 = df_sub[np.isnan(df_sub.wait_dir_0)]
    dir_1_check_2 = df_sub[np.isnan(df_sub.wait_dir_1)]

    dir_0_trigger = len(dir_0_check_2) > 0
    dir_1_trigger = len(dir_1_check_2) > 0
    if dir_0_trigger or dir_1_trigger:
        raise InvalidParsedWaitTimes(
            'NaN values for both directions on some stop IDs.')

    # At this point, we should make sure that there are still values
    # in the DataFrame - otherwise we are in a situation where there are
    # no valid times to evaluate. This is okay; we just need to skip straight
    # to the application of the fallback value
    if df_sub.empty:
        # So just make a fallback empty dataframe for now
        summed_reset = pd.DataFrame({'stop_id': [], 'avg_cost': []})

    # Only attempt this group by summary if at least one row to group on
    else:
        grouped = df_sub.groupby('stop_id')
        summarized = grouped.apply(summarize_waits_at_one_stop)

        # Clean up summary results, reformat pandas DataFrame result
        summed_reset = _format_summarized_outputs(summarized)

    end_of_stop_ids = summed_reset.stop_id.unique()
    log('Original stop id count: {}'.format(len(init_of_stop_ids)))
    log('After cleaning stop id count: {}'.format(len(end_of_stop_ids)))

    # Check for the presence of any unresolved stop ids and
    # assign them some value boarding cost
    if len(init_of_stop_ids) > len(end_of_stop_ids):
        a = set(list(init_of_stop_ids))
        b = set(list(end_of_stop_ids))
        unresolved_ids = list(a - b)
        log('Some unaccounted for stop ids. '
            'Resolving {}...'.format(len(unresolved_ids)))

        # TODO: Perhaps these are start/end stops and should adopt
        #       a cost that is "average" for that route?
        #       I should think of how to actually do this
        #       because we do not have enough data, for now let's
        #       just assign some default high cost connection value
        #       to these stops
        sids = list(summed_reset.stop_id)
        acst = list(summed_reset.avg_cost)
        for i in unresolved_ids:
            sids.append(i)
            acst.append(fallback_stop_cost)

        # Rebuild the dataframe
        summed_reset = pd.DataFrame({'stop_id': sids, 'avg_cost': acst})

    return summed_reset
예제 #4
0
    def generate_route_costs(self, route_id: str):
        # Get all the subset of trips that are related to this route
        trips = self.trips.loc[route_id].copy()

        # Pandas will try and make returned result a Series if there
        # is only one result - prevent this from happening
        if isinstance(trips, pd.Series):
            trips = trips.to_frame().T

        # Get just the stop times related to this trip
        st_trip_id_mask = self.stop_times.trip_id.isin(trips.trip_id)
        stimes_init = self.stop_times[st_trip_id_mask].copy()

        # Then subset further by just the time period that we care about
        start_time_mask = (stimes_init.arrival_time >= self.target_time_start)
        end_time_mask = (stimes_init.arrival_time <= self.target_time_end)
        stimes = stimes_init[start_time_mask & end_time_mask]

        # Report on progress if requested
        a = len(stimes_init.trip_id.unique())
        b = len(stimes.trip_id.unique())
        log('\tReduced selected trips on route {} from {} to {}.'.format(
            route_id, a, b))

        trips_and_stop_times = pd.merge(trips,
                                        stimes,
                                        how='inner',
                                        on='trip_id')

        trips_and_stop_times = pd.merge(trips_and_stop_times,
                                        self.all_stops.copy(),
                                        how='inner',
                                        on='stop_id')

        sort_list = ['stop_sequence', 'arrival_time', 'departure_time']
        trips_and_stop_times = trips_and_stop_times.sort_values(sort_list)

        # Check direction_id column value before using
        # trips_and_stop_times to generate wait and edge costs
        # Note: Advantage to adding handling at route level is that peartree
        #       avoids tossing direction id if a specific route has all direction
        #       id rows filled in (while another does not, which is possible).
        if 'direction_id' in trips_and_stop_times:
            # If there is such column then check if it contains NaN
            has_nan = trips_and_stop_times['direction_id'].isnull()
            if len(trips_and_stop_times[has_nan]) > 0:
                # If it has no full coverage in direction_id, drop the column
                trips_and_stop_times.drop('direction_id', axis=1, inplace=True)

        wait_times = generate_wait_times(trips_and_stop_times)

        # Look up wait time for each stop in wait_times for each direction
        wait_zero = trips_and_stop_times['stop_id'].apply(
            lambda x: wait_times[0][x])
        trips_and_stop_times['wait_dir_0'] = wait_zero

        wait_one = trips_and_stop_times['stop_id'].apply(
            lambda x: wait_times[1][x])
        trips_and_stop_times['wait_dir_1'] = wait_one

        tst_sub = trips_and_stop_times[['stop_id', 'wait_dir_0', 'wait_dir_1']]

        # Get all edge costs for this route and add to the running total
        edge_costs = generate_all_observed_edge_costs(trips_and_stop_times)

        return (tst_sub, edge_costs)
예제 #5
0
def test_log():
    log('foo')
예제 #6
0
def generate_summary_wait_times(
        df: pd.DataFrame,
        fallback_stop_cost: float) -> pd.DataFrame:
    df_sub = df[['stop_id',
                 'wait_dir_0',
                 'wait_dir_1']].reset_index(drop=True)
    init_of_stop_ids = df_sub.stop_id.unique()

    # Default values for average waits with not enough data should be
    # NaN types, but let's make sure all null types are NaNs to be safe
    for col in ['wait_dir_0', 'wait_dir_1']:
        mask = df_sub[col].isnull()
        df_sub.loc[mask, col] = np.nan

        # Convert anything that is 0 or less seconds to a NaN as well
        # to remove negative or 0 second waits in the system
        df_sub.loc[~(df_sub[col] > 0), col] = np.nan

        # With all null types converted to NaN, we can cast col as float
        df_sub[col] = df_sub[col].astype(float)

    # Clean out the None values
    dir_0_mask = ~np.isnan(df_sub.wait_dir_0)
    dir_1_mask = ~np.isnan(df_sub.wait_dir_1)

    # We can't include values where both directions
    # have NaNs at same time
    d0_ids = df_sub[dir_0_mask].stop_id.unique()
    d1_ids = df_sub[dir_1_mask].stop_id.unique()
    keep_ids = list(d0_ids) + list(d1_ids)
    df_sub_clean = df_sub[df_sub.stop_id.isin(keep_ids)]

    orig_len = len(df_sub)
    new_len = len(df_sub_clean)
    if not new_len == orig_len:
        log(('Cleaned out bi-directional NaN values from '
             'stop IDs. From {} to {}.'.format(orig_len, new_len)))
        # And now replace df_sub
        df_sub = df_sub_clean

    # Recheck all for NaNs
    dir_0_mask_2 = np.isnan(df_sub.wait_dir_0)
    dir_1_mask_2 = np.isnan(df_sub.wait_dir_1)

    df_sub.loc[dir_0_mask_2, 'wait_dir_0'] = df_sub.wait_dir_1
    df_sub.loc[dir_1_mask_2, 'wait_dir_1'] = df_sub.wait_dir_0

    # TODO: All this pruning is a mess, needs to be
    #       organized much better
    # One more time to drop out the subset that are NaN
    # from a given stop id
    dir_0_mask_3 = ~np.isnan(df_sub.wait_dir_0)
    df_sub = df_sub[dir_0_mask_3]

    dir_1_mask_3 = ~np.isnan(df_sub.wait_dir_1)
    df_sub = df_sub[dir_1_mask_3]

    # Make sure that there are no None values left
    dir_0_check_2 = df_sub[np.isnan(df_sub.wait_dir_0)]
    dir_1_check_2 = df_sub[np.isnan(df_sub.wait_dir_1)]

    if (len(dir_0_check_2) > 0) or (len(dir_1_check_2) > 0):
        raise Exception('NaN values for both directions on some stop IDs.')

    grouped = df_sub.groupby('stop_id')
    summarized = grouped.apply(summarize_waits_at_one_stop)

    summed_reset = summarized.reset_index(drop=False)
    summed_reset.columns = ['stop_id', 'avg_cost']

    end_of_stop_ids = summed_reset.stop_id.unique()
    log('Original stop id count: {}'.format(len(init_of_stop_ids)))
    log('After cleaning stop id count: {}'.format(len(end_of_stop_ids)))

    # Check for the presence of any unresolved stop ids and
    # assign them some value boarding cost
    if len(init_of_stop_ids) > len(end_of_stop_ids):
        a = set(list(init_of_stop_ids))
        b = set(list(end_of_stop_ids))
        unresolved_ids = list(a - b)
        log('Some unaccounted for stop ids. '
            'Resolving {}...'.format(len(unresolved_ids)))

        # TODO: Perhaps these are start/end stops and should adopt
        #       a cost that is "average" for that route?
        #       I should think of how to actually do this
        #       because we do not have enough data, for now let's
        #       just assign some default high cost connection value
        #       to these stops
        sids = list(summed_reset.stop_id)
        acst = list(summed_reset.avg_cost)
        for i in unresolved_ids:
            sids.append(i)
            acst.append(fallback_stop_cost)

        # Rebuild the dataframe
        summed_reset = pd.DataFrame({'stop_id': sids, 'avg_cost': acst})

    return summed_reset
예제 #7
0
def generate_edge_and_wait_values(
        feed: ptg.gtfs.feed,
        target_time_start: int,
        target_time_end: int,
        interpolate_times: bool,
        use_multiprocessing: bool) -> Tuple[pd.DataFrame]:
    # Initialize the trips dataframe to be worked with
    ftrips = feed.trips.copy()
    ftrips = ftrips[~ftrips['route_id'].isnull()]

    # Flags whether we interpolate intermediary stops or not
    if interpolate_times:
        # Prepare the stops times dataframe by also infilling
        # all stop times that are NaN with their linearly interpolated
        # values based on their nearest numerically valid neighbors
        stop_times = linearly_interpolate_infill_times(
            feed.stop_times,
            use_multiprocessing)
    else:
        stop_times = feed.stop_times.copy()

    # TODO: Just like linearly_interpolate_infill_times contains all these
    #       operations neatly in an abstracted method, do the same for the
    #       running of the parallelize route processing
    start_time = time.time()
    target_route_ids = feed.routes.route_id
    if use_multiprocessing is True:
        cpu_count = mp.cpu_count()
        log('Running parallelized route costing on '
            '{} processes'.format(cpu_count))

        manager = make_route_processor_manager()
        route_analyzer = manager.RouteProcessor(
            target_time_start,
            target_time_end,
            ftrips,
            stop_times,
            feed.stops.copy())

        with mp.Pool(processes=cpu_count) as pool:
            results = pool.starmap(_route_analyzer_pool_map,
                                   [(route_analyzer, route_id)
                                    for route_id in target_route_ids])
    else:
        log('Running serialized route costing (no parallelization)')
        route_analyzer = RouteProcessor(
            target_time_start,
            target_time_end,
            ftrips,
            stop_times,
            feed.stops.copy())
        results = [route_analyzer.generate_route_costs(rid)
                   for rid in target_route_ids]
    elapsed = round(time.time() - start_time, 2)
    log('Route costing complete. Execution time: {}s'.format(elapsed))

    all_edge_costs = None
    all_wait_times = None
    for tst_sub, edge_costs in results:
        # Add to the running total for wait times in this feed subset
        if all_wait_times is None:
            all_wait_times = tst_sub
        else:
            all_wait_times = all_wait_times.append(tst_sub)

        # Add to the running total in this feed subset
        if all_edge_costs is None:
            all_edge_costs = edge_costs
        else:
            all_edge_costs = all_edge_costs.append(edge_costs)

    return (all_edge_costs, all_wait_times)