def _stops_in_edge_table_selector(input_stops_df, input_stop_times_df): """ Select stops that are active during the day and time period specified Parameters ---------- input_stops_df : pandas.DataFrame stops DataFrame input_stop_times_df : pandas.DataFrame stop_times dataframe Returns ------- selected_stops_df : pandas.DataFrame """ start_time = time.time() # add unique stop id input_stops_df['unique_stop_id'] = (input_stops_df['stop_id'].str.cat( input_stops_df['unique_agency_id'].astype('str'), sep='_')) # Select stop ids that match stop ids in the subset stop time data that # match day and time selection selected_stops_df = input_stops_df.loc[ input_stops_df['unique_stop_id'].isin( input_stop_times_df['unique_stop_id'])] log('{:,} of {:,} records selected from stops. Took {:,' '.2f} seconds'.format(len(selected_stops_df), len(input_stops_df), time.time() - start_time)) return selected_stops_df
def _adjust_outliers(df_ref, col, treshhold): # proactively prevent mutation outside of function scope df = df_ref.copy() # first, handle the positive columns pos_thresh = abs(treshhold) neg_thresh = ((-1) * pos_thresh) mask_pos = df[col] > pos_thresh if len(df[mask_pos]): log(('{} rows in transit stops dataset exceeded positive threshold ' 'of {} for {} column.').format(len(df[mask_pos]), pos_thresh, col)) df.loc[mask_pos, col] = pos_thresh - 0.0001 # now handle the negative columns mask_neg = df[col] < neg_thresh if len(df[mask_neg]): log(('{} rows in transit stops dataset exceeded negative threshold ' 'of {} for {} column.').format(len(df[mask_neg]), neg_thresh, col)) df.loc[mask_neg, col] = neg_thresh + 0.0001 return df
def _read_gtfs_calendar_dates(textfile_path, textfile): """ Read gtfs calendar_dates.txt as a pandas dataframe Parameters ---------- textfile_path : str director of text file textfile : str name of text file Returns ------- df : pandas.DataFrame """ if textfile != 'calendar_dates.txt': raise ValueError('{} is not a proper GTFS file name'.format(textfile)) df = pd.read_csv(os.path.join(textfile_path, textfile), dtype={'service_id': object}, low_memory=False) if len(df) == 0: warning_msg = ('{} has no records. This could indicate that this feed ' 'is using calendar.txt for service_ids.') log(warning_msg.format(os.path.join( textfile_path, textfile)), level=lg.WARNING) # remove any extra whitespace in column names df.rename(columns=lambda x: x.strip(), inplace=True) return df
def _format_transit_net_edge(stop_times_df): """ Format transit network data table to match the format required for edges in Pandana graph networks edges Parameters ---------- stop_times_df : pandas.DataFrame interpolated stop times with travel time between stops for the subset time and day Returns ------- merged_edge_df : pandas.DataFrame """ start_time = time.time() log('Starting transformation process for {:,} ' 'total trips...'.format(len(stop_times_df['unique_trip_id'].unique()))) # set columns for new df for data needed by pandana for edges merged_edge = [] stop_times_df.sort_values(by=['unique_trip_id', 'stop_sequence'], inplace=True) for trip, tmp_trip_df in stop_times_df.groupby(['unique_trip_id']): edge_df = pd.DataFrame({ "node_id_from": tmp_trip_df['unique_stop_id'].iloc[:-1].values, "node_id_to": tmp_trip_df['unique_stop_id'].iloc[1:].values, "weight": tmp_trip_df['timediff'].iloc[1:].values, "unique_agency_id": tmp_trip_df['unique_agency_id'].iloc[1:].values, # set unique trip id without edge order to join other data later "unique_trip_id": trip }) # Set current trip id to edge id column adding edge order at # end of string edge_df['sequence'] = (edge_df.index + 1).astype(int) # append completed formatted edge table to master edge table merged_edge.append(edge_df) merged_edge_df = pd.concat(merged_edge, ignore_index=True) merged_edge_df['sequence'] = merged_edge_df['sequence'].astype(int, copy=False) merged_edge_df['id'] = (merged_edge_df['unique_trip_id'].str.cat( merged_edge_df['sequence'].astype('str'), sep='_')) log('stop time table transformation to ' 'Pandana format edge table completed. ' 'Took {:,.2f} seconds'.format(time.time() - start_time)) return merged_edge_df
def _convert_imp_time_units(df, time_col='weight', convert_to='minutes'): """ Convert the travel time impedance units Parameters ---------- df : pandas.DataFrame edge DataFrame with weight column time_col : str name of column that holds the travel impedance convert_to : {'seconds', 'minutes'} unit to convert travel time to. should always be set to 'minutes' Returns ------- df : pandas.DataFrame """ valid_convert_to = ['seconds', 'minutes'] if convert_to not in valid_convert_to or not isinstance(convert_to, str): raise ValueError( '{} not a valid value or not a string'.format(convert_to)) if convert_to == 'seconds': df[time_col] = df[time_col].astype('float') df[time_col] = df[time_col] * 60 log('Time conversion completed: minutes converted to seconds.') if convert_to == 'minutes': df[time_col] = df[time_col].astype('float') df[time_col] = df[time_col] / 60.0 log('Time conversion completed: seconds converted to minutes.') return df
def remove_feed(self, del_key=None, remove_all=False): """ Remove GTFS feeds from the existing urbanaccess_gtfsfeeds instance Parameters ---------- del_key : str or list dict keys as a single string or list of strings to remove from existing remove_all : bool if true, remove all keys from existing urbanaccess_gtfsfeeds instance """ assert isinstance(remove_all, bool) if del_key is None and remove_all: self.gtfs_feeds = {} log('Removed all feeds from gtfs_feeds') else: assert isinstance(del_key,list) or isinstance(del_key,str), \ 'del_key must be a string or list of strings' assert remove_all == False, \ 'remove_all must be False in order to ' \ 'remove individual records: {}'.format(del_key) del_key = [del_key] for key in del_key: assert key in self.gtfs_feeds.keys(), \ ('{} key to delete was not found in gtfs_feeds').format(key) del self.gtfs_feeds[key] log('Removed {} feed from gtfs_feeds'.format(key))
def _time_difference(stop_times_df=None): """ Calculate the difference in departure_time between stops in stop times table to produce travel time Parameters ---------- stop_times_df : pandas.DataFrame interpolated stop times dataframe Returns ------- stop_times_df : pandas.DataFrame """ start_time = time.time() # calculate difference between consecutive records grouping by trip id. stop_times_df['timediff'] = stop_times_df.groupby('unique_trip_id')[ 'departure_time_sec_interpolate'].diff() log( 'Difference between stop times has been successfully calculated. ' 'Took {:,.2f} seconds'.format( time.time() - start_time)) return stop_times_df
def _time_selector(df=None, starttime=None, endtime=None): """ Select stop times that fall within a specified time range Parameters ---------- df : pandas.DataFrame interpolated stop times dataframe starttime : str 24 hour clock formatted time 1 endtime : str 24 hour clock formatted time 2 Returns ------- selected_stop_timesdf : pandas.DataFrame """ assert len(df) > 0 # TODO: Deprecated, should not be referenced anymore start_time = time.time() selected_stop_timesdf = time_selector(df, starttime, endtime) log('Stop times from {} to {} successfully selected {:,} records out of ' '{:,} total records ({:.2f} percent of total). Took {:,' '.2f} seconds'.format(starttime, endtime, len(selected_stop_timesdf), len(df), (len(selected_stop_timesdf) / len(df)) * 100, time.time() - start_time)) return selected_stop_timesdf
def add_feed(self, add_dict, replace=False): """ Add a dictionary to the urbanaccess_gtfsfeeds instance. Parameters ---------- add_dict : dict Dictionary to add to existing urbanaccess_gtfsfeeds with the name of the transit service or agency GTFS feed as the key and the GTFS feed URL as the value to pass to the GTFS downloader as: {unique name of GTFS feed or transit service/agency : URL of feed} replace : bool, optional If key of dict is already in the UrbanAccess replace the existing dict value with the value passed """ if not isinstance(add_dict, dict): raise ValueError('add_dict is not a dict') if not isinstance(replace, bool): raise ValueError('replace is not bool') if replace is not True: for key in add_dict.keys(): if key in self.gtfs_feeds.keys(): raise ValueError( '{} passed in add_dict already exists in gtfs_feeds. ' 'Only unique keys are allowed to be added.'.format( key)) if not isinstance(key, str): raise ValueError('{} must be a string'.format(key)) for value in add_dict[key]: if not isinstance(value, str): raise ValueError('{} must be a string'.format(value)) for key, value in add_dict.items(): if value in self.gtfs_feeds.values(): raise ValueError('duplicate values were found when the ' 'passed add_dict dictionary was added to ' 'the existing dictionary. Feed URL ' 'values must be unique.') gtfs_feeds = self.gtfs_feeds.update(add_dict) else: for key in add_dict.keys(): if key in self.gtfs_feeds.keys(): log('{} passed in add_dict will replace existing {} feed ' 'in gtfs_feeds.'.format(key, key)) if not isinstance(key, str): raise ValueError('{} must be a string'.format(key)) for value in add_dict[key]: if not isinstance(value, str): raise ValueError('{} must be a string'.format(value)) gtfs_feeds = self.gtfs_feeds.update(add_dict) log('Added {} feeds to gtfs_feeds: {}'.format(len(add_dict), add_dict)) return gtfs_feeds
def _stops_in_edge_table_selector(input_stops_df=None, input_stop_times_df=None): """ Select stops that are active during the day and time period specified Parameters ---------- input_stops_df : pandas.DataFrame stops dataframe input_stop_times_df : pandas.DataFrame stop_times dataframe Returns ------- selected_stops_df : pandas.DataFrame """ start_time = time.time() # add unique stop id input_stops_df['unique_stop_id'] = input_stops_df[[ 'stop_id', 'unique_agency_id' ]].apply(lambda x: '{}_{}'.format(x[0], x[1]), axis=1) # Select stop ids that match stop ids in the subset stop time data that # match day and time selection selected_stops_df = input_stops_df.loc[ input_stops_df['unique_stop_id'].isin( input_stop_times_df['unique_stop_id'])] log('{:,} of {:,} records selected from stops. Took {:,' '.2f} seconds'.format(len(selected_stops_df), len(input_stops_df), time.time() - start_time)) return selected_stops_df
def _convert_imp_time_units(df=None, time_col='weight', convert_to='minutes'): """ Convert the travel time impedance units Parameters ---------- df : pandas.DataFrame edge dataframe with weight column time_col : str name of column that holds the travel impedance convert_to : {'seconds','minutes'} unit to convert travel time to. should always be set to 'minutes' Returns ------- df : pandas.DataFrame """ valid_convert_to = ['seconds', 'minutes'] assert convert_to in valid_convert_to and isinstance(convert_to, str) if convert_to == 'seconds': df[time_col] = df[time_col].astype('float') df[time_col] = df[time_col] * 60 log('Time conversion completed: minutes converted to seconds.') if convert_to == 'minutes': df[time_col] = df[time_col].astype('float') df[time_col] = df[time_col] / 60.0 log('Time conversion completed: seconds converted to minutes.') return df
def _add_txt_definitions(stops_df, routes_df, stop_times_df, trips_df): """ Append GTFS definitions to stops, routes, stop times, and trips dataframes Parameters ---------- stops_df : pandas:DataFrame stops dataframe routes_df : pandas:DataFrame routes dataframe stop_times_df : pandas:DataFrame stop times dataframe trips_df : pandas:DataFrame trip dataframe Returns ------- stops_df, routes_df, stop_times_df, trips_df : pandas.DataFrame """ stops_df = _stops_definitions(df=stops_df) routes_df = _routes_definitions(df=routes_df) stop_times_df = _stop_times_definitions(df=stop_times_df) trips_df = _trips_definitions(df=trips_df) log( 'Added descriptive definitions to stops, routes, stop_times, ' 'and trips tables') return stops_df, routes_df, stop_times_df, trips_df
def _connector_edges(osm_nodes, transit_nodes, travel_speed_mph=3): """ Generate the connector edges between the osm and transit edges and weight by travel time Parameters ---------- osm_nodes : pandas.DataFrame osm nodes DataFrame transit_nodes : pandas.DataFrame transit nodes DataFrame travel_speed_mph : int, optional travel speed to use to calculate travel time across a distance on a edge. units are in miles per hour (MPH) for pedestrian travel this is assumed to be 3 MPH Returns ------- net_connector_edges : pandas.DataFrame """ start_time = time.time() transit_nodes['nearest_osm_node'] = _nearest_neighbor( osm_nodes[['x', 'y']], transit_nodes[['x', 'y']]) net_connector_edges = [] for transit_node_id, row in transit_nodes.iterrows(): # create new edge between the node in df2 (transit) # and the node in openstreetmap (pedestrian) osm_node_id = int(row['nearest_osm_node']) osm_row = osm_nodes.loc[osm_node_id] distance = dist_calc((row['y'], row['x']), (osm_row['y'], osm_row['x'])).miles time_ped_to_transit = distance / travel_speed_mph * 60 time_transit_to_ped = distance / travel_speed_mph * 60 # save the edge net_type = 'transit to osm' net_connector_edges.append((transit_node_id, osm_node_id, time_transit_to_ped, net_type)) # make the edge bi-directional net_type = 'osm to transit' net_connector_edges.append((osm_node_id, transit_node_id, time_ped_to_transit, net_type)) net_connector_edges = pd.DataFrame(net_connector_edges, columns=["from", "to", "weight", "net_type"]) log( 'Connector edges between the OSM and transit network nodes ' 'successfully completed. Took {:,.2f} seconds'.format( time.time() - start_time)) return net_connector_edges
def tripschedualselector(input_trips_df=None, input_calendar_df=None, day=None): """ Select trips that run on a specific day Parameters ---------- input_trips_df : pandas.DataFrame trips dataframe input_calendar_df : pandas.DataFrame calendar dataframe day : {'friday','monday','saturday','sunday','thursday','tuesday','wednesday'} day of the week to extract transit schedule from that corresponds to the day in the GTFS calendar Returns ------- calendar_selected_trips_df : pandas.DataFrame """ start_time = time.time() valid_days = [ 'friday', 'monday', 'saturday', 'sunday', 'thursday', 'tuesday', 'wednesday' ] assert day in valid_days and isinstance(day, str),'Incorrect day specified. Must be lowercase string: ' \ 'friday, monday, saturday, sunday, thursday, tuesday, wednesday.' # create unique service ids input_trips_df['unique_service_id'] = input_trips_df[[ 'service_id', 'unique_agency_id' ]].apply(lambda x: '{}_{}'.format(x[0], x[1]), axis=1) input_calendar_df['unique_service_id'] = input_calendar_df[[ 'service_id', 'unique_agency_id' ]].apply(lambda x: '{}_{}'.format(x[0], x[1]), axis=1) # select service ids where day specified in function has a 1 = service runs on that day input_calendar_df = input_calendar_df[( input_calendar_df[day] == 1)] # subset calendar by the specified day input_calendar_df = input_calendar_df[['unique_service_id']] # select and create df of trips that match the service ids for the day of the week specified in function # merge calendar df that has service ids for specified day with trips df calendar_selected_trips_df = input_trips_df.loc[ input_trips_df['unique_service_id'].isin( input_calendar_df['unique_service_id'])] sort_columns = ['route_id', 'trip_id', 'direction_id'] if 'direction_id' not in calendar_selected_trips_df.columns: sort_columns.remove('direction_id') calendar_selected_trips_df.sort_values(by=sort_columns, inplace=True) calendar_selected_trips_df.reset_index(drop=True, inplace=True) calendar_selected_trips_df.drop('unique_service_id', axis=1, inplace=True) log('{:,} of {:,} total trips were extracted representing calendar day: {}. Took {:,.2f} seconds' .format(len(calendar_selected_trips_df), len(input_trips_df), day, time.time() - start_time)) return calendar_selected_trips_df
def _calc_headways_by_route_stop(df): """ Calculate headways by route stop Parameters ---------- df : pandas.DataFrame interpolated stop times dataframe for stop times within the time range with appended trip and route information Returns ------- dataframe : pandas.DataFrame dataframe of statistics of route stop headways in units of minutes """ #TODO: Optimize for speed start_time = time.time() delimiter = ',' df['unique_stop_route'] = df[['unique_stop_id','unique_route_id']].apply(lambda x : '{}{}{}'.format(x[0],delimiter,x[1]), axis=1) stop_route_groups = df.groupby('unique_stop_route') log('Starting route stop headway calculation for {:,} route stops...'.format(len(stop_route_groups))) results = {} for unique_stop_route, stop_route_group in stop_route_groups: stop_route_group.sort(['departure_time_sec_interpolate'],ascending = True, inplace=True) next_bus_time = stop_route_group['departure_time_sec_interpolate'].iloc[1:].values prev_bus_time = stop_route_group['departure_time_sec_interpolate'].iloc[:-1].values stop_route_group_headways = (next_bus_time - prev_bus_time)/60 results[unique_stop_route] = pd.Series(stop_route_group_headways).describe() log('Route stop headway calculation complete. Took {:,.2f} seconds'.format(time.time()-start_time)) return pd.DataFrame(results).T
def from_yaml(cls, gtfsfeeddir=os.path.join(config.settings.data_folder, 'gtfsfeeds'), yamlname='gtfsfeeds.yaml'): """ Create a urbanaccess_gtfsfeeds instance from a saved YAML. Parameters ---------- gtfsfeeddir : str, optional Directory to load a YAML file. yamlname : str or file like, optional File name from which to load a YAML file. Returns ------- urbanaccess_gtfsfeeds """ if not isinstance(gtfsfeeddir, str): raise ValueError('gtfsfeeddir must be a string') if not os.path.exists(gtfsfeeddir): raise ValueError( '{} does not exist or was not found'.format(gtfsfeeddir)) if not isinstance(yamlname, str): raise ValueError('yaml must be a string') yaml_file = os.path.join(gtfsfeeddir, yamlname) with open(yaml_file, 'r') as f: yaml_config = yaml.load(f) if not isinstance(yaml_config, dict): raise ValueError('{} yamlname is not a dict'.format(yamlname)) validkey = 'gtfs_feeds' if validkey not in yaml_config.keys(): raise ValueError('key gtfs_feeds was not found in YAML file') for key in yaml_config['gtfs_feeds'].keys(): if not isinstance(key, str): raise ValueError('{} must be a string'.format(key)) for value in yaml_config['gtfs_feeds'][key]: if not isinstance(value, str): raise ValueError('{} must be a string'.format(value)) unique_url_count = len( pd.DataFrame.from_dict(yaml_config['gtfs_feeds'], orient='index')[0].unique()) url_count = len(yaml_config['gtfs_feeds']) if unique_url_count != url_count: raise ValueError( 'duplicate values were found when the passed add_dict ' 'dictionary was added to the existing dictionary. Feed URL ' 'values must be unique.') gtfsfeeds = cls(gtfs_feeds=yaml_config.get('gtfs_feeds', {})) log('{} YAML successfully loaded with {} feeds.'.format( yaml_file, len(yaml_config['gtfs_feeds']))) return gtfsfeeds
def headways(gtfsfeeds_df, headway_timerange): """ Calculate headways by route stop for a specific time range Parameters ---------- gtfsfeeds_df : object gtfsfeeds_dfs object with all processed GTFS data tables headway_timerange : list time range for which to calculate headways between as a list of time 1 and time 2 where times are 24 hour clock strings such as: ['07:00:00','10:00:00'] Returns ------- gtfsfeeds_dfs.headways : pandas.DataFrame gtfsfeeds_dfs object for the headways dataframe with statistics of route stop headways in units of minutes with relevant route and stop information """ # TODO: Change Assertion to errors/exceptions time_error_statement = ( '{} starttime and endtime are not in the correct format. ' 'Format should be 24 hour clock in following format: 08:00:00 or 17:00:00' .format(headway_timerange)) assert isinstance( headway_timerange, list) and len(headway_timerange) == 2, time_error_statement assert headway_timerange[0] < headway_timerange[1], time_error_statement for t in headway_timerange: assert isinstance(t, str), time_error_statement assert len(t) == 8, time_error_statement if int(str(headway_timerange[1][0:2])) - int(str( headway_timerange[0][0:2])) > 3: log('WARNING: Time range passed: {} is a {} hour period. Long periods over 3 hours may take a ' 'significant amount of time to process.'.format( headway_timerange, int(str(headway_timerange[1][0:2])) - int(str(headway_timerange[0][0:2]))), level=lg.WARNING) assert gtfsfeeds_df is not None if gtfsfeeds_df.stop_times_int.empty or gtfsfeeds_df.trips.empty or gtfsfeeds_df.routes.empty: raise ValueError( 'one of the gtfsfeeds_dfs objects: stop_times_int, trips, or routes were found to be empty.' ) headways_df = _headway_handler( interpolated_stop_times_df=gtfsfeeds_df.stop_times_int, trips_df=gtfsfeeds_df.trips, routes_df=gtfsfeeds_df.routes, headway_timerange=headway_timerange) gtfsfeeds_df.headways = headways_df return gtfsfeeds_df
def _format_pandana_edges_nodes(edge_df, node_df): """ Perform final formatting on nodes and edge DataFrames to prepare them for use in Pandana. Formatting mainly consists of creating a unique node id and edge from and to id that is an integer per Pandana requirements. Parameters ---------- edge_df : pandas.DataFrame integrated transit and osm edge DataFrame node_df : pandas.DataFrame integrated transit and osm node DataFrame Returns ------- edge_df_wnumericid, node_df : pandas.DataFrame """ start_time = time.time() # pandana requires ids that are integer: for nodes - make it the index, # for edges make it the from and to columns node_df['id_int'] = range(1, len(node_df) + 1) edge_df.rename(columns={'id': 'edge_id'}, inplace=True) tmp = pd.merge(edge_df, node_df[['id', 'id_int']], left_on='from', right_on='id', sort=False, copy=False, how='left') tmp['from_int'] = tmp['id_int'] tmp.drop(['id_int', 'id'], axis=1, inplace=True) edge_df_wnumericid = pd.merge(tmp, node_df[['id', 'id_int']], left_on='to', right_on='id', sort=False, copy=False, how='left') edge_df_wnumericid['to_int'] = edge_df_wnumericid['id_int'] edge_df_wnumericid.drop(['id_int', 'id'], axis=1, inplace=True) # turn mixed dtype cols into all same format col_list = edge_df_wnumericid.select_dtypes(include=['object']).columns for col in col_list: try: edge_df_wnumericid[col] = edge_df_wnumericid[col].astype(str) # deal with edge cases where typically the name of a street is not # in a uniform string encoding such as names with accents except UnicodeEncodeError: log('Fixed unicode error in {} column'.format(col)) edge_df_wnumericid[col] = edge_df_wnumericid[col].str.encode( 'utf-8') node_df.set_index('id_int', drop=True, inplace=True) # turn mixed dtype col into all same format node_df['id'] = node_df['id'].astype(str) if 'nearest_osm_node' in node_df.columns: node_df.drop(['nearest_osm_node'], axis=1, inplace=True) log('Edge and node tables formatted for Pandana with integer node ids: ' 'id_int, to_int, and from_int. Took {:,.2f} seconds'.format( time.time() - start_time)) return edge_df_wnumericid, node_df
def from_yaml(cls, gtfsfeeddir=os.path.join(config.settings.data_folder, 'gtfsfeeds'), yamlname='gtfsfeeds.yaml'): """ Create a urbanaccess_gtfsfeeds instance from a saved YAML. Parameters ---------- gtfsfeeddir : str, optional Directory to load a YAML file. yamlname : str or file like, optional File name from which to load a YAML file. Returns ------- urbanaccess_gtfsfeeds """ assert isinstance(gtfsfeeddir,str), 'gtfsfeeddir must be a string' assert os.path.exists(gtfsfeeddir), \ ('{} does not exist or was not found').format(gtfsfeeddir) assert isinstance(yamlname,str) and '.yaml' in yamlname, \ 'yaml must be a string and have file extension .yaml' yaml_file = os.path.join(gtfsfeeddir, yamlname) with open(yaml_file, 'r') as f: yaml_config = yaml.load(f) assert isinstance(yaml_config,dict), \ 'yamlname is not a dict'.format(yamlname) validkey = 'gtfs_feeds' assert validkey in yaml_config.keys(), \ 'key gtfs_feeds was not found in YAML file' for key in yaml_config['gtfs_feeds'].keys(): assert isinstance(key,str), ('{} must be a string').format(key) for value in yaml_config['gtfs_feeds'][key]: assert isinstance(value,str), \ ('{} must be a string').format(value) # make sure there is just one feed coming in from the yaml file feed_cts = pd.Series(yaml_config['gtfs_feeds'].values()).value_counts() all_feeds_equal_one = (feed_cts == 1).all() assert all_feeds_equal_one, ('Duplicate values were found ' 'when the passed add_dict ' 'dictionary was added to ' 'the existing dictionary. ' 'Feed URL values ' 'must be unique.') gtfsfeeds = cls(gtfs_feeds=yaml_config.get('gtfs_feeds', {})) yaml_len = len(yaml_config['gtfs_feeds']) log('{} YAML successfully loaded with {} feeds.'.format(yaml_file, yaml_len)) return gtfsfeeds
def create_osm_net(osm_edges, osm_nodes, travel_speed_mph=3, network_type='walk'): """ Create a travel time weight network graph in units of minutes from openstreetmap nodes and edges Parameters ---------- osm_edges : pandas.DataFrame osm edge dataframe osm_nodes : pandas.DataFrame osm node dataframe travel_speed_mph : int, optional travel speed to use to calculate travel time across a distance on a edge. units are in miles per hour (MPH) for pedestrian travel this is assumed to be 3 MPH network_type : str, optional default is 'walk' for the osm pedestrian network. this string is used to label the osm network once it is integrated with the transit network Returns ------- ua_network : object urbanaccess_network object with osm_edges and osm_nodes dataframes ua_network.osm_edges : pandas.DataFrame ua_network.osm_nodes : pandas.DataFrame """ start_time = time.time() assert network_type == 'walk' # don't divide by zero! assert travel_speed_mph > 0 # assign impedance to OSM edges, measured in minutes dist_in_miles = (osm_edges['distance'] / 1609.34) dist_in_hours = (dist_in_miles / travel_speed_mph) dist_in_minutes = (dist_in_hours * 60) osm_edges['weight'] = dist_in_minutes # assign node and edge net type osm_edges['net_type'] = network_type osm_nodes['net_type'] = network_type ua_network.osm_nodes = osm_nodes ua_network.osm_edges = osm_edges time_diff = time.time() - start_time msg = ('Created OSM network with travel time impedance ' 'using a travel speed of {} MPH. Took {:,.2f} ' 'seconds').format(travel_speed_mph, time_diff) log(msg) return ua_network
def _add_unique_gtfsfeed_id(stops_df, routes_df, trips_df, stop_times_df, calendar_df, calendar_dates_df, feed_folder, feed_number): """ Create a unique GTFS feed specific id for all gtfs feed dataframes to enable tracking of specific feeds Parameters ---------- stops_df : pandas:DataFrame stops dataframe routes_df : pandas:DataFrame routes dataframe trips_df : pandas:DataFrame trips dataframe stop_times_df : pandas:DataFrame stop times dataframe calendar_df : pandas:DataFrame calendar dataframe calendar_dates_df : pandas:DataFrame calendar dates dataframe feed_folder : str name of gtfs feed folder feed_number : int current number iteration of gtfs feed being read in root directory Returns ------- stops_df, routes_df, trips_df, stop_times_df, calendar_df, calendar_dates_df : pandas.DataFrame """ start_time = time.time() df_list = [stops_df, routes_df, trips_df, stop_times_df, calendar_df] # if calendar_dates_df is not empty then add it to the processing list if calendar_dates_df.empty is False: df_list.extend([calendar_dates_df]) # standardize feed_folder name feed_folder = _generate_unique_feed_id(feed_folder) for index, df in enumerate(df_list): # create new unique_feed_id column based on the name of the feed folder df['unique_feed_id'] = '_'.join([feed_folder, str(feed_number)]) df_list[index] = df # if calendar_dates_df is empty then return the original empty df if calendar_dates_df.empty: df_list.extend([calendar_dates_df]) log('Unique GTFS feed id operation complete. Took {:,.2f} seconds'.format( time.time() - start_time)) return df_list
def _txt_header_whitespace_check(gtfsfiles_to_use, csv_rootpath=os.path.join( config.settings.data_folder, 'gtfsfeed_text')): """ Standardize all text files inside a GTFS feed to remove whitespace in headers Parameters ---------- gtfsfiles_to_use : list list of gtfs feed txt files to utilize csv_rootpath : str, optional root path where all gtfs feeds that make up a contiguous metropolitan area are stored Returns ------- None """ start_time = time.time() folderlist = [foldername for foldername in os.listdir(csv_rootpath) if os.path.isdir(os.path.join(csv_rootpath, foldername))] if not folderlist: folderlist = [csv_rootpath] for folder in folderlist: textfilelist = [textfilename for textfilename in os.listdir(os.path.join(csv_rootpath, folder)) if textfilename.endswith(".txt")] for textfile in textfilelist: if textfile in gtfsfiles_to_use: # Read from file with open(os.path.join(csv_rootpath, folder, textfile)) as f: lines = f.readlines() lines[0] = re.sub(r'\s+', '', lines[0]) + '\n' # Write to file try: with open(os.path.join(csv_rootpath, folder, textfile), 'w') as f: f.writelines(lines) except Exception: log('Unable to read {}. Check that file is not currently' 'being read or is not already in memory as this is ' 'likely the cause of the error.' ''.format(os.path.join(csv_rootpath, folder, textfile))) log( 'GTFS text file header whitespace check completed. Took {:,' '.2f} seconds'.format( time.time() - start_time))
def _append_route_type(stops_df, stop_times_df, routes_df, trips_df, info_to_append): """ Append GTFS route type definitions to stops and stop times dataframes Parameters ---------- stops_df : pandas:DataFrame stops dataframe stop_times_df : pandas:DataFrame stop times dataframe routes_df : pandas:DataFrame routes dataframe trips_df : pandas:DataFrame trip dataframe info_to_append : {'route_type_to_stops', 'route_type_to_stop_times'} the type of information to append Returns ------- stops_df or stop_times_df : pandas.DataFrame """ valid_info_to_append = ['route_type_to_stops', 'route_type_to_stop_times'] if info_to_append not in valid_info_to_append: raise ValueError('{} is not a valid parameter'.format(info_to_append)) if info_to_append == 'route_type_to_stops': tmp1 = pd.merge(trips_df, routes_df, how='left', on='route_id', sort=False, copy=False) merged_df = pd.merge(stop_times_df, tmp1, how='left', on='trip_id', sort=False, copy=False) merged_df.drop_duplicates(subset='stop_id', keep='first', inplace=True) stops_df = pd.merge(stops_df, merged_df[['route_type', 'stop_id']], how='left', on='stop_id', sort=False, copy=False) log('Appended route type to stops') return stops_df if info_to_append == 'route_type_to_stop_times': merged_df = pd.merge(trips_df, routes_df, how='left', on='route_id', sort=False, copy=False) merged_df.drop_duplicates(subset='trip_id', keep='first', inplace=True) stop_times_df = pd.merge(stop_times_df, merged_df[['route_type', 'trip_id']], how='left', on='trip_id', sort=False, copy=False) log('Appended route type to stop_times') return stop_times_df
def _txt_encoder_check(gtfsfiles_to_use, csv_rootpath=os.path.join(config.settings.data_folder, 'gtfsfeed_text')): """ Standardize all text files inside a GTFS feed for encoding problems. Has not been updated for Python 3. Parameters ---------- gtfsfiles_to_use : list list of gtfs feed txt files to utilize csv_rootpath : str, optional root path where all gtfs feeds that make up a contiguous metropolitan area are stored Returns ------- None """ # UnicodeDecodeError start_time = time.time() folderlist = [ foldername for foldername in os.listdir(csv_rootpath) if os.path.isdir(os.path.join(csv_rootpath, foldername)) ] if not folderlist: folderlist = [csv_rootpath] for folder in folderlist: textfilelist = [ textfilename for textfilename in os.listdir(os.path.join(csv_rootpath, folder)) if textfilename.endswith(".txt") ] for textfile in textfilelist: if textfile in gtfsfiles_to_use: # Read from file file_open = open(os.path.join(csv_rootpath, folder, textfile)) raw = file_open.read() file_open.close() if raw.startswith(codecs.BOM_UTF8): raw = raw.replace(codecs.BOM_UTF8, '', 1) # Write to file file_open = open( os.path.join(csv_rootpath, folder, textfile), 'w') file_open.write(raw) file_open.close() log('GTFS text file encoding check completed. Took {:,.2f} seconds'.format( time.time() - start_time))
def create_osm_net(osm_edges, osm_nodes, travel_speed_mph=3, network_type='walk'): """ Create a travel time weight network graph in units of minutes from openstreetmap nodes and edges Parameters ---------- osm_edges : pandas.DataFrame osm edge dataframe osm_nodes : pandas.DataFrame osm node dataframe travel_speed_mph : int, optional travel speed to use to calculate travel time across a distance on a edge. units are in miles per hour (MPH) for pedestrian travel this is assumed to be 3 MPH network_type : str, optional default is 'walk' for the osm pedestrian network. this string is used to label the osm network once it is integrated with the transit network Returns ------- ua_network : object urbanaccess_network object with osm_edges and osm_nodes dataframes ua_network.osm_edges : pandas.DataFrame ua_network.osm_nodes : pandas.DataFrame """ start_time = time.time() if not isinstance(network_type, str) or network_type is None: raise ValueError('{!s} network_type passed is either not a ' 'string or is None'.format(network_type)) # assign impedance to OSM edges osm_edges['weight'] = (osm_edges[ 'distance'] / 1609.34) / travel_speed_mph * 60 # assign node and edge net type osm_edges['net_type'] = network_type osm_nodes['net_type'] = network_type ua_network.osm_nodes = osm_nodes ua_network.osm_edges = osm_edges log( 'Created OSM network with travel time impedance using a travel speed ' 'of {} MPH. Took {:,.2f} seconds'.format( travel_speed_mph, time.time() - start_time)) return ua_network
def _time_selector(df, starttime, endtime): """ Select stop times that fall within a specified time range Parameters ---------- df : pandas.DataFrame interpolated stop times DataFrame starttime : str 24 hour clock formatted time 1 endtime : str 24 hour clock formatted time 2 Returns ------- selected_stop_timesdf : pandas.DataFrame """ start_time = time.time() # takes input start and end time range from 24 hour clock and converts # it to seconds past midnight # in order to select times that may be after midnight # convert string time components to integer and then calculate seconds # past midnight # convert starttime 24 hour to seconds past midnight # TODO: optimize for speed start_h = int(str(starttime[0:2])) start_m = int(str(starttime[3:5])) start_s = int(str(starttime[6:8])) starttime_sec = (start_h * 60 * 60) + (start_m * 60) + start_s # convert endtime 24 hour to seconds past midnight end_h = int(str(endtime[0:2])) end_m = int(str(endtime[3:5])) end_s = int(str(endtime[6:8])) endtime_sec = (end_h * 60 * 60) + (end_m * 60) + end_s # create df of stops times that are within the requested range selected_stop_timesdf = df[( (starttime_sec < df["departure_time_sec_interpolate"]) & ( df["departure_time_sec_interpolate"] < endtime_sec))] log( 'Stop times from {} to {} successfully selected {:,} records out of ' '{:,} total records ({:.2f} percent of total). Took {:,' '.2f} seconds'.format( starttime, endtime, len(selected_stop_timesdf), len(df), (len(selected_stop_timesdf) / len(df)) * 100, time.time() - start_time)) return selected_stop_timesdf
def _checkcoordinates(df=None, feed_folder=None): """ Check and print the hemisphere that stop coordinates are in Parameters ---------- df : pandas.DataFrame stops dataframe feed_folder : str name of originating gtfs feed folder Returns ------- None """ if (df['stop_lat'] > 0).values.any() & (df['stop_lon'] < 0).values.any(): log('{} GTFS feed stops: coordinates are in northwest hemisphere. ' 'Latitude = North (90); Longitude = West (-90).'.format( os.path.split(feed_folder)[1])) if (df['stop_lat'] < 0).values.any() & (df['stop_lon'] < 0).values.any(): log('{} GTFS feed stops: coordinates are in southwest hemisphere. ' 'Latitude = South (-90); Longitude = West (-90).'.format( os.path.split(feed_folder)[1])) if (df['stop_lat'] > 0).values.any() & (df['stop_lon'] > 0).values.any(): log('{} GTFS feed stops: coordinates are in northeast hemisphere. ' 'Latitude = North (90); Longitude = East (90).'.format( os.path.split(feed_folder)[1])) if (df['stop_lat'] < 0).values.any() & (df['stop_lon'] > 0).values.any(): log('{} GTFS feed stops: coordinates are in southeast hemisphere. ' 'Latitude = South (-90); Longitude = East (90).'.format( os.path.split(feed_folder)[1]))
def _nearest_neighbor(df1, df2, use_4326_constraints=True): df1_new = df1.copy() # osm nodes df2_new = df2.copy() # transit nodes # be aggressive about ensuring float limits (no vals over than 4 decimals) for col in ['x', 'y']: df1_new[col] = np.around(df1_new[col], decimals=4) df2_new[col] = np.around(df2_new[col], decimals=4) # drop out any invalid x, y columns from the left invalid_osm_rows = (df1_new['x'].isnull() | df1_new['y'].isnull() | (~np.isfinite(df1_new['x'])) | (~np.isfinite(df1_new['y']))) orig_df1_len = len(df1_new) df1_new = df1_new[~invalid_osm_rows] cleaned_df1_len = len(df1_new) # log if any rows were removed from osm nodes dataset df1_cleaned_diff = orig_df1_len - cleaned_df1_len if df1_cleaned_diff > 0: log(('{} OSM node rows ommitted during nearest neighbor calculations' 'due to being invalid numeric values.').format(df1_cleaned_diff)) # let's make sure that latitudes and longitudes are not in excess of # their geographic limits if use_4326_constraints: # y is latitude -90 - +90 # x is longitude -180 - +180 df1_new = _adjust_outliers(df1_new, 'x', 90) df2_new = _adjust_outliers(df2_new, 'x', 90) df1_new = _adjust_outliers(df1_new, 'y', 180) df2_new = _adjust_outliers(df2_new, 'y', 180) # identify problem rows on the right invalid_trans_rows = (df2_new['x'].isnull() | df2_new['y'].isnull()) invalid_trans_rows_ct = len(df2_new[invalid_trans_rows]) if invalid_trans_rows_ct > 0: log(('{} out of {} invalid rows identified for the transit nodes ' 'dataframe, but not removed. These may cause operation to ' 'fail.').format(invalid_trans_rows_ct, len(df2_new))) # for xy coordinates df find the nearest in a subsequent dataframe kdt = KDTree(df1_new.as_matrix().astype(np.float)) df2_mtx = df2_new.as_matrix().astype(np.float) indexes = kdt.query(df2_mtx, k=1, return_distance=False) # this is returning the osmids (indexed) from the left dataframe return df1.index.values[indexes]
def _add_headway_impedance(ped_to_transit_edges_df, headways_df, headway_statistic='mean'): """ Add route stop level headways to the osm to transit connector travel time weight column Parameters ---------- ped_to_transit_edges_df : pandas.DataFrame DataFrame of the osm to transit connectors headways_df : pandas.DataFrame headways DataFrame headway_statistic : {'mean', 'std', 'min', 'max'}, optional required if headways is true; route stop headway statistic to apply to the osm to transit connector edges: mean, std, min, max. Default is mean. Returns ------- osm_to_transit_wheadway : pandas.DataFrame """ start_time = time.time() log('{} route stop headway will be used for pedestrian to transit edge ' 'impedance.'.format(headway_statistic)) osm_to_transit_wheadway = pd.merge( ped_to_transit_edges_df, headways_df[[headway_statistic, 'node_id_route']], how='left', left_on=['to'], right_on=['node_id_route'], sort=False, copy=False) osm_to_transit_wheadway['weight_tmp'] = osm_to_transit_wheadway[ 'weight'] + (osm_to_transit_wheadway[headway_statistic] / 2.0) osm_to_transit_wheadway['weight_tmp'].fillna( osm_to_transit_wheadway['weight'], inplace=True) osm_to_transit_wheadway.drop('weight', axis=1, inplace=True) osm_to_transit_wheadway.rename(columns={'weight_tmp': 'weight'}, inplace=True) log('Headway impedance calculation completed. Took {:,.2f} seconds'.format( time.time() - start_time)) return osm_to_transit_wheadway
def _calc_headways_by_route_stop(df): """ Calculate headways by route stop Parameters ---------- df : pandas.DataFrame interpolated stop times dataframe for stop times within the time range with appended trip and route information Returns ------- dataframe : pandas.DataFrame dataframe of statistics of route stop headways in units of minutes """ # TODO: Optimize for speed start_time = time.time() df['unique_stop_route'] = (df['unique_stop_id'].str.cat( df['unique_route_id'].astype('str'), sep=',')) stop_route_groups = df.groupby('unique_stop_route') log('Starting route stop headway calculation for {:,} route ' 'stops...'.format(len(stop_route_groups))) results = {} # suppress RuntimeWarning: Mean of empty slice. for this code block with warnings.catch_warnings(): warnings.simplefilter("ignore", category='RuntimeWarning') for unique_stop_route, stop_route_group in stop_route_groups: stop_route_group.sort_values(['departure_time_sec_interpolate'], ascending=True, inplace=True) next_bus_time = (stop_route_group['departure_time_sec_interpolate'] .iloc[1:].values) prev_bus_time = (stop_route_group['departure_time_sec_interpolate'] .iloc[:-1].values) stop_route_group_headways = (next_bus_time - prev_bus_time) / 60 results[unique_stop_route] = ( pd.Series(stop_route_group_headways).describe()) log('Route stop headway calculation complete. Took {:,.2f} seconds'.format( time.time() - start_time)) return pd.DataFrame(results).T