def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, bbox=None, remove_stops_outsidebbox=None, append_definitions=False): """ Read all GTFS feed components as a dataframe in a gtfsfeeds_dfs object and merge all individual GTFS feeds into a regional metropolitan data table. Optionally, data can also be validated before its use. Parameters ---------- gtfsfeed_path : str, optional root path where all gtfs feeds that make up a contiguous metropolitan area are stored validation : bool if true, the validation check on stops checking for stops outside of a bounding box and stop coordinate hemisphere will be run. this is required to remove stops outside of a bbox verbose : bool if true and stops are found outside of the bbox, the stops that are outside will be printed for your reference bbox : tuple Bounding box formatted as a 4 element tuple: (lng_max, lat_min, lng_min, lat_max) example: (-122.304611,37.798933,-122.263412,37.822802) a bbox can be extracted for an area using: the CSV format bbox from http://boundingbox.klokantech.com/ remove_stops_outsidebbox : bool if true stops that are outside the bbox will be removed append_definitions : bool if true, columns that use the GTFS data schema for their attribute codes will have the corresponding GTFS definition information of that code appended to the resulting dataframes for reference Returns ------- gtfsfeeds_dfs : object processed dataframes of corresponding GTFS feed text files gtfsfeeds_dfs.stops : pandas.DataFrame gtfsfeeds_dfs.routes : pandas.DataFrame gtfsfeeds_dfs.trips : pandas.DataFrame gtfsfeeds_dfs.stop_times : pandas.DataFrame gtfsfeeds_dfs.calendar : pandas.DataFrame gtfsfeeds_dfs.calendar_dates : pandas.DataFrame """ merged_stops_df = pd.DataFrame() merged_routes_df = pd.DataFrame() merged_trips_df = pd.DataFrame() merged_stop_times_df = pd.DataFrame() merged_calendar_df = pd.DataFrame() merged_calendar_dates_df = pd.DataFrame() start_time = time.time() if gtfsfeed_path is None: gtfsfeed_path = os.path.join(config.settings.data_folder, 'gtfsfeed_text') assert os.path.exists(gtfsfeed_path), '{} does not exist'.format( gtfsfeed_path) else: assert os.path.exists(gtfsfeed_path), '{} does not exist'.format( gtfsfeed_path) assert isinstance(gtfsfeed_path, str) if validation: assert bbox is not None and remove_stops_outsidebbox is not None and verbose is not None, 'Attempted to run validation but bbox, verbose, and or remove_stops_outsidebbox were set to None. These paramters must be specified for validation.' _standardize_txt(csv_rootpath=gtfsfeed_path) folderlist = [ foldername for foldername in os.listdir(gtfsfeed_path) if os.path.isdir(os.path.join(gtfsfeed_path, foldername)) ] if not folderlist: folderlist = [gtfsfeed_path] for folder in folderlist: textfilelist = [ textfilename for textfilename in os.listdir(os.path.join(gtfsfeed_path, folder)) if textfilename.endswith(".txt") ] required_gtfsfiles = [ 'stops.txt', 'routes.txt', 'trips.txt', 'stop_times.txt', 'calendar.txt', 'calendar_dates.txt' ] for required_file in required_gtfsfiles: assert required_file in textfilelist, '{} is a required GTFS text file and ' \ 'was not found in folder {}'.format(required_file, os.path.join(gtfsfeed_path,folder)) for textfile in textfilelist: if textfile == 'agency.txt': agency_df = utils_format._read_gtfs_agency( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) if textfile == 'stops.txt': stops_df = utils_format._read_gtfs_stops( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) if textfile == 'routes.txt': routes_df = utils_format._read_gtfs_routes( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) if textfile == 'trips.txt': trips_df = utils_format._read_gtfs_trips( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) if textfile == 'stop_times.txt': stop_times_df = utils_format._read_gtfs_stop_times( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) if textfile == 'calendar.txt': calendar_df = utils_format._read_gtfs_calendar( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) if textfile == 'calendar_dates.txt': calendar_dates_df = utils_format._read_gtfs_calendar_dates( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) stops_df, routes_df, trips_df, stop_times_df, calendar_df, calendar_dates_df = utils_format._add_unique_agencyid( agency_df=agency_df, stops_df=stops_df, routes_df=routes_df, trips_df=trips_df, stop_times_df=stop_times_df, calendar_df=calendar_df, calendar_dates_df=calendar_dates_df, nulls_as_folder=True, feed_folder=os.path.join(gtfsfeed_path, folder)) if validation: stops_df = utils_validation._validate_gtfs( stop_times_df=stop_times_df, stops_df=stops_df, feed_folder=os.path.join(gtfsfeed_path, folder), verbose=verbose, bbox=bbox, remove_stops_outsidebbox=remove_stops_outsidebbox) if remove_stops_outsidebbox: stops_inside_bbox = list(stops_df['stop_id']) stop_times_df = stop_times_df[stop_times_df['stop_id'].isin( stops_inside_bbox)] stops_df = utils_format._append_route_type( stops_df=stops_df, stop_times_df=stop_times_df, routes_df=routes_df[['route_id', 'route_type']], trips_df=trips_df[['trip_id', 'route_id']], info_to_append='route_type_to_stops') stop_times_df = utils_format._append_route_type( stops_df=stops_df, stop_times_df=stop_times_df, routes_df=routes_df[['route_id', 'route_type']], trips_df=trips_df[['trip_id', 'route_id']], info_to_append='route_type_to_stop_times') merged_stops_df = merged_stops_df.append(stops_df, ignore_index=True) merged_routes_df = merged_routes_df.append(routes_df, ignore_index=True) merged_trips_df = merged_trips_df.append(trips_df, ignore_index=True) merged_stop_times_df = merged_stop_times_df.append(stop_times_df, ignore_index=True) merged_calendar_df = merged_calendar_df.append(calendar_df, ignore_index=True) merged_calendar_dates_df = merged_calendar_dates_df.append( calendar_dates_df, ignore_index=True) # print break to visually separate each gtfs feed log log('--------------------------------') if append_definitions: merged_stops_df, merged_routes_df, merged_stop_times_df, merged_trips_df = utils_format._add_txt_definitions( stops_df=merged_stops_df, routes_df=merged_routes_df, stop_times_df=merged_stop_times_df, trips_df=merged_trips_df) merged_stop_times_df = utils_format._timetoseconds( df=merged_stop_times_df, time_cols=['departure_time', 'arrival_time']) # set gtfsfeeds_dfs object to merged GTFS dfs gtfsfeeds_dfs.stops = merged_stops_df gtfsfeeds_dfs.routes = merged_routes_df gtfsfeeds_dfs.trips = merged_trips_df gtfsfeeds_dfs.stop_times = merged_stop_times_df gtfsfeeds_dfs.calendar = merged_calendar_df gtfsfeeds_dfs.calendar_dates = merged_calendar_dates_df log('{} GTFS feed files successfully read as dataframes: {}. Took {:,.2f} seconds' .format(len(folderlist), folderlist, time.time() - start_time)) return gtfsfeeds_dfs
def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, bbox=None, remove_stops_outsidebbox=True, append_definitions=False): """ Read all GTFS feed components as a dataframe in a gtfsfeeds_dfs object and merge all individual GTFS feeds into a regional metropolitan data table. Optionally, data can also be validated before its use. Parameters ---------- gtfsfeed_path : str, optional root path where all gtfs feeds that make up a contiguous metropolitan area are stored validation : bool if true, the validation check on stops checking for stops outside of a bounding box and stop coordinate hemisphere will be run. this is required to remove stops outside of a bbox verbose : bool if true and stops are found outside of the bbox, the stops that are outside will be printed for your reference bbox : tuple Bounding box formatted as a 4 element tuple: (lng_max, lat_min, lng_min, lat_max) example: (-122.304611,37.798933,-122.263412,37.822802) a bbox can be extracted for an area using: the CSV format bbox from http://boundingbox.klokantech.com/ remove_stops_outsidebbox : bool if true stops that are outside the bbox will be removed append_definitions : bool if true, columns that use the GTFS data schema for their attribute codes will have the corresponding GTFS definition information of that code appended to the resulting dataframes for reference Returns ------- gtfsfeeds_dfs : object processed dataframes of corresponding GTFS feed text files gtfsfeeds_dfs.stops : pandas.DataFrame gtfsfeeds_dfs.routes : pandas.DataFrame gtfsfeeds_dfs.trips : pandas.DataFrame gtfsfeeds_dfs.stop_times : pandas.DataFrame gtfsfeeds_dfs.calendar : pandas.DataFrame gtfsfeeds_dfs.calendar_dates : pandas.DataFrame """ merged_stops_df = pd.DataFrame() merged_routes_df = pd.DataFrame() merged_trips_df = pd.DataFrame() merged_stop_times_df = pd.DataFrame() merged_calendar_df = pd.DataFrame() merged_calendar_dates_df = pd.DataFrame() start_time = time.time() # assertion check to make sure we have valid path to gtfs if gtfsfeed_path is None: d_folder = config.settings.data_folder gtfsfeed_path = os.path.join(d_folder, 'gtfsfeed_text') assert_err_msg = '{} does not exist'.format(gtfsfeed_path) assert os.path.exists(gtfsfeed_path), assert_err_msg if validation: validation_note = ('Attempted to run validation but bbox, ' 'verbose, and or remove_stops_outsidebbox ' 'were set to None. These paramters must be ' 'specified for validation.') bb_good = bbox is not None rm_good = remove_stops_outsidebbox is not None vb_good = verbose is not None all_good = (bb_good and rm_good and vb_good) assert all_good, validation_note # this step updates and rewrites to file the cleaned up txt # TODO: seems dangerous to write to file, seems like UA should only # ever read in content and modify, not write out changes to # data that is used as reference content! _standardize_txt(gtfsfeed_path) folder_list = [ foldername for foldername in os.listdir(gtfsfeed_path) if os.path.isdir(os.path.join(gtfsfeed_path, foldername)) ] if not folder_list: folder_list = [gtfsfeed_path] for folder in folder_list: folder_path = os.path.join(gtfsfeed_path, folder) dir_list = os.listdir(folder_path) tflist = [t_fname for t_fname in dir_list if t_fname.endswith(".txt")] required_gtfsfiles = [ 'stops.txt', 'routes.txt', 'trips.txt', 'stop_times.txt', 'calendar.txt', 'calendar_dates.txt' ] # make sure all required files are present for req_file in required_gtfsfiles: assert_msg = ('{} is a required GTFS text file and not found ' 'in folder {}').format(req_file, folder_path) assert req_file in tflist, assert_msg # TODO: This is now handles by the consolidated steps below # so we can probably safely remove # handle read in of each required file for textfile in tflist: if textfile == 'agency.txt': agency_df = utils_format._read_gtfs_agency( textfile_path=folder_path, textfile=textfile) if textfile == 'stops.txt': stops_df = utils_format._read_gtfs_stops( textfile_path=folder_path, textfile=textfile) if textfile == 'routes.txt': routes_df = utils_format._read_gtfs_routes( textfile_path=folder_path, textfile=textfile) if textfile == 'trips.txt': trips_df = utils_format._read_gtfs_trips( textfile_path=folder_path, textfile=textfile) if textfile == 'stop_times.txt': stop_times_df = utils_format._read_gtfs_stop_times( textfile_path=folder_path, textfile=textfile) if textfile == 'calendar.txt': calendar_df = utils_format._read_gtfs_calendar( textfile_path=folder_path, textfile=textfile) if textfile == 'calendar_dates.txt': calendar_dates_df = utils_format._read_gtfs_calendar_dates( textfile_path=folder_path, textfile=textfile) # TODO: Can eventually replace the cleaning steps in the above for loop consolidated_unclean = _consolidate_gtfs_dfs(agency_df, stops_df, routes_df, trips_df, stop_times_df, calendar_df, calendar_dates_df) # should be able to work with an array, which would require # this to be moved around in this function cleaned_gtfs_tables = utils_format.clean_gtfs_tables( [consolidated_unclean]) (agency_df, stops_df, routes_df, trips_df, stop_times_df, calendar_df, calendar_dates_df) = _explode_gtfs_dfs(cleaned_gtfs_tables) with_unique_ids = utils_format._add_unique_agencyid( agency_df=agency_df, stops_df=stops_df, routes_df=routes_df, trips_df=trips_df, stop_times_df=stop_times_df, calendar_df=calendar_df, calendar_dates_df=calendar_dates_df, nulls_as_folder=True, feed_folder=folder_path) (stops_df, routes_df, trips_df, stop_times_df, calendar_df, calendar_dates_df) = with_unique_ids if validation: trim_bool = remove_stops_outsidebbox stops_df = utils_format.validate_and_trim_stops( stops_df, stop_times_df, bbox, dont_trim=trim_bool) stop_times_df = utils_format.trim_stop_times( stops_df, stop_times_df) # common sub dataframes for both processes routes_sub = routes_df[['route_id', 'route_type']] trips_sub = trips_df[['trip_id', 'route_id']] routes_trips_df = pd.merge(routes_sub, trips_sub, how='left', on='route_id', sort=False, copy=False) # encode types of stops stops_df = utils_format.route_type_to_stops(stops_df, stop_times_df, routes_trips_df) # now use those stops to generate times for each stop stop_times_by_route = utils_format.route_type_to_stop_times( stops_df, stop_times_df, routes_trips_df) # reset stop_times with new df that has route type col stop_times_df = stop_times_by_route merged_stops_df = merged_stops_df.append(stops_df, ignore_index=True) merged_routes_df = merged_routes_df.append(routes_df, ignore_index=True) merged_trips_df = merged_trips_df.append(trips_df, ignore_index=True) merged_stop_times_df = merged_stop_times_df.append(stop_times_df, ignore_index=True) merged_calendar_df = merged_calendar_df.append(calendar_df, ignore_index=True) merged_calendar_dates_df = merged_calendar_dates_df.append( calendar_dates_df, ignore_index=True) # print break to visually separate each gtfs feed log log('completed one gtfs download loop...') merged_stop_times_df = utils_format.time_to_seconds(merged_stop_times_df) # set gtfsfeeds_dfs object to merged GTFS dfs gtfsfeeds_dfs.stops = merged_stops_df gtfsfeeds_dfs.routes = merged_routes_df gtfsfeeds_dfs.trips = merged_trips_df gtfsfeeds_dfs.stop_times = merged_stop_times_df gtfsfeeds_dfs.calendar = merged_calendar_df gtfsfeeds_dfs.calendar_dates = merged_calendar_dates_df fl_len = len(folder_list) time_diff = time.time() - start_time completed_msg = ('{} GTFS feed files successfully read as ' 'dataframes: {}. Took {:,.2f} ' 'seconds').format(fl_len, folder_list, time_diff) log(completed_msg) return gtfsfeeds_dfs
def gtfsfeed_to_df(gtfsfeed_path=None, validation=False, verbose=True, bbox=None, remove_stops_outsidebbox=None): """ Read all GTFS feed components as a dataframe in a gtfsfeeds_dfs object and merge all individual GTFS feeds into a regional metropolitan data table. Optionally, data can also be validated before its use. Parameters ---------- gtfsfeed_path : str, optional root path where all gtfs feeds that make up a contiguous metropolitan area are stored validation : bool if true, the validation check on stops checking for stops outside of a bounding box and stop coordinate hemisphere will be run. this is required to remove stops outside of a bbox verbose : bool if true and stops are found outside of the bbox, the stops that are outside will be printed for your reference bbox : tuple Bounding box formatted as a 4 element tuple: (lng_max, lat_min, lng_min, lat_max) example: (-122.304611,37.798933,-122.263412,37.822802) a bbox can be extracted for an area using: the CSV format bbox from http://boundingbox.klokantech.com/ remove_stops_outsidebbox : bool if true stops that are outside the bbox will be removed append_definitions : bool if true, columns that use the GTFS data schema for their attribute codes will have the corresponding GTFS definition information of that code appended to the resulting dataframes for reference Returns ------- gtfsfeeds_dfs : object processed dataframes of corresponding GTFS feed text files gtfsfeeds_dfs.stops : pandas.DataFrame gtfsfeeds_dfs.routes : pandas.DataFrame gtfsfeeds_dfs.trips : pandas.DataFrame gtfsfeeds_dfs.stop_times : pandas.DataFrame gtfsfeeds_dfs.calendar : pandas.DataFrame gtfsfeeds_dfs.calendar_dates : pandas.DataFrame """ start_time = time.time() if gtfsfeed_path is None: gtfsfeed_path = os.path.join(config.settings.data_folder, 'gtfsfeed_text') if not os.path.exists(gtfsfeed_path): raise ValueError('{} does not exist'.format(gtfsfeed_path)) else: if not os.path.exists(gtfsfeed_path): raise ValueError('{} does not exist'.format(gtfsfeed_path)) if not isinstance(gtfsfeed_path, str): raise ValueError('gtfsfeed_path must be a string') if validation: if bbox is None or remove_stops_outsidebbox is None or verbose is \ None: raise ValueError( 'Attempted to run validation but bbox, verbose, and or ' 'remove_stops_outsidebbox were set to None. These parameters ' 'must be specified for validation.') _standardize_txt(csv_rootpath=gtfsfeed_path) folderlist = [ foldername for foldername in os.listdir(gtfsfeed_path) if os.path.isdir(os.path.join(gtfsfeed_path, foldername)) ] if not folderlist: folderlist = [gtfsfeed_path] for index, folder in enumerate(folderlist): # print break to visually separate each gtfs feed log log('--------------------------------') log('Processing GTFS feed: {!s}'.format(os.path.split(folder)[1])) textfilelist = [ textfilename for textfilename in os.listdir(os.path.join(gtfsfeed_path, folder)) if textfilename.endswith(".txt") ] required_gtfsfiles = [ 'stops.txt', 'routes.txt', 'trips.txt', 'stop_times.txt', 'calendar.txt' ] optional_gtfsfiles = ['agency.txt', 'calendar_dates.txt'] for required_file in required_gtfsfiles: if required_file not in textfilelist: raise ValueError( '{} is a required GTFS text file and was not found in ' 'folder {}'.format(required_file, os.path.join(gtfsfeed_path, folder))) for textfile in required_gtfsfiles: if textfile == 'stops.txt': stops_df = utils_format._read_gtfs_stops( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) if textfile == 'routes.txt': routes_df = utils_format._read_gtfs_routes( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) if textfile == 'trips.txt': trips_df = utils_format._read_gtfs_trips( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) if textfile == 'stop_times.txt': stop_times_df = utils_format._read_gtfs_stop_times( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) if textfile == 'calendar.txt': calendar_df = utils_format._read_gtfs_calendar( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) for textfile in optional_gtfsfiles: if textfile == 'agency.txt': if textfile in textfilelist: agency_df = utils_format._read_gtfs_agency( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) else: agency_df = pd.DataFrame() if textfile == 'calendar_dates.txt': if textfile in textfilelist: calendar_dates_df = utils_format._read_gtfs_calendar_dates( textfile_path=os.path.join(gtfsfeed_path, folder), textfile=textfile) else: calendar_dates_df = pd.DataFrame() if validation: stops_df = (utils_validation._validate_gtfs( stops_df=stops_df, feed_folder=os.path.join(gtfsfeed_path, folder), verbose=verbose, bbox=bbox, remove_stops_outsidebbox=remove_stops_outsidebbox)) if remove_stops_outsidebbox: stops_inside_bbox = list(stops_df['stop_id']) stop_times_df = stop_times_df[stop_times_df['stop_id'].isin( stops_inside_bbox)] stop_times_df = utils_format._timetoseconds(df=stop_times_df, time_cols=['departure_time']) # set gtfsfeeds_dfs object to merged GTFS dfs gtfsfeeds_dfs.stops = stops_df gtfsfeeds_dfs.routes = routes_df gtfsfeeds_dfs.trips = trips_df gtfsfeeds_dfs.stop_times = stop_times_df gtfsfeeds_dfs.calendar = calendar_df gtfsfeeds_dfs.calendar_dates = calendar_dates_df # TODO: add to print the list of gtfs feed txt files read in for each feed log('{:,} GTFS feed file(s) successfully read as dataframes:'.format( len(folderlist))) for folder in folderlist: log(' {}'.format(os.path.split(folder)[1])) log(' Took {:,.2f} seconds'.format(time.time() - start_time)) return gtfsfeeds_dfs