def batch_stats(folder=GTFS_FEEDS_PATH, output_folder=OUTPUT_DIR): for file in os.listdir(folder): date_str = file.split('.')[0] date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date() feed = gu.get_partridge_feed_by_date(output_folder + file, date) zones = gu.get_zones_df(LOCAL_TARIFF_PATH) ts = compute_trip_stats_partridge(feed, zones) ts.to_pickle(output_folder + date_str + '_trip_stats.pkl.gz', compression='gzip') rs = compute_route_stats_base_partridge(ts) rs.to_pickle(output_folder + date_str + '_route_stats.pkl.gz', compression='gzip')
def batch_stats(folder=GTFS_FEEDS_PATH, output_folder=OUTPUT_DIR): for file in os.listdir(folder): date_str = file.split('.')[0] date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date() feed = gu.get_partridge_feed_by_date(join(output_folder, file), date) zones = gu.get_zones_df(LOCAL_TARIFF_PATH) ts = compute_trip_stats_partridge(feed, zones) ts.to_pickle(join(output_folder, date_str + '_trip_stats.pkl.gz'), compression='gzip') rs = compute_route_stats_base_partridge(ts) rs.to_pickle(join(output_folder, date_str + '_route_stats.pkl.gz'), compression='gzip')
def handle_gtfs_date(date_str, file, bucket, output_folder=OUTPUT_DIR, gtfs_folder=GTFS_FEEDS_PATH, logger=None): """ Handle a single date for a single GTFS file. Download if necessary compute and save stats files (currently trip_stats and route_stats). :param date_str: %Y-%m-%d :type date_str: str :param file: gtfs file name (currently only YYYY-mm-dd.zip) :type file: str :param bucket: s3 boto bucket object :type bucket: boto3.resources.factory.s3.Bucket :param output_folder: local path to write output files to :type output_folder: str :param gtfs_folder: local path containing GTFS feeds :type gtfs_folder: str :param logger: logger to write to :type logger: logging.Logger """ date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date() downloaded = False trip_stats_output_path = output_folder + date_str + '_trip_stats.pkl.gz' if os.path.exists(trip_stats_output_path): logger.info(f'found trip stats result DF gzipped pickle "{trip_stats_output_path}"') ts = pd.read_pickle(trip_stats_output_path, compression='gzip') else: downloaded = get_gtfs_file(file, gtfs_folder, bucket, logger) if WRITE_FILTERED_FEED: filtered_out_path = FILTERED_FEEDS_PATH + date_str + '.zip' logger.info(f'writing filtered gtfs feed for file "{gtfs_folder+file}" with date "{date}" in path ' f'{filtered_out_path}') gu.write_filtered_feed_by_date(gtfs_folder + file, date, filtered_out_path) logger.info(f'reading filtered feed for file from path {filtered_out_path}') feed = ptg_feed(filtered_out_path) else: logger.info(f'creating daily partridge feed for file "{gtfs_folder+file}" with date "{date}"') try: feed = gu.get_partridge_feed_by_date(gtfs_folder + file, date) except BadZipFile: logger.error('Bad local zip file', exc_info=True) downloaded = get_gtfs_file(file, gtfs_folder, bucket, logger, force=True) feed = gu.get_partridge_feed_by_date(gtfs_folder + file, date) logger.debug(f'finished creating daily partridge feed for file "{gtfs_folder+file}" with date "{date}"') # TODO: add changing zones from archive logger.info(f'creating zones DF from "{LOCAL_TARIFF_PATH}"') zones = gu.get_zones_df(LOCAL_TARIFF_PATH) logger.info( f'starting compute_trip_stats_partridge for file "{gtfs_folder+file}" with date "{date}" and zones ' f'"{LOCAL_TARIFF_PATH}"') ts = compute_trip_stats_partridge(feed, zones) logger.debug( f'finished compute_trip_stats_partridge for file "{gtfs_folder+file}" with date "{date}" and zones ' f'"{LOCAL_TARIFF_PATH}"') # TODO: log this ts['date'] = date_str ts['date'] = pd.Categorical(ts.date) logger.info(f'saving trip stats result DF to gzipped pickle "{trip_stats_output_path}"') ts.to_pickle(trip_stats_output_path, compression='gzip') # TODO: log more stats logger.debug( f'ts.shape={ts.shape}, dc_trip_id={ts.trip_id.nunique()}, dc_route_id={ts.route_id.nunique()}, ' f'num_start_zones={ts.start_zone.nunique()}, num_agency={ts.agency_name.nunique()}') logger.info(f'starting compute_route_stats_base_partridge from trip stats result') rs = compute_route_stats_base_partridge(ts) logger.debug(f'finished compute_route_stats_base_partridge from trip stats result') # TODO: log this rs['date'] = date_str rs['date'] = pd.Categorical(rs.date) # TODO: log more stats logger.debug( f'rs.shape={rs.shape}, num_trips_sum={rs.num_trips.sum()}, dc_route_id={rs.route_id.nunique()}, ' f'num_start_zones={rs.start_zone.nunique()}, num_agency={rs.agency_name.nunique()}') route_stats_output_path = output_folder + date_str + '_route_stats.pkl.gz' logger.info(f'saving route stats result DF to gzipped pickle "{route_stats_output_path}"') rs.to_pickle(route_stats_output_path, compression='gzip') return downloaded
def handle_gtfs_date(date_str, file, bucket, output_folder=OUTPUT_DIR, gtfs_folder=GTFS_FEEDS_PATH, logger=None): """ Handle a single date for a single GTFS file. Download if necessary compute and save stats files (currently trip_stats and route_stats). :param date_str: %Y-%m-%d :type date_str: str :param file: gtfs file name (currently only YYYY-mm-dd.zip) :type file: str :param bucket: s3 boto bucket object :type bucket: boto3.resources.factory.s3.Bucket :param output_folder: local path to write output files to :type output_folder: str :param gtfs_folder: local path containing GTFS feeds :type gtfs_folder: str :param logger: logger to write to :type logger: logging.Logger """ date = datetime.datetime.strptime(date_str, '%Y-%m-%d').date() downloaded = False trip_stats_output_path = join(output_folder, date_str + '_trip_stats.pkl.gz') if os.path.exists(trip_stats_output_path): logger.info(f'found trip stats result DF gzipped pickle "{trip_stats_output_path}"') ts = pd.read_pickle(trip_stats_output_path, compression='gzip') else: downloaded = get_gtfs_file(file, gtfs_folder, bucket, logger) if WRITE_FILTERED_FEED: filtered_out_path = FILTERED_FEEDS_PATH + date_str + '.zip' logger.info(f'writing filtered gtfs feed for file "{gtfs_folder+file}" with date "{date}" in path ' f'{filtered_out_path}') gu.write_filtered_feed_by_date(gtfs_folder + file, date, filtered_out_path) logger.info(f'reading filtered feed for file from path {filtered_out_path}') feed = ptg_feed(filtered_out_path) else: logger.info(f'creating daily partridge feed for file "{join(gtfs_folder, file)}" with date "{date}"') try: feed = gu.get_partridge_feed_by_date(join(gtfs_folder, file), date) except BadZipFile: logger.error('Bad local zip file', exc_info=True) downloaded = get_gtfs_file(file, gtfs_folder, bucket, logger, force=True) feed = gu.get_partridge_feed_by_date(join(gtfs_folder, file), date) logger.debug(f'finished creating daily partridge feed for file "{join(gtfs_folder, file)}" with date "{date}"') # TODO: use Tariff.zip from s3 tariff_path_to_use = get_closest_archive_path(date, 'Tariff.zip') logger.info(f'creating zones DF from "{tariff_path_to_use}"') zones = gu.get_zones_df(tariff_path_to_use) logger.info( f'starting compute_trip_stats_partridge for file "{join(gtfs_folder, file)}" with date "{date}" and zones ' f'"{LOCAL_TARIFF_PATH}"') ts = compute_trip_stats_partridge(feed, zones) logger.debug( f'finished compute_trip_stats_partridge for file "{join(gtfs_folder, file)}" with date "{date}" and zones ' f'"{LOCAL_TARIFF_PATH}"') # TODO: log this ts['date'] = date_str ts['date'] = pd.Categorical(ts.date) logger.info(f'saving trip stats result DF to gzipped pickle "{trip_stats_output_path}"') ts.to_pickle(trip_stats_output_path, compression='gzip') # TODO: log more stats logger.debug( f'ts.shape={ts.shape}, dc_trip_id={ts.trip_id.nunique()}, dc_route_id={ts.route_id.nunique()}, ' f'num_start_zones={ts.start_zone.nunique()}, num_agency={ts.agency_name.nunique()}') logger.info(f'starting compute_route_stats_base_partridge from trip stats result') rs = compute_route_stats_base_partridge(ts) logger.debug(f'finished compute_route_stats_base_partridge from trip stats result') # TODO: log this rs['date'] = date_str rs['date'] = pd.Categorical(rs.date) # TODO: log more stats logger.debug( f'rs.shape={rs.shape}, num_trips_sum={rs.num_trips.sum()}, dc_route_id={rs.route_id.nunique()}, ' f'num_start_zones={rs.start_zone.nunique()}, num_agency={rs.agency_name.nunique()}') route_stats_output_path = join(output_folder, date_str + '_route_stats.pkl.gz') logger.info(f'saving route stats result DF to gzipped pickle "{route_stats_output_path}"') rs.to_pickle(route_stats_output_path, compression='gzip') return downloaded