def write_coefficient_template(model_settings): coefficients = simulate.read_model_coefficients(model_settings) coefficients = coefficients.transpose() coefficients.columns.name = None template = coefficients.copy() coef_names = [] coef_values = [] for c in coefficients.columns: values = coefficients[c] unique_values = values.unique() for uv in unique_values: if len(unique_values) == 1: uv_coef_name = c + '_all' else: uv_coef_name = c + '_' + '_'.join( values[values == uv].index.values) coef_names.append(uv_coef_name) coef_values.append(uv) template[c] = template[c].where(values != uv, uv_coef_name) refactored_coefficients = pd.DataFrame({ 'coefficient_name': coef_names, 'value': coef_values }) refactored_coefficients.value = refactored_coefficients.value.astype( np.float32) print(refactored_coefficients) template = template.transpose() template.to_csv( config.output_file_path('tour_mode_choice_coefficients_template.csv'), mode='w', index=True, header=True) refactored_coefficients.to_csv(config.output_file_path( 'tour_mode_choice_refactored_coefficients.csv'), mode='w', index=False, header=True)
def build_network(settings): """ Build a Pandana network from CSV files """ logger.info('building pandana network') network_settings_file = settings['network_settings_file'] if not network_settings_file: logger.error("Please specify 'network_settings_file' in settings") return network_settings = config.read_model_settings(network_settings_file) logger.debug('using settings %s' % network_settings) nodes = pd.read_csv(config.data_file_path(network_settings['nodes'])) links = pd.read_csv(config.data_file_path(network_settings['links'])) nodes.index = nodes[network_settings['nodes-id']] network = pdna.Network(nodes[network_settings['nodes-x']], nodes[network_settings['nodes-y']], links[network_settings['links-a']], links[network_settings['links-b']], links[[network_settings['links-impedance']]], twoway=network_settings['twoway']) network.save_hdf5(config.output_file_path('pandana_network.h5')) return network
def track_skim_usage(output_dir): """ write statistics on skim usage (diagnostic to detect loading of un-needed skims) FIXME - have not yet implemented a facility to avoid loading of unused skims FIXME - if resume_after, this will only reflect skims used after resume Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 skim_dict = inject.get_injectable('skim_dict') mode = 'wb' if sys.version_info < (3, ) else 'w' with open(config.output_file_path('skim_usage.txt'), mode) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.get_skim_usage(): print(key, file=output_file) unused = set(k for k in skim_dict.skim_info.base_keys) - set( k for k in skim_dict.get_skim_usage()) for key in unused: print(key, file=output_file)
def write_summaries(output_dir): summary_settings_name = 'output_summaries' summary_file_name = 'summaries.txt' summary_settings = setting(summary_settings_name) if summary_settings is None: logger.info( "No {summary_settings_name} specified in settings file. Nothing to write." ) return summary_dict = summary_settings mode = 'wb' if sys.version_info < (3, ) else 'w' with open(config.output_file_path(summary_file_name), mode) as output_file: for table_name, column_names in summary_dict.items(): df = pipeline.get_table(table_name) for c in column_names: n = 100 empty = (df[c] == '') | df[c].isnull() print( f"\n### {table_name}.{c} type: {df.dtypes[c]} rows: {len(df)} ({empty.sum()} empty)\n\n", file=output_file) print(df[c].value_counts().nlargest(n), file=output_file)
def previous_write_data_dictionary(output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table Parameters ---------- output_dir: str """ model_settings = config.read_model_settings('write_data_dictionary') txt_format = model_settings.get('txt_format', 'data_dict.txt') csv_format = model_settings.get('csv_format', 'data_dict.csv') if txt_format: output_file_path = config.output_file_path(txt_format) pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 output_tables = pipeline.checkpointed_tables() # write data dictionary for all checkpointed_tables with open(output_file_path, 'w') as output_file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print("\n### %s %s" % (table_name, df.shape), file=output_file) print('index:', df.index.name, df.index.dtype, file=output_file) print(df.dtypes, file=output_file)
def test_full_run2(): configs_dir = os.path.join(os.path.dirname(__file__), 'configs2') inject.add_injectable("configs_dir", configs_dir) data_dir = os.path.join(os.path.dirname(__file__), 'data2') inject.add_injectable("data_dir", data_dir) output_dir = os.path.join(os.path.dirname(__file__), 'output') inject.add_injectable("output_dir", output_dir) inject.clear_cache() tracing.config_logger() tracing.delete_output_files('csv') tracing.delete_output_files('txt') tracing.delete_output_files('yaml') _MODELS = [ 'input_pre_processor', 'setup_data_structures', 'initial_seed_balancing', 'meta_control_factoring', 'final_seed_balancing', 'integerize_final_seed_weights', 'sub_balancing.geography=DISTRICT', 'sub_balancing.geography=TRACT', 'sub_balancing.geography=TAZ', 'expand_households', 'summarize', 'write_tables' ] pipeline.run(models=_MODELS, resume_after=None) assert isinstance(pipeline.get_table('expanded_household_ids'), pd.DataFrame) # output tables list action: include assert os.path.exists(config.output_file_path('expanded_household_ids.csv')) assert os.path.exists(config.output_file_path('summary_DISTRICT.csv')) assert not os.path.exists(config.output_file_path('summary_TAZ.csv')) # tables will no longer be available after pipeline is closed pipeline.close_pipeline() inject.clear_cache()
def regress(): persons_df = pipeline.get_table('persons') persons_df = persons_df[persons_df.household_id == HH_ID] print("persons_df\n%s" % persons_df[['value_of_time', 'distance_to_work']]) """ persons_df person_id value_of_time distance_to_work person_id 3249922 23.349532 0.62 3249923 23.349532 0.62 """ tours_df = pipeline.get_table('tours') regress_tour_modes(tours_df) assert tours_df.shape[0] > 0 assert not tours_df.tour_mode.isnull().any() # optional logsum column was added to all tours except mandatory assert 'destination_logsum' in tours_df if (tours_df.destination_logsum.isnull() != (tours_df.tour_category == 'mandatory')).any(): print(tours_df[(tours_df.destination_logsum.isnull() != (tours_df.tour_category == 'mandatory'))]) assert (tours_df.destination_logsum.isnull() == ( tours_df.tour_category == 'mandatory')).all() # mode choice logsum calculated for all tours assert 'mode_choice_logsum' in tours_df assert not tours_df.mode_choice_logsum.isnull().any() trips_df = pipeline.get_table('trips') assert trips_df.shape[0] > 0 assert not trips_df.purpose.isnull().any() assert not trips_df.depart.isnull().any() assert not trips_df.trip_mode.isnull().any() # mode_choice_logsum calculated for all trips assert not trips_df.mode_choice_logsum.isnull().any() # should be at least two tours per trip assert trips_df.shape[0] >= 2 * tours_df.shape[0] # write_trip_matrices trip_matrices_file = config.output_file_path('trips_md.omx') assert os.path.exists(trip_matrices_file) trip_matrices = omx.open_file(trip_matrices_file) assert trip_matrices.shape() == (25, 25) assert 'WALK_MD' in trip_matrices.list_matrices() walk_trips = np.array(trip_matrices['WALK_MD']) assert walk_trips.dtype == np.dtype('float64') trip_matrices.close()
def track_skim_usage(output_dir): """ write statistics on skim usage (diagnostic to detect loading of un-needed skims) FIXME - have not yet implemented a facility to avoid loading of unused skims Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 checkpoints = pipeline.get_checkpoints() tables = OrderedDict() skim_dict = inject.get_injectable('skim_dict') skim_stack = inject.get_injectable('skim_stack', None) mode = 'wb' if sys.version_info < (3, ) else 'w' with open(config.output_file_path('skim_usage.txt'), mode) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.usage: print(key, file=output_file) if skim_stack is None: unused_keys = {k for k in skim_dict.skim_info['omx_keys']} - \ {k for k in skim_dict.usage} print("\n### unused skim keys", file=output_file) for key in unused_keys: print(key, file=output_file) else: print("\n### skim_stack usage", file=output_file) for key in skim_stack.usage: print(key, file=output_file) unused = {k for k in skim_dict.skim_info['omx_keys'] if not isinstance(k, tuple)} - \ {k for k in skim_dict.usage if not isinstance(k, tuple)} print("\n### unused skim str keys", file=output_file) for key in unused: print(key, file=output_file) unused = {k[0] for k in skim_dict.skim_info['omx_keys'] if isinstance(k, tuple)} - \ {k[0] for k in skim_dict.usage if isinstance(k, tuple)} - \ {k for k in skim_stack.usage} print("\n### unused skim dim3 keys", file=output_file) for key in unused: print(key, file=output_file)
def track_skim_usage(output_dir): """ write statistics on skim usage (diagnostic to detect loading of un-needed skims) FIXME - have not yet implemented a facility to avoid loading of unused skims Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 checkpoints = pipeline.get_checkpoints() tables = OrderedDict() skim_dict = inject.get_injectable('skim_dict') skim_stack = inject.get_injectable('skim_stack', None) mode = 'wb' if sys.version_info < (3,) else 'w' with open(config.output_file_path('skim_usage.txt'), mode) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.usage: print(key, file=output_file) if skim_stack is None: unused_keys = {k for k in skim_dict.skim_info['omx_keys']} - \ {k for k in skim_dict.usage} print("\n### unused skim keys", file=output_file) for key in unused_keys: print(key, file=output_file) else: print("\n### skim_stack usage", file=output_file) for key in skim_stack.usage: print(key, file=output_file) unused = {k for k in skim_dict.skim_info['omx_keys'] if not isinstance(k, tuple)} - \ {k for k in skim_dict.usage if not isinstance(k, tuple)} print("\n### unused skim str keys", file=output_file) for key in unused: print(key, file=output_file) unused = {k[0] for k in skim_dict.skim_info['omx_keys'] if isinstance(k, tuple)} - \ {k[0] for k in skim_dict.usage if isinstance(k, tuple)} - \ {k for k in skim_stack.usage} print("\n### unused skim dim3 keys", file=output_file) for key in unused: print(key, file=output_file)
def data_directory(self): # shouldn't be asking for this if not estimating assert self.estimating assert self.settings_name is not None parent_dir = config.output_file_path('estimation_data_bundle') if self.settings_name != self.model_name: parent_dir = os.path.join(parent_dir, self.settings_name) return os.path.join(parent_dir, self.model_name)
def write_matrices(aggregate_trips, zone_index, orig_index, dest_index, model_settings): """ Write aggregated trips to OMX format. The MATRICES setting lists the new OMX files to write. Each file can contain any number of 'tables', each specified by a table key ('name') and a trips table column ('data_field') to use for aggregated counts. Any data type may be used for columns added in the annotation phase, but the table 'data_field's must be summable types: ints, floats, bools. """ matrix_settings = model_settings.get('MATRICES') if not matrix_settings: logger.error('Missing MATRICES setting in write_trip_matrices.yaml') for matrix in matrix_settings: filename = matrix.get('file_name') filepath = config.output_file_path(filename) logger.info('opening %s' % filepath) file = omx.open_file(filepath, 'w') # possibly overwrite existing file table_settings = matrix.get('tables') for table in table_settings: table_name = table.get('name') col = table.get('data_field') if col not in aggregate_trips: logger.error( f'missing {col} column in aggregate_trips DataFrame') return hh_weight_col = model_settings.get('HH_EXPANSION_WEIGHT_COL') if hh_weight_col: aggregate_trips[col] = aggregate_trips[col] / aggregate_trips[ hh_weight_col] data = np.zeros((len(zone_index), len(zone_index))) data[orig_index, dest_index] = aggregate_trips[col] logger.info('writing %s' % table_name) file[table_name] = data # write to file # include the index-to-zone map in the file logger.info('adding %s mapping for %s zones to %s' % (zone_index.name, zone_index.size, filename)) file.create_mapping(zone_index.name, zone_index.to_numpy()) logger.info('closing %s' % filepath) file.close()
def get_cached_spec(hhsize): spec_name = cached_spec_name(hhsize) spec = inject.get_injectable(spec_name, None) if spec is not None: logger.info("build_cdap_spec returning cached injectable spec %s", spec_name) return spec # # try configs dir # spec_path = config.config_file_path(spec_name, mandatory=False) # if spec_path: # logger.info("build_cdap_spec reading cached spec %s from %s", spec_name, spec_path) # return pd.read_csv(spec_path, index_col='Expression') # try data dir if os.path.exists(config.output_file_path(spec_name)): spec_path = config.output_file_path(spec_name) logger.info("build_cdap_spec reading cached spec %s from %s", spec_name, spec_path) return pd.read_csv(spec_path, index_col='Expression') return None
def preload_injectables(): """ preload bulky injectables up front - stuff that isn't inserted into the pipeline """ logger.info("preload_injectables") inject.add_step('track_skim_usage', track_skim_usage) inject.add_step('write_data_dictionary', write_data_dictionary) inject.add_step('write_tables', write_tables) table_list = config.setting('input_table_list') # default ActivitySim table names and indices if table_list is None: logger.warning( "No 'input_table_list' found in settings. This will be a " "required setting in upcoming versions of ActivitySim.") new_settings = inject.get_injectable('settings') new_settings['input_table_list'] = DEFAULT_TABLE_LIST inject.add_injectable('settings', new_settings) # FIXME undocumented feature if config.setting('write_raw_tables'): # write raw input tables as csv (before annotation) csv_dir = config.output_file_path('raw_tables') if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed table_names = [t['tablename'] for t in table_list] for t in table_names: df = inject.get_table(t).to_frame() if t == 'households': df.drop(columns='chunk_id', inplace=True) df.to_csv(os.path.join(csv_dir, '%s.csv' % t), index=True) t0 = tracing.print_elapsed_time() # FIXME - still want to do this? # if inject.get_injectable('skim_dict', None) is not None: # t0 = tracing.print_elapsed_time("preload skim_dict", t0, debug=True) # # if inject.get_injectable('skim_stack', None) is not None: # t0 = tracing.print_elapsed_time("preload skim_stack", t0, debug=True) return True
def output_directory(self, bundle_directory=False): # shouldn't be asking for this if not estimating assert self.estimating assert self.model_name is not None dir = os.path.join(config.output_file_path('estimation_data_bundle'), self.bundle_name) if bundle_directory: # shouldn't be asking - probably confused assert self.bundle_name != self.model_name if self.bundle_name != self.model_name and not bundle_directory: dir = os.path.join(dir, self.model_name) return dir
def get_trace_csv(file_name): file_name = config.output_file_path(file_name) df = pd.read_csv(file_name) # label value_1 value_2 value_3 value_4 # 0 tour_id 38 201 39 40 # 1 mode DRIVE_LOC DRIVE_COM DRIVE_LOC DRIVE_LOC # 2 person_id 1888694 1888695 1888695 1888696 # 3 tour_type work othmaint work school # 4 tour_num 1 1 1 1 # transpose df and rename columns labels = df.label.values df = df.transpose()[1:] df.columns = labels return df
def read_network_file(settings): """ Read network from saved HDF5 file """ network_fname = settings['saved_network'] if not network_fname: logger.error("Please specify 'saved_network' file in settings") return network_fpath = config.data_file_path(network_fname, mandatory=False) or \ config.output_file_path(network_fname) if not os.path.exists(network_fpath): logger.error('No network file %s found' % network_fname) return logger.info('Reading network from %s' % network_fpath) network = pdna.Network.from_hdf5(network_fpath) return network
def get_osm_network(zone_data, settings): """ Retrieve Pandana network from Open Street Maps """ logger.info('getting osm network') zones_df = zone_data.to_frame() miles = settings.get('distance_units') == 'miles' # distance to degrees: 111 km = 69 miles = 1 degree of long (y), 3mi = 0.043 conversion = 69 if miles else 111 * 1000 buffer = settings.get('max_dist') / conversion xmin = min(zones_df[settings['zones_lon']]) - buffer xmax = max(zones_df[settings['zones_lon']]) + buffer ymin = min(zones_df[settings['zones_lat']]) - buffer ymax = max(zones_df[settings['zones_lat']]) + buffer logger.debug('bounding box: %s, %s, %s, %s' % (str(ymin), str(xmin), str(ymax), str(xmax))) # default type=walk, which excludes freeways nodes, edges = osm.network_from_bbox(lat_min=ymin, lng_min=xmin, lat_max=ymax, lng_max=xmax, two_way=True, network_type='walk') if miles: logger.info('converting network distance units to miles...') edges[['distance']] = edges[['distance']] / 1609.34 network = pdna.Network(nodes['x'], nodes['y'], edges['from'], edges['to'], edges[['distance']]) print(edges.head()) print(edges[['distance']]) network.save_hdf5(config.output_file_path('pandana_network.h5')) return network
def write_data_dictionary(output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 output_tables = pipeline.checkpointed_tables() # write data dictionary for all checkpointed_tables mode = 'wb' if sys.version_info < (3, ) else 'w' with open(config.output_file_path('data_dict.txt'), mode) as output_file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print("\n### %s %s" % (table_name, df.shape), file=output_file) print('index:', df.index.name, df.index.dtype, file=output_file) print(df.dtypes, file=output_file)
def write_data_dictionary(output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 output_tables = pipeline.checkpointed_tables() # write data dictionary for all checkpointed_tables mode = 'wb' if sys.version_info < (3,) else 'w' with open(config.output_file_path('data_dict.txt'), mode) as output_file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print("\n### %s %s" % (table_name, df.shape), file=output_file) print('index:', df.index.name, df.index.dtype, file=output_file) print(df.dtypes, file=output_file)
def read_from_table_info(table_info): """ Read input text files and return cleaned up DataFrame. table_info is a dictionary that specifies the following input params. See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | name of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ | h5_tablename | name of target table in HDF5 file | +--------------+----------------------------------------------------------+ """ input_store = config.setting('input_store', None) create_input_store = config.setting('create_input_store', default=False) tablename = table_info.get('tablename') data_filename = table_info.get('filename', input_store) h5_tablename = table_info.get('h5_tablename') or tablename drop_columns = table_info.get('drop_columns', None) column_map = table_info.get('column_map', None) keep_columns = table_info.get('keep_columns', None) rename_columns = table_info.get('rename_columns', None) index_col = table_info.get('index_col', None) assert tablename is not None, 'no tablename provided' assert data_filename is not None, 'no input file provided' data_file_path = config.data_file_path(data_filename) df = _read_input_file(data_file_path, h5_tablename=h5_tablename) logger.debug('raw %s table columns: %s' % (tablename, df.columns.values)) logger.debug('raw %s table size: %s' % (tablename, util.df_size(df))) if create_input_store: h5_filepath = config.output_file_path('input_data.h5') logger.info('writing %s to %s' % (h5_tablename, h5_filepath)) df.to_hdf(h5_filepath, key=h5_tablename, mode='a') csv_dir = config.output_file_path('input_data') if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed df.to_csv(os.path.join(csv_dir, '%s.csv' % tablename), index=False) if drop_columns: logger.debug("dropping columns: %s" % drop_columns) df.drop(columns=drop_columns, inplace=True, errors='ignore') if column_map: warnings.warn( "table_inf option 'column_map' renamed 'rename_columns'" "Support for 'column_map' will be removed in future versions.", FutureWarning) logger.debug("renaming columns: %s" % column_map) df.rename(columns=column_map, inplace=True) # rename columns first, so keep_columns can be a stable list of expected/required columns if rename_columns: logger.info("renaming columns: %s" % rename_columns) df.rename(columns=rename_columns, inplace=True) # set index if index_col is not None: if index_col in df.columns: assert not df.duplicated(index_col).any() df.set_index(index_col, inplace=True) else: df.index.names = [index_col] logger.info("keeping columns: %s" % keep_columns) if keep_columns: logger.info("keeping columns: %s" % keep_columns) df = df[keep_columns] logger.debug('%s table columns: %s' % (tablename, df.columns.values)) logger.debug('%s table size: %s' % (tablename, util.df_size(df))) logger.info('%s index name: %s' % (tablename, df.index.name)) return df
def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other members of a person's household. Because Python requires vectorization of computation, there are some specialized routines in the cdap directory of activitysim for this purpose. This module simply applies those utilities using the simulation framework. """ trace_label = 'cdap' model_settings = config.read_model_settings('cdap.yaml') cdap_indiv_spec = simulate.read_model_spec( file_name=model_settings['INDIV_AND_HHSIZE1_SPEC']) # Rules and coefficients for generating interaction specs for different household sizes cdap_interaction_coefficients = \ pd.read_csv(config.config_file_path('cdap_interaction_coefficients.csv'), comment='#') """ spec to compute/specify the relative proportions of each activity (M, N, H) that should be used to choose activities for additional household members not handled by CDAP This spec is handled much like an activitysim logit utility spec, EXCEPT that the values computed are relative proportions, not utilities (i.e. values are not exponentiated before being normalized to probabilities summing to 1.0) """ cdap_fixed_relative_proportions = \ simulate.read_model_spec(file_name=model_settings['FIXED_RELATIVE_PROPORTIONS_SPEC']) persons_merged = persons_merged.to_frame() constants = config.get_model_constants(model_settings) cdap_interaction_coefficients = \ cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients) # specs are built just-in-time on demand and cached as injectables # prebuilding here allows us to write them to the output directory # (also when multiprocessing locutor might not see all household sizes) logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True) if inject.get_injectable('locutor', False): spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize), index=True) estimator = estimation.manager.begin_estimation('cdap') if estimator: estimator.write_model_settings(model_settings, 'cdap.yaml') estimator.write_spec(model_settings, tag='INDIV_AND_HHSIZE1_SPEC') estimator.write_spec(model_settings=model_settings, tag='FIXED_RELATIVE_PROPORTIONS_SPEC') estimator.write_table(cdap_interaction_coefficients, 'interaction_coefficients', index=False, append=False) estimator.write_choosers(persons_merged) for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.get_cached_spec(hhsize) estimator.write_table(spec, 'spec_%s' % hhsize, append=False) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) choices = cdap.run_cdap( persons=persons_merged, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values(choices, 'persons', 'cdap_activity') estimator.write_override_choices(choices) estimator.end_estimation() # - assign results to persons table and annotate persons = persons.to_frame() choices = choices.reindex(persons.index) persons['cdap_activity'] = choices expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) # - annotate households table households = households.to_frame() expressions.assign_columns( df=households, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households) tracing.print_summary('cdap_activity', persons.cdap_activity, value_counts=True) logger.info( "cdap crosstabs:\n%s" % pd.crosstab(persons.ptype, persons.cdap_activity, margins=True))
def cache_spec(hhsize, spec): spec_name = cached_spec_name(hhsize) # cache as injectable inject.add_injectable(spec_name, spec) # cache as csv in output_dir spec.to_csv(config.output_file_path(spec_name), index=True)
def cdap_simulate(persons_merged, persons, households, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, chunk_size, trace_hh_id): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other members of a person's household. Because Python requires vectorization of computation, there are some specialized routines in the cdap directory of activitysim for this purpose. This module simply applies those utilities using the simulation framework. """ trace_label = 'cdap' model_settings = config.read_model_settings('cdap.yaml') persons_merged = persons_merged.to_frame() constants = config.get_model_constants(model_settings) cdap_interaction_coefficients = \ cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients) # specs are built just-in-time on demand and cached as injectables # prebuilding here allows us to write them to the output directory # (also when multiprocessing locutor might not see all household sizes) logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True) if inject.get_injectable('locutor', False): spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize), index=True) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) choices = cdap.run_cdap( persons=persons_merged, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label) # - assign results to persons table and annotate persons = persons.to_frame() choices = choices.reindex(persons.index) persons['cdap_activity'] = choices.cdap_activity persons['cdap_rank'] = choices.cdap_rank expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) # - annotate households table households = households.to_frame() expressions.assign_columns( df=households, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households) tracing.print_summary('cdap_activity', persons.cdap_activity, value_counts=True) logger.info( "cdap crosstabs:\n%s" % pd.crosstab(persons.ptype, persons.cdap_activity, margins=True)) if trace_hh_id: tracing.trace_df(inject.get_table('persons_merged').to_frame(), label="cdap", columns=['ptype', 'cdap_rank', 'cdap_activity'], warn_if_empty=True)
def read_from_table_info(table_info): """ Read input text files and return cleaned up DataFrame. table_info is a dictionary that specifies the following input params. See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | name of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ | h5_tablename | name of target table in HDF5 file | +--------------+----------------------------------------------------------+ """ input_store = config.setting('input_store', None) create_input_store = config.setting('create_input_store', default=False) tablename = table_info.get('tablename') data_filename = table_info.get('filename', input_store) h5_tablename = table_info.get('h5_tablename') or tablename drop_columns = table_info.get('drop_columns', None) column_map = table_info.get('column_map', None) index_col = table_info.get('index_col', None) assert tablename is not None, 'no tablename provided' assert data_filename is not None, 'no input file provided' data_file_path = config.data_file_path(data_filename) df = _read_input_file(data_file_path, h5_tablename=h5_tablename) logger.info('%s table columns: %s' % (tablename, df.columns.values)) logger.info('%s table size: %s' % (tablename, util.df_size(df))) if create_input_store: h5_filepath = config.output_file_path('input_data.h5') logger.info('writing %s to %s' % (h5_tablename, h5_filepath)) df.to_hdf(h5_filepath, key=h5_tablename, mode='a') if drop_columns: for c in drop_columns: logger.info("dropping column '%s'" % c) del df[c] if column_map: df.rename(columns=column_map, inplace=True) # set index if index_col is not None: if index_col in df.columns: assert not df.duplicated(index_col).any() df.set_index(index_col, inplace=True) else: df.index.names = [index_col] logger.info('%s index name: %s' % (tablename, df.index.name)) return df
def read_from_table_info(table_info): """ Read input text files and return cleaned up DataFrame. table_info is a dictionary that specifies the following input params. See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | name of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ | h5_tablename | name of target table in HDF5 file | +--------------+----------------------------------------------------------+ """ input_store = config.setting('input_store', None) create_input_store = config.setting('create_input_store', default=False) tablename = table_info.get('tablename') data_filename = table_info.get('filename', input_store) h5_tablename = table_info.get('h5_tablename') or tablename drop_columns = table_info.get('drop_columns', None) column_map = table_info.get('column_map', None) keep_columns = table_info.get('keep_columns', None) rename_columns = table_info.get('rename_columns', None) csv_dtypes = table_info.get('dtypes', {}) # don't require a redundant index_col directive for canonical tables # but allow explicit disabling of assignment of index col for canonical tables, in which case, presumably, # the canonical index will be assigned in a subsequent initialization step (e.g. initialize_tours) canonical_index_col = canonical_table_index_name(tablename) # if there is an explicit index_col entry in table_info if 'index_col' in table_info: # honor explicit index_col unless it conflicts with canonical name index_col = table_info['index_col'] if canonical_index_col: if index_col: # if there is a non-empty index_col directive, it should be for canonical_table_index_name assert index_col == canonical_index_col, \ f"{tablename} index_col {table_info.get('index_col')} should be {index_col}" else: logger.info(f"Not assigning canonical index_col {tablename}.{canonical_index_col} " f"because settings file index_col directive is explicitly None.") # if there is an index_col directive for a canonical table, it should be for canonical_table_index_name else: # otherwise default is to use canonical index name for known tables, and no index for unknown tables index_col = canonical_index_col assert tablename is not None, 'no tablename provided' assert data_filename is not None, 'no input file provided' data_file_path = config.data_file_path(data_filename) df = _read_input_file(data_file_path, h5_tablename=h5_tablename, csv_dtypes=csv_dtypes) # logger.debug('raw %s table columns: %s' % (tablename, df.columns.values)) logger.debug('raw %s table size: %s' % (tablename, util.df_size(df))) if create_input_store: h5_filepath = config.output_file_path('input_data.h5') logger.info('writing %s to %s' % (h5_tablename, h5_filepath)) df.to_hdf(h5_filepath, key=h5_tablename, mode='a') csv_dir = config.output_file_path('input_data') if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed df.to_csv(os.path.join(csv_dir, '%s.csv' % tablename), index=False) if drop_columns: logger.debug("dropping columns: %s" % drop_columns) df.drop(columns=drop_columns, inplace=True, errors='ignore') if column_map: warnings.warn("table_inf option 'column_map' renamed 'rename_columns'" "Support for 'column_map' will be removed in future versions.", FutureWarning) logger.debug("renaming columns: %s" % column_map) df.rename(columns=column_map, inplace=True) # rename columns first, so keep_columns can be a stable list of expected/required columns if rename_columns: logger.debug("renaming columns: %s" % rename_columns) df.rename(columns=rename_columns, inplace=True) # set index if index_col is not None: if index_col in df.columns: assert not df.duplicated(index_col).any() if canonical_index_col: # we expect canonical indexes to be integer-valued assert (df[index_col] == df[index_col].astype(int)).all(), \ f"Index col '{index_col}' has non-integer values" df[index_col] = df[index_col].astype(int) df.set_index(index_col, inplace=True) else: # FIXME not sure we want to do this. More likely they omitted index col than that they want to name it? # df.index.names = [index_col] logger.error(f"index_col '{index_col}' specified in configs but not in {tablename} table!") logger.error(f"{tablename} columns are: {list(df.columns)}") raise RuntimeError(f"index_col '{index_col}' not in {tablename} table!") if keep_columns: logger.debug("keeping columns: %s" % keep_columns) if not set(keep_columns).issubset(set(df.columns)): logger.error(f"Required columns missing from {tablename} table: " f"{list(set(keep_columns).difference(set(df.columns)))}") logger.error(f"{tablename} table has columns: {list(df.columns)}") raise RuntimeError(f"Required columns missing from {tablename} table") df = df[keep_columns] if df.columns.duplicated().any(): duplicate_column_names = df.columns[df.columns.duplicated(keep=False)].unique().to_list() assert not df.columns.duplicated().any(), f"duplicate columns names in {tablename}: {duplicate_column_names}" logger.debug('%s table columns: %s' % (tablename, df.columns.values)) logger.debug('%s table size: %s' % (tablename, util.df_size(df))) logger.debug('%s index name: %s' % (tablename, df.index.name)) return df
def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_segment, want_choices, trace_label, filter_targets=None, trace=False, override_choices=None): trace_label = tracing.extend_trace_label(trace_label, 'build_virtual_path') # Tracing is implemented as a seperate, second call that operates ONLY on filter_targets assert not (trace and filter_targets is None) if filter_targets is not None: assert filter_targets.any() # slice orig and dest orig = orig[filter_targets] dest = dest[filter_targets] assert len(orig) > 0 assert len(dest) > 0 # slice tod and demographic_segment if not scalar if not isinstance(tod, str): tod = tod[filter_targets] if demographic_segment is not None: demographic_segment = demographic_segment[filter_targets] assert len(demographic_segment) > 0 # slice choices # (requires actual choices from the previous call lest rands change on second call) assert want_choices == (override_choices is not None) if want_choices: override_choices = override_choices[filter_targets] units = self.units_for_recipe(recipe) assert units == 'utility' or not want_choices, "'want_choices' only supported supported if units is utility" access_mode = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.path_types.{path_type}.access') egress_mode = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.path_types.{path_type}.egress') path_types_settings = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.path_types.{path_type}') attributes_as_columns = \ self.network_los.setting(f'TVPB_SETTINGS.{recipe}.tap_tap_settings.attributes_as_columns', []) path_info = { 'path_type': path_type, 'access_mode': access_mode, 'egress_mode': egress_mode } # maz od pairs requested with memo("#TVPB build_virtual_path maz_od_df"): maz_od_df = pd.DataFrame({ 'idx': orig.index.values, 'omaz': orig.values, 'dmaz': dest.values, 'seq': range(len(orig)) }) chunk.log_df(trace_label, "maz_od_df", maz_od_df) self.trace_maz_tap(maz_od_df, access_mode, egress_mode) # for location choice, there will be multiple alt dest rows per chooser and duplicate orig.index values # but tod and demographic_segment should be the same for all chooser rows (unique orig index values) # knowing this allows us to eliminate redundant computations (e.g. utilities of maz_tap pairs) duplicated = orig.index.duplicated(keep='first') chooser_attributes = pd.DataFrame(index=orig.index[~duplicated]) if not isinstance(tod, str): chooser_attributes['tod'] = tod.loc[~duplicated] elif 'tod' in attributes_as_columns: chooser_attributes['tod'] = tod else: path_info['tod'] = tod if demographic_segment is not None: chooser_attributes[ 'demographic_segment'] = demographic_segment.loc[~duplicated] with memo("#TVPB build_virtual_path access_df"): access_df = self.compute_maz_tap_utilities(recipe, maz_od_df, chooser_attributes, leg='access', mode=access_mode, trace_label=trace_label, trace=trace) chunk.log_df(trace_label, "access_df", access_df) with memo("#TVPB build_virtual_path egress_df"): egress_df = self.compute_maz_tap_utilities(recipe, maz_od_df, chooser_attributes, leg='egress', mode=egress_mode, trace_label=trace_label, trace=trace) chunk.log_df(trace_label, "egress_df", egress_df) # path_info for use by expressions (e.g. penalty for drive access if no parking at access tap) with memo("#TVPB build_virtual_path compute_tap_tap"): transit_df = self.compute_tap_tap(recipe, maz_od_df, access_df, egress_df, chooser_attributes, path_info=path_info, trace_label=trace_label, trace=trace) chunk.log_df(trace_label, "transit_df", transit_df) with memo("#TVPB build_virtual_path best_paths"): path_df = self.best_paths(recipe, path_type, maz_od_df, access_df, egress_df, transit_df, trace_label, trace) chunk.log_df(trace_label, "path_df", path_df) # now that we have created path_df, we are done with the dataframes for the separate legs del access_df chunk.log_df(trace_label, "access_df", None) del egress_df chunk.log_df(trace_label, "egress_df", None) del transit_df chunk.log_df(trace_label, "transit_df", None) if units == 'utility': # logsums with memo("#TVPB build_virtual_path logsums"): # one row per seq with utilities in columns # path_num 0-based to aligh with logit.make_choices 0-based choice indexes path_df['path_num'] = path_df.groupby('seq').cumcount() chunk.log_df(trace_label, "path_df", path_df) utilities_df = path_df[['seq', 'path_num', units]].set_index(['seq', 'path_num' ]).unstack() utilities_df.columns = utilities_df.columns.droplevel( ) # for legibility # add rows missing because no access or egress availability utilities_df = pd.concat( [pd.DataFrame(index=maz_od_df.seq), utilities_df], axis=1) utilities_df = utilities_df.fillna( UNAVAILABLE ) # set utilities for missing paths to UNAVAILABLE chunk.log_df(trace_label, "utilities_df", utilities_df) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. # most likely "divide by zero encountered in log" caused by all transit sets non-viable warnings.simplefilter("always") paths_nest_nesting_coefficient = path_types_settings.get( 'paths_nest_nesting_coefficient', 1) exp_utilities = np.exp(utilities_df.values / paths_nest_nesting_coefficient) logsums = np.maximum( np.log(np.nansum(exp_utilities, axis=1)), UNAVAILABLE) if len(w) > 0: for wrn in w: logger.warning( f"{trace_label} - {type(wrn).__name__} ({wrn.message})" ) DUMP = False if DUMP: zero_utilities_df = utilities_df[np.nansum( np.exp(utilities_df.values), axis=1) == 0] zero_utilities_df.to_csv(config.output_file_path( 'warning_utilities_df.csv'), index=True) bug if want_choices: # orig index to identify appropriate random number channel to use making choices utilities_df.index = orig.index with memo("#TVPB build_virtual_path make_choices"): probs = logit.utils_to_probs(utilities_df, allow_zero_probs=True, trace_label=trace_label) chunk.log_df(trace_label, "probs", probs) if trace: choices = override_choices utilities_df['choices'] = choices self.trace_df(utilities_df, trace_label, 'utilities_df') probs['choices'] = choices self.trace_df(probs, trace_label, 'probs') else: choices, rands = logit.make_choices( probs, allow_bad_probs=True, trace_label=trace_label) chunk.log_df(trace_label, "rands", rands) del rands chunk.log_df(trace_label, "rands", None) del probs chunk.log_df(trace_label, "probs", None) # we need to get path_set, btap, atap from path_df row with same seq and path_num # drop seq join column, but keep path_num of choice to override_choices when tracing columns_to_cache = ['btap', 'atap', 'path_set', 'path_num'] logsum_df = \ pd.merge(pd.DataFrame({'seq': range(len(orig)), 'path_num': choices.values}), path_df[['seq'] + columns_to_cache], on=['seq', 'path_num'], how='left')\ .drop(columns=['seq'])\ .set_index(orig.index) logsum_df['logsum'] = logsums else: assert len(logsums) == len(orig) logsum_df = pd.DataFrame({'logsum': logsums}, index=orig.index) chunk.log_df(trace_label, "logsum_df", logsum_df) del utilities_df chunk.log_df(trace_label, "utilities_df", None) if trace: self.trace_df(logsum_df, trace_label, 'logsum_df') chunk.log_df(trace_label, "logsum_df", logsum_df) results = logsum_df else: assert units == 'time' # return a series results = pd.Series(path_df[units].values, index=path_df['idx']) # zero-fill rows for O-D pairs where no best path exists because there was no tap-tap transit availability results = reindex(results, maz_od_df.idx).fillna(0.0) chunk.log_df(trace_label, "results", results) assert len(results) == len(orig) del path_df chunk.log_df(trace_label, "path_df", None) # diagnostic # maz_od_df['DIST'] = self.network_los.get_default_skim_dict().get('DIST').get(maz_od_df.omaz, maz_od_df.dmaz) # maz_od_df[units] = results.logsum if units == 'utility' else results.values # print(f"maz_od_df\n{maz_od_df}") return results
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag: :: output_tables: h5_store: True action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info( "No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') h5_store = output_tables_settings.get('h5_store', False) sort = output_tables_settings.get('sort', False) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [ t for t in checkpointed_tables if t not in tables ] else: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) if sort: traceable_table_indexes = inject.get_injectable( 'traceable_table_indexes', {}) if df.index.name in traceable_table_indexes: df = df.sort_index() logger.debug( f"write_tables sorting {table_name} on index {df.index.name}" ) else: # find all registered columns we can use to sort this table # (they are ordered appropriately in traceable_table_indexes) sort_columns = [ c for c in traceable_table_indexes if c in df.columns ] if len(sort_columns) > 0: df = df.sort_values(by=sort_columns) logger.debug( f"write_tables sorting {table_name} on columns {sort_columns}" ) else: logger.debug( f"write_tables sorting {table_name} on unrecognized index {df.index.name}" ) df = df.sort_index() if h5_store: file_path = config.output_file_path('%soutput_tables.h5' % prefix) df.to_hdf(file_path, key=table_name, mode='a', format='fixed') else: file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance( df.index, pd.MultiIndex) df.to_csv(file_path, index=write_index)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag: :: output_tables: h5_store: True action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info( "No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') h5_store = output_tables_settings.get('h5_store', False) if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [ t for t in checkpointed_tables if t not in tables ] for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) if h5_store: file_path = config.output_file_path('%soutput_tables.h5' % prefix) df.to_hdf(file_path, key=table_name, mode='a', format='fixed') else: file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance( df.index, pd.MultiIndex) df.to_csv(file_path, index=write_index)
def write_data_dictionary(output_dir): """ Write table schema for all tables model settings txt_format: output text file name (default data_dict.txt) or empty to suppress txt output csv_format: output csv file name (default data_dict.tcsvxt) or empty to suppress txt output schema_tables: list of tables to include in output (defaults to all checkpointed tables) for each table, write column names, dtype, and checkpoint added) text format writes individual table schemas to a single text file csv format writes all tables together with an additional table_name column Parameters ---------- output_dir: str """ model_settings = config.read_model_settings('write_data_dictionary') txt_format = model_settings.get('txt_format', 'data_dict.txt') csv_format = model_settings.get('csv_format', 'data_dict.csv') if not (csv_format or txt_format): logger.warning( f"write_data_dictionary step invoked but neither 'txt_format' nor 'csv_format' specified" ) return table_names = pipeline.checkpointed_tables() # use table_names list from model_settings, if provided schema_tables = model_settings.get('tables', None) if schema_tables: table_names = [c for c in schema_tables if c in table_names] # initialize schema as dict of dataframe[table_name, column_name, dtype, checkpoint] schema = dict() final_shapes = dict() for table_name in table_names: df = pipeline.get_table(table_name) final_shapes[table_name] = df.shape if df.index.name and df.index.name not in df.columns: df = df.reset_index() info = df.dtypes.astype(str).to_frame('dtype').reset_index().rename( columns={'index': 'column_name'}) info['checkpoint'] = '' info.insert(loc=0, column='table_name', value=table_name) schema[table_name] = info # annotate schema.info with name of checkpoint columns were first seen for _, row in pipeline.get_checkpoints().iterrows(): checkpoint_name = row[pipeline.CHECKPOINT_NAME] for table_name in table_names: # no change to table in this checkpoint if row[table_name] != checkpoint_name: continue # get the checkpointed version of the table df = pipeline.get_table(table_name, checkpoint_name) if df.index.name and df.index.name not in df.columns: df = df.reset_index() info = schema.get(table_name, None) # tag any new columns with checkpoint name prev_columns = info[info.checkpoint != ''].column_name.values new_cols = [c for c in df.columns.values if c not in prev_columns] is_new_column_this_checkpoont = info.column_name.isin(new_cols) info.checkpoint = np.where(is_new_column_this_checkpoont, checkpoint_name, info.checkpoint) schema[table_name] = info schema_df = pd.concat(schema.values()) if csv_format: schema_df.to_csv(config.output_file_path(csv_format), header=True, index=False) if txt_format: with open(config.output_file_path(txt_format), 'w') as output_file: # get max schema column widths from omnibus table col_width = { c: schema_df[c].str.len().max() + 2 for c in schema_df } for table_name in table_names: info = schema.get(table_name, None) columns_to_print = ['column_name', 'dtype', 'checkpoint'] info = info[columns_to_print].copy() # normalize schema columns widths across all table schemas for unified output formatting for c in info: info[c] = info[c].str.pad(col_width[c], side='right') info.columns = [c.ljust(col_width[c]) for c in info.columns] info = info.to_string(index=False) print( f"###\n### {table_name} {final_shapes[table_name]}\n###\n", file=output_file) print(f"{info}\n", file=output_file)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info("No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in checkpointed_tables if t not in tables] for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance(df.index, pd.core.index.MultiIndex) df.to_csv(file_path, index=write_index)
def cdap_simulate(persons_merged, persons, households, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, chunk_size, trace_hh_id): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other members of a person's household. Because Python requires vectorization of computation, there are some specialized routines in the cdap directory of activitysim for this purpose. This module simply applies those utilities using the simulation framework. """ trace_label = 'cdap' model_settings = config.read_model_settings('cdap.yaml') persons_merged = persons_merged.to_frame() constants = config.get_model_constants(model_settings) cdap_interaction_coefficients = \ cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients) # specs are built just-in-time on demand and cached as injectables # prebuilding here allows us to write them to the output directory # (also when multiprocessing locutor might not see all household sizes) logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True) if inject.get_injectable('locutor', False): spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize), index=True) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) choices = cdap.run_cdap( persons=persons_merged, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label) # - assign results to persons table and annotate persons = persons.to_frame() choices = choices.reindex(persons.index) persons['cdap_activity'] = choices.cdap_activity persons['cdap_rank'] = choices.cdap_rank expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) # - annotate households table households = households.to_frame() expressions.assign_columns( df=households, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households) tracing.print_summary('cdap_activity', persons.cdap_activity, value_counts=True) logger.info("cdap crosstabs:\n%s" % pd.crosstab(persons.ptype, persons.cdap_activity, margins=True)) if trace_hh_id: tracing.trace_df(inject.get_table('persons_merged').to_frame(), label="cdap", columns=['ptype', 'cdap_rank', 'cdap_activity'], warn_if_empty=True)
def cached_spec_path(spec_name): return config.output_file_path(spec_name)