def initialize_households(): trace_label = 'initialize_households' with chunk.chunk_log(trace_label, base=True): chunk.log_rss(f"{trace_label}.inside-yield") households = inject.get_table('households').to_frame() assert not households._is_view chunk.log_df(trace_label, "households", households) del households chunk.log_df(trace_label, "households", None) persons = inject.get_table('persons').to_frame() assert not persons._is_view chunk.log_df(trace_label, "persons", persons) del persons chunk.log_df(trace_label, "persons", None) model_settings = config.read_model_settings( 'initialize_households.yaml', mandatory=True) annotate_tables(model_settings, trace_label) # - initialize shadow_pricing size tables after annotating household and person tables # since these are scaled to model size, they have to be created while single-process # this can now be called as a stand alone model step instead, add_size_tables add_size_tables = model_settings.get('add_size_tables', True) if add_size_tables: # warnings.warn(f"Calling add_size_tables from initialize will be removed in the future.", FutureWarning) shadow_pricing.add_size_tables() # - preload person_windows person_windows = inject.get_table('person_windows').to_frame() chunk.log_df(trace_label, "person_windows", person_windows)
def best_paths(self, recipe, path_type, maz_od_df, access_df, egress_df, transit_df, trace_label, trace=False): trace_label = tracing.extend_trace_label(trace_label, 'best_paths') path_settings = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.path_types.{path_type}') max_paths_per_tap_set = path_settings.get('max_paths_per_tap_set', 1) max_paths_across_tap_sets = path_settings.get('max_paths_across_tap_sets', 1) units = self.units_for_recipe(recipe) smaller_is_better = (units in ['time']) maz_od_df['seq'] = maz_od_df.index # maz_od_df has one row per chooser # inner join to add rows for each access, egress, and transit segment combination path_df = maz_od_df. \ merge(access_df, on=['idx', 'omaz'], how='inner'). \ merge(egress_df, on=['idx', 'dmaz'], how='inner'). \ merge(transit_df, on=['idx', 'atap', 'btap'], how='inner') chunk.log_df(trace_label, "path_df", path_df) # transit sets are the transit_df non-join columns transit_sets = [c for c in transit_df.columns if c not in ['idx', 'atap', 'btap']] if trace: # be nice and show both tap_tap set utility and total_set = access + set + egress for c in transit_sets: path_df[f'total_{c}'] = path_df[c] + path_df['access'] + path_df['egress'] self.trace_df(path_df, trace_label, 'best_paths.full') for c in transit_sets: del path_df[f'total_{c}'] for c in transit_sets: path_df[c] = path_df[c] + path_df['access'] + path_df['egress'] path_df.drop(columns=['access', 'egress'], inplace=True) # choose best paths by tap set best_paths_list = [] for c in transit_sets: keep = path_df.index.isin( path_df[['seq', c]].sort_values(by=c, ascending=smaller_is_better). groupby(['seq']).head(max_paths_per_tap_set).index ) best_paths_for_set = path_df[keep] best_paths_for_set['path_set'] = c # remember the path set best_paths_for_set[units] = path_df[keep][c] best_paths_for_set.drop(columns=transit_sets, inplace=True) best_paths_list.append(best_paths_for_set) path_df = pd.concat(best_paths_list).sort_values(by=['seq', units], ascending=[True, smaller_is_better]) # choose best paths overall by seq path_df = path_df.sort_values(by=['seq', units], ascending=[True, smaller_is_better]) path_df = path_df[path_df.index.isin(path_df.groupby(['seq']).head(max_paths_across_tap_sets).index)] if trace: self.trace_df(path_df, trace_label, 'best_paths') return path_df
def run_trip_scheduling( trips, tours, probs_spec, model_settings, estimator, is_last_iteration, chunk_size, chunk_tag, trace_hh_id, trace_label): # only non-initial trips require scheduling, segment handing first such trip in tour will use most space # is_outbound_chooser = (trips.trip_num > 1) & trips.outbound & (trips.primary_purpose != 'atwork') # is_inbound_chooser = (trips.trip_num < trips.trip_count) & ~trips.outbound & (trips.primary_purpose != 'atwork') # num_choosers = (is_inbound_chooser | is_outbound_chooser).sum() result_list = [] for i, trips_chunk, chunk_trace_label \ in chunk.adaptive_chunked_choosers_by_chunk_id(trips, chunk_size, trace_label, chunk_tag): if trips_chunk.outbound.any(): leg_chunk = trips_chunk[trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(chunk_trace_label, 'outbound') choices = \ schedule_trips_in_leg( outbound=True, trips=leg_chunk, probs_spec=probs_spec, model_settings=model_settings, is_last_iteration=is_last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) if (~trips_chunk.outbound).any(): leg_chunk = trips_chunk[~trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(chunk_trace_label, 'inbound') choices = \ schedule_trips_in_leg( outbound=False, trips=leg_chunk, probs_spec=probs_spec, model_settings=model_settings, is_last_iteration=is_last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) choices = pd.concat(result_list) return choices
def run_trip_scheduling(trips_chunk, tours, probs_spec, model_settings, estimator, is_last_iteration, chunk_size, trace_hh_id, trace_label): set_tour_hour(trips_chunk, tours) set_stop_num(trips_chunk) # only non-initial trips require scheduling, segment handing first such trip in tour will use most space # is_outbound_chooser = (trips.trip_num > 1) & trips.outbound & (trips.primary_purpose != 'atwork') # is_inbound_chooser = (trips.trip_num < trips.trip_count) & ~trips.outbound & (trips.primary_purpose != 'atwork') # num_choosers = (is_inbound_chooser | is_outbound_chooser).sum() result_list = [] if trips_chunk.outbound.any(): leg_chunk = trips_chunk[trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(trace_label, 'outbound') choices = \ schedule_trips_in_leg( outbound=True, trips=leg_chunk, probs_spec=probs_spec, model_settings=model_settings, is_last_iteration=is_last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) # departure time of last outbound trips must constrain # departure times for initial inbound trips update_tour_earliest(trips_chunk, choices) if (~trips_chunk.outbound).any(): leg_chunk = trips_chunk[~trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(trace_label, 'inbound') choices = \ schedule_trips_in_leg( outbound=False, trips=leg_chunk, probs_spec=probs_spec, model_settings=model_settings, is_last_iteration=is_last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) choices = pd.concat(result_list) return choices
def initialize_landuse(): trace_label = 'initialize_landuse' with chunk.chunk_log(trace_label, base=True): model_settings = config.read_model_settings('initialize_landuse.yaml', mandatory=True) annotate_tables(model_settings, trace_label) # instantiate accessibility (must be checkpointed to be be used to slice accessibility) accessibility = pipeline.get_table('accessibility') chunk.log_df(trace_label, "accessibility", accessibility)
def compute_utilities_for_attribute_tuple(network_los, scalar_attributes, data, chunk_size, trace_label): # scalar_attributes is a dict of attribute name/value pairs for this combination # (e.g. {'demographic_segment': 0, 'tod': 'AM', 'access_mode': 'walk'}) logger.info(f"{trace_label} scalar_attributes: {scalar_attributes}") uid_calculator = network_los.tvpb.uid_calculator attributes_as_columns = \ network_los.setting('TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attributes_as_columns', []) model_settings = \ network_los.setting(f'TVPB_SETTINGS.tour_mode_choice.tap_tap_settings') model_constants = \ network_los.setting(f'TVPB_SETTINGS.tour_mode_choice.CONSTANTS').copy() model_constants.update(scalar_attributes) data = data.reshape(uid_calculator.fully_populated_shape) # get od skim_offset dataframe with uid index corresponding to scalar_attributes choosers_df = uid_calculator.get_od_dataframe(scalar_attributes) # choosers_df is pretty big and was custom made for compute_utilities but we don't need to chunk_log it # since it is created outside of adaptive_chunked_choosers and so will show up in baseline assert not chunk.chunk_logging() # otherwise we should chunk_log this chunk_tag = 'initialize_tvpb' # all attribute_combinations can use same cached data for row_size calc for i, chooser_chunk, chunk_trace_label \ in chunk.adaptive_chunked_choosers(choosers_df, chunk_size, trace_label, chunk_tag=chunk_tag): # we should count choosers_df as chunk overhead since its pretty big and was custom made for compute_utilities assert chooser_chunk._is_view # otherwise copying it is wasteful chooser_chunk = chooser_chunk.copy() chunk.log_df(trace_label, 'attribute_chooser_chunk', chooser_chunk) # add any attribute columns specified as column attributes in settings (the rest will be scalars in locals_dict) for attribute_name in attributes_as_columns: chooser_chunk[attribute_name] = scalar_attributes[attribute_name] chunk.log_df(trace_label, 'attribute_chooser_chunk', chooser_chunk) utilities_df = \ pathbuilder.compute_utilities(network_los, model_settings=model_settings, choosers=chooser_chunk, model_constants=model_constants, trace_label=trace_label) chunk.log_df(trace_label, 'utilities_df', utilities_df) assert len(utilities_df) == len(chooser_chunk) assert len(utilities_df.columns) == data.shape[1] assert not any_uninitialized(utilities_df.values) data[chooser_chunk.index.values, :] = utilities_df.values del chooser_chunk chunk.log_df(trace_label, 'attribute_chooser_chunk', None) logger.debug(f"{trace_label} updated utilities")
def choose_intermediate_trip_purpose(trips, probs_spec, trace_hh_id, trace_label): """ chose purpose for intermediate trips based on probs_spec which assigns relative weights (summing to 1) to the possible purpose choices Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ probs_join_cols = ['primary_purpose', 'outbound', 'person_type'] non_purpose_cols = probs_join_cols + ['depart_range_start', 'depart_range_end'] purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols] num_trips = len(trips.index) have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips) # probs shold sum to 1 across rows sum_probs = probs_spec[purpose_cols].sum(axis=1) probs_spec.loc[:, purpose_cols] = probs_spec.loc[:, purpose_cols].div(sum_probs, axis=0) # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) choosers = pd.merge(trips.reset_index(), probs_spec, on=probs_join_cols, how='left').set_index('trip_id') chunk.log_df(trace_label, 'choosers', choosers) # select the matching depart range (this should result on in exactly one chooser row per trip) choosers = choosers[(choosers.start >= choosers['depart_range_start']) & ( choosers.start <= choosers['depart_range_end'])] # choosers should now match trips row for row assert choosers.index.is_unique assert len(choosers.index) == num_trips choices, rands = logit.make_choices( choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers) if have_trace_targets: tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'trip_purpose']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) choices = choices.map(pd.Series(purpose_cols)) return choices
def compute_tap_tap_time(self, recipe, access_df, egress_df, chooser_attributes, trace_label, trace): trace_label = tracing.extend_trace_label(trace_label, 'compute_tap_tap_time') model_constants = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.CONSTANTS') tap_tap_settings = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.tap_tap_settings') with memo("#TVPB CACHE compute_tap_tap_utilities all_transit_paths"): transit_df = self.all_transit_paths(access_df, egress_df, chooser_attributes, trace_label, trace) # note: transit_df index is arbitrary chunk.log_df(trace_label, "transit_df", transit_df) locals_d = {'los': self.network_los} locals_d.update(model_constants) assignment_spec = assign.read_assignment_spec( file_name=config.config_file_path(tap_tap_settings['SPEC'])) results, _, _ = assign.assign_variables(assignment_spec, transit_df, locals_d) assert len(results.columns == 1) transit_df['transit'] = results # filter out unavailable btap_atap pairs logger.debug( f"{(transit_df['transit'] <= 0).sum()} unavailable tap_tap pairs out of {len(transit_df)}" ) transit_df = transit_df[transit_df.transit > 0] transit_df.drop(columns=chooser_attributes.columns, inplace=True) chunk.log_df(trace_label, "transit_df", None) if trace: self.trace_df(transit_df, trace_label, 'transit_df') return transit_df
def annotate_tables(model_settings, trace_label): trace_label = tracing.extend_trace_label(trace_label, 'annotate_tables') chunk.log_rss(trace_label) annotate_tables = model_settings.get('annotate_tables', []) if not annotate_tables: logger.warning( f"{trace_label} - annotate_tables setting is empty - nothing to do!" ) assert isinstance(annotate_tables, list), \ f"annotate_tables settings should be a list but is {type(annotate_tables)}" t0 = tracing.print_elapsed_time() for table_info in annotate_tables: tablename = table_info['tablename'] chunk.log_rss(f"{trace_label}.pre-get_table.{tablename}") df = inject.get_table(tablename).to_frame() chunk.log_df(trace_label, tablename, df) # - rename columns column_map = table_info.get('column_map', None) if column_map: warnings.warn( f"Setting 'column_map' has been changed to 'rename_columns'. " f"Support for 'column_map' in annotate_tables will be removed in future versions.", FutureWarning) logger.info( f"{trace_label} - renaming {tablename} columns {column_map}") df.rename(columns=column_map, inplace=True) # - annotate annotate = table_info.get('annotate', None) if annotate: logger.info( f"{trace_label} - annotating {tablename} SPEC {annotate['SPEC']}" ) expressions.assign_columns(df=df, model_settings=annotate, trace_label=trace_label) chunk.log_df(trace_label, tablename, df) # - write table to pipeline pipeline.replace_table(tablename, df) del df chunk.log_df(trace_label, tablename, None)
def compute_utilities_for_atttribute_tuple(network_los, scalar_attributes, data, chunk_size, trace_label): # scalar_attributes is a dict of attribute name/value pairs for this combination # (e.g. {'demographic_segment': 0, 'tod': 'AM', 'access_mode': 'walk'}) logger.info(f"{trace_label} scalar_attributes: {scalar_attributes}") uid_calculator = network_los.tvpb.uid_calculator attributes_as_columns = \ network_los.setting('TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attributes_as_columns', []) model_settings = \ network_los.setting(f'TVPB_SETTINGS.tour_mode_choice.tap_tap_settings') model_constants = \ network_los.setting(f'TVPB_SETTINGS.tour_mode_choice.CONSTANTS').copy() model_constants.update(scalar_attributes) data = data.reshape(uid_calculator.fully_populated_shape) # get od skim_offset dataframe with uid index corresponding to scalar_attributes choosers_df = uid_calculator.get_od_dataframe(scalar_attributes) row_size = chunk_size and initialize_tvpb_calc_row_size( choosers_df, network_los, trace_label) for i, chooser_chunk, chunk_trace_label \ in chunk.adaptive_chunked_choosers(choosers_df, chunk_size, row_size, trace_label): # we should count choosers_df as chunk overhead since its pretty big and was custom made for compute_utilities # (call log_df from inside yield loop so it is visible to adaptive_chunked_choosers chunk_log) chunk.log_df(trace_label, 'choosers_df', choosers_df) # add any attribute columns specified as column attributes in settings (the rest will be scalars in locals_dict) for attribute_name in attributes_as_columns: chooser_chunk[attribute_name] = scalar_attributes[attribute_name] chunk.log_df(trace_label, 'chooser_chunk', chooser_chunk) utilities_df = \ pathbuilder.compute_utilities(network_los, model_settings=model_settings, choosers=chooser_chunk, model_constants=model_constants, trace_label=trace_label) chunk.log_df(trace_label, 'utilities_df', utilities_df) assert len(utilities_df) == len(chooser_chunk) assert len(utilities_df.columns) == data.shape[1] assert not any_uninitialized(utilities_df.values) data[chooser_chunk.index.values, :] = utilities_df.values logger.debug(f"{trace_label} updated utilities")
def compute_maz_tap_utilities(self, recipe, maz_od_df, chooser_attributes, leg, mode, trace_label, trace): trace_label = tracing.extend_trace_label(trace_label, f'maz_tap_utils.{leg}') with chunk.chunk_log(trace_label): maz_tap_settings = \ self.network_los.setting(f'TVPB_SETTINGS.{recipe}.maz_tap_settings.{mode}') chooser_columns = maz_tap_settings['CHOOSER_COLUMNS'] attribute_columns = list( chooser_attributes.columns ) if chooser_attributes is not None else [] model_constants = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.CONSTANTS') if leg == 'access': maz_col = 'omaz' tap_col = 'btap' else: maz_col = 'dmaz' tap_col = 'atap' # maz_to_tap access/egress utilities # deduped utilities_df - one row per chooser for each boarding tap (btap) accessible from omaz utilities_df = self.network_los.maz_to_tap_dfs[mode] utilities_df = utilities_df[chooser_columns]. \ reset_index(drop=False). \ rename(columns={'MAZ': maz_col, 'TAP': tap_col}) utilities_df = pd.merge(maz_od_df[['idx', maz_col]].drop_duplicates(), utilities_df, on=maz_col, how='inner') # add any supplemental chooser attributes (e.g. demographic_segment, tod) for c in attribute_columns: utilities_df[c] = reindex(chooser_attributes[c], utilities_df['idx']) chunk.log_df(trace_label, "utilities_df", utilities_df) if self.units_for_recipe(recipe) == 'utility': utilities_df[leg] = compute_utilities( self.network_los, maz_tap_settings, utilities_df, model_constants=model_constants, trace_label=trace_label, trace=trace, trace_column_names=['idx', maz_col, tap_col] if trace else None) chunk.log_df(trace_label, "utilities_df", utilities_df) # annotated else: assignment_spec = \ assign.read_assignment_spec(file_name=config.config_file_path(maz_tap_settings['SPEC'])) results, _, _ = assign.assign_variables( assignment_spec, utilities_df, model_constants) assert len(results.columns == 1) utilities_df[leg] = results chunk.log_df(trace_label, "utilities_df", utilities_df) if trace: self.trace_df(utilities_df, trace_label, 'utilities_df') # drop utility computation columns ('tod', 'demographic_segment' and maz_to_tap_df time/distance columns) utilities_df.drop(columns=attribute_columns + chooser_columns, inplace=True) return utilities_df
def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): """ trip purpose - main functionality separated from model step so it can be called iteratively For each intermediate stop on a tour (i.e. trip other than the last trip outbound or inbound) each trip is assigned a purpose based on an observed frequency distribution The distribution should always be segmented by tour purpose and tour direction. By default it is also segmented by person type. The join columns can be overwritten using the "probs_join_cols" parameter in the model settings. The model will attempt to segment by trip depart time as well if necessary and depart time ranges are specified in the probability lookup table. Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ # uniform across trip_purpose chunk_tag = 'trip_purpose' model_settings_file_name = 'trip_purpose.yaml' model_settings = config.read_model_settings(model_settings_file_name) probs_join_cols = model_settings.get('probs_join_cols', PROBS_JOIN_COLUMNS) spec_file_name = model_settings.get('PROBS_SPEC', 'trip_purpose_probs.csv') probs_spec = pd.read_csv(config.config_file_path(spec_file_name), comment='#') # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation # coefficients_df = simulate.read_model_coefficients(model_settings) # probs_spec = map_coefficients(probs_spec, coefficients_df) if estimator: estimator.write_spec(model_settings, tag='PROBS_SPEC') estimator.write_model_settings(model_settings, model_settings_file_name) # estimator.write_coefficients(coefficients_df, model_settings) result_list = [] # - last trip of outbound tour gets primary_purpose last_trip = (trips_df.trip_num == trips_df.trip_count) purpose = trips_df.primary_purpose[last_trip & trips_df.outbound] result_list.append(purpose) logger.info("assign purpose to %s last outbound trips", purpose.shape[0]) # - last trip of inbound tour gets home (or work for atwork subtours) purpose = trips_df.primary_purpose[last_trip & ~trips_df.outbound] purpose = pd.Series(np.where(purpose == 'atwork', 'work', 'home'), index=purpose.index) result_list.append(purpose) logger.info("assign purpose to %s last inbound trips", purpose.shape[0]) # - intermediate stops (non-last trips) purpose assigned by probability table trips_df = trips_df[~last_trip] logger.info("assign purpose to %s intermediate trips", trips_df.shape[0]) preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = config.get_model_constants(model_settings) expressions.assign_columns(df=trips_df, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) use_depart_time = model_settings.get('use_depart_time', True) for i, trips_chunk, chunk_trace_label in \ chunk.adaptive_chunked_choosers(trips_df, chunk_size, chunk_tag, trace_label): choices = choose_intermediate_trip_purpose( trips_chunk, probs_spec, estimator, probs_join_cols=probs_join_cols, use_depart_time=use_depart_time, trace_hh_id=trace_hh_id, trace_label=chunk_trace_label) result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) if len(result_list) > 1: choices = pd.concat(result_list) return choices
def choose_intermediate_trip_purpose(trips, probs_spec, estimator, probs_join_cols, use_depart_time, trace_hh_id, trace_label): """ chose purpose for intermediate trips based on probs_spec which assigns relative weights (summing to 1) to the possible purpose choices Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ non_purpose_cols = probs_join_cols.copy() if use_depart_time: non_purpose_cols += ['depart_range_start', 'depart_range_end'] purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols] num_trips = len(trips.index) have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips) # probs should sum to 1 across rows sum_probs = probs_spec[purpose_cols].sum(axis=1) probs_spec.loc[:, purpose_cols] = probs_spec.loc[:, purpose_cols].div(sum_probs, axis=0) # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) choosers = pd.merge(trips.reset_index(), probs_spec, on=probs_join_cols, how='left').set_index('trip_id') chunk.log_df(trace_label, 'choosers', choosers) if use_depart_time: # select the matching depart range (this should result on in exactly one chooser row per trip) chooser_probs = \ (choosers.start >= choosers['depart_range_start']) & (choosers.start <= choosers['depart_range_end']) # if we failed to match a row in probs_spec if chooser_probs.sum() < num_trips: # this can happen if the spec doesn't have probs for the trips matching a trip's probs_join_cols missing_trip_ids = trips.index[ ~trips.index.isin(choosers.index[chooser_probs])].values unmatched_choosers = choosers[choosers.index.isin( missing_trip_ids)] unmatched_choosers = unmatched_choosers[['person_id', 'start'] + non_purpose_cols] # join to persons for better diagnostics persons = inject.get_table('persons').to_frame() persons_cols = [ 'age', 'is_worker', 'is_student', 'is_gradeschool', 'is_highschool', 'is_university' ] unmatched_choosers = pd.merge(unmatched_choosers, persons[[ col for col in persons_cols if col in persons.columns ]], left_on='person_id', right_index=True, how='left') file_name = '%s.UNMATCHED_PROBS' % trace_label logger.error( "%s %s of %s intermediate trips could not be matched to probs based on join columns %s" % (trace_label, len(unmatched_choosers), len(choosers), probs_join_cols)) logger.info("Writing %s unmatched choosers to %s" % ( len(unmatched_choosers), file_name, )) tracing.write_csv(unmatched_choosers, file_name=file_name, transpose=False) raise RuntimeError( "Some trips could not be matched to probs based on join columns %s." % probs_join_cols) # select the matching depart range (this should result on in exactly one chooser row per trip) choosers = choosers[chooser_probs] # choosers should now match trips row for row assert choosers.index.identical(trips.index) if estimator: probs_cols = list(probs_spec.columns) print(choosers[probs_cols]) estimator.write_table(choosers[probs_cols], 'probs', append=True) choices, rands = logit.make_choices(choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers) if have_trace_targets: tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'trip_purpose']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) choices = choices.map(pd.Series(purpose_cols)) return choices
def schedule_trips_in_leg(outbound, trips, probs_spec, model_settings, is_last_iteration, trace_hh_id, trace_label): """ Parameters ---------- outbound trips probs_spec depart_alt_base is_last_iteration trace_hh_id trace_label Returns ------- choices: pd.Series depart choice for trips, indexed by trip_id """ failfix = model_settings.get(FAILFIX, FAILFIX_DEFAULT) # logger.debug("%s scheduling %s trips" % (trace_label, trips.shape[0])) assert len(trips) > 0 assert (trips.outbound == outbound).all() # initial trip of leg and all atwork trips get tour_hour is_initial = (trips.trip_num == 1) if outbound else (trips.trip_num == trips.trip_count) no_scheduling = is_initial | (trips.primary_purpose == 'atwork') choices = trips.tour_hour[no_scheduling] if no_scheduling.all(): return choices result_list = [] result_list.append(choices) trips = trips[~no_scheduling] # add next_trip_id temp column (temp as trips is now a copy, as result of slicing) trips = trips.sort_index() trips['next_trip_id'] = np.roll(trips.index, -1 if outbound else 1) is_final = (trips.trip_num == trips.trip_count) if outbound else (trips.trip_num == 1) trips.next_trip_id = trips.next_trip_id.where(~is_final, NO_TRIP_ID) # iterate over outbound trips in ascending trip_num order, skipping the initial trip # iterate over inbound trips in descending trip_num order, skipping the finial trip first_trip_in_leg = True for i in range(trips.trip_num.min(), trips.trip_num.max() + 1): if outbound: nth_trips = trips[trips.trip_num == i] else: nth_trips = trips[trips.trip_num == trips.trip_count - i] nth_trace_label = tracing.extend_trace_label(trace_label, 'num_%s' % i) choices = schedule_nth_trips(nth_trips, probs_spec, model_settings, first_trip_in_leg=first_trip_in_leg, report_failed_trips=is_last_iteration, trace_hh_id=trace_hh_id, trace_label=nth_trace_label) # if outbound, this trip's depart constrains next trip's earliest depart option # if inbound, we are handling in reverse order, so it constrains latest depart instead ADJUST_NEXT_DEPART_COL = 'earliest' if outbound else 'latest' # most initial departure (when no choice was made because all probs were zero) if is_last_iteration and (failfix == FAILFIX_CHOOSE_MOST_INITIAL): choices = choices.reindex(nth_trips.index) logger.warning("%s coercing %s depart choices to most initial" % (nth_trace_label, choices.isna().sum())) choices = choices.fillna(trips[ADJUST_NEXT_DEPART_COL]) # adjust allowed depart range of next trip has_next_trip = (nth_trips.next_trip_id != NO_TRIP_ID) if has_next_trip.any(): next_trip_ids = nth_trips.next_trip_id[has_next_trip] # patch choice any trips with next_trips that weren't scheduled trips.loc[next_trip_ids, ADJUST_NEXT_DEPART_COL] = \ choices.reindex(next_trip_ids.index).fillna(trips[ADJUST_NEXT_DEPART_COL]).values result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) first_trip_in_leg = False if len(result_list) > 1: choices = pd.concat(result_list) return choices
def run_trip_scheduling_choice(spec, tours, skims, locals_dict, chunk_size, trace_hh_id, trace_label): NUM_TOUR_LEGS = 3 trace_label = tracing.extend_trace_label(trace_label, 'interaction_sample_simulate') # FIXME: The duration, start, and end should be ints well before we get here... tours[TOUR_DURATION_COLUMN] = tours[TOUR_DURATION_COLUMN].astype(np.int8) # Setup boolean columns to make it easier to identify # intermediate stops later in the model. tours[HAS_OB_STOPS] = tours[NUM_OB_STOPS] >= 1 tours[HAS_IB_STOPS] = tours[NUM_IB_STOPS] >= 1 # Calculate a matrix with the appropriate alternative sizes # based on the total tour duration. This is used to calculate # chunk sizes. max_duration = tours[TOUR_DURATION_COLUMN].max() alt_sizes = generate_alternative_sizes(max_duration, NUM_TOUR_LEGS) # Assert the number of tour leg schedule alternatives for each tour tours[NUM_ALTERNATIVES] = 1 tours.loc[tours[HAS_OB_STOPS] != tours[HAS_IB_STOPS], NUM_ALTERNATIVES] = tours[TOUR_DURATION_COLUMN] + 1 tours.loc[tours[HAS_OB_STOPS] & tours[HAS_IB_STOPS], NUM_ALTERNATIVES] = \ tours.apply(lambda x: alt_sizes[1, x.duration], axis=1) # If no intermediate stops on the tour, then then main leg duration # equals the tour duration and the intermediate durations are zero tours.loc[~tours[HAS_OB_STOPS] & ~tours[HAS_IB_STOPS], MAIN_LEG_DURATION] = tours[TOUR_DURATION_COLUMN] tours.loc[~tours[HAS_OB_STOPS] & ~tours[HAS_IB_STOPS], [IB_DURATION, OB_DURATION]] = 0 # We only need to determine schedules for tours with intermediate stops indirect_tours = tours.loc[tours[HAS_OB_STOPS] | tours[HAS_IB_STOPS]] if len(indirect_tours) > 0: # Iterate through the chunks result_list = [] for i, choosers, chunk_trace_label in \ chunk.adaptive_chunked_choosers(indirect_tours, chunk_size, trace_label): # Sort the choosers and get the schedule alternatives choosers = choosers.sort_index() schedules = generate_schedule_alternatives(choosers).sort_index() # Assuming we did the max_alt_size calculation correctly, # we should get the same sizes here. assert choosers[NUM_ALTERNATIVES].sum() == schedules.shape[0] # Run the simulation choices = _interaction_sample_simulate( choosers=choosers, alternatives=schedules, spec=spec, choice_column=SCHEDULE_ID, allow_zero_probs=True, zero_prob_choice_val=-999, log_alt_losers=False, want_logsums=False, skims=skims, locals_d=locals_dict, trace_label=chunk_trace_label, trace_choice_name='trip_schedule_stage_1', estimator=None) assert len(choices.index) == len(choosers.index) choices = schedules[schedules[SCHEDULE_ID].isin(choices)] result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) assert len(choices.index) == len(indirect_tours.index) # The choices here are only the indirect tours, so the durations # need to be updated on the main tour dataframe. tours.update(choices[[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION]]) # Cleanup data types and drop temporary columns tours[[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION]] = \ tours[[MAIN_LEG_DURATION, OB_DURATION, IB_DURATION]].astype(np.int8) tours = tours.drop(columns=TEMP_COLS) return tours
def _run_cdap( persons, cdap_indiv_spec, interaction_coefficients, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label): """ Implements core run_cdap functionality on persons df (or chunked subset thereof) Aside from chunking of persons df, params are passed through from run_cdap unchanged """ # assign integer cdap_rank to each household member # persons with cdap_rank 1..MAX_HHSIZE will be have their activities chose by CDAP model # extra household members, will have activities assigned by in fixed proportions assign_cdap_rank(persons, trace_hh_id, trace_label) # Calculate CDAP utilities for each individual, ignoring interactions # ind_utils has index of 'person_id' and a column for each alternative # i.e. three columns 'M' (Mandatory), 'N' (NonMandatory), 'H' (Home) indiv_utils = individual_utilities(persons[persons.cdap_rank <= MAX_HHSIZE], cdap_indiv_spec, locals_d, trace_hh_id, trace_label) # compute interaction utilities, probabilities, and hh activity pattern choices # for each size household separately in turn up to MAX_HHSIZE hh_choices_list = [] for hhsize in range(1, MAX_HHSIZE+1): choices = household_activity_choices( indiv_utils, interaction_coefficients, hhsize=hhsize, trace_hh_id=trace_hh_id, trace_label=trace_label) hh_choices_list.append(choices) del indiv_utils # concat all the household choices into a single series indexed on _hh_index_ hh_activity_choices = pd.concat(hh_choices_list) # unpack the household activity choice list into choices for each (non-extra) household member # resulting series contains one activity per individual hh member, indexed on _persons_index_ cdap_person_choices \ = unpack_cdap_indiv_activity_choices(persons, hh_activity_choices, trace_hh_id, trace_label) # assign activities to extra household members (with cdap_rank > MAX_HHSIZE) # resulting series contains one activity per individual hh member, indexed on _persons_index_ extra_person_choices \ = extra_hh_member_choices(persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label) # concat cdap and extra persoin choices into a single series # this series will be the same length as the persons dataframe and be indexed on _persons_index_ person_choices = pd.concat([cdap_person_choices, extra_person_choices]) persons['cdap_activity'] = person_choices cdap_results = persons[['cdap_rank', 'cdap_activity']] # if DUMP: # tracing.trace_df(hh_activity_choices, '%s.DUMP.hh_activity_choices' % trace_label, # transpose=False, slicer='NONE') # tracing.trace_df(cdap_results, '%s.DUMP.cdap_results' % trace_label, # transpose=False, slicer='NONE') chunk.log_df(trace_label, 'persons', persons) # return dataframe with two columns return cdap_results
def compute_tap_tap_time(self, recipe, access_df, egress_df, chooser_attributes, path_info, trace_label, trace): trace_label = tracing.extend_trace_label(trace_label, 'compute_tap_tap_time') with chunk.chunk_log(trace_label): model_constants = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.CONSTANTS') tap_tap_settings = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.tap_tap_settings') with memo( "#TVPB CACHE compute_tap_tap_utilities all_transit_paths"): transit_df = self.all_transit_paths(access_df, egress_df, chooser_attributes, trace_label, trace) # note: transit_df index is arbitrary chunk.log_df(trace_label, "transit_df", transit_df) # some expressions may want to know access mode - locals_dict = path_info.copy() locals_dict['los'] = self.network_los locals_dict.update(model_constants) assignment_spec = assign.read_assignment_spec( file_name=config.config_file_path(tap_tap_settings['SPEC'])) DEDUPE = True if DEDUPE: # assign uid for reduping max_atap = transit_df.atap.max() + 1 transit_df[ 'uid'] = transit_df.btap * max_atap + transit_df.atap # dedupe chooser_attribute_columns = list(chooser_attributes.columns) unique_transit_df = \ transit_df.loc[~transit_df.uid.duplicated(), ['btap', 'atap', 'uid'] + chooser_attribute_columns] unique_transit_df.set_index('uid', inplace=True) chunk.log_df(trace_label, "unique_transit_df", unique_transit_df) logger.debug( f"#TVPB CACHE deduped transit_df from {len(transit_df)} to {len(unique_transit_df)}" ) # assign_variables results, _, _ = assign.assign_variables( assignment_spec, unique_transit_df, locals_dict) assert len(results.columns == 1) unique_transit_df['transit'] = results # redupe results back into transit_df with memo("#TVPB compute_tap_tap_time redupe transit_df"): transit_df['transit'] = reindex(unique_transit_df.transit, transit_df.uid) del transit_df['uid'] del unique_transit_df chunk.log_df(trace_label, "transit_df", transit_df) chunk.log_df(trace_label, "unique_transit_df", None) else: results, _, _ = assign.assign_variables( assignment_spec, transit_df, locals_dict) assert len(results.columns == 1) transit_df['transit'] = results # filter out unavailable btap_atap pairs logger.debug( f"{(transit_df['transit'] <= 0).sum()} unavailable tap_tap pairs out of {len(transit_df)}" ) transit_df = transit_df[transit_df.transit > 0] transit_df.drop(columns=chooser_attributes.columns, inplace=True) chunk.log_df(trace_label, "transit_df", None) if trace: self.trace_df(transit_df, trace_label, 'transit_df') return transit_df
def _run_cdap( persons, cdap_indiv_spec, interaction_coefficients, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label): """ Implements core run_cdap functionality on persons df (or chunked subset thereof) Aside from chunking of persons df, params are passed through from run_cdap unchanged Returns pandas Dataframe with two columns: cdap_activity : str activity for that person expressed as 'M', 'N', 'H' cdap_rank : int activities for persons with cdap_rank <= MAX_HHSIZE are determined by cdap 'extra' household members activities are assigned by cdap_fixed_relative_proportions """ # assign integer cdap_rank to each household member # persons with cdap_rank 1..MAX_HHSIZE will be have their activities chose by CDAP model # extra household members, will have activities assigned by in fixed proportions assign_cdap_rank(persons, trace_hh_id, trace_label) # Calculate CDAP utilities for each individual, ignoring interactions # ind_utils has index of 'person_id' and a column for each alternative # i.e. three columns 'M' (Mandatory), 'N' (NonMandatory), 'H' (Home) indiv_utils = individual_utilities(persons[persons.cdap_rank <= MAX_HHSIZE], cdap_indiv_spec, locals_d, trace_hh_id, trace_label) # compute interaction utilities, probabilities, and hh activity pattern choices # for each size household separately in turn up to MAX_HHSIZE hh_choices_list = [] for hhsize in range(1, MAX_HHSIZE+1): choices = household_activity_choices( indiv_utils, interaction_coefficients, hhsize=hhsize, trace_hh_id=trace_hh_id, trace_label=trace_label) hh_choices_list.append(choices) del indiv_utils # concat all the household choices into a single series indexed on _hh_index_ hh_activity_choices = pd.concat(hh_choices_list) # unpack the household activity choice list into choices for each (non-extra) household member # resulting series contains one activity per individual hh member, indexed on _persons_index_ cdap_person_choices \ = unpack_cdap_indiv_activity_choices(persons, hh_activity_choices, trace_hh_id, trace_label) # assign activities to extra household members (with cdap_rank > MAX_HHSIZE) # resulting series contains one activity per individual hh member, indexed on _persons_index_ extra_person_choices \ = extra_hh_member_choices(persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label) # concat cdap and extra persoin choices into a single series # this series will be the same length as the persons dataframe and be indexed on _persons_index_ person_choices = pd.concat([cdap_person_choices, extra_person_choices]) persons['cdap_activity'] = person_choices # if DUMP: # tracing.trace_df(hh_activity_choices, '%s.DUMP.hh_activity_choices' % trace_label, # transpose=False, slicer='NONE') # tracing.trace_df(cdap_results, '%s.DUMP.cdap_results' % trace_label, # transpose=False, slicer='NONE') chunk.log_df(trace_label, 'persons', persons) return persons[['cdap_rank', 'cdap_activity']]
def tdd_interaction_dataset(tours, alts, timetable, choice_column, window_id_col, trace_label): """ interaction_sample_simulate expects alts index same as choosers (e.g. tour_id) name of choice column in alts Parameters ---------- tours : pandas DataFrame must have person_id column and index on tour_id alts : pandas DataFrame alts index must be timetable tdd id timetable : TimeTable object choice_column : str name of column to store alt index in alt_tdd DataFrame (since alt_tdd is duplicate index on person_id but unique on person_id,alt_id) Returns ------- alt_tdd : pandas DataFrame columns: start, end , duration, <choice_column> index: tour_id """ trace_label = tracing.extend_trace_label(trace_label, 'tdd_interaction_dataset') with chunk.chunk_log(trace_label): alts_ids = np.tile(alts.index, len(tours.index)) chunk.log_df(trace_label, 'alts_ids', alts_ids) tour_ids = np.repeat(tours.index, len(alts.index)) window_row_ids = np.repeat(tours[window_id_col], len(alts.index)) alt_tdd = alts.take(alts_ids) alt_tdd.index = tour_ids alt_tdd[window_id_col] = window_row_ids # add tdd alternative id # by convention, the choice column is the first column in the interaction dataset alt_tdd.insert(loc=0, column=choice_column, value=alts_ids) # slice out all non-available tours available = timetable.tour_available(alt_tdd[window_id_col], alt_tdd[choice_column]) logger.debug( f"tdd_interaction_dataset keeping {available.sum()} of ({len(available)}) available alt_tdds" ) assert available.any() chunk.log_df(trace_label, 'alt_tdd', alt_tdd) # catch this before we slice on available alt_tdd = alt_tdd[available] chunk.log_df(trace_label, 'alt_tdd', alt_tdd) # FIXME - don't need this any more after slicing del alt_tdd[window_id_col] return alt_tdd
def schedule_trips_in_leg(outbound, trips, probs_spec, model_settings, is_last_iteration, trace_hh_id, trace_label): """ Parameters ---------- outbound trips probs_spec depart_alt_base is_last_iteration trace_hh_id trace_label Returns ------- choices: pd.Series depart choice for trips, indexed by trip_id """ failfix = model_settings.get(FAILFIX, FAILFIX_DEFAULT) depart_alt_base = model_settings.get('DEPART_ALT_BASE', 0) scheduling_mode = model_settings.get('scheduling_mode', 'departure') if scheduling_mode == 'departure': probs_join_cols = model_settings.get( 'probs_join_cols', PROBS_JOIN_COLUMNS_DEPARTURE_BASED) elif scheduling_mode == 'stop_duration': probs_join_cols = model_settings.get( 'probs_join_cols', PROBS_JOIN_COLUMNS_DURATION_BASED) else: logger.error( "Invalid scheduling mode specified: {0}.".format(scheduling_mode), "Please select one of ['departure', 'stop_duration'] and try again." ) # logger.debug("%s scheduling %s trips" % (trace_label, trips.shape[0])) assert len(trips) > 0 assert (trips.outbound == outbound).all() result_list = [] # trips to/from tour origin or atwork get tour_hour departure times # no need to schedule them if there are no intermediate stops to_from_tour_orig = (trips.trip_num == 1) if outbound else ( trips.trip_num == trips.trip_count) do_not_schedule = to_from_tour_orig | (trips.primary_purpose == 'atwork') choices = trips.tour_hour[do_not_schedule] if do_not_schedule.all(): return choices result_list.append(choices) trips = trips[~do_not_schedule] # add next_trip_id temp column, and specificy departure constraint column to update trips = trips.sort_index() if outbound or scheduling_mode == DURATION_MODE: trips['next_trip_id'] = np.roll(trips.index, -1) is_final = trips.trip_num == trips.trip_count # each trip's depart constrains next trip's earliest depart option ADJUST_NEXT_DEPART_COL = 'earliest' else: trips['next_trip_id'] = np.roll(trips.index, 1) is_final = trips.trip_num == 1 # if inbound, we are handling in reverse order, so each choice # constrains latest depart of the preceding trip ADJUST_NEXT_DEPART_COL = 'latest' trips.next_trip_id = trips.next_trip_id.where(~is_final, NO_TRIP_ID) first_trip_in_leg = True for i in range(trips.trip_num.min(), trips.trip_num.max() + 1): if outbound or scheduling_mode == DURATION_MODE: # iterate in ascending trip_num order nth_trips = trips[trips.trip_num == i] else: # iterate over inbound trips in descending trip_num order, skipping the final trip nth_trips = trips[trips.trip_num == trips.trip_count - i] nth_trace_label = tracing.extend_trace_label(trace_label, 'num_%s' % i) choices = ps.make_scheduling_choices( nth_trips, scheduling_mode, probs_spec, probs_join_cols, depart_alt_base, first_trip_in_leg=first_trip_in_leg, report_failed_trips=is_last_iteration, trace_hh_id=trace_hh_id, trace_label=nth_trace_label) # most initial departure (when no choice was made because all probs were zero) if is_last_iteration and (failfix == FAILFIX_CHOOSE_MOST_INITIAL): choices = choices.reindex(nth_trips.index) logger.warning("%s coercing %s depart choices to most initial" % (nth_trace_label, choices.isna().sum())) choices = choices.fillna(trips[ADJUST_NEXT_DEPART_COL]) # adjust allowed depart range of next trip has_next_trip = (nth_trips.next_trip_id != NO_TRIP_ID) if has_next_trip.any(): next_trip_ids = nth_trips.next_trip_id[has_next_trip] # patch choice any trips with next_trips that weren't scheduled trips.loc[next_trip_ids, ADJUST_NEXT_DEPART_COL] = \ choices.reindex(next_trip_ids.index).fillna(trips[ADJUST_NEXT_DEPART_COL]).values result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) first_trip_in_leg = False if len(result_list) > 1: choices = pd.concat(result_list) return choices
def compute_tap_tap_utilities(self, recipe, access_df, egress_df, chooser_attributes, path_info, trace_label, trace): """ create transit_df and compute utilities for all atap-btap pairs between omaz in access and dmaz in egress_df compute the utilities using the tap_tap utility expressions file specified in tap_tap_settings transit_df contains all possible access omaz/btap to egress dmaz/atap transit path pairs for each chooser trace should be True as we don't encourage/support dynamic utility computation except when tracing (precompute being fairly fast) Parameters ---------- recipe: str 'recipe' key in network_los.yaml TVPB_SETTINGS e.g. tour_mode_choice access_df: pandas.DataFrame dataframe with 'idx' and 'omaz' columns egress_df: pandas.DataFrame dataframe with 'idx' and 'dmaz' columns chooser_attributes: dict path_info trace_label: str trace: boolean Returns ------- transit_df: pandas.dataframe """ assert trace trace_label = tracing.extend_trace_label(trace_label, 'compute_tap_tap_utils') with chunk.chunk_log(trace_label): model_constants = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.CONSTANTS') tap_tap_settings = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.tap_tap_settings') with memo( "#TVPB CACHE compute_tap_tap_utilities all_transit_paths"): transit_df = self.all_transit_paths(access_df, egress_df, chooser_attributes, trace_label, trace) # note: transit_df index is arbitrary chunk.log_df(trace_label, "transit_df", transit_df) # FIXME some expressions may want to know access mode - locals_dict = path_info.copy() locals_dict.update(model_constants) # columns needed for compute_utilities chooser_columns = ['btap', 'atap'] + list( chooser_attributes.columns) # deduplicate transit_df to unique_transit_df with memo( "#TVPB compute_tap_tap_utilities deduplicate transit_df"): attribute_segments = \ self.network_los.setting('TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attribute_segments') scalar_attributes = { k: locals_dict[k] for k in attribute_segments.keys() if k not in transit_df } transit_df['uid'] = self.uid_calculator.get_unique_ids( transit_df, scalar_attributes) unique_transit_df = transit_df.loc[ ~transit_df.uid.duplicated(), chooser_columns + ['uid']] logger.debug( f"#TVPB CACHE deduped transit_df from {len(transit_df)} to {len(unique_transit_df)}" ) unique_transit_df.set_index('uid', inplace=True) chunk.log_df(trace_label, "unique_transit_df", unique_transit_df) transit_df = transit_df[['idx', 'btap', 'atap', 'uid']] # don't need chooser columns chunk.log_df(trace_label, "transit_df", transit_df) logger.debug( f"#TVPB CACHE compute_tap_tap_utilities dedupe transit_df " f"from {len(transit_df)} to {len(unique_transit_df)} rows") num_unique_transit_rows = len(unique_transit_df) # errcheck logger.debug( f"#TVPB CACHE compute_tap_tap_utilities compute_utilities for {len(unique_transit_df)} rows" ) with memo("#TVPB compute_tap_tap_utilities compute_utilities"): unique_utilities_df = compute_utilities( self.network_los, tap_tap_settings, choosers=unique_transit_df, model_constants=locals_dict, trace_label=trace_label, trace=trace, trace_column_names=chooser_columns if trace else None) chunk.log_df(trace_label, "unique_utilities_df", unique_utilities_df) chunk.log_df(trace_label, "unique_transit_df", unique_transit_df) # annotated if trace: # combine unique_transit_df with unique_utilities_df for legibility omnibus_df = pd.merge(unique_transit_df, unique_utilities_df, left_index=True, right_index=True, how='left') self.trace_df(omnibus_df, trace_label, 'unique_utilities_df') chunk.log_df(trace_label, "omnibus_df", omnibus_df) del omnibus_df chunk.log_df(trace_label, "omnibus_df", None) assert num_unique_transit_rows == len( unique_utilities_df) # errcheck # redupe unique_transit_df back into transit_df with memo("#TVPB compute_tap_tap_utilities redupe transit_df"): # idx = transit_df.index transit_df = pd.merge(transit_df, unique_utilities_df, left_on='uid', right_index=True) del transit_df['uid'] # transit_df.index = idx # note: left merge on columns does not preserve index, # but transit_df index is arbitrary so no need to restore chunk.log_df(trace_label, "transit_df", transit_df) for c in unique_utilities_df: assert ERR_CHECK and not transit_df[c].isnull().any() if len(unique_transit_df) > 0: # if all rows were cached, then unique_utilities_df is just a ref to cache del unique_utilities_df chunk.log_df(trace_label, "unique_utilities_df", None) chunk.log_df(trace_label, "transit_df", None) if trace: self.trace_df(transit_df, trace_label, 'transit_df') return transit_df
def run_cdap(persons, person_type_map, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, locals_d, chunk_size=0, trace_hh_id=None, trace_label=None): """ Choose individual activity patterns for persons. Parameters ---------- persons : pandas.DataFrame Table of persons data. Must contain at least a household ID, household size, person type category, and age, plus any columns used in cdap_indiv_spec cdap_indiv_spec : pandas.DataFrame CDAP spec for individuals without taking any interactions into account. cdap_interaction_coefficients : pandas.DataFrame Rules and coefficients for generating interaction specs for different household sizes cdap_fixed_relative_proportions : pandas.DataFrame Spec to for the relative proportions of each activity (M, N, H) to choose activities for additional household members not handled by CDAP locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ in either the cdap_indiv_spec or cdap_fixed_relative_proportions expression files chunk_size: int Chunk size or 0 for no chunking trace_hh_id : int hh_id to trace or None if no hh tracing trace_label : str label for tracing or None if no tracing Returns ------- choices : pandas.DataFrame dataframe is indexed on _persons_index_ and has two columns: cdap_activity : str activity for that person expressed as 'M', 'N', 'H' """ trace_label = tracing.extend_trace_label(trace_label, 'cdap') result_list = [] # segment by person type and pick the right spec for each person type for i, persons_chunk, chunk_trace_label \ in chunk.adaptive_chunked_choosers_by_chunk_id(persons, chunk_size, trace_label): cdap_results = \ _run_cdap(persons_chunk, person_type_map, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, locals_d, trace_hh_id, chunk_trace_label) result_list.append(cdap_results) chunk.log_df(trace_label, f'result_list', result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: cdap_results = pd.concat(result_list) if trace_hh_id: tracing.trace_df(cdap_results, label="cdap", columns=['cdap_rank', 'cdap_activity'], warn_if_empty=True) # return choices column as series return cdap_results['cdap_activity']
def eval_and_sum(assignment_expressions, df, locals_dict, group_by_column_names=None, df_alias=None, chunk_size=0, trace_rows=None): """ Evaluate assignment_expressions against df, and sum the results (sum by group if list of group_by_column_names is specified. e.g. group by coc column names and return sums grouped by community of concern.) Parameters ---------- assignment_expressions df locals_dict group_by_column_names : array of str list of names of the columns to group by (e.g. coc_column_names of trip_coc_end) df_alias : str assign_variables df_alias (name of df in assignment_expressions) chunk_size : int trace_rows : array of bool array indicating which rows in df are to be traced Returns ------- """ if group_by_column_names is None: group_by_column_names = [] rows_per_chunk, effective_chunk_size = \ calc_rows_per_chunk(chunk_size, df, assignment_expressions, extra_columns=len(group_by_column_names), trace_label='eval_and_sum') logger.info("eval_and_sum chunk_size %s rows_per_chunk %s df rows %s" % (effective_chunk_size, rows_per_chunk, df.shape[0])) summary = None result_list = [] trace_results = [] trace_assigned_locals = {} for i, num_chunks, df_chunk, trace_rows_chunk in chunked_df( df, rows_per_chunk, trace_rows): logger.info("eval_and_sum chunk %s of %s" % (i, num_chunks)) logger.debug("eval_and_sum chunk %s assign variables" % (i, )) assigned_chunk, trace_chunk, trace_assigned_locals_chunk = \ assign.assign_variables(assignment_expressions, df_chunk, locals_dict=locals_dict, df_alias=df_alias, trace_rows=trace_rows_chunk) # sum this chunk logger.debug("eval_and_sum chunk %s sum" % (i, )) if group_by_column_names: # concat in the group_by columns for c in group_by_column_names: assigned_chunk[c] = df_chunk[c] # sum this chunk summary = assigned_chunk.groupby(group_by_column_names).sum() else: summary = assigned_chunk.sum().to_frame().T result_list.append(summary) if trace_chunk is not None: trace_results.append(trace_chunk) if trace_assigned_locals_chunk is not None: trace_assigned_locals.update(trace_assigned_locals_chunk) # note: chunk size will log low if there are more spec temp vars than extra_columns trace_label = 'eval_and_sum chunk_%s' % i chunk.log_open(trace_label, chunk_size, effective_chunk_size) chunk.log_df(trace_label, 'df_chunk', df_chunk) chunk.log_df(trace_label, 'assigned_chunk', assigned_chunk) chunk.log_close(trace_label) assert result_list # squash multiple chunk summaries if len(result_list) > 1: logger.debug("eval_and_sum squash chunk summaries") summary = pd.concat(result_list) if group_by_column_names: summary.reset_index(inplace=True) summary = summary.groupby(group_by_column_names).sum() else: summary = summary.sum().to_frame().T if trace_results: trace_results = pd.concat(trace_results) # trace_rows index values should match index of original df trace_results.index = df[trace_rows].index else: trace_results = None return summary, trace_results, trace_assigned_locals
def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_segment, want_choices, trace_label, filter_targets=None, trace=False, override_choices=None): trace_label = tracing.extend_trace_label(trace_label, 'build_virtual_path') # Tracing is implemented as a seperate, second call that operates ONLY on filter_targets assert not (trace and filter_targets is None) if filter_targets is not None: assert filter_targets.any() # slice orig and dest orig = orig[filter_targets] dest = dest[filter_targets] assert len(orig) > 0 assert len(dest) > 0 # slice tod and demographic_segment if not scalar if not isinstance(tod, str): tod = tod[filter_targets] if demographic_segment is not None: demographic_segment = demographic_segment[filter_targets] assert len(demographic_segment) > 0 # slice choices # (requires actual choices from the previous call lest rands change on second call) assert want_choices == (override_choices is not None) if want_choices: override_choices = override_choices[filter_targets] units = self.units_for_recipe(recipe) assert units == 'utility' or not want_choices, "'want_choices' only supported supported if units is utility" access_mode = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.path_types.{path_type}.access') egress_mode = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.path_types.{path_type}.egress') path_types_settings = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.path_types.{path_type}') attributes_as_columns = \ self.network_los.setting(f'TVPB_SETTINGS.{recipe}.tap_tap_settings.attributes_as_columns', []) path_info = { 'path_type': path_type, 'access_mode': access_mode, 'egress_mode': egress_mode } # maz od pairs requested with memo("#TVPB build_virtual_path maz_od_df"): maz_od_df = pd.DataFrame({ 'idx': orig.index.values, 'omaz': orig.values, 'dmaz': dest.values, 'seq': range(len(orig)) }) chunk.log_df(trace_label, "maz_od_df", maz_od_df) self.trace_maz_tap(maz_od_df, access_mode, egress_mode) # for location choice, there will be multiple alt dest rows per chooser and duplicate orig.index values # but tod and demographic_segment should be the same for all chooser rows (unique orig index values) # knowing this allows us to eliminate redundant computations (e.g. utilities of maz_tap pairs) duplicated = orig.index.duplicated(keep='first') chooser_attributes = pd.DataFrame(index=orig.index[~duplicated]) if not isinstance(tod, str): chooser_attributes['tod'] = tod.loc[~duplicated] elif 'tod' in attributes_as_columns: chooser_attributes['tod'] = tod else: path_info['tod'] = tod if demographic_segment is not None: chooser_attributes[ 'demographic_segment'] = demographic_segment.loc[~duplicated] with memo("#TVPB build_virtual_path access_df"): access_df = self.compute_maz_tap_utilities(recipe, maz_od_df, chooser_attributes, leg='access', mode=access_mode, trace_label=trace_label, trace=trace) chunk.log_df(trace_label, "access_df", access_df) with memo("#TVPB build_virtual_path egress_df"): egress_df = self.compute_maz_tap_utilities(recipe, maz_od_df, chooser_attributes, leg='egress', mode=egress_mode, trace_label=trace_label, trace=trace) chunk.log_df(trace_label, "egress_df", egress_df) # path_info for use by expressions (e.g. penalty for drive access if no parking at access tap) with memo("#TVPB build_virtual_path compute_tap_tap"): transit_df = self.compute_tap_tap(recipe, maz_od_df, access_df, egress_df, chooser_attributes, path_info=path_info, trace_label=trace_label, trace=trace) chunk.log_df(trace_label, "transit_df", transit_df) with memo("#TVPB build_virtual_path best_paths"): path_df = self.best_paths(recipe, path_type, maz_od_df, access_df, egress_df, transit_df, trace_label, trace) chunk.log_df(trace_label, "path_df", path_df) # now that we have created path_df, we are done with the dataframes for the separate legs del access_df chunk.log_df(trace_label, "access_df", None) del egress_df chunk.log_df(trace_label, "egress_df", None) del transit_df chunk.log_df(trace_label, "transit_df", None) if units == 'utility': # logsums with memo("#TVPB build_virtual_path logsums"): # one row per seq with utilities in columns # path_num 0-based to aligh with logit.make_choices 0-based choice indexes path_df['path_num'] = path_df.groupby('seq').cumcount() chunk.log_df(trace_label, "path_df", path_df) utilities_df = path_df[['seq', 'path_num', units]].set_index(['seq', 'path_num' ]).unstack() utilities_df.columns = utilities_df.columns.droplevel( ) # for legibility # add rows missing because no access or egress availability utilities_df = pd.concat( [pd.DataFrame(index=maz_od_df.seq), utilities_df], axis=1) utilities_df = utilities_df.fillna( UNAVAILABLE ) # set utilities for missing paths to UNAVAILABLE chunk.log_df(trace_label, "utilities_df", utilities_df) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. # most likely "divide by zero encountered in log" caused by all transit sets non-viable warnings.simplefilter("always") paths_nest_nesting_coefficient = path_types_settings.get( 'paths_nest_nesting_coefficient', 1) exp_utilities = np.exp(utilities_df.values / paths_nest_nesting_coefficient) logsums = np.maximum( np.log(np.nansum(exp_utilities, axis=1)), UNAVAILABLE) if len(w) > 0: for wrn in w: logger.warning( f"{trace_label} - {type(wrn).__name__} ({wrn.message})" ) DUMP = False if DUMP: zero_utilities_df = utilities_df[np.nansum( np.exp(utilities_df.values), axis=1) == 0] zero_utilities_df.to_csv(config.output_file_path( 'warning_utilities_df.csv'), index=True) bug if want_choices: # orig index to identify appropriate random number channel to use making choices utilities_df.index = orig.index with memo("#TVPB build_virtual_path make_choices"): probs = logit.utils_to_probs(utilities_df, allow_zero_probs=True, trace_label=trace_label) chunk.log_df(trace_label, "probs", probs) if trace: choices = override_choices utilities_df['choices'] = choices self.trace_df(utilities_df, trace_label, 'utilities_df') probs['choices'] = choices self.trace_df(probs, trace_label, 'probs') else: choices, rands = logit.make_choices( probs, allow_bad_probs=True, trace_label=trace_label) chunk.log_df(trace_label, "rands", rands) del rands chunk.log_df(trace_label, "rands", None) del probs chunk.log_df(trace_label, "probs", None) # we need to get path_set, btap, atap from path_df row with same seq and path_num # drop seq join column, but keep path_num of choice to override_choices when tracing columns_to_cache = ['btap', 'atap', 'path_set', 'path_num'] logsum_df = \ pd.merge(pd.DataFrame({'seq': range(len(orig)), 'path_num': choices.values}), path_df[['seq'] + columns_to_cache], on=['seq', 'path_num'], how='left')\ .drop(columns=['seq'])\ .set_index(orig.index) logsum_df['logsum'] = logsums else: assert len(logsums) == len(orig) logsum_df = pd.DataFrame({'logsum': logsums}, index=orig.index) chunk.log_df(trace_label, "logsum_df", logsum_df) del utilities_df chunk.log_df(trace_label, "utilities_df", None) if trace: self.trace_df(logsum_df, trace_label, 'logsum_df') chunk.log_df(trace_label, "logsum_df", logsum_df) results = logsum_df else: assert units == 'time' # return a series results = pd.Series(path_df[units].values, index=path_df['idx']) # zero-fill rows for O-D pairs where no best path exists because there was no tap-tap transit availability results = reindex(results, maz_od_df.idx).fillna(0.0) chunk.log_df(trace_label, "results", results) assert len(results) == len(orig) del path_df chunk.log_df(trace_label, "path_df", None) # diagnostic # maz_od_df['DIST'] = self.network_los.get_default_skim_dict().get('DIST').get(maz_od_df.omaz, maz_od_df.dmaz) # maz_od_df[units] = results.logsum if units == 'utility' else results.values # print(f"maz_od_df\n{maz_od_df}") return results
def physical_activity_processor(trips_with_demographics, persons_merged, physical_activity_trip_spec, physical_activity_person_spec, physical_activity_settings, coc_column_names, settings, chunk_size, trace_hh_id): """ Compute physical benefits Physical activity benefits generally accrue if the net physical activity for an individual exceeds a certain threshold. We calculate individual physical activity based on trips, so we need to compute trip activity and then sum up to the person level to calculate benefits. We chunk trips by household id to ensure that all of a persons trips are in the same chunk. """ trips_df = trips_with_demographics.to_frame() persons_df = persons_merged.to_frame() trace_label = 'physical_activity' logger.info( "Running physical_activity_processor with %d trips for %d persons " % (len(trips_df), len(persons_df))) locals_dict = config.get_model_constants(physical_activity_settings) locals_dict.update(config.setting('globals')) trip_trace_rows = trace_hh_id and trips_df.household_id == trace_hh_id rows_per_chunk, effective_chunk_size = \ physical_activity_rpc(chunk_size, trips_df, persons_df, physical_activity_trip_spec, trace_label) logger.info("physical_activity_processor chunk_size %s rows_per_chunk %s" % (chunk_size, rows_per_chunk)) coc_summary = None result_list = [] # iterate over trips df chunked by hh_id for i, num_chunks, trips_chunk, trace_rows_chunk \ in bca.chunked_df_by_chunk_id(trips_df, trip_trace_rows, rows_per_chunk): logger.info("%s chunk %s of %s" % (trace_label, i, num_chunks)) trip_activity, trip_trace_results, trip_trace_assigned_locals = \ assign.assign_variables(physical_activity_trip_spec, trips_chunk, locals_dict=locals_dict, df_alias='trips', trace_rows=trace_rows_chunk) # since tracing is at household level, trace_results will occur in only one chunk # we can just write them out when we see them without need to accumulate across chunks if trip_trace_results is not None: tracing.write_csv(trip_trace_results, file_name="physical_activity_trips", index_label='trip_id', column_labels=['label', 'trip']) if trip_trace_assigned_locals: tracing.write_csv(trip_trace_assigned_locals, file_name="physical_activity_trips_locals") # sum trip activity for each unique person trip_activity = trip_activity.groupby(trips_chunk.person_id).sum() # merge in persons columns for this chunk persons_chunk = pd.merge(trip_activity, persons_df, left_index=True, right_index=True) # trace rows array for this chunk person_trace_rows = trace_hh_id and persons_chunk[ 'household_id'] == trace_hh_id person_activity, person_trace_results, person_trace_assigned_locals = \ assign.assign_variables(physical_activity_person_spec, persons_chunk, locals_dict=locals_dict, df_alias='persons', trace_rows=person_trace_rows) # since tracing is at household level, trace_results will occur in only one chunk # we can just write them out when we see them without need to accumulate across chunks if person_trace_results is not None: tracing.write_csv(person_trace_results, file_name="physical_activity_persons", index_label='persons_merged_table_index', column_labels=['label', 'person']) if person_trace_assigned_locals: tracing.write_csv(person_trace_assigned_locals, file_name="physical_activity_persons_locals") # concat in the coc columns and summarize the chunk by coc person_activity = pd.concat( [persons_chunk[coc_column_names], person_activity], axis=1) coc_summary = person_activity.groupby(coc_column_names).sum() result_list.append(coc_summary) chunk_trace_label = 'trace_label chunk_%s' % i chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) chunk.log_df(chunk_trace_label, 'trips_chunk', trips_chunk) chunk.log_df(chunk_trace_label, 'persons_chunk', persons_chunk) chunk.log_close(chunk_trace_label) if len(result_list) > 1: # (if there was only one chunk, then concat is redundant) coc_summary = pd.concat(result_list) # squash the accumulated chunk summaries by reapplying group and sum coc_summary.reset_index(inplace=True) coc_summary = coc_summary.groupby(coc_column_names).sum() result_prefix = 'PA_' add_result_columns("coc_results", coc_summary, result_prefix) add_summary_results(coc_summary, prefix=result_prefix, spec=physical_activity_person_spec)
def _schedule_tours(tours, persons_merged, alts, spec, logsum_tour_purpose, model_settings, skims, timetable, window_id_col, previous_tour, tour_owner_id_col, estimator, tour_trace_label): """ previous_tour stores values used to add columns that can be used in the spec which have to do with the previous tours per person. Every column in the alternatives table is appended with the suffix "_previous" and made available. So if your alternatives table has columns for start and end, then start_previous and end_previous will be set to the start and end of the most recent tour for a person. The first time through, start_previous and end_previous are undefined, so make sure to protect with a tour_num >= 2 in the variable computation. Parameters ---------- tours : DataFrame chunk of tours to schedule with unique timetable window_id_col persons_merged : DataFrame DataFrame of persons to be merged with tours containing attributes referenced by expressions in spec alts : DataFrame DataFrame of alternatives which represent all possible time slots. tdd_interaction_dataset function will use timetable to filter them to omit unavailable alternatives spec : DataFrame The spec which will be passed to interaction_simulate. model_settings : dict timetable : TimeTable timetable of timewidows for person (or subtour) with rows for tours[window_id_col] window_id_col : str column name from tours that identifies timetable owner (or None if tours index) - person_id for non/mandatory tours - parent_tour_id for subtours, - None (tours index) for joint_tours since every tour may have different participants) previous_tour: Series series with value of tdd_alt choice for last previous tour scheduled for tour_owner_id_col : str column name from tours that identifies 'owner' of this tour (person_id for non/mandatory tours, parent_tour_id for subtours, household_id for joint_tours) tour_trace_label Returns ------- """ logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) # merge persons into tours # avoid dual suffix for redundant columns names (e.g. household_id) that appear in both tours = pd.merge(tours, persons_merged, left_on='person_id', right_index=True, suffixes=('', '_y')) chunk.log_df(tour_trace_label, "tours", tours) # - add explicit window_id_col for timetable owner if it is index # if no timetable window_id_col specified, then add index as an explicit column # (this is not strictly necessary but its presence makes code simpler in several places) if window_id_col is None: window_id_col = tours.index.name tours[window_id_col] = tours.index # timetable can't handle multiple tours per window_id assert not tours[window_id_col].duplicated().any() # - build interaction dataset filtered to include only available tdd alts # dataframe columns start, end , duration, person_id, tdd # indexed (not unique) on tour_id choice_column = TDD_CHOICE_COLUMN alt_tdd = tdd_interaction_dataset(tours, alts, timetable, choice_column, window_id_col, tour_trace_label) # print(f"tours {tours.shape} alts {alts.shape}") chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) # - add logsums if logsum_tour_purpose: logsums = \ compute_logsums(alt_tdd, tours, logsum_tour_purpose, model_settings, skims, tour_trace_label) else: logsums = 0 alt_tdd['mode_choice_logsum'] = logsums # - merge in previous tour columns # adds start_previous and end_previous, joins on index tours = \ tours.join(get_previous_tour_by_tourid(tours[tour_owner_id_col], previous_tour, alts)) chunk.log_df(tour_trace_label, "tours", tours) # - make choices locals_d = {'tt': timetable} constants = config.get_model_constants(model_settings) if constants is not None: locals_d.update(constants) if not RUN_ALTS_PREPROCESSOR_BEFORE_MERGE: # Note: Clint was running alts_preprocessor here on tdd_interaction_dataset instead of on raw (unmerged) alts # and he was using logsum_tour_purpose as selector, although logically it should be the spec_segment # It just happened to work for example_arc.mandatory_tour_scheduling because, in that model, (unlike semcog) # logsum_tour_purpose and spec_segments are aligned (both logsums and spec are segmented on work, school, univ) # In any case, I don't see any benefit to doing this here - at least not for any existing implementations # but if we do, it will require passing spec_segment to schedule_tours and _schedule_tours # or redundently segmenting alts (yuck!) to conform to more granular tour_segmentation (e.g. univ do school) spec_segment = logsum_tour_purpose # FIXME this is not always right - see note above alt_tdd = run_alts_preprocessor(model_settings, alt_tdd, spec_segment, locals_d, tour_trace_label) chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) if estimator: # write choosers after annotation estimator.write_choosers(tours) estimator.set_alt_id(choice_column) estimator.write_interaction_sample_alternatives(alt_tdd) choices = interaction_sample_simulate(tours, alt_tdd, spec, choice_column=choice_column, locals_d=locals_d, chunk_size=0, trace_label=tour_trace_label, estimator=estimator) # - update previous_tour and timetable parameters # update previous_tour (series with most recent previous tdd choices) with latest values previous_tour.loc[tours[tour_owner_id_col]] = choices.values # update timetable with chosen tdd footprints timetable.assign(tours[window_id_col], choices) return choices
def schedule_tours(tours, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, estimator, chunk_size, tour_trace_label, tour_chunk_tag): """ chunking wrapper for _schedule_tours While interaction_sample_simulate provides chunking support, the merged tours, persons dataframe and the tdd_interaction_dataset are very big, so we want to create them inside the chunking loop to minimize memory footprint. So we implement the chunking loop here, and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support. """ if not tours.index.is_monotonic_increasing: logger.info( "schedule_tours %s tours not monotonic_increasing - sorting df") tours = tours.sort_index() logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) # no more than one tour per timetable_window per call if timetable_window_id_col is None: assert not tours.index.duplicated().any() else: assert not tours[timetable_window_id_col].duplicated().any() if 'LOGSUM_SETTINGS' in model_settings: # we need skims to calculate tvpb skim overhead in 3_ZONE systems for use by calc_rows_per_chunk skims = skims_for_logsums(logsum_tour_purpose, model_settings, tour_trace_label) else: skims = None result_list = [] for i, chooser_chunk, chunk_trace_label \ in chunk.adaptive_chunked_choosers(tours, chunk_size, tour_trace_label, tour_chunk_tag): choices = _schedule_tours(chooser_chunk, persons_merged, alts, spec, logsum_tour_purpose, model_settings, skims, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, estimator, tour_trace_label=chunk_trace_label) result_list.append(choices) chunk.log_df(tour_trace_label, f'result_list', result_list) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) assert len(choices.index == len(tours.index)) return choices
def schedule_nth_trips(trips, probs_spec, model_settings, first_trip_in_leg, report_failed_trips, trace_hh_id, trace_label): """ We join each trip with the appropriate row in probs_spec by joining on probs_join_cols, which should exist in both trips, probs_spec dataframe. Parameters ---------- trips: pd.DataFrame probs_spec: pd.DataFrame Dataframe of probs for choice of depart times and join columns to match them with trips. Depart columns names are irrelevant. Instead, they are position dependent, time period choice is their index + depart_alt_base depart_alt_base: int int to add to probs column index to get time period it represents. e.g. depart_alt_base = 5 means first column (column 0) represents 5 am report_failed_trips : bool trace_hh_id trace_label Returns ------- choices: pd.Series time periods depart choices, one per trip (except for trips with zero probs) """ depart_alt_base = model_settings.get('DEPART_ALT_BASE') probs_cols = [c for c in probs_spec.columns if c not in PROBS_JOIN_COLUMNS] # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) choosers = pd.merge(trips.reset_index(), probs_spec, on=PROBS_JOIN_COLUMNS, how='left').set_index('trip_id') chunk.log_df(trace_label, "choosers", choosers) if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(choosers, '%s.choosers' % trace_label) # choosers should now match trips row for row assert choosers.index.is_unique assert len(choosers.index) == len(trips.index) # zero out probs outside earliest-latest window chooser_probs = clip_probs(trips, choosers[probs_cols], model_settings) chunk.log_df(trace_label, "chooser_probs", chooser_probs) if first_trip_in_leg: # probs should sum to 1 unless all zero chooser_probs = chooser_probs.div(chooser_probs.sum(axis=1), axis=0).fillna(0) # probs should sum to 1 with residual probs resulting in choice of 'fail' chooser_probs['fail'] = 1 - chooser_probs.sum(axis=1).clip(0, 1) chunk.log_df(trace_label, "chooser_probs", chooser_probs) if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(chooser_probs, '%s.chooser_probs' % trace_label) choices, rands = logit.make_choices(chooser_probs, trace_label=trace_label, trace_choosers=choosers) chunk.log_df(trace_label, "choices", choices) chunk.log_df(trace_label, "rands", rands) if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'depart']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) # convert alt choice index to depart time (setting failed choices to -1) failed = (choices == chooser_probs.columns.get_loc('fail')) choices = (choices + depart_alt_base).where(~failed, -1) chunk.log_df(trace_label, "failed", failed) # report failed trips while we have the best diagnostic info if report_failed_trips and failed.any(): report_bad_choices(bad_row_map=failed, df=choosers, filename='failed_choosers', trace_label=trace_label, trace_choosers=None) # trace before removing failures if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'depart']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) # remove any failed choices if failed.any(): choices = choices[~failed] assert (choices >= trips.earliest[~failed]).all() assert (choices <= trips.latest[~failed]).all() return choices
def _schedule_tours( tours, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, window_id_col, previous_tour, tour_owner_id_col, tour_trace_label): """ previous_tour stores values used to add columns that can be used in the spec which have to do with the previous tours per person. Every column in the alternatives table is appended with the suffix "_previous" and made available. So if your alternatives table has columns for start and end, then start_previous and end_previous will be set to the start and end of the most recent tour for a person. The first time through, start_previous and end_previous are undefined, so make sure to protect with a tour_num >= 2 in the variable computation. Parameters ---------- tours : DataFrame chunk of tours to schedule with unique timetable window_id_col persons_merged : DataFrame DataFrame of persons to be merged with tours containing attributes referenced by expressions in spec alts : DataFrame DataFrame of alternatives which represent all possible time slots. tdd_interaction_dataset function will use timetable to filter them to omit unavailable alternatives spec : DataFrame The spec which will be passed to interaction_simulate. model_settings : dict timetable : TimeTable timetable of timewidows for person (or subtour) with rows for tours[window_id_col] window_id_col : str column name from tours that identifies timetable owner (or None if tours index) - person_id for non/mandatory tours - parent_tour_id for subtours, - None (tours index) for joint_tours since every tour may have different participants) previous_tour: Series series with value of tdd_alt choice for last previous tour scheduled for tour_owner_id_col : str column name from tours that identifies 'owner' of this tour (person_id for non/mandatory tours, parent_tour_id for subtours, household_id for joint_tours) tour_trace_label Returns ------- """ logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) # merge persons into tours # avoid dual suffix for redundant columns names (e.g. household_id) that appear in both tours = pd.merge(tours, persons_merged, left_on='person_id', right_index=True, suffixes=('', '_y')) chunk.log_df(tour_trace_label, "tours", tours) # - add explicit window_id_col for timetable owner if it is index # if no timetable window_id_col specified, then add index as an explicit column # (this is not strictly necessary but its presence makes code simpler in several places) if window_id_col is None: window_id_col = tours.index.name tours[window_id_col] = tours.index # timetable can't handle multiple tours per window_id assert not tours[window_id_col].duplicated().any() # - build interaction dataset filtered to include only available tdd alts # dataframe columns start, end , duration, person_id, tdd # indexed (not unique) on tour_id choice_column = 'tdd' alt_tdd = tdd_interaction_dataset(tours, alts, timetable, choice_column, window_id_col, tour_trace_label) chunk.log_df(tour_trace_label, "alt_tdd", alt_tdd) # - add logsums if logsum_tour_purpose: logsums = \ compute_logsums(alt_tdd, tours, logsum_tour_purpose, model_settings, tour_trace_label) else: logsums = 0 alt_tdd['mode_choice_logsum'] = logsums # - merge in previous tour columns # adds start_previous and end_previous, joins on index tours = \ tours.join(get_previous_tour_by_tourid(tours[tour_owner_id_col], previous_tour, alts)) chunk.log_df(tour_trace_label, "tours", tours) # - make choices locals_d = { 'tt': timetable } constants = config.get_model_constants(model_settings) if constants is not None: locals_d.update(constants) choices = interaction_sample_simulate( tours, alt_tdd, spec, choice_column=choice_column, locals_d=locals_d, chunk_size=0, trace_label=tour_trace_label ) # - update previous_tour and timetable parameters # update previous_tour (series with most recent previous tdd choices) with latest values previous_tour.loc[tours[tour_owner_id_col]] = choices.values # update timetable with chosen tdd footprints timetable.assign(tours[window_id_col], choices) return choices
def lookup_tap_tap_utilities(self, recipe, maz_od_df, access_df, egress_df, chooser_attributes, path_info, trace_label): """ create transit_df and compute utilities for all atap-btap pairs between omaz in access and dmaz in egress_df look up the utilities in the precomputed tap_cache data (which is indexed by uid_calculator unique_ids) (unique_id can used as a zero-based index into the data array) transit_df contains all possible access omaz/btap to egress dmaz/atap transit path pairs for each chooser Parameters ---------- recipe maz_od_df access_df egress_df chooser_attributes path_info trace_label Returns ------- """ trace_label = tracing.extend_trace_label(trace_label, 'lookup_tap_tap_utils') with chunk.chunk_log(trace_label): with memo( "#TVPB CACHE lookup_tap_tap_utilities all_transit_paths"): transit_df = self.all_transit_paths(access_df, egress_df, chooser_attributes, trace_label, trace=False) # note: transit_df index is arbitrary chunk.log_df(trace_label, "transit_df", transit_df) if TRACE_COMPLEXITY: # diagnostic: log the omaz,dmaz pairs with the greatest number of virtual tap-tap paths num_paths = transit_df.groupby(['idx']).size().to_frame('n') num_paths = pd.merge(maz_od_df, num_paths, left_on='idx', right_index=True) num_paths = num_paths[[ 'omaz', 'dmaz', 'n' ]].drop_duplicates(subset=['omaz', 'dmaz']) num_paths = num_paths.sort_values( 'n', ascending=False).reset_index(drop=True) logger.debug(f"num_paths\n{num_paths.head(10)}") # FIXME some expressions may want to know access mode - locals_dict = path_info.copy() # add uid column to transit_df with memo("#TVPB lookup_tap_tap_utilities assign uid"): attribute_segments = \ self.network_los.setting('TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attribute_segments') scalar_attributes = { k: locals_dict[k] for k in attribute_segments.keys() if k not in transit_df } transit_df.index = self.uid_calculator.get_unique_ids( transit_df, scalar_attributes) transit_df = transit_df[[ 'idx', 'btap', 'atap' ]] # just needed chooser_columns for uid calculation chunk.log_df(trace_label, "transit_df add uid index", transit_df) with memo("#TVPB lookup_tap_tap_utilities reindex transit_df"): utilities = self.tap_cache.data i = 0 for column_name in self.uid_calculator.set_names: transit_df[column_name] = utilities[ transit_df.index.values, i] i += 1 for c in self.uid_calculator.set_names: assert ERR_CHECK and not transit_df[c].isnull().any() chunk.log_df(trace_label, "transit_df", None) return transit_df
def compute_accessibilities_for_zones(accessibility_df, land_use_df, assignment_spec, constants, network_los, trace_od, trace_label): orig_zones = accessibility_df.index.values dest_zones = land_use_df.index.values orig_zone_count = len(orig_zones) dest_zone_count = len(dest_zones) logger.info("Running %s with %d orig zones %d dest zones" % (trace_label, orig_zone_count, dest_zone_count)) # create OD dataframe od_df = pd.DataFrame( data={ 'orig': np.repeat(orig_zones, dest_zone_count), 'dest': np.tile(dest_zones, orig_zone_count) }) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (od_df.orig == trace_orig) & (od_df.dest == trace_dest) else: trace_od_rows = None # merge land_use_columns into od_df od_df = pd.merge(od_df, land_use_df, left_on='dest', right_index=True).sort_index() chunk.log_df(trace_label, "od_df", od_df) locals_d = { 'log': np.log, 'exp': np.exp, 'network_los': network_los, } locals_d.update(constants) skim_dict = network_los.get_default_skim_dict() locals_d['skim_od'] = skim_dict.wrap('orig', 'dest').set_df(od_df) locals_d['skim_do'] = skim_dict.wrap('dest', 'orig').set_df(od_df) if network_los.zone_system == los.THREE_ZONE: locals_d['tvpb'] = network_los.tvpb results, trace_results, trace_assigned_locals \ = assign.assign_variables(assignment_spec, od_df, locals_d, trace_rows=trace_od_rows, trace_label=trace_label, chunk_log=True) chunk.log_df(trace_label, "results", results) # accessibility_df = accessibility_df.copy() for column in results.columns: data = np.asanyarray(results[column]) data.shape = (orig_zone_count, dest_zone_count) # (o,d) accessibility_df[column] = np.log(np.sum(data, axis=1) + 1) if trace_od: if not trace_od_rows.any(): logger.warning( f"trace_od not found origin = {trace_orig}, dest = {trace_dest}" ) else: # add OD columns to trace results df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging tracing.trace_df(df, label='accessibility', index_label='skim_offset', slicer='NONE', warn_if_empty=True) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="accessibility_locals") return (accessibility_df)
def compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, skims, trace_label): """ Compute logsums for the tour alt_tdds, which will differ based on their different start, stop times of day, which translate to different odt_skim out_period and in_periods. In mtctm1, tdds are hourly, but there are only 5 skim time periods, so some of the tdd_alts will be the same, once converted to skim time periods. With 5 skim time periods there are 15 unique out-out period pairs but 190 tdd alternatives. For efficiency, rather compute a lot of redundant logsums, we compute logsums for the unique (out-period, in-period) pairs and then join them back to the alt_tdds. """ trace_label = tracing.extend_trace_label(trace_label, 'compute_logsums') network_los = inject.get_injectable('network_los') # - in_period and out_period assert 'out_period' not in alt_tdd assert 'in_period' not in alt_tdd alt_tdd['out_period'] = network_los.skim_time_period_label( alt_tdd['start']) alt_tdd['in_period'] = network_los.skim_time_period_label(alt_tdd['end']) alt_tdd['duration'] = alt_tdd['end'] - alt_tdd['start'] # outside chunk_log context because we extend log_df call for alt_tdd made by our only caller _schedule_tours chunk.log_df(trace_label, "alt_tdd", alt_tdd) with chunk.chunk_log(trace_label): if USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS: # compute logsums for all the tour alt_tdds (inefficient) logsums = _compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, network_los, skims, trace_label) return logsums index_name = alt_tdd.index.name deduped_alt_tdds, redupe_columns = dedupe_alt_tdd( alt_tdd, tour_purpose, trace_label) chunk.log_df(trace_label, "deduped_alt_tdds", deduped_alt_tdds) logger.info( f"{trace_label} compute_logsums " f"deduped_alt_tdds reduced number of rows by " f"{round(100 * (len(alt_tdd) - len(deduped_alt_tdds)) / len(alt_tdd), 2)}% " f"from {len(alt_tdd)} to {len(deduped_alt_tdds)} compared to USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS" ) t0 = tracing.print_elapsed_time() # - compute logsums for the alt_tdd_periods deduped_alt_tdds['logsums'] = \ _compute_logsums(deduped_alt_tdds, tours_merged, tour_purpose, model_settings, network_los, skims, trace_label) # tracing.log_runtime(model_name=trace_label, start_time=t0) # redupe - join the alt_tdd_period logsums to alt_tdd to get logsums for alt_tdd logsums = pd.merge(alt_tdd.reset_index(), deduped_alt_tdds.reset_index(), on=[index_name] + redupe_columns, how='left').set_index(index_name).logsums chunk.log_df(trace_label, "logsums", logsums) del deduped_alt_tdds chunk.log_df(trace_label, "deduped_alt_tdds", None) # this is really expensive TRACE = False if TRACE: trace_logsums_df = logsums.to_frame('representative_logsum') trace_logsums_df['brute_force_logsum'] = \ _compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, network_los, skims, trace_label) tracing.trace_df(trace_logsums_df, label=tracing.extend_trace_label( trace_label, 'representative_logsums'), slicer='NONE', transpose=False) # leave it to our caller to pick up logsums with call to chunk.log_df return logsums