def create_mandatory_tours(): # FIXME - move this to body? persons = inject.get_table('persons') configs_dir = inject.get_injectable('configs_dir') persons = persons.to_frame(columns=[ "mandatory_tour_frequency", "is_worker", "school_taz", "workplace_taz" ]) persons = persons[~persons.mandatory_tour_frequency.isnull()] tour_frequency_alternatives = inject.get_injectable( 'mandatory_tour_frequency_alternatives') tours = process_mandatory_tours(persons, tour_frequency_alternatives) expressions.assign_columns(df=tours, model_settings='annotate_tours_with_dest', configs_dir=configs_dir, trace_label='create_mandatory_tours') pipeline.extend_table("tours", tours) tracing.register_traceable_table('tours', tours) pipeline.get_rn_generator().add_channel(tours, 'tours')
def create_simple_trips(tours, households, persons, trace_hh_id): """ Create a simple trip table """ logger.info("Running simple trips table creation with %d tours" % len(tours.index)) tours_df = tours.to_frame() # we now have a tour_id column tours_df.reset_index(inplace=True) tours_df['household_id'] = reindex(persons.household_id, tours_df.person_id) tours_df['TAZ'] = reindex(households.TAZ, tours_df.household_id) # create inbound and outbound records trips = pd.concat([tours_df, tours_df], ignore_index=True) # first half are outbound, second half are inbound trips['INBOUND'] = np.repeat([False, True], len(trips.index) / 2) # TRIPID for outbound trips = 1, inbound_trips = 2 trips['trip_num'] = np.repeat([1, 2], len(trips.index) / 2) # set key fields from tour fields: 'TAZ','destination','start','end' trips['OTAZ'] = trips.TAZ trips['OTAZ'][trips.INBOUND] = trips.destination[trips.INBOUND] trips['DTAZ'] = trips.destination trips['DTAZ'][trips.INBOUND] = trips.TAZ[trips.INBOUND] trips['start_trip'] = trips.start trips['start_trip'][trips.INBOUND] = trips.end[trips.INBOUND] trips['end_trip'] = trips.end trips['end_trip'][trips.INBOUND] = trips.start[trips.INBOUND] # create a stable (predictable) index based on tour_id and trip_num possible_trips_count = 2 trips['trip_id'] = (trips.tour_id * possible_trips_count) + (trips.trip_num - 1) trips.set_index('trip_id', inplace=True, verify_integrity=True) trip_columns = [ 'tour_id', 'INBOUND', 'trip_num', 'OTAZ', 'DTAZ', 'start_trip', 'end_trip' ] trips = trips[trip_columns] orca.add_table("trips", trips) tracing.register_traceable_table('trips', trips) pipeline.get_rn_generator().add_channel(trips, 'trips') if trace_hh_id: tracing.trace_df(trips, label="trips", warn_if_empty=True)
def create_households(trace_hh_id): df = pd.DataFrame({'HHID': [1, 2, 3], 'TAZ': {100, 100, 101}}) inject.add_table('households', df) pipeline.get_rn_generator().add_channel(df, 'households') if trace_hh_id: tracing.register_traceable_table('households', df)
def trip_departure_choice( trips, trips_merged, skim_dict, chunk_size, trace_hh_id): trace_label = 'trip_departure_choice' model_settings = config.read_model_settings('trip_departure_choice.yaml') spec = simulate.read_model_spec(file_name=model_settings['SPECIFICATION']) trips_merged_df = trips_merged.to_frame() # add tour-based chunk_id so we can chunk all trips in tour together tour_ids = trips_merged[TOUR_ID].unique() trips_merged_df['chunk_id'] = reindex(pd.Series(list(range(len(tour_ids))), tour_ids), trips_merged_df.tour_id) max_tour_id = trips_merged[TOUR_ID].max() trip_departure_choice.MAX_TOUR_ID = int(np.power(10, np.ceil(np.log10(max_tour_id)))) locals_d = config.get_model_constants(model_settings).copy() preprocessor_settings = model_settings.get('PREPROCESSOR', None) tour_legs = get_tour_legs(trips_merged_df) pipeline.get_rn_generator().add_channel('tour_legs', tour_legs) if preprocessor_settings: od_skim = skim_dict.wrap('origin', 'destination') do_skim = skim_dict.wrap('destination', 'origin') skims = [od_skim, do_skim] simulate.set_skim_wrapper_targets(trips_merged_df, skims) locals_d.update({ "od_skims": od_skim, "do_skims": do_skim, }) expressions.assign_columns( df=trips_merged_df, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label) choices = apply_stage_two_model(spec, trips_merged_df, chunk_size, trace_label) trips_df = trips.to_frame() trip_length = len(trips_df) trips_df = pd.concat([trips_df, choices], axis=1) assert len(trips_df) == trip_length assert trips_df[trips_df['depart'].isnull()].empty pipeline.replace_table("trips", trips_df)
def create_households(trace_hh_id): df = pd.DataFrame({ 'household_id': [1, 2, 3], 'home_zone_id': {100, 100, 101} }) inject.add_table('households', df) pipeline.get_rn_generator().add_channel('households', df) tracing.register_traceable_table('households', df)
def create_mandatory_tours_table(): persons = orca.get_table('persons') persons = persons.to_frame(columns=["mandatory_tour_frequency", "is_worker", "school_taz", "workplace_taz"]) persons = persons[~persons.mandatory_tour_frequency.isnull()] df = process_mandatory_tours(persons) orca.add_table("mandatory_tours", df) tracing.register_traceable_table('mandatory_tours', df) pipeline.get_rn_generator().add_channel(df, 'tours')
def create_non_mandatory_tours_table(): persons = orca.get_table('persons') non_mandatory_tour_frequency_alts = orca.get_table('non_mandatory_tour_frequency_alts') df = process_non_mandatory_tours( persons.non_mandatory_tour_frequency.dropna(), non_mandatory_tour_frequency_alts.local ) orca.add_table("non_mandatory_tours", df) tracing.register_traceable_table('non_mandatory_tours', df) pipeline.get_rn_generator().add_channel(df, 'tours')
def initialize_tours(network_los, households, persons, trace_hh_id): trace_label = 'initialize_tours' tours = read_input_table("tours") # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after. # FIXME could just always slice... slice_happened = \ inject.get_injectable('households_sample_size', 0) > 0 \ or inject.get_injectable('households_sample_size', 0) > 0 if slice_happened: logger.info("slicing tours %s" % (tours.shape,)) # keep all persons in the sampled households tours = tours[tours.person_id.isin(persons.index)] # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above model_settings = config.read_model_settings('initialize_tours.yaml', mandatory=True) expressions.assign_columns( df=tours, model_settings=model_settings.get('annotate_tours'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_tours')) skip_patch_tour_ids = model_settings.get('skip_patch_tour_ids', False) if skip_patch_tour_ids: pass else: tours = patch_tour_ids(tours) assert tours.index.name == 'tour_id' # replace table function with dataframe inject.add_table('tours', tours) pipeline.get_rn_generator().add_channel('tours', tours) tracing.register_traceable_table('tours', tours) logger.debug(f"{len(tours.household_id.unique())} unique household_ids in tours") logger.debug(f"{len(households.index.unique())} unique household_ids in households") assert not tours.index.duplicated().any() tours_without_persons = ~tours.person_id.isin(persons.index) if tours_without_persons.any(): logger.error(f"{tours_without_persons.sum()} tours out of {len(persons)} without persons\n" f"{pd.Series({'person_id': tours_without_persons.index.values})}") raise RuntimeError(f"{tours_without_persons.sum()} tours with bad person_id") if trace_hh_id: tracing.trace_df(tours, label='initialize_tours', warn_if_empty=True)
def test_rng_access(): configs_dir = os.path.join(os.path.dirname(__file__), 'configs') orca.add_injectable("configs_dir", configs_dir) output_dir = os.path.join(os.path.dirname(__file__), 'output') orca.add_injectable("output_dir", output_dir) data_dir = os.path.join(os.path.dirname(__file__), 'data') orca.add_injectable("data_dir", data_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) orca.clear_cache() pipeline.set_rn_generator_base_seed(0) pipeline.open_pipeline() with pytest.raises(RuntimeError) as excinfo: pipeline.set_rn_generator_base_seed(0) assert "call set_rn_generator_base_seed before the first step" in str( excinfo.value) rng = pipeline.get_rn_generator() pipeline.close_pipeline() orca.clear_cache()
def persons(households, trace_hh_id): df = read_raw_persons(households) logger.info("loaded persons %s" % (df.shape, )) # replace table function with dataframe inject.add_table('persons', df) pipeline.get_rn_generator().add_channel('persons', df) if trace_hh_id: tracing.register_traceable_table('persons', df) tracing.trace_df(df, "raw.persons", warn_if_empty=True) return df
def persons(households, trace_hh_id): df = read_raw_persons(households) logger.info("loaded persons %s" % (df.shape,)) df.index.name = 'person_id' # replace table function with dataframe inject.add_table('persons', df) pipeline.get_rn_generator().add_channel('persons', df) if trace_hh_id: tracing.register_traceable_table('persons', df) tracing.trace_df(df, "raw.persons", warn_if_empty=True) return df
def create_non_mandatory_tours(): """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ persons = inject.get_table('persons') alts = inject.get_injectable('non_mandatory_tour_frequency_alts') df = process_non_mandatory_tours( persons.non_mandatory_tour_frequency.dropna(), alts ) pipeline.extend_table("tours", df) tracing.register_traceable_table('tours', df) pipeline.get_rn_generator().add_channel(df, 'tours')
def persons(households, trace_hh_id): df = read_raw_persons(households) logger.info("loaded persons %s" % (df.shape, )) # replace table function with dataframe inject.add_table('persons', df) pipeline.get_rn_generator().add_channel('persons', df) tracing.register_traceable_table('persons', df) if trace_hh_id: tracing.trace_df(df, "raw.persons", warn_if_empty=True) logger.debug( f"{len(df.household_id.unique())} unique household_ids in persons") logger.debug( f"{len(households.index.unique())} unique household_ids in households") assert not households.index.duplicated().any() assert not df.index.duplicated().any() persons_without_households = ~df.household_id.isin(households.index) if persons_without_households.any(): logger.error( f"{persons_without_households.sum()} persons out of {len(persons)} without households\n" f"{pd.Series({'person_id': persons_without_households.index.values})}" ) raise RuntimeError( f"{persons_without_households.sum()} persons with bad household_id" ) households_without_persons = df.groupby('household_id').size().reindex( households.index).isnull() if households_without_persons.any(): logger.error( f"{households_without_persons.sum()} households out of {len(households.index)} without persons\n" f"{pd.Series({'household_id': households_without_persons.index.values})}" ) raise RuntimeError( f"{households_without_persons.sum()} households with no persons") return df
def persons(store, households_sample_size, households, trace_hh_id): df = store["persons"] if households_sample_size > 0: # keep all persons in the sampled households df = df[df.household_id.isin(households.index)] logger.info("loaded persons %s" % (df.shape, )) # replace table function with dataframe orca.add_table('persons', df) pipeline.get_rn_generator().add_channel(df, 'persons') if trace_hh_id: tracing.register_traceable_table('persons', df) tracing.trace_df(df, "persons", warn_if_empty=True) return df
def households(store, households_sample_size, trace_hh_id): df_full = store["households"] # if we are tracing hh exclusively if trace_hh_id and households_sample_size == 1: # df contains only trace_hh (or empty if not in full store) df = tracing.slice_ids(df_full, trace_hh_id) # if we need sample a subset of full store elif households_sample_size > 0 and df_full.shape[0] > households_sample_size: logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0])) # take the requested random sample df = asim.random_rows(df_full, households_sample_size) # if tracing and we missed trace_hh in sample, but it is in full store if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index: # replace first hh in sample with trace_hh logger.debug("replacing household %s with %s in household sample" % (df.index[0], trace_hh_id)) df_hh = tracing.slice_ids(df_full, trace_hh_id) df = pd.concat([df_hh, df[1:]]) else: df = df_full logger.info("loaded households %s" % (df.shape,)) # replace table function with dataframe inject.add_table('households', df) pipeline.get_rn_generator().add_channel(df, 'households') if trace_hh_id: tracing.register_traceable_table('households', df) tracing.trace_df(df, "households", warn_if_empty=True) return df
def get_trip_mc_logsums_for_all_modes(tours, segment_column_name, model_settings, trace_label): """Creates pseudo-trips from tours and runs trip mode choice to get logsums Parameters ---------- tours : pandas.DataFrame segment_column_name : str column in tours table used for segmenting model spec model_settings : dict trace_label : str Returns ------- tours : pd.DataFrame Adds two * n_modes logsum columns to each tour row, e.g. "logsum_DRIVE_outbound" """ # create pseudo-trips from tours for all tour modes logsum_trips = create_logsum_trips(tours, segment_column_name, model_settings, trace_label) # temporarily register trips in the pipeline pipeline.replace_table('trips', logsum_trips) tracing.register_traceable_table('trips', logsum_trips) pipeline.get_rn_generator().add_channel('trips', logsum_trips) # run trip mode choice on pseudo-trips. use orca instead of pipeline to # execute the step because pipeline can only handle one open step at a time orca.run(['trip_mode_choice']) # add trip mode choice logsums as new cols in tours tours = append_tour_leg_trip_mode_choice_logsums(tours) # de-register logsum trips table pipeline.get_rn_generator().drop_channel('trips') tracing.deregister_traceable_table('trips') return tours
def create_non_mandatory_tours(trace_hh_id): """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ persons = inject.get_table('persons') alts = inject.get_injectable('non_mandatory_tour_frequency_alts') non_mandatory_tours = process_non_mandatory_tours( persons.non_mandatory_tour_frequency.dropna(), alts ) tours = pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', tours) pipeline.get_rn_generator().add_channel(non_mandatory_tours, 'tours') if trace_hh_id: tracing.trace_df(non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True)
def test_rng_access(): setup_dirs() inject.add_injectable('rng_base_seed', 0) pipeline.open_pipeline() rng = pipeline.get_rn_generator() assert isinstance(rng, random.Random) pipeline.close_pipeline() inject.clear_cache()
def test_rng_access(): configs_dir = os.path.join(os.path.dirname(__file__), 'configs') setup_dirs(configs_dir) inject.add_injectable('rng_base_seed', 0) pipeline.open_pipeline() rng = pipeline.get_rn_generator() assert isinstance(rng, random.Random) pipeline.close_pipeline() inject.clear_cache()
def mandatory_tour_frequency(persons_merged, mandatory_tour_frequency_spec, mandatory_tour_frequency_settings, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons" % len(choosers)) nest_spec = config.get_logit_model_settings(mandatory_tour_frequency_settings) constants = config.get_model_constants(mandatory_tour_frequency_settings) choices = asim.simple_simulate( choosers, spec=mandatory_tour_frequency_spec, nest_spec=nest_spec, locals_d=constants, trace_label=trace_hh_id and 'mandatory_tour_frequency', trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series( mandatory_tour_frequency_spec.columns[choices.values], index=choices.index).reindex(persons_merged.local.index) tracing.print_summary('mandatory_tour_frequency', choices, value_counts=True) orca.add_column("persons", "mandatory_tour_frequency", choices) pipeline.add_dependent_columns("persons", "persons_mtf") create_mandatory_tours_table() # FIXME - test prng repeatability r = pipeline.get_rn_generator().random_for_df(choices) orca.add_column("persons", "mtf_rand", [item for sublist in r for item in sublist]) if trace_hh_id: trace_columns = ['mandatory_tour_frequency'] tracing.trace_df(orca.get_table('persons_merged').to_frame(), label="mandatory_tour_frequency", columns=trace_columns, warn_if_empty=True)
def local_utilities(): """ Dict of useful modules and functions to provides as locals for use in eval of expressions Returns ------- utility_dict : dict name, entity pairs of locals """ utility_dict = { 'pd': pd, 'np': np, 'reindex': util.reindex, 'setting': config.setting, 'other_than': util.other_than, 'rng': pipeline.get_rn_generator(), } return utility_dict
def local_utilities(): """ Dict of useful modules and functions to provides as locals for use in eval of expressions Returns ------- utility_dict : dict name, entity pairs of locals """ utility_dict = { 'pd': pd, 'np': np, 'reindex': util.reindex, 'reindex_i': util.reindex_i, 'setting': config.setting, 'other_than': util.other_than, 'skim_time_period_label': expressions.skim_time_period_label, 'rng': pipeline.get_rn_generator(), } utility_dict.update(config.get_global_constants()) return utility_dict
def households(households_sample_size, override_hh_ids, trace_hh_id): df_full = read_input_table("households") households_sliced = False logger.info("full household list contains %s households" % df_full.shape[0]) # only using households listed in override_hh_ids if override_hh_ids is not None: # trace_hh_id will not used if it is not in list of override_hh_ids logger.info("override household list containing %s households" % len(override_hh_ids)) df = df_full[df_full.index.isin(override_hh_ids)] households_sliced = True if df.shape[0] < len(override_hh_ids): logger.info("found %s of %s households in override household list" % (df.shape[0], len(override_hh_ids))) if df.shape[0] == 0: raise RuntimeError('No override households found in store') # if we are tracing hh exclusively elif trace_hh_id and households_sample_size == 1: # df contains only trace_hh (or empty if not in full store) df = tracing.slice_ids(df_full, trace_hh_id) households_sliced = True # if we need a subset of full store elif households_sample_size > 0 and df_full.shape[0] > households_sample_size: logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0])) """ Because random seed is set differently for each step, sampling of households using Random.global_rng would sample differently depending upon which step it was called from. We use a one-off rng seeded with the pseudo step name 'sample_households' to provide repeatable sampling no matter when the table is loaded. Note that the external_rng is also seeded with base_seed so the sample will (rightly) change if the pipeline rng's base_seed is changed """ prng = pipeline.get_rn_generator().get_external_rng('sample_households') df = df_full.take(prng.choice(len(df_full), size=households_sample_size, replace=False)) households_sliced = True # if tracing and we missed trace_hh in sample, but it is in full store if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index: # replace first hh in sample with trace_hh logger.debug("replacing household %s with %s in household sample" % (df.index[0], trace_hh_id)) df_hh = df_full.loc[[trace_hh_id]] df = pd.concat([df_hh, df[1:]]) else: df = df_full # persons table inject.add_injectable('households_sliced', households_sliced) logger.info("loaded households %s" % (df.shape,)) df.index.name = 'household_id' # FIXME - pathological knowledge of name of chunk_id column used by chunked_choosers_by_chunk_id assert 'chunk_id' not in df.columns df['chunk_id'] = pd.Series(list(range(len(df))), df.index) # replace table function with dataframe inject.add_table('households', df) pipeline.get_rn_generator().add_channel('households', df) if trace_hh_id: tracing.register_traceable_table('households', df) tracing.trace_df(df, "raw.households", warn_if_empty=True) return df
def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = 'mandatory_tour_frequency' model_settings = config.read_model_settings( 'mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec( file_name='mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('mandatory_tour_frequency_alternatives.csv'), set_index='alt') choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours if choosers.shape[0] == 0: add_null_results(trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {} expressions.assign_columns(df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index).reindex( persons_merged.local.index) # - create mandatory tours """ This reprocesses the choice of index of the mandatory tour frequency alternatives into an actual dataframe of tours. Ending format is the same as got non_mandatory_tours except trip types are "work" and "school" """ choosers['mandatory_tour_frequency'] = choices mandatory_tours = process_mandatory_tours( persons=choosers, mandatory_tour_frequency_alts=alternatives) tours = pipeline.extend_table("tours", mandatory_tours) tracing.register_traceable_table('tours', mandatory_tours) pipeline.get_rn_generator().add_channel('tours', mandatory_tours) # - annotate persons persons = inject.get_table('persons').to_frame() # need to reindex as we only handled persons with cdap_activity == 'M' persons['mandatory_tour_frequency'] = choices.reindex( persons.index).fillna('').astype(str) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True) tracing.trace_df(persons, label="mandatory_tour_frequency.persons", warn_if_empty=True)
def joint_tour_participation( tours, persons_merged, chunk_size, trace_hh_id): """ Predicts for each eligible person to participate or not participate in each joint tour. """ trace_label = 'joint_tour_participation' model_settings_file_name = 'joint_tour_participation.yaml' model_settings = config.read_model_settings(model_settings_file_name) tours = tours.to_frame() joint_tours = tours[tours.tour_category == 'joint'] # - if no joint tours if joint_tours.shape[0] == 0: add_null_results(model_settings, trace_label) return persons_merged = persons_merged.to_frame() # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) tracing.register_traceable_table('joint_tour_participants', candidates) pipeline.get_rn_generator().add_channel('joint_tour_participants', candidates) logger.info("Running joint_tours_participation with %d potential participants (candidates)" % candidates.shape[0]) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_time_window_overlap': person_time_window_overlap, 'persons': persons_merged } expressions.assign_columns( df=candidates, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # - simple_simulate estimator = estimation.manager.begin_estimation('joint_tour_participation') model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) if estimator: estimator.write_model_settings(model_settings, model_settings_file_name) estimator.write_spec(model_settings) estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(candidates) # add tour-based chunk_id so we can chunk all trips in tour together assert 'chunk_id' not in candidates.columns unique_household_ids = candidates.household_id.unique() household_chunk_ids = pd.Series(range(len(unique_household_ids)), index=unique_household_ids) candidates['chunk_id'] = reindex(household_chunk_ids, candidates.household_id) choices = simulate.simple_simulate_by_chunk_id( choosers=candidates, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='participation', custom_chooser=participants_chooser, estimator=estimator) # choice is boolean (participate or not) choice_col = model_settings.get('participation_choice', 'participate') assert choice_col in model_spec.columns, \ "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = model_spec.columns.get_loc(choice_col) participate = (choices == PARTICIPATE_CHOICE) if estimator: estimator.write_choices(choices) # we override the 'participate' boolean series, instead of raw alternative index in 'choices' series # its value depends on whether the candidate's 'participant_id' is in the joint_tour_participant index survey_participants_df = estimator.get_survey_table('joint_tour_participants') participate = pd.Series(choices.index.isin(survey_participants_df.index.values), index=choices.index) # but estimation software wants to know the choices value (alternative index) choices = participate.replace({True: PARTICIPATE_CHOICE, False: 1-PARTICIPATE_CHOICE}) # estimator.write_override_choices(participate) # write choices as boolean participate estimator.write_override_choices(choices) # write choices as int alt indexes estimator.end_estimation() # satisfaction indexed by tour_id tour_satisfaction = get_tour_satisfaction(candidates, participate) assert tour_satisfaction.all() candidates['satisfied'] = reindex(tour_satisfaction, candidates.tour_id) PARTICIPANT_COLS = ['tour_id', 'household_id', 'person_id'] participants = candidates[participate][PARTICIPANT_COLS].copy() # assign participant_num # FIXME do we want something smarter than the participant with the lowest person_id? participants['participant_num'] = \ participants.sort_values(by=['tour_id', 'person_id']).\ groupby('tour_id').cumcount() + 1 pipeline.replace_table("joint_tour_participants", participants) # drop channel as we aren't using any more (and it has candidates that weren't chosen) pipeline.get_rn_generator().drop_channel('joint_tour_participants') # - assign joint tour 'point person' (participant_num == 1) point_persons = participants[participants.participant_num == 1] joint_tours['person_id'] = point_persons.set_index('tour_id').person_id # update number_of_participants which was initialized to 1 joint_tours['number_of_participants'] = participants.groupby('tour_id').size() assign_in_place(tours, joint_tours[['person_id', 'number_of_participants']]) pipeline.replace_table("tours", tours) # - run annotations annotate_jtp(model_settings, trace_label) if trace_hh_id: tracing.trace_df(participants, label="joint_tour_participation.participants") tracing.trace_df(joint_tours, label="joint_tour_participation.joint_tours")
def assign_cdap_rank(persons, trace_hh_id=None, trace_label=None): """ Assign an integer index, cdap_rank, to each household member. (Starting with 1, not 0) Modifies persons df in place The cdap_rank order is important, because cdap only assigns activities to the first MAX_HHSIZE persons in each household. This will preferentially be two working adults and the three youngest children. Rank is assigned starting at 1. This necessitates some care indexing, but is preferred as it follows the convention of 1-based pnums in expression files. According to the documentation of reOrderPersonsForCdap in mtctm2.abm.ctramp HouseholdCoordinatedDailyActivityPatternModel: "Method reorders the persons in the household for use with the CDAP model, which only explicitly models the interaction of five persons in a HH. Priority in the reordering is first given to full time workers (up to two), then to part time workers (up to two workers, of any type), then to children (youngest to oldest, up to three). If the method is called for a household with less than 5 people, the cdapPersonArray is the same as the person array." We diverge from the above description in that a cdap_rank is assigned to all persons, including 'extra' household members, whose activity is assigned subsequently. The pair _hh_id_, cdap_rank will uniquely identify each household member. Parameters ---------- persons : pandas.DataFrame Table of persons data. Must contain columns _hh_size_, _hh_id_, _ptype_, _age_ Returns ------- cdap_rank : pandas.Series integer cdap_rank of every person, indexed on _persons_index_ """ # transient categories used to categorize persons in cdap_rank before assigning final rank RANK_WORKER = 1 RANK_CHILD = 2 RANK_BACKFILL = 3 RANK_UNASSIGNED = 9 persons['cdap_rank'] = RANK_UNASSIGNED # choose up to 2 workers, preferring full over part, older over younger workers = \ persons.loc[persons[_ptype_].isin(WORKER_PTYPES), [_hh_id_, _ptype_]]\ .sort_values(by=[_hh_id_, _ptype_], ascending=[True, True])\ .groupby(_hh_id_).head(2) # tag the selected workers persons.loc[workers.index, 'cdap_rank'] = RANK_WORKER del workers # choose up to 3, preferring youngest children = \ persons.loc[persons[_ptype_].isin(CHILD_PTYPES), [_hh_id_, _ptype_, _age_]]\ .sort_values(by=[_hh_id_, _ptype_], ascending=[True, True])\ .groupby(_hh_id_).head(3) # tag the selected children persons.loc[children.index, 'cdap_rank'] = RANK_CHILD del children # choose up to MAX_HHSIZE, preferring anyone already chosen # others = \ # persons[[_hh_id_, 'cdap_rank']]\ # .sort_values(by=[_hh_id_, 'cdap_rank'], ascending=[True, True])\ # .groupby(_hh_id_).head(MAX_HHSIZE) # choose up to MAX_HHSIZE, choosing randomly others = persons[[_hh_id_, 'cdap_rank']].copy() others['random_order'] = pipeline.get_rn_generator().random_for_df(persons) others = \ others\ .sort_values(by=[_hh_id_, 'random_order'], ascending=[True, True])\ .groupby(_hh_id_).head(MAX_HHSIZE) # tag the backfilled persons persons.loc[others[others.cdap_rank == RANK_UNASSIGNED].index, 'cdap_rank'] \ = RANK_BACKFILL del others # assign person number in cdapPersonArray preference order # i.e. convert cdap_rank from category to index in order of category rank within household # groupby rank() is slow, so we compute rank artisanally # save time by sorting only the columns we need (persons is big, and sort moves data) p = persons[[_hh_id_, 'cdap_rank', _age_]]\ .sort_values(by=[_hh_id_, 'cdap_rank', _age_], ascending=[True, True, True]) rank = p.groupby(_hh_id_).size().map(range) rank = [item+1 for sublist in rank for item in sublist] p['cdap_rank'] = rank persons['cdap_rank'] = p['cdap_rank'] # assignment aligns on index values # if DUMP: # tracing.trace_df(persons, '%s.DUMP.cdap_person_array' % trace_label, # transpose=False, slicer='NONE') if trace_hh_id: tracing.trace_df(persons, '%s.cdap_rank' % trace_label) return persons['cdap_rank']
def joint_tour_participation( tours, persons_merged, chunk_size, trace_hh_id): """ Predicts for each eligible person to participate or not participate in each joint tour. """ trace_label = 'joint_tour_participation' model_settings = config.read_model_settings('joint_tour_participation.yaml') model_spec = simulate.read_model_spec(file_name='joint_tour_participation.csv') tours = tours.to_frame() joint_tours = tours[tours.tour_category == 'joint'] # - if no joint tours if joint_tours.shape[0] == 0: add_null_results(model_settings, trace_label) return persons_merged = persons_merged.to_frame() # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) tracing.register_traceable_table('joint_tour_participants', candidates) pipeline.get_rn_generator().add_channel('joint_tour_participants', candidates) logger.info("Running joint_tours_participation with %d potential participants (candidates)" % candidates.shape[0]) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_time_window_overlap': person_time_window_overlap, 'persons': persons_merged } expressions.assign_columns( df=candidates, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # - simple_simulate nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=candidates, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='participation', custom_chooser=participants_chooser) # choice is boolean (participate or not) choice_col = model_settings.get('participation_choice', 'participate') assert choice_col in model_spec.columns, \ "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = model_spec.columns.get_loc(choice_col) participate = (choices == PARTICIPATE_CHOICE) # satisfaction indexed by tour_id tour_satisfaction = get_tour_satisfaction(candidates, participate) assert tour_satisfaction.all() candidates['satisfied'] = reindex(tour_satisfaction, candidates.tour_id) PARTICIPANT_COLS = ['tour_id', 'household_id', 'person_id'] participants = candidates[participate][PARTICIPANT_COLS].copy() # assign participant_num # FIXME do we want something smarter than the participant with the lowest person_id? participants['participant_num'] = \ participants.sort_values(by=['tour_id', 'person_id']).\ groupby('tour_id').cumcount() + 1 pipeline.replace_table("joint_tour_participants", participants) # drop channel as we aren't using any more (and it has candidates that weren't chosen) pipeline.get_rn_generator().drop_channel('joint_tour_participants') # - assign joint tour 'point person' (participant_num == 1) point_persons = participants[participants.participant_num == 1] joint_tours['person_id'] = point_persons.set_index('tour_id').person_id # update number_of_participants which was initialized to 1 joint_tours['number_of_participants'] = participants.groupby('tour_id').size() assign_in_place(tours, joint_tours[['person_id', 'number_of_participants']]) pipeline.replace_table("tours", tours) # - run annotations annotate_jtp(model_settings, trace_label) if trace_hh_id: tracing.trace_df(participants, label="joint_tour_participation.participants") tracing.trace_df(joint_tours, label="joint_tour_participation.joint_tours")
def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_col_name, trace_label): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ Parameters ---------- taz_sample: dataframe with duplicated index <chooser_id_col> and columns: <alt_dest_col_name>, prob, pick_count MAZ_size_terms: dataframe with duplicated index <chooser_id_col> and columns: zone_id, dest_TAZ, size_term Returns ------- dataframe with with duplicated index <chooser_id_col> and columns: <alt_dest_col_name>, prob, pick_count """ if len(taz_sample) == 0: # it can happen that all trips have no viable destinations (and so are dropped from the sample) # in which case we can just return the empty taz_sample, since it has the same columns return taz_sample.copy() # we had to use alt_dest_col_name as specified in model_settings for interaction_sample # because expressions reference it to look up size_terms by trip purpose DEST_MAZ = alt_dest_col_name DEST_TAZ = f"{alt_dest_col_name}_TAZ" taz_sample.rename(columns={alt_dest_col_name: DEST_TAZ}, inplace=True) trace_hh_id = inject.get_injectable("trace_hh_id", None) have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, 'choose_MAZ_for_TAZ') # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(taz_sample) tracing.trace_df(taz_sample[trace_targets], label=tracing.extend_trace_label( trace_label, 'taz_sample'), transpose=False) # print(f"taz_sample\n{taz_sample}") # alt_dest_TAZ prob pick_count # trip_id # 4343721 12 0.000054 1 # 4343721 20 0.001864 2 taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False) taz_choices = taz_choices.reindex( taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True) taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'}) # print(f"taz_choices\n{taz_choices}") # trip_id alt_dest_TAZ prob # 0 4343721 12 0.000054 # 1 4343721 20 0.001864 # 2 4343721 20 0.001864 # print(f"MAZ_size_terms\n{MAZ_size_terms.df}") # work escort shopping eatout othmaint social othdiscr univ # alt_dest # 2 31.0 9.930 0.042 0.258 0.560 0.520 10.856 0.042 # 3 0.0 3.277 0.029 0.000 0.029 0.029 7.308 0.029 # 4 0.0 1.879 0.023 0.000 0.023 0.023 5.796 0.023 # just to make it clear we are siloing choices by chooser_id chooser_id_col = taz_sample.index.name # should be canonical chooser index name (e.g. 'trip_id') # for random_for_df, we need df with de-duplicated chooser canonical index chooser_df = pd.DataFrame( index=taz_sample.index[~taz_sample.index.duplicated()]) num_choosers = len(chooser_df) assert chooser_df.index.name == chooser_id_col # to make choices, <taz_sample_size> rands for each chooser (one rand for each sampled TAZ) # taz_sample_size will be model_settings['SAMPLE_SIZE'] samples, except if we are estimating taz_sample_size = taz_choices.groupby( chooser_id_col)[DEST_TAZ].count().max() # taz_choices index values should be contiguous assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index, taz_sample_size)).all() # we need to choose a MAZ for each DEST_TAZ choice # probability of choosing MAZ based on MAZ size_term fraction of TAZ total # there will be a different set (and number) of candidate MAZs for each TAZ # (preserve index, which will have duplicates as result of join) maz_taz = network_los.maz_taz_df[['MAZ', 'TAZ']].rename(columns={ 'TAZ': DEST_TAZ, 'MAZ': DEST_MAZ }) maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(), maz_taz, how='left', on=DEST_TAZ).set_index('index') purpose = maz_sizes['trip_id'].map( trips.purpose) # size term varies by purpose maz_sizes['size_term'] = MAZ_size_terms.get(maz_sizes[DEST_MAZ], purpose) # print(f"maz_sizes\n{maz_sizes}") # trip_id alt_dest_TAZ alt_dest size_term # index # 0 4343721 12 3445 0.019 # 0 4343721 12 11583 0.017 # 0 4343721 12 21142 0.020 if have_trace_targets: # write maz_sizes: maz_sizes[index,trip_id,dest_TAZ,zone_id,size_term] maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer='trip_id') trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] tracing.trace_df(trace_maz_sizes, label=tracing.extend_trace_label( trace_label, 'maz_sizes'), transpose=False) # number of DEST_TAZ candidates per chooser maz_counts = maz_sizes.groupby(maz_sizes.index).size().values # print(maz_counts) # max number of MAZs for any TAZ max_maz_count = maz_counts.max() # print(f"max_maz_count {max_maz_count}") # offsets of the first and last rows of each chooser in sparse interaction_utilities last_row_offsets = maz_counts.cumsum() first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0) # repeat the row offsets once for each dummy utility to insert # (we want to insert dummy utilities at the END of the list of alternative utilities) # inserts is a list of the indices at which we want to do the insertions inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts) # insert zero filler to pad each alternative set to same size padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts, 0.0) padded_maz_sizes = padded_maz_sizes.reshape(-1, max_maz_count) # prob array with one row TAZ_choice, one column per alternative row_sums = padded_maz_sizes.sum(axis=1) maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) rands = pipeline.get_rn_generator().random_for_df( chooser_df, n=taz_sample_size).reshape(-1, 1) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] # make choices # positions is array with the chosen alternative represented as a column index in probs # which is an integer between zero and max_maz_count positions = np.argmax((maz_probs.cumsum(axis=1) - rands) > 0.0, axis=1) # shouldn't have chosen any of the dummy pad positions assert (positions < maz_counts).all() taz_choices[DEST_MAZ] = maz_sizes[DEST_MAZ].take(positions + first_row_offsets) taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]), positions] taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob'] if have_trace_targets: taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer='trip_id') trace_taz_choices_df = taz_choices[taz_choices_trace_targets] tracing.trace_df(trace_taz_choices_df, label=tracing.extend_trace_label( trace_label, 'taz_choices'), transpose=False) lhs_df = trace_taz_choices_df[['trip_id', DEST_TAZ]] alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)] # following the same logic as the full code, but for trace cutout trace_maz_counts = maz_counts[taz_choices_trace_targets] trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum() trace_inserts = np.repeat(trace_last_row_offsets, max_maz_count - trace_maz_counts) # trace dest_maz_alts padded_maz_sizes = np.insert(trace_maz_sizes[DEST_MAZ].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_alts'), transpose=False) # trace dest_maz_size_terms padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_size_terms'), transpose=False) # trace dest_maz_probs df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets], columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) df['rand'] = rands[taz_choices_trace_targets] tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_probs'), transpose=False) taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob']) taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ ]).agg(prob=('prob', 'max'), pick_count=('prob', 'count')) taz_choices.reset_index(level=DEST_MAZ, inplace=True) return taz_choices
def stop_frequency(tours, tours_merged, stop_frequency_alts, skim_dict, chunk_size, trace_hh_id): """ stop frequency model For each tour, shoose a number of intermediate inbound stops and outbound stops. Create a trip table with inbound and outbound trips. Thus, a tour with stop_frequency '2out_0in' will have two outbound and zero inbound stops, and four corresponding trips: three outbound, and one inbound. Adds stop_frequency str column to trips, with fields creates trips table with columns: :: - person_id - household_id - tour_id - primary_purpose - atwork - trip_num - outbound - trip_count """ trace_label = 'stop_frequency' model_settings = config.read_model_settings('stop_frequency.yaml') tours = tours.to_frame() tours_merged = tours_merged.to_frame() assert not tours_merged.household_id.isnull().any() assert not (tours_merged.origin == -1).any() assert not (tours_merged.destination == -1).any() nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) # - run preprocessor to annotate tours_merged preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: # hack: preprocessor adds origin column in place if it does not exist already od_skim_stack_wrapper = skim_dict.wrap('origin', 'destination') skims = [od_skim_stack_wrapper] locals_dict = {"od_skims": od_skim_stack_wrapper} if constants is not None: locals_dict.update(constants) simulate.set_skim_wrapper_targets(tours_merged, skims) # this should be pre-slice as some expressions may count tours by type annotations = expressions.compute_columns( df=tours_merged, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) assign_in_place(tours_merged, annotations) tracing.print_summary('stop_frequency segments', tours_merged.primary_purpose, value_counts=True) choices_list = [] for segment_type, choosers in tours_merged.groupby('primary_purpose'): logging.info("%s running segment %s with %s chooser rows" % (trace_label, segment_type, choosers.shape[0])) spec = simulate.read_model_spec(file_name='stop_frequency_%s.csv' % segment_type) assert spec is not None, "spec for segment_type %s not found" % segment_type choices = simulate.simple_simulate( choosers=choosers, spec=spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, segment_type), trace_choice_name='stops') # convert indexes to alternative names choices = pd.Series(spec.columns[choices.values], index=choices.index) choices_list.append(choices) choices = pd.concat(choices_list) tracing.print_summary('stop_frequency', choices, value_counts=True) # add stop_frequency choices to tours table assign_in_place(tours, choices.to_frame('stop_frequency')) if 'primary_purpose' not in tours.columns: assign_in_place(tours, tours_merged[['primary_purpose']]) pipeline.replace_table("tours", tours) # create trips table trips = process_trips(tours, stop_frequency_alts) trips = pipeline.extend_table("trips", trips) tracing.register_traceable_table('trips', trips) pipeline.get_rn_generator().add_channel('trips', trips) if trace_hh_id: tracing.trace_df(tours, label="stop_frequency.tours", slicer='person_id', columns=None) tracing.trace_df(trips, label="stop_frequency.trips", slicer='person_id', columns=None) tracing.trace_df(annotations, label="stop_frequency.annotations", columns=None) tracing.trace_df(tours_merged, label="stop_frequency.tours_merged", slicer='person_id', columns=None)
def stop_frequency( tours, tours_merged, stop_frequency_alts, skim_dict, chunk_size, trace_hh_id): """ stop frequency model For each tour, shoose a number of intermediate inbound stops and outbound stops. Create a trip table with inbound and outbound trips. Thus, a tour with stop_frequency '2out_0in' will have two outbound and zero inbound stops, and four corresponding trips: three outbound, and one inbound. Adds stop_frequency str column to trips, with fields creates trips table with columns: :: - person_id - household_id - tour_id - primary_purpose - atwork - trip_num - outbound - trip_count """ trace_label = 'stop_frequency' model_settings = config.read_model_settings('stop_frequency.yaml') tours = tours.to_frame() tours_merged = tours_merged.to_frame() assert not tours_merged.household_id.isnull().any() assert not (tours_merged.origin == -1).any() assert not (tours_merged.destination == -1).any() nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) # - run preprocessor to annotate tours_merged preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: # hack: preprocessor adds origin column in place if it does not exist already od_skim_stack_wrapper = skim_dict.wrap('origin', 'destination') skims = [od_skim_stack_wrapper] locals_dict = { "od_skims": od_skim_stack_wrapper } if constants is not None: locals_dict.update(constants) simulate.set_skim_wrapper_targets(tours_merged, skims) # this should be pre-slice as some expressions may count tours by type annotations = expressions.compute_columns( df=tours_merged, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) assign_in_place(tours_merged, annotations) tracing.print_summary('stop_frequency segments', tours_merged.primary_purpose, value_counts=True) choices_list = [] for segment_type, choosers in tours_merged.groupby('primary_purpose'): logging.info("%s running segment %s with %s chooser rows" % (trace_label, segment_type, choosers.shape[0])) spec = simulate.read_model_spec(file_name='stop_frequency_%s.csv' % segment_type) assert spec is not None, "spec for segment_type %s not found" % segment_type choices = simulate.simple_simulate( choosers=choosers, spec=spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, segment_type), trace_choice_name='stops') # convert indexes to alternative names choices = pd.Series(spec.columns[choices.values], index=choices.index) choices_list.append(choices) choices = pd.concat(choices_list) tracing.print_summary('stop_frequency', choices, value_counts=True) # add stop_frequency choices to tours table assign_in_place(tours, choices.to_frame('stop_frequency')) if 'primary_purpose' not in tours.columns: assign_in_place(tours, tours_merged[['primary_purpose']]) pipeline.replace_table("tours", tours) # create trips table trips = process_trips(tours, stop_frequency_alts) trips = pipeline.extend_table("trips", trips) tracing.register_traceable_table('trips', trips) pipeline.get_rn_generator().add_channel('trips', trips) if trace_hh_id: tracing.trace_df(tours, label="stop_frequency.tours", slicer='person_id', columns=None) tracing.trace_df(trips, label="stop_frequency.trips", slicer='person_id', columns=None) tracing.trace_df(annotations, label="stop_frequency.annotations", columns=None) tracing.trace_df(tours_merged, label="stop_frequency.tours_merged", slicer='person_id', columns=None)
def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is configured by the user) - these trips include escort, shopping, othmaint, othdiscr, eatout, and social trips in various combination. """ trace_label = 'non_mandatory_tour_frequency' model_settings = config.read_model_settings('non_mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='non_mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('non_mandatory_tour_frequency_alternatives.csv'), set_index=None) choosers = persons_merged.to_frame() # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives['tot_tours'] = alternatives.sum(axis=1) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_max_window': person_max_window } expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # filter based on results of CDAP choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers)) constants = config.get_model_constants(model_settings) choices_list = [] # segment by person type and pick the right spec for each person type for ptype, segment in choosers.groupby('ptype'): name = PTYPE_NAME[ptype] # pick the spec column for the segment spec = model_spec[[name]] # drop any zero-valued rows spec = spec[spec[name] != 0] logger.info("Running segment '%s' of size %d", name, len(segment)) choices = interaction_simulate( segment, alternatives, spec=spec, locals_d=constants, chunk_size=chunk_size, trace_label='non_mandatory_tour_frequency.%s' % name, trace_choice_name='non_mandatory_tour_frequency') choices_list.append(choices) # FIXME - force garbage collection? # force_garbage_collect() choices = pd.concat(choices_list) del alternatives['tot_tours'] # del tot_tours column we added above # - add non_mandatory_tour_frequency column to persons persons = persons.to_frame() # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] # (we expect there to be an alt with no tours - which we can use to backfill non-travelers) no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] persons['non_mandatory_tour_frequency'] = \ choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8) """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ # - get counts of each of the alternatives (so we can extend) # (choices is just the index values for the chosen alts) """ escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ tour_counts = alternatives.loc[choices] tour_counts.index = choices.index # assign person ids to the index prev_tour_count = tour_counts.sum().sum() # - extend_tour_counts tour_counts = extend_tour_counts(choosers, tour_counts, alternatives, trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts')) extended_tour_count = tour_counts.sum().sum() logging.info("extend_tour_counts increased nmtf tour count by %s from %s to %s" % (extended_tour_count - prev_tour_count, prev_tour_count, extended_tour_count)) # - create the non_mandatory tours non_mandatory_tours = process_non_mandatory_tours(persons, tour_counts) assert len(non_mandatory_tours) == extended_tour_count pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', non_mandatory_tours) pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=trace_label) pipeline.replace_table("persons", persons) tracing.print_summary('non_mandatory_tour_frequency', persons.non_mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True) tracing.trace_df(choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True) tracing.trace_df(persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True)
def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is configured by the user) - these trips include escort, shopping, othmaint, othdiscr, eatout, and social trips in various combination. """ trace_label = 'non_mandatory_tour_frequency' model_settings_file_name = 'non_mandatory_tour_frequency.yaml' model_settings = config.read_model_settings(model_settings_file_name) # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives = simulate.read_model_alts( 'non_mandatory_tour_frequency_alternatives.csv', set_index=None) alternatives['tot_tours'] = alternatives.sum(axis=1) # filter based on results of CDAP choosers = persons_merged.to_frame() choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {'person_max_window': person_max_window} expressions.assign_columns(df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers)) constants = config.get_model_constants(model_settings) model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) spec_segments = model_settings.get('SPEC_SEGMENTS', {}) # segment by person type and pick the right spec for each person type choices_list = [] for segment_settings in spec_segments: segment_name = segment_settings['NAME'] ptype = segment_settings['PTYPE'] # pick the spec column for the segment segment_spec = model_spec[[segment_name]] chooser_segment = choosers[choosers.ptype == ptype] logger.info("Running segment '%s' of size %d", segment_name, len(chooser_segment)) if len(chooser_segment) == 0: # skip empty segments continue estimator = \ estimation.manager.begin_estimation(model_name=segment_name, bundle_name='non_mandatory_tour_frequency') coefficients_df = simulate.read_model_coefficients(segment_settings) segment_spec = simulate.eval_coefficients(segment_spec, coefficients_df, estimator) if estimator: estimator.write_spec(model_settings, bundle_directory=True) estimator.write_model_settings(model_settings, model_settings_file_name, bundle_directory=True) # preserving coefficients file name makes bringing back updated coefficients more straightforward estimator.write_coefficients(coefficients_df, segment_settings) estimator.write_choosers(chooser_segment) estimator.write_alternatives(alternatives, bundle_directory=True) # FIXME #interaction_simulate_estimation_requires_chooser_id_in_df_column # shuold we do it here or have interaction_simulate do it? # chooser index must be duplicated in column or it will be omitted from interaction_dataset # estimation requires that chooser_id is either in index or a column of interaction_dataset # so it can be reformatted (melted) and indexed by chooser_id and alt_id assert chooser_segment.index.name == 'person_id' assert 'person_id' not in chooser_segment.columns chooser_segment['person_id'] = chooser_segment.index # FIXME set_alt_id - do we need this for interaction_simulate estimation bundle tables? estimator.set_alt_id('alt_id') estimator.set_chooser_id(chooser_segment.index.name) choices = interaction_simulate( chooser_segment, alternatives, spec=segment_spec, locals_d=constants, chunk_size=chunk_size, trace_label='non_mandatory_tour_frequency.%s' % segment_name, trace_choice_name='non_mandatory_tour_frequency', estimator=estimator) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values( choices, 'persons', 'non_mandatory_tour_frequency') estimator.write_override_choices(choices) estimator.end_estimation() choices_list.append(choices) # FIXME - force garbage collection? force_garbage_collect() del alternatives['tot_tours'] # del tot_tours column we added above # The choice value 'non_mandatory_tour_frequency' assigned by interaction_simulate # is the index value of the chosen alternative in the alternatives table. choices = pd.concat(choices_list).sort_index() # add non_mandatory_tour_frequency column to persons persons = persons.to_frame() # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] persons['non_mandatory_tour_frequency'] = \ choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8) """ We have now generated non-mandatory tour frequencies, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) But before we do that, we run an additional probablilistic step to extend/increase tour counts beyond the strict limits of the tour_frequency alternatives chosen above (which are currently limited to at most 2 escort tours and 1 each of shopping, othmaint, othdiscr, eatout, and social tours) The choice value 'non_mandatory_tour_frequency' assigned by interaction_simulate is simply the index value of the chosen alternative in the alternatives table. get counts of each of the tour type alternatives (so we can extend) escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ # counts of each of the tour type alternatives (so we can extend) modeled_tour_counts = alternatives.loc[choices] modeled_tour_counts.index = choices.index # assign person ids to the index # - extend_tour_counts - probabalistic extended_tour_counts = \ extend_tour_counts(choosers, modeled_tour_counts.copy(), alternatives, trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts')) num_modeled_tours = modeled_tour_counts.sum().sum() num_extended_tours = extended_tour_counts.sum().sum() logger.info("extend_tour_counts increased tour count by %s from %s to %s" % (num_extended_tours - num_modeled_tours, num_modeled_tours, num_extended_tours)) """ create the non_mandatory tours based on extended_tour_counts """ if estimator: override_tour_counts = \ estimation.manager.get_survey_values(extended_tour_counts, table_name='persons', column_names=['_%s' % c for c in extended_tour_counts.columns]) override_tour_counts = \ override_tour_counts.rename(columns={('_%s' % c): c for c in extended_tour_counts.columns}) logger.info( "estimation get_survey_values override_tour_counts %s changed cells" % (override_tour_counts != extended_tour_counts).sum().sum()) extended_tour_counts = override_tour_counts """ create the non_mandatory tours based on extended_tour_counts """ non_mandatory_tours = process_non_mandatory_tours(persons, extended_tour_counts) assert len(non_mandatory_tours) == extended_tour_counts.sum().sum() if estimator: # make sure they created the right tours survey_tours = estimation.manager.get_survey_table( 'tours').sort_index() non_mandatory_survey_tours = survey_tours[survey_tours.tour_category == 'non_mandatory'] assert len(non_mandatory_survey_tours) == len(non_mandatory_tours) assert non_mandatory_survey_tours.index.equals( non_mandatory_tours.sort_index().index) # make sure they created tours with the expected tour_ids columns = ['person_id', 'household_id', 'tour_type', 'tour_category'] survey_tours = \ estimation.manager.get_survey_values(non_mandatory_tours, table_name='tours', column_names=columns) tours_differ = (non_mandatory_tours[columns] != survey_tours[columns]).any(axis=1) if tours_differ.any(): print("tours_differ\n%s" % tours_differ) print("%s of %s tours differ" % (tours_differ.sum(), len(tours_differ))) print("differing survey_tours\n%s" % survey_tours[tours_differ]) print("differing modeled_tours\n%s" % non_mandatory_tours[columns][tours_differ]) assert (not tours_differ.any()) pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', non_mandatory_tours) pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=trace_label) pipeline.replace_table("persons", persons) tracing.print_summary('non_mandatory_tour_frequency', persons.non_mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df( non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True) tracing.trace_df(choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True) tracing.trace_df( persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True)
def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ Parameters ---------- taz_sample: dataframe with duplicated index <chooser_id_col> and columns: <DEST_TAZ>, prob, pick_count MAZ_size_terms: dataframe with duplicated index <chooser_id_col> and columns: zone_id, dest_TAZ, size_term Returns ------- dataframe with with duplicated index <chooser_id_col> and columns: <DEST_MAZ>, prob, pick_count """ # print(f"taz_sample\n{taz_sample}") # dest_TAZ prob pick_count person_id # tour_id # 542963 18 0.004778 1 13243 # 542963 53 0.004224 2 13243 # 542963 59 0.008628 1 13243 trace_hh_id = inject.get_injectable("trace_hh_id", None) have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, 'choose_MAZ_for_TAZ') CHOOSER_ID = taz_sample.index.name # zone_id for tours, but person_id for location choice assert CHOOSER_ID is not None # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(taz_sample) tracing.trace_df(taz_sample[trace_targets], label=tracing.extend_trace_label( trace_label, 'taz_sample'), transpose=False) # redupe taz_sample[[DEST_TAZ, 'prob']] using pick_count to repeat rows taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False) taz_choices = taz_choices.reindex( taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True) taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'}) # print(f"taz_choices\n{taz_choices}") # tour_id dest_TAZ TAZ_prob # 0 542963 18 0.004778 # 1 542963 53 0.004224 # 2 542963 53 0.004224 # 3 542963 59 0.008628 # print(f"MAZ_size_terms\n{MAZ_size_terms}") # zone_id dest_TAZ size_term # 0 6097 2 7.420 # 1 16421 2 9.646 # 2 24251 2 10.904 # just to make it clear we are siloing choices by chooser_id chooser_id_col = taz_sample.index.name # should be canonical chooser index name (e.g. 'person_id') # for random_for_df, we need df with de-duplicated chooser canonical index chooser_df = pd.DataFrame( index=taz_sample.index[~taz_sample.index.duplicated()]) num_choosers = len(chooser_df) assert chooser_df.index.name == chooser_id_col # to make choices, <taz_sample_size> rands for each chooser (one rand for each sampled TAZ) # taz_sample_size will be model_settings['SAMPLE_SIZE'] samples, except if we are estimating taz_sample_size = taz_choices.groupby( chooser_id_col)[DEST_TAZ].count().max() # taz_choices index values should be contiguous assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index, taz_sample_size)).all() # we need to choose a MAZ for each DEST_TAZ choice # probability of choosing MAZ based on MAZ size_term fraction of TAZ total # there will be a different set (and number) of candidate MAZs for each TAZ # (preserve index, which will have duplicates as result of join) # maz_sizes.index is the integer offset into taz_choices of the taz for which the maz_size row is a candidate) maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(), MAZ_size_terms, how='left', on=DEST_TAZ).set_index('index') # tour_id dest_TAZ zone_id size_term # index # 0 542963 18 498 12.130 # 0 542963 18 7696 18.550 # 0 542963 18 15431 8.678 # 0 542963 18 21429 29.938 # 1 542963 53 17563 34.252 if have_trace_targets: # write maz_sizes: maz_sizes[index,tour_id,dest_TAZ,zone_id,size_term] maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer=CHOOSER_ID) trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] tracing.trace_df(trace_maz_sizes, label=tracing.extend_trace_label( trace_label, 'maz_sizes'), transpose=False) # number of DEST_TAZ candidates per chooser maz_counts = maz_sizes.groupby(maz_sizes.index).size().values # max number of MAZs for any TAZ max_maz_count = maz_counts.max() # offsets of the first and last rows of each chooser in sparse interaction_utilities last_row_offsets = maz_counts.cumsum() first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0) # repeat the row offsets once for each dummy utility to insert # (we want to insert dummy utilities at the END of the list of alternative utilities) # inserts is a list of the indices at which we want to do the insertions inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts) # insert zero filler to pad each alternative set to same size padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts, 0.0).reshape(-1, max_maz_count) # prob array with one row TAZ_choice, one column per alternative row_sums = padded_maz_sizes.sum(axis=1) maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) rands = pipeline.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) rands = rands.reshape(-1, 1) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] # make choices # positions is array with the chosen alternative represented as a column index in probs # which is an integer between zero and max_maz_count positions = np.argmax((maz_probs.cumsum(axis=1) - rands) > 0.0, axis=1) # shouldn't have chosen any of the dummy pad positions assert (positions < maz_counts).all() taz_choices[DEST_MAZ] = maz_sizes['zone_id'].take(positions + first_row_offsets) taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]), positions] taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob'] if have_trace_targets: taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer=CHOOSER_ID) trace_taz_choices_df = taz_choices[taz_choices_trace_targets] tracing.trace_df(trace_taz_choices_df, label=tracing.extend_trace_label( trace_label, 'taz_choices'), transpose=False) lhs_df = trace_taz_choices_df[[CHOOSER_ID, DEST_TAZ]] alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)] # following the same logic as the full code, but for trace cutout trace_maz_counts = maz_counts[taz_choices_trace_targets] trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum() trace_inserts = np.repeat(trace_last_row_offsets, max_maz_count - trace_maz_counts) # trace dest_maz_alts padded_maz_sizes = np.insert(trace_maz_sizes[CHOOSER_ID].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_alts'), transpose=False) # trace dest_maz_size_terms padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_size_terms'), transpose=False) # trace dest_maz_probs df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets], columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) df['rand'] = rands[taz_choices_trace_targets] tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_probs'), transpose=False) taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob']) taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ ]).agg(prob=('prob', 'max'), pick_count=('prob', 'count')) taz_choices.reset_index(level=DEST_MAZ, inplace=True) return taz_choices
def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): """ This model predicts the frequency of making fully joint trips (see the alternatives above). """ trace_label = 'joint_tour_frequency' model_settings_file_name = 'joint_tour_frequency.yaml' estimator = estimation.manager.begin_estimation('joint_tour_frequency') model_settings = config.read_model_settings(model_settings_file_name) alternatives = simulate.read_model_alts( 'joint_tour_frequency_alternatives.csv', set_index='alt') # - only interested in households with more than one cdap travel_active person and # - at least one non-preschooler households = households.to_frame() multi_person_households = households[ households.participates_in_jtf_model].copy() # - only interested in persons in multi_person_households # FIXME - gratuitous pathological efficiency move, just let yaml specify persons? persons = persons.to_frame() persons = persons[persons.household_id.isin(multi_person_households.index)] logger.info( "Running joint_tour_frequency with %d multi-person households" % multi_person_households.shape[0]) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'persons': persons, 'hh_time_window_overlap': hh_time_window_overlap } expressions.assign_columns(df=multi_person_households, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) if estimator: estimator.write_spec(model_settings) estimator.write_model_settings(model_settings, model_settings_file_name) estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(multi_person_households) choices = simulate.simple_simulate( choosers=multi_person_households, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='joint_tour_frequency', estimator=estimator) # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values(choices, 'households', 'joint_tour_frequency') estimator.write_override_choices(choices) estimator.end_estimation() # - create joint_tours based on joint_tour_frequency choices # - we need a person_id in order to generate the tour index (and for register_traceable_table) # - but we don't know the tour participants yet # - so we arbitrarily choose the first person in the household # - to be point person for the purpose of generating an index and setting origin temp_point_persons = persons.loc[persons.PNUM == 1] temp_point_persons['person_id'] = temp_point_persons.index temp_point_persons = temp_point_persons.set_index('household_id') temp_point_persons = temp_point_persons[['person_id', 'home_zone_id']] joint_tours = \ process_joint_tours(choices, alternatives, temp_point_persons) tours = pipeline.extend_table("tours", joint_tours) tracing.register_traceable_table('tours', joint_tours) pipeline.get_rn_generator().add_channel('tours', joint_tours) # - annotate households # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] households['joint_tour_frequency'] = choices.reindex( households.index).fillna(no_tours_alt).astype(str) households['num_hh_joint_tours'] = joint_tours.groupby('household_id').size().\ reindex(households.index).fillna(0).astype(np.int8) pipeline.replace_table("households", households) tracing.print_summary('joint_tour_frequency', households.joint_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(households, label="joint_tour_frequency.households") tracing.trace_df(joint_tours, label="joint_tour_frequency.joint_tours", slicer='household_id') if estimator: survey_tours = estimation.manager.get_survey_table('tours') survey_tours = survey_tours[survey_tours.tour_category == 'joint'] print(f"len(survey_tours) {len(survey_tours)}") print(f"len(joint_tours) {len(joint_tours)}") different = False survey_tours_not_in_tours = survey_tours[~survey_tours.index. isin(joint_tours.index)] if len(survey_tours_not_in_tours) > 0: print(f"survey_tours_not_in_tours\n{survey_tours_not_in_tours}") different = True tours_not_in_survey_tours = joint_tours[~joint_tours.index. isin(survey_tours.index)] if len(survey_tours_not_in_tours) > 0: print(f"tours_not_in_survey_tours\n{tours_not_in_survey_tours}") different = True assert not different
def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = 'mandatory_tour_frequency' model_settings = config.read_model_settings('mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('mandatory_tour_frequency_alternatives.csv'), set_index='alt') choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours if choosers.shape[0] == 0: add_null_results(trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {} expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series( model_spec.columns[choices.values], index=choices.index).reindex(persons_merged.local.index) # - create mandatory tours """ This reprocesses the choice of index of the mandatory tour frequency alternatives into an actual dataframe of tours. Ending format is the same as got non_mandatory_tours except trip types are "work" and "school" """ choosers['mandatory_tour_frequency'] = choices mandatory_tours = process_mandatory_tours( persons=choosers, mandatory_tour_frequency_alts=alternatives ) tours = pipeline.extend_table("tours", mandatory_tours) tracing.register_traceable_table('tours', mandatory_tours) pipeline.get_rn_generator().add_channel('tours', mandatory_tours) # - annotate persons persons = inject.get_table('persons').to_frame() # need to reindex as we only handled persons with cdap_activity == 'M' persons['mandatory_tour_frequency'] = choices.reindex(persons.index).fillna('').astype(str) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True) tracing.trace_df(persons, label="mandatory_tour_frequency.persons", warn_if_empty=True)