def create_mandatory_tours(): # FIXME - move this to body? persons = inject.get_table('persons') configs_dir = inject.get_injectable('configs_dir') persons = persons.to_frame(columns=[ "mandatory_tour_frequency", "is_worker", "school_taz", "workplace_taz" ]) persons = persons[~persons.mandatory_tour_frequency.isnull()] tour_frequency_alternatives = inject.get_injectable( 'mandatory_tour_frequency_alternatives') tours = process_mandatory_tours(persons, tour_frequency_alternatives) expressions.assign_columns(df=tours, model_settings='annotate_tours_with_dest', configs_dir=configs_dir, trace_label='create_mandatory_tours') pipeline.extend_table("tours", tours) tracing.register_traceable_table('tours', tours) pipeline.get_rn_generator().add_channel(tours, 'tours')
def create_simple_trips(tours, households, persons, trace_hh_id): """ Create a simple trip table """ logger.info("Running simple trips table creation with %d tours" % len(tours.index)) tours_df = tours.to_frame() # we now have a tour_id column tours_df.reset_index(inplace=True) tours_df['household_id'] = reindex(persons.household_id, tours_df.person_id) tours_df['TAZ'] = reindex(households.TAZ, tours_df.household_id) # create inbound and outbound records trips = pd.concat([tours_df, tours_df], ignore_index=True) # first half are outbound, second half are inbound trips['INBOUND'] = np.repeat([False, True], len(trips.index) / 2) # TRIPID for outbound trips = 1, inbound_trips = 2 trips['trip_num'] = np.repeat([1, 2], len(trips.index) / 2) # set key fields from tour fields: 'TAZ','destination','start','end' trips['OTAZ'] = trips.TAZ trips['OTAZ'][trips.INBOUND] = trips.destination[trips.INBOUND] trips['DTAZ'] = trips.destination trips['DTAZ'][trips.INBOUND] = trips.TAZ[trips.INBOUND] trips['start_trip'] = trips.start trips['start_trip'][trips.INBOUND] = trips.end[trips.INBOUND] trips['end_trip'] = trips.end trips['end_trip'][trips.INBOUND] = trips.start[trips.INBOUND] # create a stable (predictable) index based on tour_id and trip_num possible_trips_count = 2 trips['trip_id'] = (trips.tour_id * possible_trips_count) + (trips.trip_num - 1) trips.set_index('trip_id', inplace=True, verify_integrity=True) trip_columns = [ 'tour_id', 'INBOUND', 'trip_num', 'OTAZ', 'DTAZ', 'start_trip', 'end_trip' ] trips = trips[trip_columns] orca.add_table("trips", trips) tracing.register_traceable_table('trips', trips) pipeline.get_rn_generator().add_channel(trips, 'trips') if trace_hh_id: tracing.trace_df(trips, label="trips", warn_if_empty=True)
def create_households(trace_hh_id): df = pd.DataFrame({'HHID': [1, 2, 3], 'TAZ': {100, 100, 101}}) inject.add_table('households', df) pipeline.get_rn_generator().add_channel(df, 'households') if trace_hh_id: tracing.register_traceable_table('households', df)
def create_households(trace_hh_id): df = pd.DataFrame({ 'household_id': [1, 2, 3], 'home_zone_id': {100, 100, 101} }) inject.add_table('households', df) pipeline.get_rn_generator().add_channel('households', df) tracing.register_traceable_table('households', df)
def create_mandatory_tours_table(): persons = orca.get_table('persons') persons = persons.to_frame(columns=["mandatory_tour_frequency", "is_worker", "school_taz", "workplace_taz"]) persons = persons[~persons.mandatory_tour_frequency.isnull()] df = process_mandatory_tours(persons) orca.add_table("mandatory_tours", df) tracing.register_traceable_table('mandatory_tours', df) pipeline.get_rn_generator().add_channel(df, 'tours')
def create_non_mandatory_tours_table(): persons = orca.get_table('persons') non_mandatory_tour_frequency_alts = orca.get_table('non_mandatory_tour_frequency_alts') df = process_non_mandatory_tours( persons.non_mandatory_tour_frequency.dropna(), non_mandatory_tour_frequency_alts.local ) orca.add_table("non_mandatory_tours", df) tracing.register_traceable_table('non_mandatory_tours', df) pipeline.get_rn_generator().add_channel(df, 'tours')
def initialize_tours(network_los, households, persons, trace_hh_id): trace_label = 'initialize_tours' tours = read_input_table("tours") # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after. # FIXME could just always slice... slice_happened = \ inject.get_injectable('households_sample_size', 0) > 0 \ or inject.get_injectable('households_sample_size', 0) > 0 if slice_happened: logger.info("slicing tours %s" % (tours.shape,)) # keep all persons in the sampled households tours = tours[tours.person_id.isin(persons.index)] # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above model_settings = config.read_model_settings('initialize_tours.yaml', mandatory=True) expressions.assign_columns( df=tours, model_settings=model_settings.get('annotate_tours'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_tours')) skip_patch_tour_ids = model_settings.get('skip_patch_tour_ids', False) if skip_patch_tour_ids: pass else: tours = patch_tour_ids(tours) assert tours.index.name == 'tour_id' # replace table function with dataframe inject.add_table('tours', tours) pipeline.get_rn_generator().add_channel('tours', tours) tracing.register_traceable_table('tours', tours) logger.debug(f"{len(tours.household_id.unique())} unique household_ids in tours") logger.debug(f"{len(households.index.unique())} unique household_ids in households") assert not tours.index.duplicated().any() tours_without_persons = ~tours.person_id.isin(persons.index) if tours_without_persons.any(): logger.error(f"{tours_without_persons.sum()} tours out of {len(persons)} without persons\n" f"{pd.Series({'person_id': tours_without_persons.index.values})}") raise RuntimeError(f"{tours_without_persons.sum()} tours with bad person_id") if trace_hh_id: tracing.trace_df(tours, label='initialize_tours', warn_if_empty=True)
def persons(households, trace_hh_id): df = read_raw_persons(households) logger.info("loaded persons %s" % (df.shape, )) # replace table function with dataframe inject.add_table('persons', df) pipeline.get_rn_generator().add_channel('persons', df) if trace_hh_id: tracing.register_traceable_table('persons', df) tracing.trace_df(df, "raw.persons", warn_if_empty=True) return df
def create_non_mandatory_tours(): """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ persons = inject.get_table('persons') alts = inject.get_injectable('non_mandatory_tour_frequency_alts') df = process_non_mandatory_tours( persons.non_mandatory_tour_frequency.dropna(), alts ) pipeline.extend_table("tours", df) tracing.register_traceable_table('tours', df) pipeline.get_rn_generator().add_channel(df, 'tours')
def persons(households, trace_hh_id): df = read_raw_persons(households) logger.info("loaded persons %s" % (df.shape,)) df.index.name = 'person_id' # replace table function with dataframe inject.add_table('persons', df) pipeline.get_rn_generator().add_channel('persons', df) if trace_hh_id: tracing.register_traceable_table('persons', df) tracing.trace_df(df, "raw.persons", warn_if_empty=True) return df
def persons(households, trace_hh_id): df = read_raw_persons(households) logger.info("loaded persons %s" % (df.shape, )) # replace table function with dataframe inject.add_table('persons', df) pipeline.get_rn_generator().add_channel('persons', df) tracing.register_traceable_table('persons', df) if trace_hh_id: tracing.trace_df(df, "raw.persons", warn_if_empty=True) logger.debug( f"{len(df.household_id.unique())} unique household_ids in persons") logger.debug( f"{len(households.index.unique())} unique household_ids in households") assert not households.index.duplicated().any() assert not df.index.duplicated().any() persons_without_households = ~df.household_id.isin(households.index) if persons_without_households.any(): logger.error( f"{persons_without_households.sum()} persons out of {len(persons)} without households\n" f"{pd.Series({'person_id': persons_without_households.index.values})}" ) raise RuntimeError( f"{persons_without_households.sum()} persons with bad household_id" ) households_without_persons = df.groupby('household_id').size().reindex( households.index).isnull() if households_without_persons.any(): logger.error( f"{households_without_persons.sum()} households out of {len(households.index)} without persons\n" f"{pd.Series({'household_id': households_without_persons.index.values})}" ) raise RuntimeError( f"{households_without_persons.sum()} households with no persons") return df
def persons(store, households_sample_size, households, trace_hh_id): df = store["persons"] if households_sample_size > 0: # keep all persons in the sampled households df = df[df.household_id.isin(households.index)] logger.info("loaded persons %s" % (df.shape, )) # replace table function with dataframe orca.add_table('persons', df) pipeline.get_rn_generator().add_channel(df, 'persons') if trace_hh_id: tracing.register_traceable_table('persons', df) tracing.trace_df(df, "persons", warn_if_empty=True) return df
def households(store, households_sample_size, trace_hh_id): df_full = store["households"] # if we are tracing hh exclusively if trace_hh_id and households_sample_size == 1: # df contains only trace_hh (or empty if not in full store) df = tracing.slice_ids(df_full, trace_hh_id) # if we need sample a subset of full store elif households_sample_size > 0 and df_full.shape[0] > households_sample_size: logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0])) # take the requested random sample df = asim.random_rows(df_full, households_sample_size) # if tracing and we missed trace_hh in sample, but it is in full store if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index: # replace first hh in sample with trace_hh logger.debug("replacing household %s with %s in household sample" % (df.index[0], trace_hh_id)) df_hh = tracing.slice_ids(df_full, trace_hh_id) df = pd.concat([df_hh, df[1:]]) else: df = df_full logger.info("loaded households %s" % (df.shape,)) # replace table function with dataframe inject.add_table('households', df) pipeline.get_rn_generator().add_channel(df, 'households') if trace_hh_id: tracing.register_traceable_table('households', df) tracing.trace_df(df, "households", warn_if_empty=True) return df
def get_trip_mc_logsums_for_all_modes(tours, segment_column_name, model_settings, trace_label): """Creates pseudo-trips from tours and runs trip mode choice to get logsums Parameters ---------- tours : pandas.DataFrame segment_column_name : str column in tours table used for segmenting model spec model_settings : dict trace_label : str Returns ------- tours : pd.DataFrame Adds two * n_modes logsum columns to each tour row, e.g. "logsum_DRIVE_outbound" """ # create pseudo-trips from tours for all tour modes logsum_trips = create_logsum_trips(tours, segment_column_name, model_settings, trace_label) # temporarily register trips in the pipeline pipeline.replace_table('trips', logsum_trips) tracing.register_traceable_table('trips', logsum_trips) pipeline.get_rn_generator().add_channel('trips', logsum_trips) # run trip mode choice on pseudo-trips. use orca instead of pipeline to # execute the step because pipeline can only handle one open step at a time orca.run(['trip_mode_choice']) # add trip mode choice logsums as new cols in tours tours = append_tour_leg_trip_mode_choice_logsums(tours) # de-register logsum trips table pipeline.get_rn_generator().drop_channel('trips') tracing.deregister_traceable_table('trips') return tours
def create_non_mandatory_tours(trace_hh_id): """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ persons = inject.get_table('persons') alts = inject.get_injectable('non_mandatory_tour_frequency_alts') non_mandatory_tours = process_non_mandatory_tours( persons.non_mandatory_tour_frequency.dropna(), alts ) tours = pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', tours) pipeline.get_rn_generator().add_channel(non_mandatory_tours, 'tours') if trace_hh_id: tracing.trace_df(non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True)
def stop_frequency( tours, tours_merged, stop_frequency_alts, skim_dict, chunk_size, trace_hh_id): """ stop frequency model For each tour, shoose a number of intermediate inbound stops and outbound stops. Create a trip table with inbound and outbound trips. Thus, a tour with stop_frequency '2out_0in' will have two outbound and zero inbound stops, and four corresponding trips: three outbound, and one inbound. Adds stop_frequency str column to trips, with fields creates trips table with columns: :: - person_id - household_id - tour_id - primary_purpose - atwork - trip_num - outbound - trip_count """ trace_label = 'stop_frequency' model_settings = config.read_model_settings('stop_frequency.yaml') tours = tours.to_frame() tours_merged = tours_merged.to_frame() assert not tours_merged.household_id.isnull().any() assert not (tours_merged.origin == -1).any() assert not (tours_merged.destination == -1).any() nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) # - run preprocessor to annotate tours_merged preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: # hack: preprocessor adds origin column in place if it does not exist already od_skim_stack_wrapper = skim_dict.wrap('origin', 'destination') skims = [od_skim_stack_wrapper] locals_dict = { "od_skims": od_skim_stack_wrapper } if constants is not None: locals_dict.update(constants) simulate.set_skim_wrapper_targets(tours_merged, skims) # this should be pre-slice as some expressions may count tours by type annotations = expressions.compute_columns( df=tours_merged, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) assign_in_place(tours_merged, annotations) tracing.print_summary('stop_frequency segments', tours_merged.primary_purpose, value_counts=True) choices_list = [] for segment_type, choosers in tours_merged.groupby('primary_purpose'): logging.info("%s running segment %s with %s chooser rows" % (trace_label, segment_type, choosers.shape[0])) spec = simulate.read_model_spec(file_name='stop_frequency_%s.csv' % segment_type) assert spec is not None, "spec for segment_type %s not found" % segment_type choices = simulate.simple_simulate( choosers=choosers, spec=spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, segment_type), trace_choice_name='stops') # convert indexes to alternative names choices = pd.Series(spec.columns[choices.values], index=choices.index) choices_list.append(choices) choices = pd.concat(choices_list) tracing.print_summary('stop_frequency', choices, value_counts=True) # add stop_frequency choices to tours table assign_in_place(tours, choices.to_frame('stop_frequency')) if 'primary_purpose' not in tours.columns: assign_in_place(tours, tours_merged[['primary_purpose']]) pipeline.replace_table("tours", tours) # create trips table trips = process_trips(tours, stop_frequency_alts) trips = pipeline.extend_table("trips", trips) tracing.register_traceable_table('trips', trips) pipeline.get_rn_generator().add_channel('trips', trips) if trace_hh_id: tracing.trace_df(tours, label="stop_frequency.tours", slicer='person_id', columns=None) tracing.trace_df(trips, label="stop_frequency.trips", slicer='person_id', columns=None) tracing.trace_df(annotations, label="stop_frequency.annotations", columns=None) tracing.trace_df(tours_merged, label="stop_frequency.tours_merged", slicer='person_id', columns=None)
def joint_tour_participation( tours, persons_merged, chunk_size, trace_hh_id): """ Predicts for each eligible person to participate or not participate in each joint tour. """ trace_label = 'joint_tour_participation' model_settings_file_name = 'joint_tour_participation.yaml' model_settings = config.read_model_settings(model_settings_file_name) tours = tours.to_frame() joint_tours = tours[tours.tour_category == 'joint'] # - if no joint tours if joint_tours.shape[0] == 0: add_null_results(model_settings, trace_label) return persons_merged = persons_merged.to_frame() # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) tracing.register_traceable_table('joint_tour_participants', candidates) pipeline.get_rn_generator().add_channel('joint_tour_participants', candidates) logger.info("Running joint_tours_participation with %d potential participants (candidates)" % candidates.shape[0]) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_time_window_overlap': person_time_window_overlap, 'persons': persons_merged } expressions.assign_columns( df=candidates, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # - simple_simulate estimator = estimation.manager.begin_estimation('joint_tour_participation') model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) if estimator: estimator.write_model_settings(model_settings, model_settings_file_name) estimator.write_spec(model_settings) estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(candidates) # add tour-based chunk_id so we can chunk all trips in tour together assert 'chunk_id' not in candidates.columns unique_household_ids = candidates.household_id.unique() household_chunk_ids = pd.Series(range(len(unique_household_ids)), index=unique_household_ids) candidates['chunk_id'] = reindex(household_chunk_ids, candidates.household_id) choices = simulate.simple_simulate_by_chunk_id( choosers=candidates, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='participation', custom_chooser=participants_chooser, estimator=estimator) # choice is boolean (participate or not) choice_col = model_settings.get('participation_choice', 'participate') assert choice_col in model_spec.columns, \ "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = model_spec.columns.get_loc(choice_col) participate = (choices == PARTICIPATE_CHOICE) if estimator: estimator.write_choices(choices) # we override the 'participate' boolean series, instead of raw alternative index in 'choices' series # its value depends on whether the candidate's 'participant_id' is in the joint_tour_participant index survey_participants_df = estimator.get_survey_table('joint_tour_participants') participate = pd.Series(choices.index.isin(survey_participants_df.index.values), index=choices.index) # but estimation software wants to know the choices value (alternative index) choices = participate.replace({True: PARTICIPATE_CHOICE, False: 1-PARTICIPATE_CHOICE}) # estimator.write_override_choices(participate) # write choices as boolean participate estimator.write_override_choices(choices) # write choices as int alt indexes estimator.end_estimation() # satisfaction indexed by tour_id tour_satisfaction = get_tour_satisfaction(candidates, participate) assert tour_satisfaction.all() candidates['satisfied'] = reindex(tour_satisfaction, candidates.tour_id) PARTICIPANT_COLS = ['tour_id', 'household_id', 'person_id'] participants = candidates[participate][PARTICIPANT_COLS].copy() # assign participant_num # FIXME do we want something smarter than the participant with the lowest person_id? participants['participant_num'] = \ participants.sort_values(by=['tour_id', 'person_id']).\ groupby('tour_id').cumcount() + 1 pipeline.replace_table("joint_tour_participants", participants) # drop channel as we aren't using any more (and it has candidates that weren't chosen) pipeline.get_rn_generator().drop_channel('joint_tour_participants') # - assign joint tour 'point person' (participant_num == 1) point_persons = participants[participants.participant_num == 1] joint_tours['person_id'] = point_persons.set_index('tour_id').person_id # update number_of_participants which was initialized to 1 joint_tours['number_of_participants'] = participants.groupby('tour_id').size() assign_in_place(tours, joint_tours[['person_id', 'number_of_participants']]) pipeline.replace_table("tours", tours) # - run annotations annotate_jtp(model_settings, trace_label) if trace_hh_id: tracing.trace_df(participants, label="joint_tour_participation.participants") tracing.trace_df(joint_tours, label="joint_tour_participation.joint_tours")
def joint_tour_participation( tours, persons_merged, chunk_size, trace_hh_id): """ Predicts for each eligible person to participate or not participate in each joint tour. """ trace_label = 'joint_tour_participation' model_settings = config.read_model_settings('joint_tour_participation.yaml') model_spec = simulate.read_model_spec(file_name='joint_tour_participation.csv') tours = tours.to_frame() joint_tours = tours[tours.tour_category == 'joint'] # - if no joint tours if joint_tours.shape[0] == 0: add_null_results(model_settings, trace_label) return persons_merged = persons_merged.to_frame() # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) tracing.register_traceable_table('joint_tour_participants', candidates) pipeline.get_rn_generator().add_channel('joint_tour_participants', candidates) logger.info("Running joint_tours_participation with %d potential participants (candidates)" % candidates.shape[0]) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_time_window_overlap': person_time_window_overlap, 'persons': persons_merged } expressions.assign_columns( df=candidates, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # - simple_simulate nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=candidates, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='participation', custom_chooser=participants_chooser) # choice is boolean (participate or not) choice_col = model_settings.get('participation_choice', 'participate') assert choice_col in model_spec.columns, \ "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = model_spec.columns.get_loc(choice_col) participate = (choices == PARTICIPATE_CHOICE) # satisfaction indexed by tour_id tour_satisfaction = get_tour_satisfaction(candidates, participate) assert tour_satisfaction.all() candidates['satisfied'] = reindex(tour_satisfaction, candidates.tour_id) PARTICIPANT_COLS = ['tour_id', 'household_id', 'person_id'] participants = candidates[participate][PARTICIPANT_COLS].copy() # assign participant_num # FIXME do we want something smarter than the participant with the lowest person_id? participants['participant_num'] = \ participants.sort_values(by=['tour_id', 'person_id']).\ groupby('tour_id').cumcount() + 1 pipeline.replace_table("joint_tour_participants", participants) # drop channel as we aren't using any more (and it has candidates that weren't chosen) pipeline.get_rn_generator().drop_channel('joint_tour_participants') # - assign joint tour 'point person' (participant_num == 1) point_persons = participants[participants.participant_num == 1] joint_tours['person_id'] = point_persons.set_index('tour_id').person_id # update number_of_participants which was initialized to 1 joint_tours['number_of_participants'] = participants.groupby('tour_id').size() assign_in_place(tours, joint_tours[['person_id', 'number_of_participants']]) pipeline.replace_table("tours", tours) # - run annotations annotate_jtp(model_settings, trace_label) if trace_hh_id: tracing.trace_df(participants, label="joint_tour_participation.participants") tracing.trace_df(joint_tours, label="joint_tour_participation.joint_tours")
def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = 'mandatory_tour_frequency' model_settings = config.read_model_settings('mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('mandatory_tour_frequency_alternatives.csv'), set_index='alt') choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours if choosers.shape[0] == 0: add_null_results(trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {} expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series( model_spec.columns[choices.values], index=choices.index).reindex(persons_merged.local.index) # - create mandatory tours """ This reprocesses the choice of index of the mandatory tour frequency alternatives into an actual dataframe of tours. Ending format is the same as got non_mandatory_tours except trip types are "work" and "school" """ choosers['mandatory_tour_frequency'] = choices mandatory_tours = process_mandatory_tours( persons=choosers, mandatory_tour_frequency_alts=alternatives ) tours = pipeline.extend_table("tours", mandatory_tours) tracing.register_traceable_table('tours', mandatory_tours) pipeline.get_rn_generator().add_channel('tours', mandatory_tours) # - annotate persons persons = inject.get_table('persons').to_frame() # need to reindex as we only handled persons with cdap_activity == 'M' persons['mandatory_tour_frequency'] = choices.reindex(persons.index).fillna('').astype(str) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True) tracing.trace_df(persons, label="mandatory_tour_frequency.persons", warn_if_empty=True)
def stop_frequency(tours, tours_merged, stop_frequency_alts, skim_dict, chunk_size, trace_hh_id): """ stop frequency model For each tour, shoose a number of intermediate inbound stops and outbound stops. Create a trip table with inbound and outbound trips. Thus, a tour with stop_frequency '2out_0in' will have two outbound and zero inbound stops, and four corresponding trips: three outbound, and one inbound. Adds stop_frequency str column to trips, with fields creates trips table with columns: :: - person_id - household_id - tour_id - primary_purpose - atwork - trip_num - outbound - trip_count """ trace_label = 'stop_frequency' model_settings = config.read_model_settings('stop_frequency.yaml') tours = tours.to_frame() tours_merged = tours_merged.to_frame() assert not tours_merged.household_id.isnull().any() assert not (tours_merged.origin == -1).any() assert not (tours_merged.destination == -1).any() nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) # - run preprocessor to annotate tours_merged preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: # hack: preprocessor adds origin column in place if it does not exist already od_skim_stack_wrapper = skim_dict.wrap('origin', 'destination') skims = [od_skim_stack_wrapper] locals_dict = {"od_skims": od_skim_stack_wrapper} if constants is not None: locals_dict.update(constants) simulate.set_skim_wrapper_targets(tours_merged, skims) # this should be pre-slice as some expressions may count tours by type annotations = expressions.compute_columns( df=tours_merged, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) assign_in_place(tours_merged, annotations) tracing.print_summary('stop_frequency segments', tours_merged.primary_purpose, value_counts=True) choices_list = [] for segment_type, choosers in tours_merged.groupby('primary_purpose'): logging.info("%s running segment %s with %s chooser rows" % (trace_label, segment_type, choosers.shape[0])) spec = simulate.read_model_spec(file_name='stop_frequency_%s.csv' % segment_type) assert spec is not None, "spec for segment_type %s not found" % segment_type choices = simulate.simple_simulate( choosers=choosers, spec=spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, segment_type), trace_choice_name='stops') # convert indexes to alternative names choices = pd.Series(spec.columns[choices.values], index=choices.index) choices_list.append(choices) choices = pd.concat(choices_list) tracing.print_summary('stop_frequency', choices, value_counts=True) # add stop_frequency choices to tours table assign_in_place(tours, choices.to_frame('stop_frequency')) if 'primary_purpose' not in tours.columns: assign_in_place(tours, tours_merged[['primary_purpose']]) pipeline.replace_table("tours", tours) # create trips table trips = process_trips(tours, stop_frequency_alts) trips = pipeline.extend_table("trips", trips) tracing.register_traceable_table('trips', trips) pipeline.get_rn_generator().add_channel('trips', trips) if trace_hh_id: tracing.trace_df(tours, label="stop_frequency.tours", slicer='person_id', columns=None) tracing.trace_df(trips, label="stop_frequency.trips", slicer='person_id', columns=None) tracing.trace_df(annotations, label="stop_frequency.annotations", columns=None) tracing.trace_df(tours_merged, label="stop_frequency.tours_merged", slicer='person_id', columns=None)
def atwork_subtour_frequency(tours, persons_merged, atwork_subtour_frequency_spec, atwork_subtour_frequency_settings, atwork_subtour_frequency_alternatives, chunk_size, trace_hh_id): """ This model predicts the frequency of making at-work subtour tours (alternatives for this model come from a separate csv file which is configured by the user). """ trace_label = 'atwork_subtour_frequency' tours = tours.to_frame() persons_merged = persons_merged.to_frame() work_tours = tours[tours.tour_type == 'work'] # merge persons into work_tours work_tours = pd.merge(work_tours, persons_merged, left_on='person_id', right_index=True) logger.info("Running atwork_subtour_frequency with %d work tours" % len(work_tours)) nest_spec = config.get_logit_model_settings( atwork_subtour_frequency_settings) constants = config.get_model_constants(atwork_subtour_frequency_settings) choices = simulate.simple_simulate( choosers=work_tours, spec=atwork_subtour_frequency_spec, nest_spec=nest_spec, locals_d=constants, trace_label=trace_label, trace_choice_name='atwork_subtour_frequency') # convert indexes to alternative names choices = pd.Series(atwork_subtour_frequency_spec.columns[choices.values], index=choices.index) tracing.print_summary('atwork_subtour_frequency', choices, value_counts=True) # reindex since we are working with a subset of tours choices = choices.reindex(tours.index) # add atwork_subtour_frequency column to tours tours['atwork_subtour_frequency'] = choices pipeline.replace_table("tours", tours) # - create atwork_subtours based on atwork_subtour_frequency choice names work_tours = tours[tours.tour_type == 'work'] assert not work_tours.atwork_subtour_frequency.isnull().any() subtours = process_atwork_subtours(work_tours, atwork_subtour_frequency_alternatives) pipeline.extend_table("tours", subtours) tracing.register_traceable_table('tours', subtours) pipeline.get_rn_generator().add_channel(subtours, 'tours') if trace_hh_id: trace_columns = ['atwork_subtour_frequency'] tracing.trace_df(inject.get_table('tours').to_frame(), label=trace_label, columns=trace_columns, warn_if_empty=True)
def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is configured by the user) - these trips include escort, shopping, othmaint, othdiscr, eatout, and social trips in various combination. """ trace_label = 'non_mandatory_tour_frequency' model_settings_file_name = 'non_mandatory_tour_frequency.yaml' model_settings = config.read_model_settings(model_settings_file_name) # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives = simulate.read_model_alts( 'non_mandatory_tour_frequency_alternatives.csv', set_index=None) alternatives['tot_tours'] = alternatives.sum(axis=1) # filter based on results of CDAP choosers = persons_merged.to_frame() choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {'person_max_window': person_max_window} expressions.assign_columns(df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers)) constants = config.get_model_constants(model_settings) model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) spec_segments = model_settings.get('SPEC_SEGMENTS', {}) # segment by person type and pick the right spec for each person type choices_list = [] for segment_settings in spec_segments: segment_name = segment_settings['NAME'] ptype = segment_settings['PTYPE'] # pick the spec column for the segment segment_spec = model_spec[[segment_name]] chooser_segment = choosers[choosers.ptype == ptype] logger.info("Running segment '%s' of size %d", segment_name, len(chooser_segment)) if len(chooser_segment) == 0: # skip empty segments continue estimator = \ estimation.manager.begin_estimation(model_name=segment_name, bundle_name='non_mandatory_tour_frequency') coefficients_df = simulate.read_model_coefficients(segment_settings) segment_spec = simulate.eval_coefficients(segment_spec, coefficients_df, estimator) if estimator: estimator.write_spec(model_settings, bundle_directory=True) estimator.write_model_settings(model_settings, model_settings_file_name, bundle_directory=True) # preserving coefficients file name makes bringing back updated coefficients more straightforward estimator.write_coefficients(coefficients_df, segment_settings) estimator.write_choosers(chooser_segment) estimator.write_alternatives(alternatives, bundle_directory=True) # FIXME #interaction_simulate_estimation_requires_chooser_id_in_df_column # shuold we do it here or have interaction_simulate do it? # chooser index must be duplicated in column or it will be omitted from interaction_dataset # estimation requires that chooser_id is either in index or a column of interaction_dataset # so it can be reformatted (melted) and indexed by chooser_id and alt_id assert chooser_segment.index.name == 'person_id' assert 'person_id' not in chooser_segment.columns chooser_segment['person_id'] = chooser_segment.index # FIXME set_alt_id - do we need this for interaction_simulate estimation bundle tables? estimator.set_alt_id('alt_id') estimator.set_chooser_id(chooser_segment.index.name) choices = interaction_simulate( chooser_segment, alternatives, spec=segment_spec, locals_d=constants, chunk_size=chunk_size, trace_label='non_mandatory_tour_frequency.%s' % segment_name, trace_choice_name='non_mandatory_tour_frequency', estimator=estimator) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values( choices, 'persons', 'non_mandatory_tour_frequency') estimator.write_override_choices(choices) estimator.end_estimation() choices_list.append(choices) # FIXME - force garbage collection? force_garbage_collect() del alternatives['tot_tours'] # del tot_tours column we added above # The choice value 'non_mandatory_tour_frequency' assigned by interaction_simulate # is the index value of the chosen alternative in the alternatives table. choices = pd.concat(choices_list).sort_index() # add non_mandatory_tour_frequency column to persons persons = persons.to_frame() # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] persons['non_mandatory_tour_frequency'] = \ choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8) """ We have now generated non-mandatory tour frequencies, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) But before we do that, we run an additional probablilistic step to extend/increase tour counts beyond the strict limits of the tour_frequency alternatives chosen above (which are currently limited to at most 2 escort tours and 1 each of shopping, othmaint, othdiscr, eatout, and social tours) The choice value 'non_mandatory_tour_frequency' assigned by interaction_simulate is simply the index value of the chosen alternative in the alternatives table. get counts of each of the tour type alternatives (so we can extend) escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ # counts of each of the tour type alternatives (so we can extend) modeled_tour_counts = alternatives.loc[choices] modeled_tour_counts.index = choices.index # assign person ids to the index # - extend_tour_counts - probabalistic extended_tour_counts = \ extend_tour_counts(choosers, modeled_tour_counts.copy(), alternatives, trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts')) num_modeled_tours = modeled_tour_counts.sum().sum() num_extended_tours = extended_tour_counts.sum().sum() logger.info("extend_tour_counts increased tour count by %s from %s to %s" % (num_extended_tours - num_modeled_tours, num_modeled_tours, num_extended_tours)) """ create the non_mandatory tours based on extended_tour_counts """ if estimator: override_tour_counts = \ estimation.manager.get_survey_values(extended_tour_counts, table_name='persons', column_names=['_%s' % c for c in extended_tour_counts.columns]) override_tour_counts = \ override_tour_counts.rename(columns={('_%s' % c): c for c in extended_tour_counts.columns}) logger.info( "estimation get_survey_values override_tour_counts %s changed cells" % (override_tour_counts != extended_tour_counts).sum().sum()) extended_tour_counts = override_tour_counts """ create the non_mandatory tours based on extended_tour_counts """ non_mandatory_tours = process_non_mandatory_tours(persons, extended_tour_counts) assert len(non_mandatory_tours) == extended_tour_counts.sum().sum() if estimator: # make sure they created the right tours survey_tours = estimation.manager.get_survey_table( 'tours').sort_index() non_mandatory_survey_tours = survey_tours[survey_tours.tour_category == 'non_mandatory'] assert len(non_mandatory_survey_tours) == len(non_mandatory_tours) assert non_mandatory_survey_tours.index.equals( non_mandatory_tours.sort_index().index) # make sure they created tours with the expected tour_ids columns = ['person_id', 'household_id', 'tour_type', 'tour_category'] survey_tours = \ estimation.manager.get_survey_values(non_mandatory_tours, table_name='tours', column_names=columns) tours_differ = (non_mandatory_tours[columns] != survey_tours[columns]).any(axis=1) if tours_differ.any(): print("tours_differ\n%s" % tours_differ) print("%s of %s tours differ" % (tours_differ.sum(), len(tours_differ))) print("differing survey_tours\n%s" % survey_tours[tours_differ]) print("differing modeled_tours\n%s" % non_mandatory_tours[columns][tours_differ]) assert (not tours_differ.any()) pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', non_mandatory_tours) pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=trace_label) pipeline.replace_table("persons", persons) tracing.print_summary('non_mandatory_tour_frequency', persons.non_mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df( non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True) tracing.trace_df(choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True) tracing.trace_df( persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True)
def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making at-work subtour tours (alternatives for this model come from a separate csv file which is configured by the user). """ trace_label = 'atwork_subtour_frequency' model_settings = config.read_model_settings( 'atwork_subtour_frequency.yaml') model_spec = simulate.read_model_spec( file_name='atwork_subtour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('atwork_subtour_frequency_alternatives.csv'), set_index='alt') tours = tours.to_frame() persons_merged = persons_merged.to_frame() work_tours = tours[tours.tour_type == 'work'] # - if no work_tours if len(work_tours) == 0: add_null_results(trace_label, tours) return # merge persons into work_tours work_tours = pd.merge(work_tours, persons_merged, left_on='person_id', right_index=True) logger.info("Running atwork_subtour_frequency with %d work tours", len(work_tours)) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: assign_columns(df=work_tours, model_settings=preprocessor_settings, trace_label=trace_label) choices = simulate.simple_simulate( choosers=work_tours, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='atwork_subtour_frequency') # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) tracing.print_summary('atwork_subtour_frequency', choices, value_counts=True) # add atwork_subtour_frequency column to tours # reindex since we are working with a subset of tours tours['atwork_subtour_frequency'] = choices.reindex(tours.index) pipeline.replace_table("tours", tours) # - create atwork_subtours based on atwork_subtour_frequency choice names work_tours = tours[tours.tour_type == 'work'] assert not work_tours.atwork_subtour_frequency.isnull().any() subtours = process_atwork_subtours(work_tours, alternatives) tours = pipeline.extend_table("tours", subtours) tracing.register_traceable_table('tours', subtours) pipeline.get_rn_generator().add_channel('tours', subtours) if trace_hh_id: tracing.trace_df(tours, label='atwork_subtour_frequency.tours')
def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): """ This model predicts the frequency of making fully joint trips (see the alternatives above). """ trace_label = 'joint_tour_frequency' model_settings = config.read_model_settings('joint_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='joint_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('joint_tour_frequency_alternatives.csv'), set_index='alt') # - only interested in households with more than one cdap travel_active person households = households.to_frame() multi_person_households = households[ households.num_travel_active > 1].copy() # - only interested in persons in multi_person_households # FIXME - gratuitous pathological efficiency move, just let yaml specify persons? persons = persons.to_frame() persons = persons[persons.household_id.isin(multi_person_households.index)] logger.info( "Running joint_tour_frequency with %d multi-person households" % multi_person_households.shape[0]) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'persons': persons, 'hh_time_window_overlap': hh_time_window_overlap } expressions.assign_columns(df=multi_person_households, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # - simple_simulate nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=multi_person_households, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='joint_tour_frequency') # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) # - create joint_tours based on joint_tour_frequency choices # - we need a person_id in order to generate the tour index (and for register_traceable_table) # - but we don't know the tour participants yet # - so we arbitrarily choose the first person in the household # - to be point person for the purpose of generating an index and setting origin temp_point_persons = persons.loc[persons.PNUM == 1] temp_point_persons['person_id'] = temp_point_persons.index temp_point_persons = temp_point_persons.set_index('household_id') temp_point_persons = temp_point_persons[['person_id', 'home_taz']] joint_tours = \ process_joint_tours(choices, alternatives, temp_point_persons) tours = pipeline.extend_table("tours", joint_tours) tracing.register_traceable_table('tours', joint_tours) pipeline.get_rn_generator().add_channel('tours', joint_tours) # - annotate households # add joint_tour_frequency and num_hh_joint_tours columns to households # reindex since we ran model on a subset of households households['joint_tour_frequency'] = choices.reindex( households.index).fillna('').astype(str) households['num_hh_joint_tours'] = joint_tours.groupby('household_id').size().\ reindex(households.index).fillna(0).astype(np.int8) pipeline.replace_table("households", households) tracing.print_summary('joint_tour_frequency', households.joint_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(households, label="joint_tour_frequency.households") tracing.trace_df(joint_tours, label="joint_tour_frequency.joint_tours", slicer='household_id')
def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is configured by the user) - these trips include escort, shopping, othmaint, othdiscr, eatout, and social trips in various combination. """ trace_label = 'non_mandatory_tour_frequency' model_settings = config.read_model_settings('non_mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='non_mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('non_mandatory_tour_frequency_alternatives.csv'), set_index=None) choosers = persons_merged.to_frame() # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives['tot_tours'] = alternatives.sum(axis=1) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_max_window': person_max_window } expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # filter based on results of CDAP choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers)) constants = config.get_model_constants(model_settings) choices_list = [] # segment by person type and pick the right spec for each person type for ptype, segment in choosers.groupby('ptype'): name = PTYPE_NAME[ptype] # pick the spec column for the segment spec = model_spec[[name]] # drop any zero-valued rows spec = spec[spec[name] != 0] logger.info("Running segment '%s' of size %d", name, len(segment)) choices = interaction_simulate( segment, alternatives, spec=spec, locals_d=constants, chunk_size=chunk_size, trace_label='non_mandatory_tour_frequency.%s' % name, trace_choice_name='non_mandatory_tour_frequency') choices_list.append(choices) # FIXME - force garbage collection? # force_garbage_collect() choices = pd.concat(choices_list) del alternatives['tot_tours'] # del tot_tours column we added above # - add non_mandatory_tour_frequency column to persons persons = persons.to_frame() # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] # (we expect there to be an alt with no tours - which we can use to backfill non-travelers) no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] persons['non_mandatory_tour_frequency'] = \ choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8) """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ # - get counts of each of the alternatives (so we can extend) # (choices is just the index values for the chosen alts) """ escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ tour_counts = alternatives.loc[choices] tour_counts.index = choices.index # assign person ids to the index prev_tour_count = tour_counts.sum().sum() # - extend_tour_counts tour_counts = extend_tour_counts(choosers, tour_counts, alternatives, trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts')) extended_tour_count = tour_counts.sum().sum() logging.info("extend_tour_counts increased nmtf tour count by %s from %s to %s" % (extended_tour_count - prev_tour_count, prev_tour_count, extended_tour_count)) # - create the non_mandatory tours non_mandatory_tours = process_non_mandatory_tours(persons, tour_counts) assert len(non_mandatory_tours) == extended_tour_count pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', non_mandatory_tours) pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=trace_label) pipeline.replace_table("persons", persons) tracing.print_summary('non_mandatory_tour_frequency', persons.non_mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True) tracing.trace_df(choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True) tracing.trace_df(persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True)
def stop_frequency(tours, tours_merged, stop_frequency_alts, network_los, chunk_size, trace_hh_id): """ stop frequency model For each tour, shoose a number of intermediate inbound stops and outbound stops. Create a trip table with inbound and outbound trips. Thus, a tour with stop_frequency '2out_0in' will have two outbound and zero inbound stops, and four corresponding trips: three outbound, and one inbound. Adds stop_frequency str column to trips, with fields creates trips table with columns: :: - person_id - household_id - tour_id - primary_purpose - atwork - trip_num - outbound - trip_count """ trace_label = 'stop_frequency' model_settings_file_name = 'stop_frequency.yaml' model_settings = config.read_model_settings(model_settings_file_name) tours = tours.to_frame() tours_merged = tours_merged.to_frame() assert not tours_merged.household_id.isnull().any() assert not (tours_merged.origin == -1).any() assert not (tours_merged.destination == -1).any() nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) # - run preprocessor to annotate tours_merged preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: # hack: preprocessor adds origin column in place if it does not exist already assert 'origin' in tours_merged assert 'destination' in tours_merged od_skim_stack_wrapper = network_los.get_default_skim_dict().wrap( 'origin', 'destination') skims = [od_skim_stack_wrapper] locals_dict = { "od_skims": od_skim_stack_wrapper, 'network_los': network_los } locals_dict.update(constants) simulate.set_skim_wrapper_targets(tours_merged, skims) # this should be pre-slice as some expressions may count tours by type annotations = expressions.compute_columns( df=tours_merged, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) assign_in_place(tours_merged, annotations) tracing.print_summary('stop_frequency segments', tours_merged.primary_purpose, value_counts=True) spec_segments = model_settings.get('SPEC_SEGMENTS') assert spec_segments is not None, f"SPEC_SEGMENTS setting not found in model settings: {model_settings_file_name}" segment_col = model_settings.get('SEGMENT_COL') assert segment_col is not None, f"SEGMENT_COL setting not found in model settings: {model_settings_file_name}" nest_spec = config.get_logit_model_settings(model_settings) choices_list = [] for segment_settings in spec_segments: segment_name = segment_settings[segment_col] segment_value = segment_settings[segment_col] chooser_segment = tours_merged[tours_merged[segment_col] == segment_value] if len(chooser_segment) == 0: logging.info( f"{trace_label} skipping empty segment {segment_name}") continue logging.info( f"{trace_label} running segment {segment_name} with {chooser_segment.shape[0]} chooser rows" ) estimator = estimation.manager.begin_estimation( model_name=segment_name, bundle_name='stop_frequency') segment_spec = simulate.read_model_spec( file_name=segment_settings['SPEC']) assert segment_spec is not None, "spec for segment_type %s not found" % segment_name coefficients_file_name = segment_settings['COEFFICIENTS'] coefficients_df = simulate.read_model_coefficients( file_name=coefficients_file_name) segment_spec = simulate.eval_coefficients(segment_spec, coefficients_df, estimator) if estimator: estimator.write_spec(segment_settings, bundle_directory=False) estimator.write_model_settings(model_settings, model_settings_file_name, bundle_directory=True) estimator.write_coefficients(coefficients_df, segment_settings) estimator.write_choosers(chooser_segment) estimator.set_chooser_id(chooser_segment.index.name) choices = simulate.simple_simulate( choosers=chooser_segment, spec=segment_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, segment_name), trace_choice_name='stops', estimator=estimator) # convert indexes to alternative names choices = pd.Series(segment_spec.columns[choices.values], index=choices.index) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values( choices, 'tours', 'stop_frequency') # override choices estimator.write_override_choices(choices) estimator.end_estimation() choices_list.append(choices) choices = pd.concat(choices_list) tracing.print_summary('stop_frequency', choices, value_counts=True) # add stop_frequency choices to tours table assign_in_place(tours, choices.to_frame('stop_frequency')) # FIXME should have added this when tours created? assert 'primary_purpose' not in tours if 'primary_purpose' not in tours.columns: # if not already there, then it will have been added by annotate tours preprocessor assign_in_place(tours, tours_merged[['primary_purpose']]) pipeline.replace_table("tours", tours) # create trips table trips = process_trips(tours, stop_frequency_alts) trips = pipeline.extend_table("trips", trips) tracing.register_traceable_table('trips', trips) pipeline.get_rn_generator().add_channel('trips', trips) if estimator: # make sure they created trips with the expected tour_ids columns = ['person_id', 'household_id', 'tour_id', 'outbound'] survey_trips = estimation.manager.get_survey_table(table_name='trips') different = False survey_trips_not_in_trips = survey_trips[~survey_trips.index. isin(trips.index)] if len(survey_trips_not_in_trips) > 0: print(f"survey_trips_not_in_trips\n{survey_trips_not_in_trips}") different = True trips_not_in_survey_trips = trips[~trips.index.isin(survey_trips.index )] if len(survey_trips_not_in_trips) > 0: print(f"trips_not_in_survey_trips\n{trips_not_in_survey_trips}") different = True assert not different survey_trips = \ estimation.manager.get_survey_values(trips, table_name='trips', column_names=columns) trips_differ = (trips[columns] != survey_trips[columns]).any(axis=1) if trips_differ.any(): print("trips_differ\n%s" % trips_differ) print("%s of %s tours differ" % (trips_differ.sum(), len(trips_differ))) print("differing survey_trips\n%s" % survey_trips[trips_differ]) print("differing modeled_trips\n%s" % trips[columns][trips_differ]) assert (not trips_differ.any()) if trace_hh_id: tracing.trace_df(tours, label="stop_frequency.tours", slicer='person_id', columns=None) tracing.trace_df(trips, label="stop_frequency.trips", slicer='person_id', columns=None) tracing.trace_df(annotations, label="stop_frequency.annotations", columns=None) tracing.trace_df(tours_merged, label="stop_frequency.tours_merged", slicer='person_id', columns=None)
def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = 'mandatory_tour_frequency' model_settings = config.read_model_settings( 'mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec( file_name='mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('mandatory_tour_frequency_alternatives.csv'), set_index='alt') choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours if choosers.shape[0] == 0: add_null_results(trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {} expressions.assign_columns(df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index).reindex( persons_merged.local.index) # - create mandatory tours """ This reprocesses the choice of index of the mandatory tour frequency alternatives into an actual dataframe of tours. Ending format is the same as got non_mandatory_tours except trip types are "work" and "school" """ choosers['mandatory_tour_frequency'] = choices mandatory_tours = process_mandatory_tours( persons=choosers, mandatory_tour_frequency_alts=alternatives) tours = pipeline.extend_table("tours", mandatory_tours) tracing.register_traceable_table('tours', mandatory_tours) pipeline.get_rn_generator().add_channel('tours', mandatory_tours) # - annotate persons persons = inject.get_table('persons').to_frame() # need to reindex as we only handled persons with cdap_activity == 'M' persons['mandatory_tour_frequency'] = choices.reindex( persons.index).fillna('').astype(str) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True) tracing.trace_df(persons, label="mandatory_tour_frequency.persons", warn_if_empty=True)
def run_od_logsums(spec_segment_name, tours_merged_df, od_sample, model_settings, network_los, estimator, chunk_size, trace_hh_id, trace_label): """ add logsum column to existing tour_destination_sample table logsum is calculated by running the mode_choice model for each sample (person, OD_id) pair in od_sample, and computing the logsum of all the utilities """ chunk_tag = 'tour_od.logsums' logsum_settings = config.read_model_settings( model_settings['LOGSUM_SETTINGS']) origin_id_col = model_settings['ORIG_COL_NAME'] dest_id_col = model_settings['DEST_COL_NAME'] tour_od_id_col = get_od_id_col(origin_id_col, dest_id_col) # FIXME - MEMORY HACK - only include columns actually used in spec tours_merged_df = \ logsum.filter_chooser_columns(tours_merged_df, logsum_settings, model_settings) # merge ods into choosers table choosers = od_sample.join(tours_merged_df, how='left') choosers[tour_od_id_col] = choosers[origin_id_col].astype( str) + '_' + choosers[dest_id_col].astype(str) logger.info("Running %s with %s rows", trace_label, len(choosers)) tracing.dump_df(DUMP, choosers, trace_label, 'choosers') # run trip mode choice to compute tour mode choice logsums if logsum_settings.get('COMPUTE_TRIP_MODE_CHOICE_LOGSUMS', False): pseudo_tours = choosers.copy() trip_mode_choice_settings = config.read_model_settings( 'trip_mode_choice') # tours_merged table doesn't yet have all the cols it needs to be called (e.g. # home_zone_id), so in order to compute tour mode choice/trip mode choice logsums # in this step we have to pass all tour-level attributes in with the main trips # table. see trip_mode_choice.py L56-61 for more details. tour_cols_needed = trip_mode_choice_settings.get( 'TOURS_MERGED_CHOOSER_COLUMNS', []) tour_cols_needed.append(tour_od_id_col) # from tour_mode_choice.py not_university = (pseudo_tours.tour_type != 'school') | ~pseudo_tours.is_university pseudo_tours['tour_purpose'] = \ pseudo_tours.tour_type.where(not_university, 'univ') pseudo_tours['stop_frequency'] = '0out_0in' pseudo_tours['primary_purpose'] = pseudo_tours['tour_purpose'] choosers_og_index = choosers.index.name pseudo_tours.reset_index(inplace=True) pseudo_tours.index.name = 'unique_id' # need dest_id_col to create dest col in trips, but need to preserve # tour dest as separate column in the trips table bc the trip mode choice # preprocessor isn't able to get the tour dest from the tours table bc the # tours don't yet have ODs. stop_frequency_alts = inject.get_injectable('stop_frequency_alts') pseudo_tours['tour_destination'] = pseudo_tours[dest_id_col] trips = trip.initialize_from_tours( pseudo_tours, stop_frequency_alts, [origin_id_col, dest_id_col, 'tour_destination', 'unique_id']) outbound = trips['outbound'] trips['depart'] = reindex(pseudo_tours.start, trips.unique_id) trips.loc[~outbound, 'depart'] = reindex(pseudo_tours.end, trips.loc[~outbound, 'unique_id']) logsum_trips = pd.DataFrame() nest_spec = config.get_logit_model_settings(logsum_settings) # actual coeffs dont matter here, just need them to load the nest structure coefficients = simulate.get_segment_coefficients( logsum_settings, pseudo_tours.iloc[0]['tour_purpose']) nest_spec = simulate.eval_nest_coefficients(nest_spec, coefficients, trace_label) tour_mode_alts = [] for nest in logit.each_nest(nest_spec): if nest.is_leaf: tour_mode_alts.append(nest.name) # repeat rows from the trips table iterating over tour mode for tour_mode in tour_mode_alts: trips['tour_mode'] = tour_mode logsum_trips = pd.concat((logsum_trips, trips), ignore_index=True) assert len(logsum_trips) == len(trips) * len(tour_mode_alts) logsum_trips.index.name = 'trip_id' for col in tour_cols_needed: if col not in trips: logsum_trips[col] = reindex(pseudo_tours[col], logsum_trips.unique_id) pipeline.replace_table('trips', logsum_trips) tracing.register_traceable_table('trips', logsum_trips) pipeline.get_rn_generator().add_channel('trips', logsum_trips) # run trip mode choice on pseudo-trips. use orca instead of pipeline to # execute the step because pipeline can only handle one open step at a time orca.run(['trip_mode_choice']) # grab trip mode choice logsums and pivot by tour mode and direction, index # on tour_id to enable merge back to choosers table trips = inject.get_table('trips').to_frame() trip_dir_mode_logsums = trips.pivot(index=['tour_id', tour_od_id_col], columns=['tour_mode', 'outbound'], values='trip_mode_choice_logsum') new_cols = [ '_'.join(['logsum', mode, 'outbound' if outbound else 'inbound']) for mode, outbound in trip_dir_mode_logsums.columns ] trip_dir_mode_logsums.columns = new_cols choosers.reset_index(inplace=True) choosers.set_index(['tour_id', tour_od_id_col], inplace=True) choosers = pd.merge(choosers, trip_dir_mode_logsums, left_index=True, right_index=True) choosers.reset_index(inplace=True) choosers.set_index(choosers_og_index, inplace=True) pipeline.get_rn_generator().drop_channel('trips') tracing.deregister_traceable_table('trips') assert (od_sample.index == choosers.index).all() for col in new_cols: od_sample[col] = choosers[col] logsums = logsum.compute_logsums(choosers, spec_segment_name, logsum_settings, model_settings, network_los, chunk_size, chunk_tag, trace_label, 'end', 'start', 'duration') assert (od_sample.index == logsums.index).all() od_sample['tour_mode_choice_logsum'] = logsums return od_sample
def households(households_sample_size, override_hh_ids, trace_hh_id): df_full = read_input_table("households") households_sliced = False logger.info("full household list contains %s households" % df_full.shape[0]) # only using households listed in override_hh_ids if override_hh_ids is not None: # trace_hh_id will not used if it is not in list of override_hh_ids logger.info("override household list containing %s households" % len(override_hh_ids)) df = df_full[df_full.index.isin(override_hh_ids)] households_sliced = True if df.shape[0] < len(override_hh_ids): logger.info("found %s of %s households in override household list" % (df.shape[0], len(override_hh_ids))) if df.shape[0] == 0: raise RuntimeError('No override households found in store') # if we are tracing hh exclusively elif trace_hh_id and households_sample_size == 1: # df contains only trace_hh (or empty if not in full store) df = tracing.slice_ids(df_full, trace_hh_id) households_sliced = True # if we need a subset of full store elif households_sample_size > 0 and df_full.shape[0] > households_sample_size: logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0])) """ Because random seed is set differently for each step, sampling of households using Random.global_rng would sample differently depending upon which step it was called from. We use a one-off rng seeded with the pseudo step name 'sample_households' to provide repeatable sampling no matter when the table is loaded. Note that the external_rng is also seeded with base_seed so the sample will (rightly) change if the pipeline rng's base_seed is changed """ prng = pipeline.get_rn_generator().get_external_rng('sample_households') df = df_full.take(prng.choice(len(df_full), size=households_sample_size, replace=False)) households_sliced = True # if tracing and we missed trace_hh in sample, but it is in full store if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index: # replace first hh in sample with trace_hh logger.debug("replacing household %s with %s in household sample" % (df.index[0], trace_hh_id)) df_hh = df_full.loc[[trace_hh_id]] df = pd.concat([df_hh, df[1:]]) else: df = df_full # persons table inject.add_injectable('households_sliced', households_sliced) logger.info("loaded households %s" % (df.shape,)) df.index.name = 'household_id' # FIXME - pathological knowledge of name of chunk_id column used by chunked_choosers_by_chunk_id assert 'chunk_id' not in df.columns df['chunk_id'] = pd.Series(list(range(len(df))), df.index) # replace table function with dataframe inject.add_table('households', df) pipeline.get_rn_generator().add_channel('households', df) if trace_hh_id: tracing.register_traceable_table('households', df) tracing.trace_df(df, "raw.households", warn_if_empty=True) return df
def households(households_sample_size, override_hh_ids, trace_hh_id): df_full = read_input_table("households") households_sliced = False logger.info("full household list contains %s households" % df_full.shape[0]) # only using households listed in override_hh_ids if override_hh_ids is not None: # trace_hh_id will not used if it is not in list of override_hh_ids logger.info("override household list containing %s households" % len(override_hh_ids)) df = df_full[df_full.index.isin(override_hh_ids)] households_sliced = True if df.shape[0] < len(override_hh_ids): logger.info("found %s of %s households in override household list" % (df.shape[0], len(override_hh_ids))) if df.shape[0] == 0: raise RuntimeError('No override households found in store') # if we are tracing hh exclusively elif trace_hh_id and households_sample_size == 1: # df contains only trace_hh (or empty if not in full store) df = tracing.slice_ids(df_full, trace_hh_id) households_sliced = True # if we need a subset of full store elif households_sample_size > 0 and df_full.shape[0] > households_sample_size: logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0])) """ Because random seed is set differently for each step, sampling of households using Random.global_rng would sample differently depending upon which step it was called from. We use a one-off rng seeded with the pseudo step name 'sample_households' to provide repeatable sampling no matter when the table is loaded. Note that the external_rng is also seeded with base_seed so the sample will (rightly) change if the pipeline rng's base_seed is changed """ prng = pipeline.get_rn_generator().get_external_rng('sample_households') df = df_full.take(prng.choice(len(df_full), size=households_sample_size, replace=False)) households_sliced = True # if tracing and we missed trace_hh in sample, but it is in full store if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index: # replace first hh in sample with trace_hh logger.debug("replacing household %s with %s in household sample" % (df.index[0], trace_hh_id)) df_hh = df_full.loc[[trace_hh_id]] df = pd.concat([df_hh, df[1:]]) else: df = df_full # persons table inject.add_injectable('households_sliced', households_sliced) logger.info("loaded households %s" % (df.shape,)) # FIXME - pathological knowledge of name of chunk_id column used by chunked_choosers_by_chunk_id assert 'chunk_id' not in df.columns df['chunk_id'] = pd.Series(list(range(len(df))), df.index) # replace table function with dataframe inject.add_table('households', df) pipeline.get_rn_generator().add_channel('households', df) if trace_hh_id: tracing.register_traceable_table('households', df) tracing.trace_df(df, "raw.households", warn_if_empty=True) return df
def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): """ This model predicts the frequency of making fully joint trips (see the alternatives above). """ trace_label = 'joint_tour_frequency' model_settings_file_name = 'joint_tour_frequency.yaml' estimator = estimation.manager.begin_estimation('joint_tour_frequency') model_settings = config.read_model_settings(model_settings_file_name) alternatives = simulate.read_model_alts( 'joint_tour_frequency_alternatives.csv', set_index='alt') # - only interested in households with more than one cdap travel_active person and # - at least one non-preschooler households = households.to_frame() multi_person_households = households[ households.participates_in_jtf_model].copy() # - only interested in persons in multi_person_households # FIXME - gratuitous pathological efficiency move, just let yaml specify persons? persons = persons.to_frame() persons = persons[persons.household_id.isin(multi_person_households.index)] logger.info( "Running joint_tour_frequency with %d multi-person households" % multi_person_households.shape[0]) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'persons': persons, 'hh_time_window_overlap': hh_time_window_overlap } expressions.assign_columns(df=multi_person_households, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) if estimator: estimator.write_spec(model_settings) estimator.write_model_settings(model_settings, model_settings_file_name) estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(multi_person_households) choices = simulate.simple_simulate( choosers=multi_person_households, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='joint_tour_frequency', estimator=estimator) # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values(choices, 'households', 'joint_tour_frequency') estimator.write_override_choices(choices) estimator.end_estimation() # - create joint_tours based on joint_tour_frequency choices # - we need a person_id in order to generate the tour index (and for register_traceable_table) # - but we don't know the tour participants yet # - so we arbitrarily choose the first person in the household # - to be point person for the purpose of generating an index and setting origin temp_point_persons = persons.loc[persons.PNUM == 1] temp_point_persons['person_id'] = temp_point_persons.index temp_point_persons = temp_point_persons.set_index('household_id') temp_point_persons = temp_point_persons[['person_id', 'home_zone_id']] joint_tours = \ process_joint_tours(choices, alternatives, temp_point_persons) tours = pipeline.extend_table("tours", joint_tours) tracing.register_traceable_table('tours', joint_tours) pipeline.get_rn_generator().add_channel('tours', joint_tours) # - annotate households # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] households['joint_tour_frequency'] = choices.reindex( households.index).fillna(no_tours_alt).astype(str) households['num_hh_joint_tours'] = joint_tours.groupby('household_id').size().\ reindex(households.index).fillna(0).astype(np.int8) pipeline.replace_table("households", households) tracing.print_summary('joint_tour_frequency', households.joint_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(households, label="joint_tour_frequency.households") tracing.trace_df(joint_tours, label="joint_tour_frequency.joint_tours", slicer='household_id') if estimator: survey_tours = estimation.manager.get_survey_table('tours') survey_tours = survey_tours[survey_tours.tour_category == 'joint'] print(f"len(survey_tours) {len(survey_tours)}") print(f"len(joint_tours) {len(joint_tours)}") different = False survey_tours_not_in_tours = survey_tours[~survey_tours.index. isin(joint_tours.index)] if len(survey_tours_not_in_tours) > 0: print(f"survey_tours_not_in_tours\n{survey_tours_not_in_tours}") different = True tours_not_in_survey_tours = joint_tours[~joint_tours.index. isin(survey_tours.index)] if len(survey_tours_not_in_tours) > 0: print(f"tours_not_in_survey_tours\n{tours_not_in_survey_tours}") different = True assert not different