def trip_destination( trips, tours_merged, chunk_size, trace_hh_id): """ Choose a destination for all 'intermediate' trips based on trip purpose. Final trips already have a destination (the primary tour destination for outbound trips, and home for inbound trips.) """ trace_label = 'trip_destination' model_settings = config.read_model_settings('trip_destination.yaml') CLEANUP = model_settings.get('CLEANUP', True) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) trips_df = run_trip_destination( trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label) if trips_df.failed.any(): logger.warning("%s %s failed trips", trace_label, trips_df.failed.sum()) file_name = "%s_failed_trips" % trace_label logger.info("writing failed trips to %s", file_name) tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) if CLEANUP: trips_df = cleanup_failed_trips(trips_df) elif trips_df.failed.any(): logger.warning("%s keeping %s sidelined failed trips" % (trace_label, trips_df.failed.sum())) pipeline.replace_table("trips", trips_df) print("trips_df\n", trips_df.shape) if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def write_trace_files(self, iteration): """ Write trace files for this iteration Writes desired_size, modeled_size, and shadow_prices tables Trace file names are tagged with model_selector and iteration number (e.g. self.desired_size => shadow_price_school_desired_size_1) Parameters ---------- iteration: int current iteration to tag trace file """ logger.info("write_trace_files iteration %s" % iteration) if iteration == 1: # write desired_size only on first iteration, as it doesn't change tracing.write_csv(self.desired_size, 'shadow_price_%s_desired_size' % self.model_selector, transpose=False) tracing.write_csv(self.modeled_size, 'shadow_price_%s_modeled_size_%s' % (self.model_selector, iteration), transpose=False) if self.use_shadow_pricing: tracing.write_csv(self.shadow_prices, 'shadow_price_%s_shadow_prices_%s' % (self.model_selector, iteration), transpose=False)
def report_bad_choices(bad_row_map, df, filename, trace_label, trace_choosers=None): """ Parameters ---------- bad_row_map df : pandas.DataFrame utils or probs dataframe trace_choosers : pandas.dataframe the choosers df (for interaction_simulate) to facilitate the reporting of hh_id because we can't deduce hh_id from the interaction_dataset which is indexed on index values from alternatives df """ df = df[bad_row_map] if trace_choosers is None: hh_ids = tracing.hh_id_for_chooser(df.index, df) else: hh_ids = tracing.hh_id_for_chooser(df.index, trace_choosers) df['household_id'] = hh_ids filename = "%s.%s" % (trace_label, filename) logger.info("dumping %s" % filename) tracing.write_csv(df, file_name=filename, transpose=False) # log the indexes of the first MAX_PRINT offending rows MAX_PRINT = 0 for idx in df.index[:MAX_PRINT].values: row_msg = "%s : failed %s = %s (hh_id = %s)" % \ (trace_label, df.index.name, idx, df.household_id.loc[idx]) logger.warning(row_msg)
def eval_link_spec(link_spec, link_file_names, data_dir, link_file_column_map, link_index_fields, model_settings, trace_tag=None, trace_od=None): # accept a single string as well as a dict of {suffix: filename} if isinstance(link_file_names, str): link_file_names = {"": link_file_names} locals_dict = config.get_model_constants(model_settings) locals_dict.update(config.setting('globals')) locals_dict = add_tables_to_locals(data_dir, model_settings, locals_dict) results = {} for scenario in ['base', 'build']: logger.debug("eval_link_spec scenario %s" % scenario) link_data_subdir = 'base-data' if scenario == 'base' else 'build-data' df_list = [] for suffix, link_file_name in list(link_file_names.items()): df = read_csv_file(data_dir=os.path.join(data_dir, link_data_subdir), file_name=link_file_name, column_map=link_file_column_map) if link_index_fields: df.set_index(link_index_fields, drop=True, inplace=True) if suffix: df = df.add_suffix("_" + suffix) df_list.append(df) links_df = pd.concat(df_list, axis=1) # copy index fields into columns if link_index_fields: links_df = links_df.reset_index().set_index(link_index_fields, drop=False) if trace_od: od_column = model_settings.get('od_column', None) if od_column: o, d = trace_od trace_rows = (links_df[od_column] == o) | (links_df[od_column] == d) else: # just dump first row trace_rows = (links_df.index == 1) else: trace_rows = None summary, trace_results, trace_assigned_locals = \ bca.eval_and_sum(link_spec, links_df, locals_dict, df_alias='links', trace_rows=trace_rows) results[scenario] = summary if trace_results is not None: # FIXME: manually setting df.index.name to prevent # activitysim.tracing.write_df_csv() from attempting to reset the index. # write_df_csv() should be able to handle a multi-index dataframe. trace_results.index.name = trace_results.index.names[0] tracing.write_csv(trace_results, file_name="%s_results_%s" % (trace_tag, scenario), index_label='index', column_labels=['label', 'link']) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="%s_locals_%s" % (trace_tag, scenario)) results = results['build'] - results['base'] results.reset_index(drop=True, inplace=True) return results
def trip_purpose_and_destination( trips, tours_merged, chunk_size, trace_hh_id): trace_label = "trip_purpose_and_destination" model_settings = config.read_model_settings('trip_purpose_and_destination.yaml') MAX_ITERATIONS = model_settings.get('MAX_ITERATIONS', 5) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() if trips_df.empty: logger.info("%s - no trips. Nothing to do." % trace_label) return # FIXME could allow MAX_ITERATIONS=0 to allow for cleanup-only run # in which case, we would need to drop bad trips, WITHOUT failing bad_trip leg_mates assert (MAX_ITERATIONS > 0) # if trip_destination has been run before, keep only failed trips (and leg_mates) to retry if 'destination' in trips_df: if trips_df.failed.any(): logger.info('trip_destination has already been run. Rerunning failed trips') flag_failed_trip_leg_mates(trips_df, 'failed') trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin(trips_df.tour_id)] logger.info('Rerunning %s failed trips and leg-mates' % trips_df.shape[0]) else: # no failed trips from prior run of trip_destination logger.info("%s - no failed trips from prior model run." % trace_label) del trips_df['failed'] pipeline.replace_table("trips", trips_df) return results = [] i = 0 RESULT_COLUMNS = ['purpose', 'destination', 'origin', 'failed'] while True: i += 1 for c in RESULT_COLUMNS: if c in trips_df: del trips_df[c] trips_df = run_trip_purpose_and_destination( trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "i%s" % i)) num_failed_trips = trips_df.failed.sum() # if there were no failed trips, we are done if num_failed_trips == 0: results.append(trips_df[RESULT_COLUMNS]) break logger.warning("%s %s failed trips in iteration %s" % (trace_label, num_failed_trips, i)) file_name = "%s_i%s_failed_trips" % (trace_label, i) logger.info("writing failed trips to %s" % file_name) tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) # if max iterations reached, add remaining trips to results and give up # note that we do this BEFORE failing leg_mates so resulting trip legs are complete if i >= MAX_ITERATIONS: logger.warning("%s too many iterations %s" % (trace_label, i)) results.append(trips_df[RESULT_COLUMNS]) break # otherwise, if any trips failed, then their leg-mates trips must also fail flag_failed_trip_leg_mates(trips_df, 'failed') # add the good trips to results results.append(trips_df[~trips_df.failed][RESULT_COLUMNS]) # and keep the failed ones to retry trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin(trips_df.tour_id)] # - assign result columns to trips results = pd.concat(results) logger.info("%s %s failed trips after %s iterations" % (trace_label, results.failed.sum(), i)) trips_df = trips.to_frame() assign_in_place(trips_df, results) trips_df = cleanup_failed_trips(trips_df) pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def best_transit_path(set_random_seed, network_los, best_transit_path_spec): model_settings = config.read_model_settings('best_transit_path.yaml') logger.info("best_transit_path VECTOR_TEST_SIZE %s", VECTOR_TEST_SIZE) omaz = network_los.maz_df.sample(VECTOR_TEST_SIZE, replace=True).index dmaz = network_los.maz_df.sample(VECTOR_TEST_SIZE, replace=True).index tod = np.random.choice(['AM', 'PM'], VECTOR_TEST_SIZE) od_df = pd.DataFrame({'omaz': omaz, 'dmaz': dmaz, 'tod': tod}) trace_od = (od_df.omaz[0], od_df.dmaz[0]) logger.info("trace_od omaz %s dmaz %s" % trace_od) # build exploded atap_btap_df # FIXME - pathological knowledge about mode - should be parameterized # filter out rows with no drive time omaz-btap or no walk time from dmaz-atap atap_btap_df = network_los.get_tappairs_mazpairs(od_df.omaz, od_df.dmaz, ofilter='drive_time', dfilter='walk_alightingActual') # add in tod column atap_btap_df = atap_btap_df.merge( right=od_df[['tod']], left_on='idx', right_index=True, how='left' ) logger.info("len od_df %s", len(od_df.index)) logger.info("len atap_btap_df %s", len(atap_btap_df.index)) logger.info("avg explosion %s", (len(atap_btap_df.index) / (1.0 * len(od_df.index)))) if trace_od: trace_orig, trace_dest = trace_od trace_oabd_rows = (atap_btap_df.omaz == trace_orig) & (atap_btap_df.dmaz == trace_dest) else: trace_oabd_rows = None constants = config.get_model_constants(model_settings) locals_d = { 'np': np, 'network_los': network_los } if constants is not None: locals_d.update(constants) results, trace_results, trace_assigned_locals \ = assign.assign_variables(best_transit_path_spec, atap_btap_df, locals_d, trace_rows=trace_oabd_rows) # copy results for column in results.columns: atap_btap_df[column] = results[column] # drop rows if no utility n = len(atap_btap_df.index) atap_btap_df = atap_btap_df.dropna(subset=['utility']) logger.info("Dropped %s of %s rows with null utility", n - len(atap_btap_df.index), n) # choose max utility atap_btap_df = atap_btap_df.sort_values(by='utility').groupby('idx').tail(1) if trace_od: if not trace_oabd_rows.any(): logger.warning("trace_od not found origin = %s, dest = %s", trace_orig, trace_dest) else: tracing.trace_df(atap_btap_df, label='best_transit_path', slicer='NONE', transpose=False) tracing.trace_df(trace_results, label='trace_best_transit_path', slicer='NONE', transpose=False) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="trace_best_transit_path_locals")
def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): trace_label = "trip_purpose_and_destination" model_settings = config.read_model_settings( 'trip_purpose_and_destination.yaml') # for consistency, read sample_table_name setting from trip_destination settings file trip_destination_model_settings = config.read_model_settings( 'trip_destination.yaml') sample_table_name = trip_destination_model_settings.get( 'DEST_CHOICE_SAMPLE_TABLE_NAME') want_sample_table = config.setting( 'want_dest_choice_sample_tables') and sample_table_name is not None MAX_ITERATIONS = model_settings.get('MAX_ITERATIONS', 5) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() if trips_df.empty: logger.info("%s - no trips. Nothing to do." % trace_label) return # FIXME could allow MAX_ITERATIONS=0 to allow for cleanup-only run # in which case, we would need to drop bad trips, WITHOUT failing bad_trip leg_mates assert (MAX_ITERATIONS > 0) # if trip_destination has been run before, keep only failed trips (and leg_mates) to retry if 'destination' in trips_df: if 'failed' not in trips_df.columns: # trip_destination model cleaned up any failed trips logger.info("%s - no failed column from prior model run." % trace_label) return elif not trips_df.failed.any(): # 'failed' column but no failed trips from prior run of trip_destination logger.info("%s - no failed trips from prior model run." % trace_label) trips_df.drop(columns='failed', inplace=True) pipeline.replace_table("trips", trips_df) return else: logger.info( "trip_destination has already been run. Rerunning failed trips" ) flag_failed_trip_leg_mates(trips_df, 'failed') trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin( trips_df.tour_id)] logger.info("Rerunning %s failed trips and leg-mates" % trips_df.shape[0]) # drop any previously saved samples of failed trips if want_sample_table and pipeline.is_table(sample_table_name): logger.info( "Dropping any previously saved samples of failed trips") save_sample_df = pipeline.get_table(sample_table_name) save_sample_df.drop(trips_df.index, level='trip_id', inplace=True) pipeline.replace_table(sample_table_name, save_sample_df) del save_sample_df # if we estimated trip_destination, there should have been no failed trips # if we didn't, but it is enabled, it is probably a configuration error # if we just estimated trip_purpose, it isn't clear what they are trying to do , nor how to handle it assert not (estimation.manager.begin_estimation('trip_purpose') or estimation.manager.begin_estimation('trip_destination')) processed_trips = [] save_samples = [] i = 0 TRIP_RESULT_COLUMNS = ['purpose', 'destination', 'origin', 'failed'] while True: i += 1 for c in TRIP_RESULT_COLUMNS: if c in trips_df: del trips_df[c] trips_df, save_sample_df = run_trip_purpose_and_destination( trips_df, tours_merged_df, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "i%s" % i)) # # if testing, make sure at least one trip fails if config.setting('testing_fail_trip_destination', False) \ and (i == 1) and not trips_df.failed.any(): fail_o = trips_df[ trips_df.trip_num < trips_df.trip_count].origin.max() trips_df.failed = (trips_df.origin == fail_o) & \ (trips_df.trip_num < trips_df.trip_count) num_failed_trips = trips_df.failed.sum() # if there were no failed trips, we are done if num_failed_trips == 0: processed_trips.append(trips_df[TRIP_RESULT_COLUMNS]) if save_sample_df is not None: save_samples.append(save_sample_df) break logger.warning("%s %s failed trips in iteration %s" % (trace_label, num_failed_trips, i)) file_name = "%s_i%s_failed_trips" % (trace_label, i) logger.info("writing failed trips to %s" % file_name) tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) # if max iterations reached, add remaining trips to processed_trips and give up # note that we do this BEFORE failing leg_mates so resulting trip legs are complete if i >= MAX_ITERATIONS: logger.warning("%s too many iterations %s" % (trace_label, i)) processed_trips.append(trips_df[TRIP_RESULT_COLUMNS]) if save_sample_df is not None: save_sample_df.drop(trips_df[trips_df.failed].index, level='trip_id', inplace=True) save_samples.append(save_sample_df) break # otherwise, if any trips failed, then their leg-mates trips must also fail flag_failed_trip_leg_mates(trips_df, 'failed') # add the good trips to processed_trips processed_trips.append(trips_df[~trips_df.failed][TRIP_RESULT_COLUMNS]) # and keep the failed ones to retry trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin( trips_df.tour_id)] # add trip samples of processed_trips to processed_samples if save_sample_df is not None: # drop failed trip samples save_sample_df.drop(trips_df.index, level='trip_id', inplace=True) save_samples.append(save_sample_df) # - assign result columns to trips processed_trips = pd.concat(processed_trips) if len(save_samples) > 0: save_sample_df = pd.concat(save_samples) logger.info("adding %s samples to %s" % (len(save_sample_df), sample_table_name)) pipeline.extend_table(sample_table_name, save_sample_df) logger.info("%s %s failed trips after %s iterations" % (trace_label, processed_trips.failed.sum(), i)) trips_df = trips.to_frame() assign_in_place(trips_df, processed_trips) trips_df = cleanup_failed_trips(trips_df) pipeline.replace_table("trips", trips_df) # check to make sure we wrote sample file if requestsd if want_sample_table and len(trips_df) > 0: assert pipeline.is_table(sample_table_name) # since we have saved samples for all successful trips # once we discard failed trips, we should samples for all trips save_sample_df = pipeline.get_table(sample_table_name) # expect samples only for intermediate trip destinatinos assert \ len(save_sample_df.index.get_level_values(0).unique()) == \ len(trips_df[trips_df.trip_num < trips_df.trip_count]) del save_sample_df if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def best_transit_path(set_random_seed, network_los, best_transit_path_spec): model_settings = config.read_model_settings('best_transit_path.yaml') logger.info("best_transit_path VECTOR_TEST_SIZE %s", VECTOR_TEST_SIZE) omaz = network_los.maz_df.sample(VECTOR_TEST_SIZE, replace=True).index dmaz = network_los.maz_df.sample(VECTOR_TEST_SIZE, replace=True).index tod = np.random.choice(['AM', 'PM'], VECTOR_TEST_SIZE) od_df = pd.DataFrame({'omaz': omaz, 'dmaz': dmaz, 'tod': tod}) trace_od = (od_df.omaz[0], od_df.dmaz[0]) logger.info("trace_od omaz %s dmaz %s" % trace_od) # build exploded atap_btap_df # FIXME - pathological knowledge about mode - should be parameterized # filter out rows with no drive time omaz-btap or no walk time from dmaz-atap atap_btap_df = network_los.get_tappairs_mazpairs( od_df.omaz, od_df.dmaz, ofilter='drive_time', dfilter='walk_alightingActual') # add in tod column atap_btap_df = atap_btap_df.merge(right=od_df[['tod']], left_on='idx', right_index=True, how='left') logger.info("len od_df %s", len(od_df.index)) logger.info("len atap_btap_df %s", len(atap_btap_df.index)) logger.info("avg explosion %s", (len(atap_btap_df.index) / (1.0 * len(od_df.index)))) if trace_od: trace_orig, trace_dest = trace_od trace_oabd_rows = (atap_btap_df.omaz == trace_orig) & (atap_btap_df.dmaz == trace_dest) else: trace_oabd_rows = None constants = config.get_model_constants(model_settings) locals_d = {'np': np, 'network_los': network_los} if constants is not None: locals_d.update(constants) results, trace_results, trace_assigned_locals \ = assign.assign_variables(best_transit_path_spec, atap_btap_df, locals_d, trace_rows=trace_oabd_rows) # copy results for column in results.columns: atap_btap_df[column] = results[column] # drop rows if no utility n = len(atap_btap_df.index) atap_btap_df = atap_btap_df.dropna(subset=['utility']) logger.info("Dropped %s of %s rows with null utility", n - len(atap_btap_df.index), n) # choose max utility atap_btap_df = atap_btap_df.sort_values( by='utility').groupby('idx').tail(1) if trace_od: if not trace_oabd_rows.any(): logger.warning("trace_od not found origin = %s, dest = %s", trace_orig, trace_dest) else: tracing.trace_df(atap_btap_df, label='best_transit_path', slicer='NONE', transpose=False) tracing.trace_df(trace_results, label='trace_best_transit_path', slicer='NONE', transpose=False) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="trace_best_transit_path_locals")
def compute_accessibility(accessibility, network_los, land_use, trace_od): """ Compute accessibility for each zone in land use file using expressions from accessibility_spec The actual results depend on the expressions in accessibility_spec, but this is initially intended to permit implementation of the mtc accessibility calculation as implemented by Accessibility.job Compute measures of accessibility used by the automobile ownership model. The accessibility measure first multiplies an employment variable by a mode-specific decay function. The product reflects the difficulty of accessing the activities the farther (in terms of round-trip travel time) the jobs are from the location in question. The products to each destination zone are next summed over each origin zone, and the logarithm of the product mutes large differences. The decay function on the walk accessibility measure is steeper than automobile or transit. The minimum accessibility is zero. """ trace_label = 'compute_accessibility' model_settings = config.read_model_settings('accessibility.yaml') assignment_spec = assign.read_assignment_spec( config.config_file_path('accessibility.csv')) accessibility_df = accessibility.to_frame() logger.info("Running %s with %d dest zones" % (trace_label, len(accessibility_df))) constants = config.get_model_constants(model_settings) land_use_columns = model_settings.get('land_use_columns', []) land_use_df = land_use.to_frame() land_use_df = land_use_df[land_use_columns] # don't assume they are the same: accessibility may be sliced if we are multiprocessing orig_zones = accessibility_df.index.values dest_zones = land_use_df.index.values orig_zone_count = len(orig_zones) dest_zone_count = len(dest_zones) logger.info("Running %s with %d dest zones %d orig zones" % (trace_label, dest_zone_count, orig_zone_count)) # create OD dataframe od_df = pd.DataFrame( data={ 'orig': np.repeat(orig_zones, dest_zone_count), 'dest': np.tile(dest_zones, orig_zone_count) }) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (od_df.orig == trace_orig) & (od_df.dest == trace_dest) else: trace_od_rows = None # merge land_use_columns into od_df od_df = pd.merge(od_df, land_use_df, left_on='dest', right_index=True).sort_index() locals_d = { 'log': np.log, 'exp': np.exp, 'network_los': network_los, } skim_dict = network_los.get_default_skim_dict() locals_d['skim_od'] = skim_dict.wrap('orig', 'dest').set_df(od_df) locals_d['skim_do'] = skim_dict.wrap('dest', 'orig').set_df(od_df) if network_los.zone_system == los.THREE_ZONE: locals_d['tvpb'] = TransitVirtualPathBuilder(network_los) if constants is not None: locals_d.update(constants) results, trace_results, trace_assigned_locals \ = assign.assign_variables(assignment_spec, od_df, locals_d, trace_rows=trace_od_rows) for column in results.columns: data = np.asanyarray(results[column]) data.shape = (orig_zone_count, dest_zone_count) # (o,d) accessibility_df[column] = np.log(np.sum(data, axis=1) + 1) logger.info("{trace_label} added {len(results.columns} columns") # - write table to pipeline pipeline.replace_table("accessibility", accessibility_df) if trace_od: if not trace_od_rows.any(): logger.warning( f"trace_od not found origin = {trace_orig}, dest = {trace_dest}" ) else: # add OD columns to trace results df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging tracing.trace_df(df, label='accessibility', index_label='skim_offset', slicer='NONE', warn_if_empty=True) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="accessibility_locals")
def compute_accessibility(accessibility, skim_dict, land_use, trace_od): """ Compute accessibility for each zone in land use file using expressions from accessibility_spec The actual results depend on the expressions in accessibility_spec, but this is initially intended to permit implementation of the mtc accessibility calculation as implemented by Accessibility.job Compute measures of accessibility used by the automobile ownership model. The accessibility measure first multiplies an employment variable by a mode-specific decay function. The product reflects the difficulty of accessing the activities the farther (in terms of round-trip travel time) the jobs are from the location in question. The products to each destination zone are next summed over each origin zone, and the logarithm of the product mutes large differences. The decay function on the walk accessibility measure is steeper than automobile or transit. The minimum accessibility is zero. """ trace_label = 'compute_accessibility' model_settings = config.read_model_settings('accessibility.yaml') assignment_spec = assign.read_assignment_spec(config.config_file_path('accessibility.csv')) accessibility_df = accessibility.to_frame() logger.info("Running %s with %d dest zones" % (trace_label, len(accessibility_df))) constants = config.get_model_constants(model_settings) land_use_columns = model_settings.get('land_use_columns', []) land_use_df = land_use.to_frame() # #bug # # land_use_df = land_use_df[land_use_df.index % 2 == 1] # accessibility_df = accessibility_df[accessibility_df.index.isin(land_use_df.index)].head(5) # # print "land_use_df", land_use_df.index # print "accessibility_df", accessibility_df.index # #bug orig_zones = accessibility_df.index.values dest_zones = land_use_df.index.values orig_zone_count = len(orig_zones) dest_zone_count = len(dest_zones) logger.info("Running %s with %d dest zones %d orig zones" % (trace_label, dest_zone_count, orig_zone_count)) # create OD dataframe od_df = pd.DataFrame( data={ 'orig': np.repeat(np.asanyarray(accessibility_df.index), dest_zone_count), 'dest': np.tile(np.asanyarray(land_use_df.index), orig_zone_count) } ) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (od_df.orig == trace_orig) & (od_df.dest == trace_dest) else: trace_od_rows = None # merge land_use_columns into od_df land_use_df = land_use_df[land_use_columns] od_df = pd.merge(od_df, land_use_df, left_on='dest', right_index=True).sort_index() locals_d = { 'log': np.log, 'exp': np.exp, 'skim_od': AccessibilitySkims(skim_dict, orig_zones, dest_zones), 'skim_do': AccessibilitySkims(skim_dict, orig_zones, dest_zones, transpose=True) } if constants is not None: locals_d.update(constants) results, trace_results, trace_assigned_locals \ = assign.assign_variables(assignment_spec, od_df, locals_d, trace_rows=trace_od_rows) for column in results.columns: data = np.asanyarray(results[column]) data.shape = (orig_zone_count, dest_zone_count) accessibility_df[column] = np.log(np.sum(data, axis=1) + 1) # - write table to pipeline pipeline.replace_table("accessibility", accessibility_df) if trace_od: if not trace_od_rows.any(): logger.warning("trace_od not found origin = %s, dest = %s" % (trace_orig, trace_dest)) else: # add OD columns to trace results df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging tracing.trace_df(df, label='accessibility', index_label='skim_offset', slicer='NONE', warn_if_empty=True) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="accessibility_locals")
def demographics_processor(persons, persons_merged, demographics_spec, demographics_settings, chunk_size, trace_hh_id): # the choice model will be applied to each row of the choosers table (a pandas.DataFrame) persons_df = persons_merged.to_frame() logger.info( "Running demographics_processor with %d persons (chunk size = %s)" % (len(persons_df), chunk_size)) # locals whose values will be accessible to the execution context # when the expressions in spec are applied to choosers locals_dict = config.get_model_constants(demographics_settings) locals_dict.update(config.setting('globals')) trace_rows = trace_hh_id and persons_df['household_id'] == trace_hh_id # eval_variables evaluates each of the expressions in spec # in the context of each row in of the choosers dataframe results, trace_results, trace_assigned_locals \ = assign.assign_variables(demographics_spec, persons_df, locals_dict, df_alias='persons', trace_rows=trace_rows) # add assigned columns to persons as they are needed by downstream processors persons = persons.to_frame() assign_in_place(persons, results) pipeline.replace_table("persons", persons) # coc groups with counts # TODO - should we allow specifying which assigned columns are coc (e.g. in settings?) # for now, assume all assigned columns are coc, but this could cramp modelers style # if they want to create additional demographic columns for downstream use that aren't coc coc_columns = list(results.columns) inject.add_injectable("coc_column_names", coc_columns) # - create table with coc columns as indexes and a single column 'persons' with counts # index persons # coc_poverty coc_age # False False 20 # True 3 # True False 4 coc_grouped = results.groupby(coc_columns) coc_grouped = coc_grouped[coc_columns[0]].count().to_frame(name='persons') pipeline.replace_table("coc_results", coc_grouped) add_summary_results(coc_grouped) if trace_hh_id: if trace_results is not None: tracing.write_csv(trace_results, file_name="demographics", index_label='person_idx', column_labels=['label', 'person']) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="demographics_locals")
def compute_accessibilities_for_zones(accessibility_df, land_use_df, assignment_spec, constants, network_los, trace_od, trace_label): orig_zones = accessibility_df.index.values dest_zones = land_use_df.index.values orig_zone_count = len(orig_zones) dest_zone_count = len(dest_zones) logger.info("Running %s with %d orig zones %d dest zones" % (trace_label, orig_zone_count, dest_zone_count)) # create OD dataframe od_df = pd.DataFrame( data={ 'orig': np.repeat(orig_zones, dest_zone_count), 'dest': np.tile(dest_zones, orig_zone_count) }) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (od_df.orig == trace_orig) & (od_df.dest == trace_dest) else: trace_od_rows = None # merge land_use_columns into od_df od_df = pd.merge(od_df, land_use_df, left_on='dest', right_index=True).sort_index() chunk.log_df(trace_label, "od_df", od_df) locals_d = { 'log': np.log, 'exp': np.exp, 'network_los': network_los, } locals_d.update(constants) skim_dict = network_los.get_default_skim_dict() locals_d['skim_od'] = skim_dict.wrap('orig', 'dest').set_df(od_df) locals_d['skim_do'] = skim_dict.wrap('dest', 'orig').set_df(od_df) if network_los.zone_system == los.THREE_ZONE: locals_d['tvpb'] = network_los.tvpb results, trace_results, trace_assigned_locals \ = assign.assign_variables(assignment_spec, od_df, locals_d, trace_rows=trace_od_rows, trace_label=trace_label, chunk_log=True) chunk.log_df(trace_label, "results", results) # accessibility_df = accessibility_df.copy() for column in results.columns: data = np.asanyarray(results[column]) data.shape = (orig_zone_count, dest_zone_count) # (o,d) accessibility_df[column] = np.log(np.sum(data, axis=1) + 1) if trace_od: if not trace_od_rows.any(): logger.warning( f"trace_od not found origin = {trace_orig}, dest = {trace_dest}" ) else: # add OD columns to trace results df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging tracing.trace_df(df, label='accessibility', index_label='skim_offset', slicer='NONE', warn_if_empty=True) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="accessibility_locals") return (accessibility_df)
def compute_columns(df, model_settings, locals_dict={}, trace_label=None): """ Evaluate expressions_spec in context of df, with optional additional pipeline tables in locals Parameters ---------- df : pandas DataFrame or if None, expect name of pipeline table to be specified by DF in model_settings model_settings : dict or str dict with keys: DF - df_alias and (additionally, if df is None) name of pipeline table to load as df SPEC - name of expressions file (csv suffix optional) if different from model_settings TABLES - list of pipeline tables to load and make available as (read only) locals str: name of yaml file in configs_dir to load dict from locals_dict : dict dict of locals (e.g. utility functions) to add to the execution environment trace_label Returns ------- results: pandas.DataFrame one column for each expression (except temps with ALL_CAP target names) same index as df """ if isinstance(model_settings, str): model_settings_name = model_settings model_settings = config.read_model_settings('%s.yaml' % model_settings) assert model_settings, "Found no model settings for %s" % model_settings_name else: model_settings_name = 'dict' assert isinstance(model_settings, dict) assert 'DF' in model_settings, \ "Expected to find 'DF' in %s" % model_settings_name df_name = model_settings.get('DF') helper_table_names = model_settings.get('TABLES', []) expressions_spec_name = model_settings.get('SPEC', None) assert expressions_spec_name is not None, \ "Expected to find 'SPEC' in %s" % model_settings_name trace_label = tracing.extend_trace_label(trace_label or '', expressions_spec_name) if not expressions_spec_name.endswith(".csv"): expressions_spec_name = '%s.csv' % expressions_spec_name logger.debug( f"{trace_label} compute_columns using expression spec file {expressions_spec_name}" ) expressions_spec = assign.read_assignment_spec( config.config_file_path(expressions_spec_name)) assert expressions_spec.shape[0] > 0, \ "Expected to find some assignment expressions in %s" % expressions_spec_name tables = {t: inject.get_table(t).to_frame() for t in helper_table_names} # if df was passed in, df might be a slice, or any other table, but DF is it's local alias assert df_name not in tables, "Did not expect to find df '%s' in TABLES" % df_name tables[df_name] = df # be nice and also give it to them as df? tables['df'] = df _locals_dict = assign.local_utilities() _locals_dict.update(locals_dict) _locals_dict.update(tables) # FIXME a number of asim model preprocessors want skim_dict - should they request it in model_settings.TABLES? _locals_dict.update({ # 'los': inject.get_injectable('network_los', None), 'skim_dict': inject.get_injectable('skim_dict', None), }) results, trace_results, trace_assigned_locals \ = assign.assign_variables(expressions_spec, df, _locals_dict, trace_rows=tracing.trace_targets(df)) if trace_results is not None: tracing.trace_df(trace_results, label=trace_label, slicer='NONE') if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="%s_locals" % trace_label) return results
def physical_activity_processor(trips_with_demographics, persons_merged, physical_activity_trip_spec, physical_activity_person_spec, physical_activity_settings, coc_column_names, settings, chunk_size, trace_hh_id): """ Compute physical benefits Physical activity benefits generally accrue if the net physical activity for an individual exceeds a certain threshold. We calculate individual physical activity based on trips, so we need to compute trip activity and then sum up to the person level to calculate benefits. We chunk trips by household id to ensure that all of a persons trips are in the same chunk. """ trips_df = trips_with_demographics.to_frame() persons_df = persons_merged.to_frame() trace_label = 'physical_activity' logger.info( "Running physical_activity_processor with %d trips for %d persons " % (len(trips_df), len(persons_df))) locals_dict = config.get_model_constants(physical_activity_settings) locals_dict.update(config.setting('globals')) trip_trace_rows = trace_hh_id and trips_df.household_id == trace_hh_id rows_per_chunk, effective_chunk_size = \ physical_activity_rpc(chunk_size, trips_df, persons_df, physical_activity_trip_spec, trace_label) logger.info("physical_activity_processor chunk_size %s rows_per_chunk %s" % (chunk_size, rows_per_chunk)) coc_summary = None result_list = [] # iterate over trips df chunked by hh_id for i, num_chunks, trips_chunk, trace_rows_chunk \ in bca.chunked_df_by_chunk_id(trips_df, trip_trace_rows, rows_per_chunk): logger.info("%s chunk %s of %s" % (trace_label, i, num_chunks)) trip_activity, trip_trace_results, trip_trace_assigned_locals = \ assign.assign_variables(physical_activity_trip_spec, trips_chunk, locals_dict=locals_dict, df_alias='trips', trace_rows=trace_rows_chunk) # since tracing is at household level, trace_results will occur in only one chunk # we can just write them out when we see them without need to accumulate across chunks if trip_trace_results is not None: tracing.write_csv(trip_trace_results, file_name="physical_activity_trips", index_label='trip_id', column_labels=['label', 'trip']) if trip_trace_assigned_locals: tracing.write_csv(trip_trace_assigned_locals, file_name="physical_activity_trips_locals") # sum trip activity for each unique person trip_activity = trip_activity.groupby(trips_chunk.person_id).sum() # merge in persons columns for this chunk persons_chunk = pd.merge(trip_activity, persons_df, left_index=True, right_index=True) # trace rows array for this chunk person_trace_rows = trace_hh_id and persons_chunk[ 'household_id'] == trace_hh_id person_activity, person_trace_results, person_trace_assigned_locals = \ assign.assign_variables(physical_activity_person_spec, persons_chunk, locals_dict=locals_dict, df_alias='persons', trace_rows=person_trace_rows) # since tracing is at household level, trace_results will occur in only one chunk # we can just write them out when we see them without need to accumulate across chunks if person_trace_results is not None: tracing.write_csv(person_trace_results, file_name="physical_activity_persons", index_label='persons_merged_table_index', column_labels=['label', 'person']) if person_trace_assigned_locals: tracing.write_csv(person_trace_assigned_locals, file_name="physical_activity_persons_locals") # concat in the coc columns and summarize the chunk by coc person_activity = pd.concat( [persons_chunk[coc_column_names], person_activity], axis=1) coc_summary = person_activity.groupby(coc_column_names).sum() result_list.append(coc_summary) chunk_trace_label = 'trace_label chunk_%s' % i chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) chunk.log_df(chunk_trace_label, 'trips_chunk', trips_chunk) chunk.log_df(chunk_trace_label, 'persons_chunk', persons_chunk) chunk.log_close(chunk_trace_label) if len(result_list) > 1: # (if there was only one chunk, then concat is redundant) coc_summary = pd.concat(result_list) # squash the accumulated chunk summaries by reapplying group and sum coc_summary.reset_index(inplace=True) coc_summary = coc_summary.groupby(coc_column_names).sum() result_prefix = 'PA_' add_result_columns("coc_results", coc_summary, result_prefix) add_summary_results(coc_summary, prefix=result_prefix, spec=physical_activity_person_spec)
def trip_purpose_and_destination(trips, tours_merged, chunk_size, trace_hh_id): trace_label = "trip_purpose_and_destination" model_settings = config.read_model_settings( 'trip_purpose_and_destination.yaml') MAX_ITERATIONS = model_settings.get('MAX_ITERATIONS', 5) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() if trips_df.empty: logger.info("%s - no trips. Nothing to do." % trace_label) return # FIXME could allow MAX_ITERATIONS=0 to allow for cleanup-only run # in which case, we would need to drop bad trips, WITHOUT failing bad_trip leg_mates assert (MAX_ITERATIONS > 0) # if trip_destination has been run before, keep only failed trips (and leg_mates) to retry if 'destination' in trips_df: if trips_df.failed.any(): logger.info( 'trip_destination has already been run. Rerunning failed trips' ) flag_failed_trip_leg_mates(trips_df, 'failed') trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin( trips_df.tour_id)] logger.info('Rerunning %s failed trips and leg-mates' % trips_df.shape[0]) else: # no failed trips from prior run of trip_destination logger.info("%s - no failed trips from prior model run." % trace_label) del trips_df['failed'] pipeline.replace_table("trips", trips_df) return results = [] i = 0 RESULT_COLUMNS = ['purpose', 'destination', 'origin', 'failed'] while True: i += 1 for c in RESULT_COLUMNS: if c in trips_df: del trips_df[c] trips_df = run_trip_purpose_and_destination( trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "i%s" % i)) num_failed_trips = trips_df.failed.sum() # if there were no failed trips, we are done if num_failed_trips == 0: results.append(trips_df[RESULT_COLUMNS]) break logger.warning("%s %s failed trips in iteration %s" % (trace_label, num_failed_trips, i)) file_name = "%s_i%s_failed_trips" % (trace_label, i) logger.info("writing failed trips to %s" % file_name) tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) # if max iterations reached, add remaining trips to results and give up # note that we do this BEFORE failing leg_mates so resulting trip legs are complete if i >= MAX_ITERATIONS: logger.warning("%s too many iterations %s" % (trace_label, i)) results.append(trips_df[RESULT_COLUMNS]) break # otherwise, if any trips failed, then their leg-mates trips must also fail flag_failed_trip_leg_mates(trips_df, 'failed') # add the good trips to results results.append(trips_df[~trips_df.failed][RESULT_COLUMNS]) # and keep the failed ones to retry trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin( trips_df.tour_id)] # - assign result columns to trips results = pd.concat(results) logger.info("%s %s failed trips after %s iterations" % (trace_label, results.failed.sum(), i)) trips_df = trips.to_frame() assign_in_place(trips_df, results) trips_df = cleanup_failed_trips(trips_df) pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def choose_intermediate_trip_purpose(trips, probs_spec, estimator, probs_join_cols, use_depart_time, trace_hh_id, trace_label): """ chose purpose for intermediate trips based on probs_spec which assigns relative weights (summing to 1) to the possible purpose choices Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ non_purpose_cols = probs_join_cols.copy() if use_depart_time: non_purpose_cols += ['depart_range_start', 'depart_range_end'] purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols] num_trips = len(trips.index) have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips) # probs should sum to 1 across rows sum_probs = probs_spec[purpose_cols].sum(axis=1) probs_spec.loc[:, purpose_cols] = probs_spec.loc[:, purpose_cols].div(sum_probs, axis=0) # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) choosers = pd.merge(trips.reset_index(), probs_spec, on=probs_join_cols, how='left').set_index('trip_id') chunk.log_df(trace_label, 'choosers', choosers) if use_depart_time: # select the matching depart range (this should result on in exactly one chooser row per trip) chooser_probs = \ (choosers.start >= choosers['depart_range_start']) & (choosers.start <= choosers['depart_range_end']) # if we failed to match a row in probs_spec if chooser_probs.sum() < num_trips: # this can happen if the spec doesn't have probs for the trips matching a trip's probs_join_cols missing_trip_ids = trips.index[ ~trips.index.isin(choosers.index[chooser_probs])].values unmatched_choosers = choosers[choosers.index.isin( missing_trip_ids)] unmatched_choosers = unmatched_choosers[['person_id', 'start'] + non_purpose_cols] # join to persons for better diagnostics persons = inject.get_table('persons').to_frame() persons_cols = [ 'age', 'is_worker', 'is_student', 'is_gradeschool', 'is_highschool', 'is_university' ] unmatched_choosers = pd.merge(unmatched_choosers, persons[[ col for col in persons_cols if col in persons.columns ]], left_on='person_id', right_index=True, how='left') file_name = '%s.UNMATCHED_PROBS' % trace_label logger.error( "%s %s of %s intermediate trips could not be matched to probs based on join columns %s" % (trace_label, len(unmatched_choosers), len(choosers), probs_join_cols)) logger.info("Writing %s unmatched choosers to %s" % ( len(unmatched_choosers), file_name, )) tracing.write_csv(unmatched_choosers, file_name=file_name, transpose=False) raise RuntimeError( "Some trips could not be matched to probs based on join columns %s." % probs_join_cols) # select the matching depart range (this should result on in exactly one chooser row per trip) choosers = choosers[chooser_probs] # choosers should now match trips row for row assert choosers.index.identical(trips.index) if estimator: probs_cols = list(probs_spec.columns) print(choosers[probs_cols]) estimator.write_table(choosers[probs_cols], 'probs', append=True) choices, rands = logit.make_choices(choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers) if have_trace_targets: tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'trip_purpose']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) choices = choices.map(pd.Series(purpose_cols)) return choices
def participants_chooser(probs, choosers, spec, trace_label): """ custom alternative to logit.make_choices for simulate.simple_simulate Choosing participants for mixed tours is trickier than adult or child tours becuase we need at least one adult and one child participant in a mixed tour. We call logit.make_choices and then check to see if the tour statisfies this requirement, and rechoose for any that fail until all are satisfied. In principal, this shold always occur eventually, but we fail after MAX_ITERATIONS, just in case there is some failure in program logic (haven't seen this occur.) Parameters ---------- probs : pandas.DataFrame Rows for choosers and columns for the alternatives from which they are choosing. Values are expected to be valid probabilities across each row, e.g. they should sum to 1. choosers : pandas.dataframe simple_simulate choosers df spec : pandas.DataFrame simple_simulate spec df We only need spec so we can know the column index of the 'participate' alternative indicating that the participant has been chosen to participate in the tour trace_label : str Returns - same as logit.make_choices ------- choices, rands choices, rands as returned by logit.make_choices (in same order as probs) """ assert probs.index.equals(choosers.index) # choice is boolean (participate or not) model_settings = config.read_model_settings('joint_tour_participation.yaml') choice_col = model_settings.get('participation_choice', 'participate') assert choice_col in spec.columns, \ "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = spec.columns.get_loc(choice_col) MAX_ITERATIONS = model_settings.get('max_participation_choice_iterations', 5000) trace_label = tracing.extend_trace_label(trace_label, 'participants_chooser') candidates = choosers.copy() choices_list = [] rands_list = [] num_tours_remaining = len(candidates.tour_id.unique()) logger.info('%s %s joint tours to satisfy.', trace_label, num_tours_remaining,) iter = 0 while candidates.shape[0] > 0: iter += 1 if iter > MAX_ITERATIONS: logger.warning('%s max iterations exceeded (%s).', trace_label, MAX_ITERATIONS) diagnostic_cols = ['tour_id', 'household_id', 'composition', 'adult'] unsatisfied_candidates = candidates[diagnostic_cols].join(probs) tracing.write_csv(unsatisfied_candidates, file_name='%s.UNSATISFIED' % trace_label, transpose=False) print(unsatisfied_candidates.head(20)) assert False choices, rands = logit.make_choices(probs, trace_label=trace_label, trace_choosers=choosers) participate = (choices == PARTICIPATE_CHOICE) # satisfaction indexed by tour_id tour_satisfaction = get_tour_satisfaction(candidates, participate) num_tours_satisfied_this_iter = tour_satisfaction.sum() if num_tours_satisfied_this_iter > 0: num_tours_remaining -= num_tours_satisfied_this_iter satisfied = reindex(tour_satisfaction, candidates.tour_id) choices_list.append(choices[satisfied]) rands_list.append(rands[satisfied]) # remove candidates of satisfied tours probs = probs[~satisfied] candidates = candidates[~satisfied] logger.info('%s iteration %s : %s joint tours satisfied %s remaining' % (trace_label, iter, num_tours_satisfied_this_iter, num_tours_remaining,)) choices = pd.concat(choices_list) rands = pd.concat(rands_list).reindex(choosers.index) # reindex choices and rands to match probs and v index choices = choices.reindex(choosers.index) rands = rands.reindex(choosers.index) assert choices.index.equals(choosers.index) assert rands.index.equals(choosers.index) logger.info('%s %s iterations to satisfy all joint tours.', trace_label, iter,) return choices, rands
def trip_destination(trips, tours_merged, chunk_size, trace_hh_id): """ Choose a destination for all 'intermediate' trips based on trip purpose. Final trips already have a destination (the primary tour destination for outbound trips, and home for inbound trips.) """ trace_label = 'trip_destination' model_settings_file_name = 'trip_destination.yaml' model_settings = config.read_model_settings(model_settings_file_name) CLEANUP = model_settings.get('CLEANUP', True) fail_some_trips_for_testing = model_settings.get( 'fail_some_trips_for_testing', False) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() estimator = estimation.manager.begin_estimation('trip_destination') if estimator: estimator.write_coefficients(model_settings=model_settings) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') estimator.write_spec(model_settings, tag='SPEC') estimator.set_alt_id(model_settings["ALT_DEST_COL_NAME"]) estimator.write_table(inject.get_injectable('size_terms'), 'size_terms', append=False) estimator.write_table(inject.get_table('land_use').to_frame(), 'landuse', append=False) estimator.write_model_settings(model_settings, model_settings_file_name) logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) trips_df, save_sample_df = run_trip_destination( trips_df, tours_merged_df, estimator=estimator, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label, fail_some_trips_for_testing=fail_some_trips_for_testing) # testing feature t0 make sure at least one trip fails so trip_purpose_and_destination model is run if config.setting('testing_fail_trip_destination', False) and not trips_df.failed.any(): if (trips_df.trip_num < trips_df.trip_count).sum() == 0: raise RuntimeError( f"can't honor 'testing_fail_trip_destination' setting because no intermediate trips" ) fail_o = trips_df[trips_df.trip_num < trips_df.trip_count].origin.max() trips_df.failed = (trips_df.origin == fail_o) & \ (trips_df.trip_num < trips_df.trip_count) if trips_df.failed.any(): logger.warning("%s %s failed trips", trace_label, trips_df.failed.sum()) if inject.get_injectable('pipeline_file_prefix', None): file_name = f"{trace_label}_failed_trips_{inject.get_injectable('pipeline_file_prefix')}" else: file_name = f"{trace_label}_failed_trips" logger.info("writing failed trips to %s", file_name) tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) if estimator: estimator.end_estimation() # no trips should have failed since we overwrite choices and sample should have not failed trips assert not trips_df.failed.any() if CLEANUP: if trips_df.failed.any(): flag_failed_trip_leg_mates(trips_df, 'failed') if save_sample_df is not None: save_sample_df.drop(trips_df.index[trips_df.failed], level='trip_id', inplace=True) trips_df = cleanup_failed_trips(trips_df) trips_df.drop(columns='failed', inplace=True, errors='ignore') pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True) if save_sample_df is not None: # might be none if want_sample_table but there are no intermediate trips # expect samples only for intermediate trip destinations assert len(save_sample_df.index.get_level_values(0).unique()) == \ len(trips_df[trips_df.trip_num < trips_df.trip_count]) sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') assert sample_table_name is not None logger.info("adding %s samples to %s" % (len(save_sample_df), sample_table_name)) # lest they try to put tour samples into the same table if pipeline.is_table(sample_table_name): raise RuntimeError("sample table %s already exists" % sample_table_name) pipeline.extend_table(sample_table_name, save_sample_df)
def compute_accessibility(settings, accessibility_spec, accessibility_settings, skim_dict, omx_file, land_use, trace_od): """ Compute accessibility for each zone in land use file using expressions from accessibility_spec The actual results depend on the expressions in accessibility_spec, but this is initially intended to permit implementation of the mtc accessibility calculation as implemented by Accessibility.job Compute measures of accessibility used by the automobile ownership model. The accessibility measure first multiplies an employment variable by a mode-specific decay function. The product reflects the difficulty of accessing the activities the farther (in terms of round-trip travel time) the jobs are from the location in question. The products to each destination zone are next summed over each origin zone, and the logarithm of the product mutes large differences. The decay function on the walk accessibility measure is steeper than automobile or transit. The minimum accessibility is zero. """ logger.info("Running compute_accessibility") constants = config.get_model_constants(accessibility_settings) land_use_columns = accessibility_settings.get('land_use_columns', []) land_use_df = land_use.to_frame() zone_count = len(land_use_df.index) # create OD dataframe od_df = pd.DataFrame( data={ 'orig': np.repeat(np.asanyarray(land_use_df.index), zone_count), 'dest': np.tile(np.asanyarray(land_use_df.index), zone_count) }) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (od_df.orig == trace_orig) & (od_df.dest == trace_dest) else: trace_od_rows = None # merge land_use_columns into od_df land_use_df = land_use_df[land_use_columns] od_df = pd.merge(od_df, land_use_df, left_on='dest', right_index=True).sort_index() locals_d = { 'log': np.log, 'exp': np.exp, 'skim_od': AccessibilitySkims(skim_dict, omx_file, zone_count), 'skim_do': AccessibilitySkims(skim_dict, omx_file, zone_count, transpose=True) } if constants is not None: locals_d.update(constants) results, trace_results, trace_assigned_locals \ = assign.assign_variables(accessibility_spec, od_df, locals_d, trace_rows=trace_od_rows) accessibility_df = pd.DataFrame(index=land_use.index) for column in results.columns: data = np.asanyarray(results[column]) data.shape = (zone_count, zone_count) accessibility_df[column] = np.log(np.sum(data, axis=1) + 1) inject.add_column("accessibility", column, accessibility_df[column]) if trace_od: if not trace_od_rows.any(): logger.warn("trace_od not found origin = %s, dest = %s" % (trace_orig, trace_dest)) else: # add OD columns to trace results df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging # note that this is not the same as the orca-injected accessibility table # FIXME - should we name this differently and also dump the updated accessibility table? tracing.trace_df(df, label='accessibility', index_label='skim_offset', slicer='NONE', warn_if_empty=True) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="accessibility_locals")
def aggregate_demographics_processor(zone_hhs, aggregate_demographics_spec, settings, trace_od): """ Parameters ---------- zone_hhs : orca table input zone demographics """ trace_label = 'aggregate_demographics' model_settings = config.read_model_settings('aggregate_demographics.yaml') zone_hhs_df = zone_hhs.to_frame() logger.info("Running %s with %d zones" % ( trace_label, len(zone_hhs_df), )) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (zone_hhs_df.index == trace_orig) | (zone_hhs_df.index == trace_dest) else: trace_od_rows = None # locals whose values will be accessible to the execution context # when the expressions in spec are applied to choosers locals_dict = config.get_model_constants(model_settings) locals_dict.update(config.setting('globals')) trace_rows = None # eval_variables evaluates each of the expressions in spec # in the context of each row in of the choosers dataframe results, trace_results, trace_assigned_locals = \ assign.assign_variables(aggregate_demographics_spec, zone_hhs_df, locals_dict, df_alias='hhs', trace_rows=trace_od_rows) pipeline.replace_table("zone_demographics", results) # expression file can use silos column to designate result targets (e.g. count of households) add_aggregate_results(results, aggregate_demographics_spec, source=trace_label) if trace_results is not None: tracing.write_csv(trace_results, file_name="aggregate_demographics", index_label='zone', column_labels=['label', 'zone']) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="aggregate_demographics_locals")
def aggregate_od_processor(zone_districts, zones, data_dir, trace_od): trace_label = 'aggregate_od' logger.info("Running %s" % (trace_label, )) model_settings = config.read_model_settings('aggregate_od.yaml') spec_file_name = model_settings.get('spec_file_name', 'aggregate_od.csv') aggregate_od_spec = bca.read_assignment_spec(spec_file_name) zones = zones.to_frame() zone_districts = zone_districts.to_frame() zone_count = zone_districts.shape[0] assert zones.index.equals(zone_districts.index) # create OD dataframe in order compatible with ODSkims od_df = pd.DataFrame( data={ 'orig': np.repeat(np.asanyarray(zones.index), zone_count), 'dest': np.tile(np.asanyarray(zones.index), zone_count), }) # locals whose values will be accessible to the execution context # when the expressions in spec are applied to choosers locals_dict = config.get_model_constants(model_settings) locals_dict.update(config.setting('globals')) locals_dict['logger'] = logger logger.debug('%s mem before create_skim_locals_dict, %s' % ( trace_label, memory_info(), )) # - add ODSkims to locals (note: we use local_skims list later to close omx files) cache_skims = model_settings.get('cache_skims', False) local_skims = create_skim_locals_dict(model_settings, data_dir, zones, cache_skims) locals_dict.update(local_skims) # - create_zone_matrices dicts locals_dict.update(create_zone_matrices(model_settings, zones)) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (od_df.orig == trace_orig) & (od_df.dest == trace_dest) else: trace_od_rows = None logger.debug("%s assigning variables" % (trace_label, )) results, trace_results, trace_assigned_locals = \ assign.assign_variables(aggregate_od_spec, od_df, locals_dict=locals_dict, df_alias='od', trace_rows=trace_od_rows) logger.debug('%s mem after assign_variables, %s' % ( trace_label, memory_info(), )) for local_name, od_skims in local_skims.items(): logger.debug("closing %s" % local_name) od_skims.log_skim_usage() od_skims.close() # summarize aggregate_od_benefits by orig and dest districts logger.debug("%s district summary" % (trace_label, )) results['orig'] = np.repeat(np.asanyarray(zone_districts.district), zone_count) results['dest'] = np.tile(np.asanyarray(zone_districts.district), zone_count) district_summary = results.groupby(['orig', 'dest']).sum() pipeline.replace_table('aggregate_od_district_summary', district_summary) # attribute aggregate_results benefits to origin zone logger.debug("%s zone summary" % (trace_label, )) results['orig'] = od_df['orig'] del results['dest'] zone_summary = results.groupby(['orig']).sum() pipeline.replace_table('aggregate_od_zone_summary', zone_summary) add_aggregate_results(zone_summary, aggregate_od_spec, source=trace_label) if trace_results is not None: tracing.write_csv(trace_results, file_name=trace_label, index_label='index', column_labels=['label', 'od']) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="%s_locals" % trace_label, index_label='variable', columns='value')
def load_data(self): """ Load tables and skims from files specified in network_los settigns """ # load maz tables if self.zone_system in [TWO_ZONE, THREE_ZONE]: # maz file_name = self.setting('maz') self.maz_taz_df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) self.maz_taz_df = self.maz_taz_df[['MAZ', 'TAZ']].sort_values(by='MAZ') # only fields we need self.maz_ceiling = self.maz_taz_df.MAZ.max() + 1 # maz_to_maz_df maz_to_maz_tables = self.setting('maz_to_maz.tables') maz_to_maz_tables = [maz_to_maz_tables] if isinstance(maz_to_maz_tables, str) else maz_to_maz_tables for file_name in maz_to_maz_tables: df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) df['i'] = df.OMAZ * self.maz_ceiling + df.DMAZ df.set_index('i', drop=True, inplace=True, verify_integrity=True) logger.debug(f"loading maz_to_maz table {file_name} with {len(df)} rows") # FIXME - don't really need these columns, but if we do want them, # we would need to merge them in since files may have different numbers of rows df.drop(columns=['OMAZ', 'DMAZ'], inplace=True) # besides, we only want data columns so we can coerce to same type as skims df = df.astype(np.dtype(self.skim_dtype_name)) if self.maz_to_maz_df is None: self.maz_to_maz_df = df else: self.maz_to_maz_df = pd.concat([self.maz_to_maz_df, df], axis=1) # load tap tables if self.zone_system == THREE_ZONE: # tap_df should already have been loaded by load_skim_info because, # during multiprocessing, it is required by TapTapUidCalculator to size TVPBCache # self.tap_df = pd.read_csv(config.data_file_path(self.setting('tap'), mandatory=True)) assert self.tap_df is not None # maz_to_tap_dfs - different sized sparse arrays with different columns, so we keep them seperate for mode, maz_to_tap_settings in self.setting('maz_to_tap').items(): assert 'table' in maz_to_tap_settings, \ f"Expected setting maz_to_tap.{mode}.table not found in in {LOS_SETTINGS_FILE_NAME}" file_name = maz_to_tap_settings['table'] df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) # trim tap set # if provided, use tap_line_distance_col together with tap_lines table to trim the near tap set # to only include the nearest tap to origin when more than one tap serves the same line distance_col = maz_to_tap_settings.get('tap_line_distance_col') if distance_col: if self.tap_lines_df is None: # load tap_lines on demand (required if they specify tap_line_distance_col) tap_lines_file_name = self.setting('tap_lines', ) self.tap_lines_df = pd.read_csv(config.data_file_path(tap_lines_file_name, mandatory=True)) # csv file has one row per TAP with space-delimited list of lines served by that TAP # TAP LINES # 6020 GG_024b_SB GG_068_RT GG_228_WB GG_023X_RT # stack to create dataframe with one column 'line' indexed by TAP with one row per line served # TAP line # 6020 GG_024b_SB # 6020 GG_068_RT # 6020 GG_228_WB self.tap_lines_df = \ self.tap_lines_df.set_index('TAP').LINES.str.split(expand=True)\ .stack().droplevel(1).to_frame('line') old_len = len(df) # NOTE - merge will remove unused taps (not appearing in tap_lines) df = pd.merge(df, self.tap_lines_df, left_on='TAP', right_index=True) # find nearest TAP to MAz that serves line df = df.sort_values(by=distance_col).drop_duplicates(subset=['MAZ', 'line']) # we don't need to remember which lines are served by which TAPs df = df.drop(columns='line').drop_duplicates(subset=['MAZ', 'TAP']).sort_values(['MAZ', 'TAP']) logger.debug(f"trimmed maz_to_tap table {file_name} from {old_len} to {len(df)} rows") logger.debug(f"maz_to_tap table {file_name} max {distance_col} {df[distance_col].max()}") max_dist = maz_to_tap_settings.get('max_dist', None) if max_dist: old_len = len(df) df = df[df[distance_col] <= max_dist] logger.debug(f"trimmed maz_to_tap table {file_name} from {old_len} to {len(df)} rows " f"based on max_dist {max_dist}") if TRACE_TRIMMED_MAZ_TO_TAP_TABLES: tracing.write_csv(df, file_name=f"trimmed_{maz_to_tap_settings['table']}", transpose=False) df.set_index(['MAZ', 'TAP'], drop=True, inplace=True, verify_integrity=True) logger.debug(f"loaded maz_to_tap table {file_name} with {len(df)} rows") assert mode not in self.maz_to_tap_dfs self.maz_to_tap_dfs[mode] = df mem.trace_memory_info('#MEM network_los.load_data before create_skim_dicts') # create taz skim dict assert 'taz' not in self.skim_dicts self.skim_dicts['taz'] = self.create_skim_dict('taz') # make sure skim has all tap_ids # FIXME - weird that there is no list of tazs? # create MazSkimDict facade if self.zone_system in [TWO_ZONE, THREE_ZONE]: # create MazSkimDict facade skim_dict # (must have already loaded dependencies: taz skim_dict, maz_to_maz_df, and maz_taz_df) assert 'maz' not in self.skim_dicts self.skim_dicts['maz'] = self.create_skim_dict('maz') # make sure skim has all maz_ids assert set(self.maz_taz_df['MAZ'].values).issubset(set(self.skim_dicts['maz'].zone_ids)) # create tap skim dict if self.zone_system == THREE_ZONE: assert 'tap' not in self.skim_dicts self.skim_dicts['tap'] = self.create_skim_dict('tap') # make sure skim has all tap_ids assert set(self.tap_df['TAP'].values).issubset(set(self.skim_dicts['tap'].zone_ids)) mem.trace_memory_info("network_los.load_data after create_skim_dicts")
def trip_destination( trips, tours_merged, chunk_size, trace_hh_id): """ Choose a destination for all 'intermediate' trips based on trip purpose. Final trips already have a destination (the primary tour destination for outbound trips, and home for inbound trips.) """ trace_label = 'trip_destination' model_settings = config.read_model_settings('trip_destination.yaml') CLEANUP = model_settings.get('CLEANUP', True) fail_some_trips_for_testing = model_settings.get('fail_some_trips_for_testing', False) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) trips_df, save_sample_df = run_trip_destination( trips_df, tours_merged_df, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label, fail_some_trips_for_testing=fail_some_trips_for_testing) # testing feature t0 make sure at least one trip fails so trip_purpose_and_destination model is run if config.setting('testing_fail_trip_destination', False) and not trips_df.failed.any(): fail_o = trips_df[trips_df.trip_num < trips_df.trip_count].origin.max() trips_df.failed = (trips_df.origin == fail_o) & \ (trips_df.trip_num < trips_df.trip_count) if trips_df.failed.any(): logger.warning("%s %s failed trips", trace_label, trips_df.failed.sum()) file_name = "%s_failed_trips" % trace_label logger.info("writing failed trips to %s", file_name) tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) if CLEANUP: if trips_df.failed.any(): flag_failed_trip_leg_mates(trips_df, 'failed') if save_sample_df is not None: save_sample_df.drop(trips_df.index[trips_df.failed], level='trip_id', inplace=True) trips_df = cleanup_failed_trips(trips_df) trips_df.drop(columns='failed', inplace=True, errors='ignore') pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True) if save_sample_df is not None: # might be none if want_sample_table but there are no intermediate trips # expect samples only for intermediate trip destinations assert len(save_sample_df.index.get_level_values(0).unique()) == \ len(trips_df[trips_df.trip_num < trips_df.trip_count]) sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') assert sample_table_name is not None logger.info("adding %s samples to %s" % (len(save_sample_df), sample_table_name)) # lest they try to put tour samples into the same table if pipeline.is_table(sample_table_name): raise RuntimeError("sample table %s already exists" % sample_table_name) pipeline.extend_table(sample_table_name, save_sample_df)
def compute_columns(df, model_settings, configs_dir, trace_label=None): """ Evaluate expressions_spec in context of df, with optional additional pipeline tables in locals Parameters ---------- df : pandas DataFrame or if None, expect name of pipeline table to be specified by DF in model_settings model_settings : dict or str dict with keys: DF - df_alias and (additionally, if df is None) name of pipeline table to load as df SPEC - name of expressions file (csv suffix optional) if different from model_settings TABLES - list of pipeline tables to load and make available as (read only) locals str: name of yaml file in confirs_dir to load dict from configs_dir trace_label Returns ------- results: pandas.DataFrame one column for each expression (except temps with ALL_CAP target names) same index as df """ if isinstance(model_settings, str): model_settings_name = model_settings model_settings = config.read_model_settings(configs_dir, '%s.yaml' % model_settings) assert model_settings, "Found no model settings for %s" % model_settings_name else: model_settings_name = 'dict' assert 'DF' in model_settings, \ "Expected to find 'DF' in %s" % model_settings_name df_name = model_settings.get('DF') helper_table_names = model_settings.get('TABLES', []) expressions_spec_name = model_settings.get('SPEC', model_settings_name) assert expressions_spec_name is not None, \ "Expected to find 'SPEC' in %s" % model_settings_name if trace_label is None: trace_label = expressions_spec_name if not expressions_spec_name.endswith(".csv"): expressions_spec_name = '%s.csv' % expressions_spec_name expressions_spec = assign.read_assignment_spec(os.path.join(configs_dir, expressions_spec_name)) tables = {t: inject.get_table(t).to_frame() for t in helper_table_names} # if df was passed in, df might be a slice, or any other table, but DF is it's local alias assert df_name not in tables, "Did not expect to find df '%s' in TABLES" % df_name tables[df_name] = df locals_dict = local_utilities() locals_dict.update(tables) results, trace_results, trace_assigned_locals \ = assign.assign_variables(expressions_spec, df, locals_dict, trace_rows=tracing.trace_targets(df)) if trace_results is not None: tracing.trace_df(trace_results, label=trace_label, slicer='NONE', warn_if_empty=True) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="%s_locals" % trace_label) return results
def balance_trips(zones, trace_od): """Improve the match between destination zone trip totals (given by the DEST_TARGETS in the balance_trips config file) and the trip counts calculated during the destination choice step. The config file should contain the following parameters: dest_zone_trip_targets: total: <aggregate destination zone trip counts> OR <segment_1>: totals for segment 1 (optional) <segment_2>: totals for segment 2 (optional) <segment_3>: totals for segment 3 (optional) (These are optional) max_iterations: maximum number of iteration to pass to the balancer balance_closure: float precision to stop balancing totals input_table: path to CSV to use instead of trips table. The config file can also have an orig_zone_trip_targets to manually specify origin zone totals instead of using the logsums calculated by the destination choice step. Parameters ---------- zones : DataFrameWrapper zone attributes trace_od : list or dict Returns ------- Nothing. Balances trips table and writes trace tables """ logger.info('running trip balancing step ...') model_settings = config.read_model_settings(YAML_FILENAME) trips_df = get_trips_df(model_settings) trace_rows = trace.trace_filter(trips_df, trace_od) tracing.write_csv(trips_df[trace_rows], file_name='trips_unbalanced', transpose=False) trips_df = trips_df.melt( id_vars=['orig', 'dest'], var_name='segment', value_name='trips') dest_targets = model_settings.get(DEST_TARGETS) orig_targets = model_settings.get(ORIG_TARGETS) max_iterations = model_settings.get('max_iterations', 50) closure = model_settings.get('balance_closure', 0.001) aggregates, dimensions = calculate_aggregates(trips_df, zones.to_frame(), dest_targets, orig_targets) balancer = Balancer(trips_df.reset_index(), aggregates, dimensions, weight_col='trips', max_iteration=max_iterations, closure=closure) balanced_df = balancer.balance() balanced_trips = balanced_df.set_index(['orig', 'dest', 'segment'])['trips'].unstack() tracing.write_csv(balanced_trips.reset_index()[trace_rows], file_name='trips_balanced', transpose=False) pipeline.replace_table('trips', balanced_trips) logger.info('finished balancing trips.')