def setup_data_structures(output_dir, settings, configs_dir, households, persons): seed_geography = setting('seed_geography') households_df = households.to_frame() persons_df = persons.to_frame() # # remove mixed type fields # del persons_df["RT"] # del persons_df["indp02"] # del persons_df["naicsp02"] # del persons_df["occp02"] # del persons_df["socp00"] # del persons_df["occp10"] # del persons_df["socp10"] # del persons_df["indp07"] # del persons_df["naicsp07"] # # file_path = os.path.join(output_dir, "cleaned_persons.csv") # write_index = persons_df.index.name is not None # persons_df.to_csv(file_path, index=write_index) # # assert False crosswalk_df = build_crosswalk_table() inject.add_table('crosswalk', crosswalk_df) control_spec = read_control_spec( setting('control_file_name', 'controls.csv'), configs_dir) inject.add_table('control_spec', control_spec) geographies = settings['geographies'] for g in geographies: controls = build_control_table(g, control_spec, crosswalk_df) inject.add_table(control_table_name(g), controls) households_df, persons_df = filter_households(households_df, persons_df, crosswalk_df) pipeline.replace_table('households', households_df) pipeline.replace_table('persons', persons_df) incidence_table = \ build_incidence_table(control_spec, households_df, persons_df, crosswalk_df) incidence_table = add_geography_columns(incidence_table, households_df, crosswalk_df) # add sample_weight col to incidence table hh_weight_col = setting('household_weight_col') incidence_table['sample_weight'] = households_df[hh_weight_col] if setting('GROUP_BY_INCIDENCE_SIGNATURE'): group_incidence_table, household_groups \ = build_grouped_incidence_table(incidence_table, control_spec, seed_geography) inject.add_table('household_groups', household_groups) inject.add_table('incidence_table', group_incidence_table) else: inject.add_table('incidence_table', incidence_table)
def add_geography_columns(incidence_table, households_df, crosswalk_df): """ Add seed and meta geography columns to incidence_table Parameters ---------- incidence_table households_df crosswalk_df Returns ------- """ geographies = setting('geographies') meta_geography = geographies[0] seed_geography = setting('seed_geography') # add seed_geography col to incidence table incidence_table[seed_geography] = households_df[seed_geography] # add meta column to incidence table seed_to_meta = \ crosswalk_df[[seed_geography, meta_geography]] \ .groupby(seed_geography, as_index=True).min()[meta_geography] incidence_table[meta_geography] = incidence_table[seed_geography].map( seed_to_meta) return incidence_table
def meta_summary(incidence_df, control_spec, top_geography, top_id, sub_geographies): if setting('NO_INTEGERIZATION_EVER', False): seed_weight_cols = ['preliminary_balanced_weight', 'balanced_weight'] sub_weight_cols = ['balanced_weight'] else: seed_weight_cols = [ 'preliminary_balanced_weight', 'balanced_weight', 'integer_weight' ] sub_weight_cols = ['balanced_weight', 'integer_weight'] incidence_df = incidence_df[incidence_df[top_geography] == top_id] control_cols = control_spec.target.values controls_df = get_control_table(top_geography) # controls for this geography as series controls = controls_df[control_cols].loc[top_id] incidence = incidence_df[control_cols] summary = pd.DataFrame(index=control_cols) summary.index.name = 'control_name' summary['control_value'] = controls seed_geography = setting('seed_geography') seed_weights_df = get_weight_table(seed_geography) for c in seed_weight_cols: if c in seed_weights_df: summary_col_name = '%s_%s' % (top_geography, c) summary[summary_col_name] = \ incidence.multiply(seed_weights_df[c], axis="index").sum(axis=0) for g in sub_geographies: sub_weights = get_weight_table(g) if sub_weights is None: continue sub_weights = sub_weights[sub_weights[top_geography] == top_id] sub_weights = sub_weights[['hh_id'] + sub_weight_cols].groupby('hh_id').sum() for c in sub_weight_cols: summary['%s_%s' % (g, c)] = \ incidence.multiply(sub_weights[c], axis="index").sum(axis=0) return summary
def build_grouped_incidence_table(incidence_table, control_spec, seed_geography): hh_incidence_table = incidence_table household_id_col = setting('household_id_col') hh_groupby_cols = list(control_spec.target) + [seed_geography] hh_grouper = hh_incidence_table.groupby(hh_groupby_cols) group_incidence_table = hh_grouper.max() group_incidence_table['sample_weight'] = hh_grouper.sum()['sample_weight'] group_incidence_table['group_size'] = hh_grouper.count()['sample_weight'] group_incidence_table = group_incidence_table.reset_index() logger.info("grouped incidence table has %s entries, ungrouped has %s" % (len(group_incidence_table.index), len(hh_incidence_table.index))) # add group_id of each hh to hh_incidence_table group_incidence_table['group_id'] = group_incidence_table.index hh_incidence_table['group_id'] = hh_incidence_table[hh_groupby_cols].merge( group_incidence_table[hh_groupby_cols + ['group_id']], on=hh_groupby_cols, how='left').group_id.astype(int).values # it doesn't really matter what the incidence_table index is until we create population # when we need to expand each group to constituent households # but incidence_table should have the same name whether grouped or ungrouped # so that the rest of the steps can handle them interchangeably group_incidence_table.index.name = hh_incidence_table.index.name # create table mapping household_groups to households and their sample_weights # explicitly provide hh_id as a column to make it easier for use when expanding population household_groups = hh_incidence_table[['group_id', 'sample_weight']].copy() household_groups[household_id_col] = household_groups.index.astype(int) return group_incidence_table, household_groups
def filter_households(households_df, persons_df, crosswalk_df): # drop any zero weight households (there are some in calm data) hh_weight_col = setting('household_weight_col') households_df = households_df[households_df[hh_weight_col] > 0] # remove any households not in seed zones seed_geography = setting('seed_geography') seed_ids = crosswalk_df[seed_geography].unique() rows_in_seed_zones = households_df[seed_geography].isin(seed_ids) if rows_in_seed_zones.any(): households_df = households_df[rows_in_seed_zones] logger.info("dropped %s households not in seed zones" % (~rows_in_seed_zones).sum()) logger.info("kept %s households in seed zones" % len(households_df)) return households_df, persons_df
def repop_setup_data_structures(settings, configs_dir, households, persons): seed_geography = setting('seed_geography') geographies = setting('geographies') low_geography = geographies[-1] # replace crosswalk table crosswalk_df = build_crosswalk_table() pipeline.replace_table('crosswalk', crosswalk_df) # replace control_spec control_file_name = setting('repop_control_file_name', 'repop_controls.csv') control_spec = read_control_spec(control_file_name, configs_dir) pipeline.replace_table('control_spec', control_spec) # build incidence_table with repop controls and households in repop zones # filter households (dropping any not in crosswalk) in order to build incidence_table # We DO NOT REPLACE households and persons as we need full tables to synthesize population households_df = households.to_frame() persons_df = persons.to_frame() households_df, repop_persons_df = filter_households( households_df, persons_df, crosswalk_df) incidence_table = build_incidence_table(control_spec, households_df, persons_df, crosswalk_df) incidence_table = add_geography_columns(incidence_table, households_df, crosswalk_df) # add sample_weight col to incidence table hh_weight_col = setting('household_weight_col') incidence_table['sample_weight'] = households_df[hh_weight_col] # rebuild control tables with only the low level controls (aggregated at higher levels) for g in geographies: controls = build_control_table(g, control_spec, crosswalk_df) pipeline.replace_table(control_table_name(g), controls) if setting('GROUP_BY_INCIDENCE_SIGNATURE'): group_incidence_table, household_groups \ = build_grouped_incidence_table(incidence_table, control_spec, seed_geography) pipeline.replace_table('household_groups', household_groups) pipeline.replace_table('incidence_table', group_incidence_table) else: pipeline.replace_table('incidence_table', incidence_table)
def summarize_geography(geography, weight_col, crosswalk_df, results_df, incidence_df): # controls_table for current geography level controls_df = get_control_table(geography) control_names = controls_df.columns.tolist() # only want zones from crosswalk for which non-zero control rows exist zone_ids = crosswalk_df[geography].unique() zone_ids = controls_df.index.intersection(zone_ids) results = [] controls = [] for zone_id in zone_ids: zone_controls = controls_df.loc[zone_id].tolist() controls.append(zone_controls) zone_row_map = results_df[geography] == zone_id zone_weights = results_df[zone_row_map] incidence = incidence_df.loc[zone_weights[setting('household_id_col')]] weights = zone_weights[weight_col].tolist() x = [(incidence[c] * weights).sum() for c in control_names] results.append(x) controls_df = pd.DataFrame( data=np.asanyarray(controls), columns=['%s_control' % c for c in control_names], index=zone_ids) summary_df = pd.DataFrame(data=np.asanyarray(results), columns=['%s_result' % c for c in control_names], index=zone_ids) dif_df = pd.DataFrame(data=np.asanyarray(results) - np.asanyarray(controls), columns=['%s_diff' % c for c in control_names], index=zone_ids) summary_df = pd.concat([controls_df, summary_df, dif_df], axis=1) summary_cols = summary_df.columns.tolist() summary_df['geography'] = geography summary_df['id'] = summary_df.index summary_df.index = summary_df['geography'] + '_' + summary_df['id'].astype( str) summary_df = summary_df[['geography', 'id'] + summary_cols] return summary_df
def merge_seed_data(expanded_household_ids, seed_data_df, seed_columns, trace_label): seed_geography = setting('seed_geography') hh_col = setting('household_id_col') df_columns = seed_data_df.columns.values # warn of any columns that aren't in seed_data_df for c in seed_columns: if c not in df_columns and c != hh_col: logger.warn("column '%s' not in %s" % (c, trace_label)) # remove any columns that aren't in seed_data_df df_columns = [c for c in seed_columns if c in df_columns] # seed_geography column in seed_data_df is redundant (already in expanded_household_ids table) if seed_geography in df_columns: df_columns.remove(seed_geography) # join to seed_data on either index or hh_col (for persons) right_index = (seed_data_df.index.name == hh_col) right_on = hh_col if hh_col in seed_data_df.columns and not right_index else None assert right_index or right_on if right_on and hh_col not in df_columns: df_columns.append(hh_col) merged_df = pd.merge( how="left", left=expanded_household_ids, right=seed_data_df[df_columns], left_on=hh_col, right_index=right_index, right_on=right_on ) if hh_col not in seed_columns: del merged_df[hh_col] return merged_df
def build_crosswalk_table(): """ build crosswalk table filtered to include only zones in lowest geography """ geographies = setting('geographies') crosswalk_data_table = inject.get_table('geo_cross_walk').to_frame() # dont need any other geographies crosswalk = crosswalk_data_table[geographies] # filter geo_cross_walk_df to only include geo_ids with lowest_geography controls # (just in case geo_cross_walk_df table contains rows for unused low zones) low_geography = geographies[-1] low_control_data_df = get_control_data_table(low_geography) rows_in_low_controls = crosswalk[low_geography].isin(low_control_data_df[low_geography]) crosswalk = crosswalk[rows_in_low_controls] return crosswalk
def read_control_spec(data_filename, configs_dir): # read the csv file data_file_path = os.path.join(configs_dir, data_filename) if not os.path.exists(data_file_path): raise RuntimeError( "initial_seed_balancing - control file not found: %s" % (data_file_path,)) logger.info("Reading control file %s" % data_file_path) control_spec = pd.read_csv(data_file_path, comment='#') geographies = setting('geographies') if 'geography' not in control_spec.columns: raise RuntimeError("missing geography column in controls file") for g in control_spec.geography.unique(): if g not in geographies: raise RuntimeError("unknown geography column '%s' in control file" % g) return control_spec
def build_incidence_table(control_spec, households_df, persons_df, crosswalk_df): hh_col = setting('household_id_col') incidence_table = pd.DataFrame(index=households_df.index) seed_tables = { 'households': households_df, 'persons': persons_df, } for control_row in control_spec.itertuples(): logger.info("control target %s" % control_row.target) logger.debug("control_row.seed_table %s" % control_row.seed_table) logger.debug("control_row.expression %s" % control_row.expression) incidence, trace_results = assign_variable( target=control_row.target, expression=control_row.expression, df=seed_tables[control_row.seed_table], locals_dict={'np': np}, df_alias=control_row.seed_table, trace_rows=None) # convert boolean True/False values to 1/0 incidence = incidence * 1 # aggregate person incidence counts to household if control_row.seed_table == 'persons': df = pd.DataFrame({ hh_col: persons_df[hh_col], 'incidence': incidence }) incidence = df.groupby([hh_col], as_index=True).sum() incidence_table[control_row.target] = incidence return incidence_table
def write_results(output_dir): output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) output_tables = pipeline.checkpointed_tables() if output_tables_settings is not None: action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) if action == 'include': output_tables = tables elif action == 'skip': output_tables = [t for t in output_tables if t not in tables] # should provide option to also write checkpoints? # output_tables.append("checkpoints.csv") for table_name in output_tables: table = inject.get_table(table_name, None) if table is None: logger.warn("Skipping '%s': Table not found." % table_name) continue df = table.to_frame() file_name = "%s.csv" % table_name logger.info("writing output file %s" % file_name) file_path = os.path.join(output_dir, file_name) write_index = df.index.name is not None df.to_csv(file_path, index=write_index)
def input_pre_processor(): # alternate table list name may have been provided as a model argument table_list_name = inject.get_step_arg('table_list', default='input_table_list') table_list = setting(table_list_name) assert table_list is not None, "table list '%s' not in settings." % table_list_name data_dir = data_dir_from_settings() for table_info in table_list: tablename = table_info['tablename'] logger.info("input_pre_processor processing %s" % tablename) # read the csv file data_filename = table_info.get('filename', None) data_file_path = os.path.join(data_dir, data_filename) if not os.path.exists(data_file_path): raise RuntimeError( "input_pre_processor %s - input file not found: %s" % ( tablename, data_file_path, )) logger.info("Reading csv file %s" % data_file_path) df = pd.read_csv(data_file_path, comment='#') print df.columns drop_columns = table_info.get('drop_columns', None) if drop_columns: for c in drop_columns: logger.info("dropping column '%s'" % c) del df[c] # rename columns column_map = table_info.get('column_map', None) if column_map: df.rename(columns=column_map, inplace=True) # set index index_col = table_info.get('index_col', None) if index_col is not None: if index_col in df.columns: assert not df.duplicated(index_col).any() df.set_index(index_col, inplace=True) else: df.index.names = [index_col] # read expression file # expression_filename = table_info.get('expression_filename', None) # if expression_filename: # assert False # expression_file_path = os.path.join(configs_dir, expression_filename) # if not os.path.exists(expression_file_path): # raise RuntimeError("input_pre_processor %s - expression file not found: %s" # % (table, expression_file_path, )) # spec = assign.read_assignment_spec(expression_file_path) # # df_alias = table_info.get('df_alias', table) # # locals_d = {} # # results, trace_results, trace_assigned_locals \ # = assign.assign_variables(spec, df, locals_d, df_alias=df_alias) # # for column in results.columns: # # orca.add_column(table, column, results[column]) # # df = pd.concat([df, results], axis=1) logger.info("adding table %s" % tablename) inject.add_table(tablename, df)
def repop_setup_data_structures(configs_dir, households, persons): """ Setup geographic correspondence (crosswalk), control sets, and incidence tables for repop run. A new lowest-level geography control tables should already have been read in by rerunning input_pre_processor with a table_list override. The control table contains one row for each zone, with columns specifying control field totals for that control This step reads in the repop control file, which specifies which control control fields in the control table should be used for balancing, along with their importance and the recipe (seed table and expression) for determining household incidence for that control. Parameters ---------- configs_dir : str households: pipeline table persons: pipeline table Returns ------- """ seed_geography = setting('seed_geography') geographies = setting('geographies') low_geography = geographies[-1] # replace crosswalk table crosswalk_df = build_crosswalk_table() pipeline.replace_table('crosswalk', crosswalk_df) # replace control_spec control_file_name = setting('repop_control_file_name', 'repop_controls.csv') control_spec = read_control_spec(control_file_name, configs_dir) # repop control spec should only specify controls for lowest level geography assert control_spec.geography.unique() == [low_geography] pipeline.replace_table('control_spec', control_spec) # build incidence_table with repop controls and households in repop zones # filter households (dropping any not in crosswalk) in order to build incidence_table # We DO NOT REPLACE households and persons as we need full tables to synthesize population # (There is no problem, however, with overwriting the incidence_table and household_groups # because the expand_households step has ALREADY created the expanded_household_ids table # for the original simulated population. ) households_df = households.to_frame() persons_df = persons.to_frame() households_df, persons_df = filter_households(households_df, persons_df, crosswalk_df) incidence_table = build_incidence_table(control_spec, households_df, persons_df, crosswalk_df) incidence_table = add_geography_columns(incidence_table, households_df, crosswalk_df) # add sample_weight col to incidence table hh_weight_col = setting('household_weight_col') incidence_table['sample_weight'] = households_df[hh_weight_col] # rebuild control tables with only the low level controls (aggregated at higher levels) for g in geographies: controls = build_control_table(g, control_spec, crosswalk_df) pipeline.replace_table(control_table_name(g), controls) if setting('GROUP_BY_INCIDENCE_SIGNATURE'): group_incidence_table, household_groups \ = build_grouped_incidence_table(incidence_table, control_spec, seed_geography) pipeline.replace_table('household_groups', household_groups) pipeline.replace_table('incidence_table', group_incidence_table) else: pipeline.replace_table('incidence_table', incidence_table)
def setup_data_structures(settings, configs_dir, households, persons): """ Setup geographic correspondence (crosswalk), control sets, and incidence tables. A control tables for target geographies should already have been read in by running input_pre_processor. The zone control tables contains one row for each zone, with columns specifying control field totals for that control This step reads in the global control file, which specifies which control control fields in the control table should be used for balancing, along with their importance and the recipe (seed table and expression) for determining household incidence for that control. If GROUP_BY_INCIDENCE_SIGNATURE setting is enabled, then incidence table rows are household group ids and and additional household_groups table is created mapping hh group ids to actual hh_ids. Parameters ---------- settings: dict contents of settings.yaml as dict configs_dir: str households: pipeline table persons: pipeline table creates pipeline tables: crosswalk controls geography-specific controls incidence_table household_groups (if GROUP_BY_INCIDENCE_SIGNATURE setting is enabled) modifies tables: households persons """ seed_geography = setting('seed_geography') households_df = households.to_frame() persons_df = persons.to_frame() crosswalk_df = build_crosswalk_table() inject.add_table('crosswalk', crosswalk_df) control_spec = read_control_spec( setting('control_file_name', 'controls.csv'), configs_dir) inject.add_table('control_spec', control_spec) geographies = settings['geographies'] for g in geographies: controls = build_control_table(g, control_spec, crosswalk_df) inject.add_table(control_table_name(g), controls) households_df, persons_df = filter_households(households_df, persons_df, crosswalk_df) pipeline.replace_table('households', households_df) pipeline.replace_table('persons', persons_df) incidence_table = \ build_incidence_table(control_spec, households_df, persons_df, crosswalk_df) incidence_table = add_geography_columns(incidence_table, households_df, crosswalk_df) # add sample_weight col to incidence table hh_weight_col = setting('household_weight_col') incidence_table['sample_weight'] = households_df[hh_weight_col] if setting('GROUP_BY_INCIDENCE_SIGNATURE'): group_incidence_table, household_groups \ = build_grouped_incidence_table(incidence_table, control_spec, seed_geography) inject.add_table('household_groups', household_groups) inject.add_table('incidence_table', group_incidence_table) else: inject.add_table('incidence_table', incidence_table)
def write_synthetic_population(expanded_household_ids, households, persons, output_dir): """ Write synthetic households and persons tables to output dir as csv files. The settings file allows specification of output file names, household_id column name, and seed data attribute columns to include in output files. Parameters ---------- expanded_household_ids : pipeline table households : pipeline table persons : pipeline table output_dir : str Returns ------- """ expanded_household_ids = expanded_household_ids.to_frame() households = households.to_frame() persons = persons.to_frame() SETTINGS_NAME = 'output_synthetic_population' synthetic_tables_settings = setting(SETTINGS_NAME) if synthetic_tables_settings is None: raise RuntimeError("'%s' not found in settings" % SETTINGS_NAME) hh_col = setting('household_id_col') synthetic_hh_col = synthetic_tables_settings.get('household_id', 'HH_ID') # - assign household_ids to synthetic population expanded_household_ids.reset_index(drop=True, inplace=True) expanded_household_ids[ 'synthetic_hh_id'] = expanded_household_ids.index + 1 # - households TABLE_NAME = 'households' options = synthetic_tables_settings.get(TABLE_NAME, None) if options is None: raise RuntimeError("Options for '%s' not found in '%s' in settings" % (TABLE_NAME, SETTINGS_NAME)) seed_columns = options.get('columns') if synthetic_hh_col.lower() in [c.lower() for c in seed_columns]: raise RuntimeError( "synthetic household_id column '%s' also appears in seed column list" % synthetic_hh_col) df = merge_seed_data(expanded_household_ids, households, seed_columns=seed_columns, trace_label=TABLE_NAME) # synthetic_hh_id is index df.rename(columns={'synthetic_hh_id': synthetic_hh_col}, inplace=True) df.set_index(synthetic_hh_col, inplace=True) filename = options.get('filename', '%s.csv' % TABLE_NAME) file_path = os.path.join(output_dir, filename) df.to_csv(file_path, index=True) # - persons TABLE_NAME = 'persons' options = synthetic_tables_settings.get(TABLE_NAME, None) if options is None: raise RuntimeError("Options for '%s' not found in '%s' in settings" % (TABLE_NAME, SETTINGS_NAME)) seed_columns = options.get('columns') if synthetic_hh_col.lower() in [c.lower() for c in seed_columns]: raise RuntimeError( "synthetic household_id column '%s' also appears in seed column list" % synthetic_hh_col) df = merge_seed_data(expanded_household_ids, persons, seed_columns=seed_columns, trace_label=TABLE_NAME) # FIXME drop or rename old seed hh_id column? df.rename(columns={'synthetic_hh_id': synthetic_hh_col}, inplace=True) filename = options.get('filename', '%s.csv' % TABLE_NAME) file_path = os.path.join(output_dir, filename) df.to_csv(file_path, index=False)
def build_control_table(geo, control_spec, crosswalk_df): # control_geographies is list with target geography and the geographies beneath it control_geographies = setting('geographies') assert geo in control_geographies control_geographies = control_geographies[control_geographies.index(geo):] # only want controls for control_geographies control_spec = control_spec[control_spec['geography'].isin( control_geographies)] controls_list = [] # for each geography at or beneath target geography for g in control_geographies: # control spec rows for this geography spec = control_spec[control_spec['geography'] == g] # are there any controls specified for this geography? (e.g. seed has none) if len(spec.index) == 0: continue # control_data for this geography control_data_df = get_control_data_table(g) control_data_columns = [geo] + spec.control_field.tolist() if g == geo: # for top level, we expect geo_col, and need to group and sum assert geo in control_data_df.columns controls = control_data_df[control_data_columns] controls.set_index(geo, inplace=True) else: # aggregate sub geography control totals to the target geo level # add geo_col to control_data table if geo not in control_data_df.columns: # create series mapping sub_geo id to geo id sub_to_geog = crosswalk_df[[g, geo]].groupby( g, as_index=True).min()[geo] control_data_df[geo] = control_data_df[g].map(sub_to_geog) # aggregate (sum) controls to geo level controls = control_data_df[control_data_columns].groupby( geo, as_index=True).sum() controls_list.append(controls) # concat geography columns controls = pd.concat(controls_list, axis=1) # rename columns from seed_col to target columns = { c: t for c, t in zip(control_spec.control_field, control_spec.target) } controls.rename(columns=columns, inplace=True) # reorder columns to match order of control_spec rows controls = controls[control_spec.target] return controls
def summarize(crosswalk, incidence_table, control_spec): """ Write aggregate summary files of controls and weights for all geographic levels to output dir Parameters ---------- crosswalk : pipeline table incidence_table : pipeline table control_spec : pipeline table Returns ------- """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() geographies = setting('geographies') seed_geography = setting('seed_geography') meta_geography = geographies[0] sub_geographies = geographies[geographies.index(seed_geography) + 1:] household_id_col = setting('household_id_col') meta_ids = crosswalk_df[meta_geography].unique() for meta_id in meta_ids: meta_summary_df = \ meta_summary(incidence_df, control_spec, meta_geography, meta_id, sub_geographies) out_table('%s_%s' % (meta_geography, meta_id), meta_summary_df) hh_weights_summary = pd.DataFrame(index=incidence_df.index) # add seed level summaries seed_weights_df = get_weight_table(seed_geography) hh_weights_summary['%s_balanced_weight' % seed_geography] = seed_weights_df['balanced_weight'] hh_weights_summary['%s_integer_weight' % seed_geography] = seed_weights_df['integer_weight'] for geography in sub_geographies: weights_df = get_weight_table(geography) if weights_df is None: continue hh_weight_cols = [ household_id_col, 'balanced_weight', 'integer_weight' ] hh_weights = weights_df[hh_weight_cols].groupby([household_id_col ]).sum() hh_weights_summary['%s_balanced_weight' % geography] = hh_weights['balanced_weight'] hh_weights_summary['%s_integer_weight' % geography] = hh_weights['integer_weight'] # aggregate to seed level hh_id_col = incidence_df.index.name aggegrate_weights = weights_df.groupby([seed_geography, hh_id_col], as_index=False).sum() aggegrate_weights.set_index(hh_id_col, inplace=True) aggegrate_weights = \ aggegrate_weights[[seed_geography, 'balanced_weight', 'integer_weight']] aggegrate_weights['sample_weight'] = \ incidence_df['sample_weight'] aggegrate_weights['%s_preliminary_balanced_weight' % seed_geography] = \ seed_weights_df['preliminary_balanced_weight'] aggegrate_weights['%s_balanced_weight' % seed_geography] = \ seed_weights_df['balanced_weight'] aggegrate_weights['%s_integer_weight' % seed_geography] = \ seed_weights_df['integer_weight'] out_table('%s_aggregate' % (geography, ), aggegrate_weights) df = summarize_geography(seed_geography, 'integer_weight', crosswalk_df, weights_df, incidence_df) out_table('%s_%s' % ( geography, seed_geography, ), df) df = summarize_geography(geography, 'integer_weight', crosswalk_df, weights_df, incidence_df) out_table('%s' % (geography, ), df) out_table('hh_weights', hh_weights_summary)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. Pipeline tables are intermediate computational tables, not to be confused with the synthetic population tables written by the write_synthetic_population step. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written Intermediate tables likely to be of particular interest or utility are the controls and weights tables for the various geographies. For example, if one of your geographies is TRACT, then: TRACT_controls has control totals for every TRACT (and aggregated subzone) controls. TRACT_weights has balanced_weight and integer_weight for every TRACT. To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the expanded_household_ids table: :: output_tables: action: include tables: - expanded_household_ids Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) output_tables_list = pipeline.checkpointed_tables() if output_tables_settings is None: logger.info("No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') if action not in ['include', 'skip']: raise RuntimeError("expected %s action '%s' to be either 'include' or 'skip'" % (output_tables_settings_name, action)) if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in output_tables_list if t not in tables] logger.debug("output_tables_list: %s" % str(output_tables_list)) # should provide option to also write checkpoints? # output_tables_list.append("checkpoints.csv") # columns: geography, id, variable, control, result, diff summary_melt_df = pd.DataFrame() for table_name in output_tables_list: table = inject.get_table(table_name, None) if table is None: logger.warn("Skipping '%s': Table not found." % table_name) continue df = table.to_frame() file_name = "%s.csv" % table_name logger.info("writing output file %s" % file_name) file_path = os.path.join(output_dir, file_name) write_index = df.index.name is not None df.to_csv(file_path, index=write_index) try: # create the melt # find the control variables control_vars = [] for column in list(df.columns.values): if column[-8:] == "_control": control_vars.append(column[:-8]) logger.debug("control variables for melt %s" % str(control_vars)) control_col_names = list("%s_control" % cv for cv in control_vars) result_col_names = list("%s_result" % cv for cv in control_vars) diff_col_names = list("%s_diff" % cv for cv in control_vars) control_melt_df = df.melt(id_vars=["geography","id"], value_vars=control_col_names, value_name="control").replace(to_replace=dict(zip(control_col_names, control_vars)) ) result_melt_df = df.melt(id_vars=["geography","id"], value_vars=result_col_names, value_name="result" ).replace(to_replace=dict(zip(result_col_names, control_vars)) ) diff_melt_df = df.melt(id_vars=["geography","id"], value_vars=diff_col_names, value_name="diff" ).replace(to_replace=dict(zip(diff_col_names, control_vars)) ) melt_df = pd.merge(left=control_melt_df, right=result_melt_df, how="left", on=["geography","id","variable"]) melt_df = pd.merge(left=melt_df, right=diff_melt_df, how="left", on=["geography","id","variable"]) summary_melt_df = summary_melt_df.append(melt_df) except: # if something doesn't work, it's ok pass if len(summary_melt_df) > 0: file_name = "summary_melt.csv" logger.info("writing output file %s" % file_name) file_path = os.path.join(output_dir, file_name) write_index = df.index.name is not None summary_melt_df.to_csv(file_path, index=write_index)
def final_seed_balancing(settings, crosswalk, control_spec, incidence_table): """ Balance the household weights for each of the seed geographies (independently) using the seed level controls and the aggregated sub-zone controls totals. Create the seed_weights table with one row per household and columns contaiing household_id, seed geography (e.g. PUMA), and float preliminary_balanced_weights Adds column balanced_weight to the seed_weights table Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table Returns ------- """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_weight_table_name = weight_table_name(seed_geography) # if there are no meta controls, then balanced_weight is simply preliminary_balanced_weight geographies = settings['geographies'] if not (control_spec.geography == geographies[0]).any(): logger.warning( "no need for final_seed_balancing because no meta controls") seed_weights_df = get_weight_table(seed_geography) if 'balanced_weight' not in seed_weights_df: final_seed_weights = seed_weights_df['preliminary_balanced_weight'] inject.add_column(seed_weight_table_name, 'balanced_weight', final_seed_weights) return # we use all control_spec rows, so no need to filter on geography as for initial_seed_balancing seed_controls_df = get_control_table(seed_geography) assert (seed_controls_df.columns == control_spec.target).all() # determine master_control_index if specified in settings total_hh_control_col = setting('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) relaxation_factors = pd.DataFrame(index=seed_controls_df.columns.tolist()) # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("final_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "final_seed_balancing for seed_id %s did not converge" % seed_id) weight_list.append(weights_df['final']) relaxation_factors[seed_id] = controls_df['relaxation_factor'] # bulk concat all seed level results final_seed_weights = pd.concat(weight_list) inject.add_column(seed_weight_table_name, 'balanced_weight', final_seed_weights)
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): """ Balance the household weights for each of the seed geographies (independently) using the seed level controls and the aggregated sub-zone controls totals. Create the seed_weights table with one row per household and columns contaiing household_id, seed geography (e.g. PUMA), and float preliminary_balanced_weights Adds seed_weights table to pipeline named <seed_geography>_weights (e.g. PUMA_weights): +--------+------+-----------------------------+-------+ | index | PUMA | preliminary_balanced_weight | hh_id | | hh_id | | | | +========+======+=============================+=======+ | 0 | 600 | 0.313555 | 0 | | 1 | 601 | 0.627110 | 1 | | 2 | 602 | 0.313555 | 2 | | ... | | | | +--------+------+-----------------------------+-------+ Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) # only want control_spec rows for seed geography and below geographies = settings['geographies'] seed_geographies = geographies[geographies.index(seed_geography):] seed_control_spec = control_spec[control_spec['geography'].isin( seed_geographies)] # determine master_control_index if specified in settings total_hh_control_col = setting('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) min_expansion_factor = settings.get('min_expansion_factor', None) # run balancer for each seed geography weight_list = [] sample_weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=seed_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, min_expansion_factor=min_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "initial_seed_balancing for seed_id %s did not converge" % seed_id) balanced_weights = weights_df['final'] logger.info("Total balanced weights for seed %s = %s" % (seed_id, balanced_weights.sum())) weight_list.append(balanced_weights) sample_weight_list.append(seed_incidence_df['sample_weight']) # bulk concat all seed level results weights = pd.concat(weight_list) sample_weights = pd.concat(sample_weight_list) # build canonical weights table seed_weights_df = incidence_df[[seed_geography]].copy() seed_weights_df['preliminary_balanced_weight'] = weights seed_weights_df['sample_weight'] = sample_weights # copy household_id_col index to named column seed_weights_df[setting('household_id_col')] = seed_weights_df.index # this is just a convenience if there are no meta controls if inject.get_step_arg('final', default=False): seed_weights_df['balanced_weight'] = seed_weights_df[ 'preliminary_balanced_weight'] repop = inject.get_step_arg('repop', default=False) inject.add_table(weight_table_name(seed_geography), seed_weights_df, replace=repop)
def repop_balancing(settings, crosswalk, control_spec, incidence_table): """ Balance and integerize all zones at a lowest geographic level. Creates a weight table for the repop zones target geography with float 'balanced_weight' and 'integer_weight' columns. Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec: pipeline table incidence_table : pipeline table """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings['geographies'] low_geography = geographies[-1] seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) all_seed_weights_df = get_weight_table(seed_geography) assert all_seed_weights_df is not None # only want control_spec rows for low_geography low_control_spec = control_spec[control_spec['geography'] == low_geography] low_controls_df = get_control_table(low_geography) household_id_col = setting('household_id_col') total_hh_control_col = setting('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) min_expansion_factor = settings.get('min_expansion_factor', None) # run balancer for each low geography low_weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] # initial seed weights in series indexed by hh id seed_weights_df = all_seed_weights_df[ all_seed_weights_df[seed_geography] == seed_id] seed_weights_df = seed_weights_df.set_index(household_id_col) # number of hh in seed zone (for scaling low zone weights) seed_zone_hh_count = seed_controls_df[total_hh_control_col].loc[ seed_id] low_ids = seed_crosswalk_df[low_geography].unique() for low_id in low_ids: trace_label = "%s_%s_%s_%s" % (seed_geography, seed_id, low_geography, low_id) logger.info("balance and integerize %s" % trace_label) # weights table for this zone with household_id index and low_geography column zone_weights_df = pd.DataFrame(index=seed_weights_df.index) zone_weights_df[low_geography] = low_id # scale seed weights by relative hh counts # it doesn't makes sense to repop balance with integer weights low_zone_hh_count = low_controls_df[total_hh_control_col].loc[ low_id] scaling_factor = float(low_zone_hh_count) / seed_zone_hh_count initial_weights = seed_weights_df[ 'balanced_weight'] * scaling_factor # - balance status, weights_df, controls_df = do_balancing( control_spec=low_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, min_expansion_factor=min_expansion_factor, incidence_df=seed_incidence_df, control_totals=low_controls_df.loc[low_id], initial_weights=initial_weights) logger.info("repop_balancing balancing %s status: %s" % (trace_label, status)) if not status['converged']: raise RuntimeError("repop_balancing for %s did not converge" % trace_label) zone_weights_df['balanced_weight'] = weights_df['final'] # - integerize integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=low_controls_df.loc[low_id], incidence_table=seed_incidence_df, float_weights=weights_df['final'], total_hh_control_col=total_hh_control_col) logger.info("repop_balancing integerizing status: %s" % status) zone_weights_df['integer_weight'] = integer_weights logger.info( "Total balanced weights for %s = %s" % (trace_label, zone_weights_df['balanced_weight'].sum())) logger.info("Total integerized weights for %s = %s" % (trace_label, zone_weights_df['integer_weight'].sum())) low_weight_list.append(zone_weights_df) # concat all low geography zone level results low_weights_df = pd.concat(low_weight_list).reset_index() # add higher level geography id columns to facilitate summaries crosswalk_df = crosswalk_df.set_index(low_geography)\ .loc[low_weights_df[low_geography]]\ .reset_index(drop=True) low_weights_df = pd.concat([low_weights_df, crosswalk_df], axis=1) inject.add_table(weight_table_name(low_geography), low_weights_df, replace=True) inject.add_table(weight_table_name(low_geography, sparse=True), low_weights_df[low_weights_df['integer_weight'] > 0], replace=True)
def expand_households(): """ Create a complete expanded synthetic household list with their assigned geographic zone ids. This is the skeleton synthetic household id list with no household or person attributes, one row per household, with geography columns and seed household table household_id. Creates pipeline table expanded_household_ids """ if setting('NO_INTEGERIZATION_EVER', False): logger.warning("skipping expand_households: NO_INTEGERIZATION_EVER") inject.add_table('expanded_household_ids', pd.DataFrame()) return geographies = setting('geographies') household_id_col = setting('household_id_col') low_geography = geographies[-1] # only one we really need is low_geography seed_geography = setting('seed_geography') geography_cols = geographies[geographies.index(seed_geography):] weights = get_weight_table(low_geography, sparse=True) weights = weights[geography_cols + [household_id_col, 'integer_weight']] # - expand weights table by integer_weight, so there is one row per desired hh weight_cols = weights.columns.values weights_np = np.repeat(weights.values, weights.integer_weight.values, axis=0) expanded_weights = pd.DataFrame(data=weights_np, columns=weight_cols) if setting('GROUP_BY_INCIDENCE_SIGNATURE'): # the household_id_col is really the group_id expanded_weights.rename(columns={household_id_col: 'group_id'}, inplace=True) # the original incidence table with one row per hh, with index hh_id household_groups = pipeline.get_table('household_groups') household_groups = household_groups[[ household_id_col, 'group_id', 'sample_weight' ]] # for each group, lists of hh_ids and their sample_weights (as relative probabiliities) # [ [ [<group_0_hh_id_list>], [<group_0_hh_prob_list>] ], # [ [<group_1_hh_id_list>], [<group_1_hh_prob_list>] ], ... ] HH_IDS = 0 HH_PROBS = 1 grouper = household_groups.groupby('group_id') group_hh_probs = [0] * len(grouper) for group_id, df in grouper: hh_ids = list(df[household_id_col]) probs = list(df.sample_weight / df.sample_weight.sum()) group_hh_probs[group_id] = [hh_ids, probs] # FIXME - should sample without replacement? # now make a hh_id choice for each group_id in expanded_weights def chooser(group_id): hh_ids = group_hh_probs[group_id][HH_IDS] hh_probs = group_hh_probs[group_id][HH_PROBS] return np.random.choice(hh_ids, p=hh_probs) expanded_weights[household_id_col] = \ expanded_weights.group_id.apply(chooser, convert_dtype=True,) # FIXME - omit in production? del expanded_weights['group_id'] del expanded_weights['integer_weight'] append = inject.get_step_arg('append', False) replace = inject.get_step_arg('replace', False) assert not ( append and replace), "can't specify both append and replace for expand_households" if append or replace: t = inject.get_table('expanded_household_ids').to_frame() prev_hhs = len(t.index) added_hhs = len(expanded_weights.index) if replace: # FIXME - should really get from crosswalk table? low_ids_to_replace = expanded_weights[low_geography].unique() t = t[~t[low_geography].isin(low_ids_to_replace)] expanded_weights = pd.concat([t, expanded_weights], ignore_index=True) dropped_hhs = prev_hhs - len(t.index) final_hhs = len(expanded_weights.index) op = 'append' if append else 'replace' logger.info( "expand_households op: %s prev hh count %s dropped %s added %s final %s" % (op, prev_hhs, dropped_hhs, added_hhs, final_hhs)) repop = inject.get_step_arg('repop', default=False) inject.add_table('expanded_household_ids', expanded_weights, replace=repop)
def integerize_final_seed_weights(settings, crosswalk, control_spec, incidence_table): """ Final balancing for each seed (puma) zone with aggregated low and mid-level controls and distributed meta-level controls. Adds integer_weight column to seed-level weight table Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table Returns ------- """ if setting('NO_INTEGERIZATION_EVER', False): logger.warning( "skipping integerize_final_seed_weights: NO_INTEGERIZATION_EVER") return crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) seed_weights_df = get_weight_table(seed_geography) # FIXME - I assume we want to integerize using meta controls too? control_cols = control_spec.target assert (seed_controls_df.columns == control_cols).all() # determine master_control_index if specified in settings total_hh_control_col = setting('total_hh_control') # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("integerize_final_seed_weights seed id %s" % seed_id) # slice incidence rows for this seed geography seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id] balanced_seed_weights = \ seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight'] trace_label = "%s_%s" % (seed_geography, seed_id) integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=seed_controls_df.loc[seed_id], incidence_table=seed_incidence[control_cols], float_weights=balanced_seed_weights, total_hh_control_col=total_hh_control_col) weight_list.append(integer_weights) # bulk concat all seed level results integer_seed_weights = pd.concat(weight_list) inject.add_column(weight_table_name(seed_geography), 'integer_weight', integer_seed_weights)
# Add (and handle) 'standard' activitysim arguments: # --config : specify path to config_dir # --output : specify path to output_dir # --data : specify path to data_dir # --models : specify run_list name # --resume : resume_after handle_standard_args() tracing.config_logger() t0 = print_elapsed_time() logger = logging.getLogger('populationsim') logger.info("GROUP_BY_INCIDENCE_SIGNATURE: %s" % setting('GROUP_BY_INCIDENCE_SIGNATURE')) logger.info("INTEGERIZE_WITH_BACKSTOPPED_CONTROLS: %s" % setting('INTEGERIZE_WITH_BACKSTOPPED_CONTROLS')) logger.info("SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS: %s" % setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS')) logger.info("meta_control_data: %s" % setting('meta_control_data')) logger.info("control_file_name: %s" % setting('control_file_name')) logger.info("USE_CVXPY: %s" % lp.use_cvxpy()) logger.info("USE_SIMUL_INTEGERIZER: %s" % multi_integerizer.use_simul_integerizer()) # get the run list (name was possibly specified on the command line with the -m option) run_list_name = inject.get_injectable('run_list_name', 'run_list') # run list from settings file is dict with list of 'steps' and optional 'resume_after'
def sub_balancing(settings, crosswalk, control_spec, incidence_table): # geography is an injected model step arg geography = inject.get_step_arg('geography') crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] parent_geography = geographies[geographies.index(geography) - 1] sub_geographies = geographies[geographies.index(geography):] parent_geographies = geographies[:geographies.index(geography)] total_hh_control_col = settings.get('total_hh_control') sub_controls_df = get_control_table(geography) weights_df = get_weight_table(parent_geography) assert weights_df is not None integer_weights_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] assert len(seed_crosswalk_df[meta_geography].unique()) == 1 parent_ids = seed_crosswalk_df[parent_geography].unique() for parent_id in parent_ids: logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id)) initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index( settings.get('household_id_col')) # using balanced_weight slows down simul and doesn't improve results # (float seeds means no zero-weight households to drop) if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True): initial_weights = initial_weights['balanced_weight'] else: initial_weights = initial_weights['integer_weight'] assert len(initial_weights.index) == len(seed_incidence_df.index) zone_weights_df = balance_and_integerize( incidence_df=seed_incidence_df, parent_weights=initial_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, parent_geography=parent_geography, parent_id=parent_id, sub_geographies=sub_geographies, crosswalk_df=seed_crosswalk_df) # add higher level geography id columns to facilitate summaries parent_geography_ids = \ crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\ .max(axis=0) for z in parent_geography_ids.index: zone_weights_df[z] = parent_geography_ids[z] integer_weights_list.append(zone_weights_df) integer_weights_df = pd.concat(integer_weights_list) inject.add_table(weight_table_name(geography), integer_weights_df) inject.add_table( weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) if 'trace_geography' in settings and geography in settings[ 'trace_geography']: trace_geography_id = settings.get('trace_geography')[geography] df = integer_weights_df[integer_weights_df[geography] == trace_geography_id] inject.add_table('trace_%s' % weight_table_name(geography), df)
def sub_balancing(settings, crosswalk, control_spec, incidence_table): """ Simul-balance and integerize all zones at a specified geographic level in groups by parent zone. For instance, if the 'geography' step arg is 'TRACT' and the parent geography is 'SEED', then for each seed zone, we simul-balance the TRACTS it contains. Creates a weight table for the target geography with float 'balanced_weight' and 'integer_weight' columns. Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table Returns ------- """ # geography is an injected model step arg geography = inject.get_step_arg('geography') crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] parent_geography = geographies[geographies.index(geography) - 1] sub_geographies = geographies[geographies.index(geography):] parent_geographies = geographies[:geographies.index(geography)] total_hh_control_col = setting('total_hh_control') parent_controls_df = get_control_table(parent_geography) sub_controls_df = get_control_table(geography) weights_df = get_weight_table(parent_geography) assert weights_df is not None integer_weights_list = [] # the incidence table is siloed by seed geography, se we handle each seed zone in turn seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: # slice incidence and crosswalk tables for this seed zone seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] assert len(seed_crosswalk_df[meta_geography].unique()) == 1 # list of unique parent zone ids in this seed zone # (there will be just one if parent geo is seed) parent_ids = seed_crosswalk_df[parent_geography].unique() # only want ones for which there are (non-zero) controls parent_ids = parent_controls_df.index.intersection(parent_ids) for parent_id in parent_ids: logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id)) initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index( settings.get('household_id_col')) # using balanced_weight slows down simul and doesn't improve results # (float seeds means no zero-weight households to drop) if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True): initial_weights = initial_weights['balanced_weight'] else: initial_weights = initial_weights['integer_weight'] assert len(initial_weights.index) == len(seed_incidence_df.index) zone_weights_df = balance_and_integerize( incidence_df=seed_incidence_df, parent_weights=initial_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, parent_geography=parent_geography, parent_id=parent_id, sub_geographies=sub_geographies, crosswalk_df=seed_crosswalk_df) # add higher level geography id columns to facilitate summaries parent_geography_ids = \ crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\ .max(axis=0) for z in parent_geography_ids.index: zone_weights_df[z] = parent_geography_ids[z] integer_weights_list.append(zone_weights_df) integer_weights_df = pd.concat(integer_weights_list) inject.add_table(weight_table_name(geography), integer_weights_df) inject.add_table( weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) if 'trace_geography' in settings and geography in settings[ 'trace_geography']: trace_geography_id = settings.get('trace_geography')[geography] df = integer_weights_df[integer_weights_df[geography] == trace_geography_id] inject.add_table('trace_%s' % weight_table_name(geography), df)
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) # only want control_spec rows for sub_geographies geographies = settings['geographies'] sub_geographies = geographies[geographies.index(seed_geography) + 1:] seed_control_spec = control_spec[control_spec['geography'].isin( sub_geographies)] # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=seed_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "initial_seed_balancing for seed_id %s did not converge" % seed_id) balanced_weights = weights_df['final'] logger.info("Total balanced weights for seed %s = %s" % (seed_id, balanced_weights.sum())) weight_list.append(balanced_weights) # bulk concat all seed level results weights = pd.concat(weight_list) # build canonical weights table seed_weights_df = incidence_df[[seed_geography]].copy() seed_weights_df['preliminary_balanced_weight'] = weights # copy household_id_col index to named column seed_weights_df[setting('household_id_col')] = seed_weights_df.index # this is just a convenience if there are no meta controls if inject.get_step_arg('final', default=False): seed_weights_df['balanced_weight'] = seed_weights_df[ 'preliminary_balanced_weight'] inject.add_table(weight_table_name(seed_geography), seed_weights_df)
def input_pre_processor(): """ Read input text files and save them as pipeline tables for use in subsequent steps. The files to read as specified by table_list, and array of dicts that specify the input file name, the name of the pipeline table, along with keys allow the specification of pre-processing steps. By default, reads table_list from 'input_table_list' in settings.yaml, unless an alternate table_list name is specified as a model step argument 'table_list'. (This allows alternate/additional input files to be read for repop) In the case of repop, this step is being run after an initial populationsim run has completed, in which case the input_table_list may specify replacement tables. (e.g. lowest geography controls that will replace the previous low controls dataframe.) See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | ame of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ """ # alternate table list name may have been provided as a model argument table_list_name = inject.get_step_arg('table_list', default='input_table_list') table_list = setting(table_list_name) assert table_list is not None, "table list '%s' not in settings." % table_list_name data_dir = data_dir_from_settings() for table_info in table_list: tablename = table_info['tablename'] logger.info("input_pre_processor processing %s" % tablename) # read the csv file data_filename = table_info.get('filename', None) data_file_path = os.path.join(data_dir, data_filename) if not os.path.exists(data_file_path): raise RuntimeError( "input_pre_processor %s - input file not found: %s" % ( tablename, data_file_path, )) logger.info("Reading csv file %s" % data_file_path) df = pd.read_csv(data_file_path, comment='#') logger.info("input file columns: %s" % df.columns.values) drop_columns = table_info.get('drop_columns', None) if drop_columns: for c in drop_columns: logger.info("dropping column '%s'" % c) del df[c] # rename columns column_map = table_info.get('column_map', None) if column_map: df.rename(columns=column_map, inplace=True) # set index index_col = table_info.get('index_col', None) if index_col is not None: if index_col in df.columns: assert not df.duplicated(index_col).any() df.set_index(index_col, inplace=True) else: df.index.names = [index_col] # read expression file # expression_filename = table_info.get('expression_filename', None) # if expression_filename: # assert False # expression_file_path = os.path.join(configs_dir, expression_filename) # if not os.path.exists(expression_file_path): # raise RuntimeError("input_pre_processor %s - expression file not found: %s" # % (table, expression_file_path, )) # spec = assign.read_assignment_spec(expression_file_path) # # df_alias = table_info.get('df_alias', table) # # locals_d = {} # # results, trace_results, trace_assigned_locals \ # = assign.assign_variables(spec, df, locals_d, df_alias=df_alias) # # for column in results.columns: # # orca.add_column(table, column, results[column]) # # df = pd.concat([df, results], axis=1) logger.info("adding table %s" % tablename) # add (or replace) pipeline table repop = inject.get_step_arg('repop', default=False) inject.add_table(tablename, df, replace=repop)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. Pipeline tables are intermediate computational tables, not to be confused with the synthetic population tables written by the write_synthetic_population step. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written Intermediate tables likely to be of particular interest or utility are the controls and weights tables for the various geographies. For example, if one of your geographies is TRACT, then: TRACT_controls has control totals for every TRACT (and aggregated subzone) controls. TRACT_weights has balanced_weight and integer_weight for every TRACT. To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the expanded_household_ids table: :: output_tables: action: include tables: - expanded_household_ids Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) output_tables_list = pipeline.checkpointed_tables() if output_tables_settings is None: logger.info("No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') if action not in ['include', 'skip']: raise RuntimeError("expected %s action '%s' to be either 'include' or 'skip'" % (output_tables_settings_name, action)) if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in output_tables_list if t not in tables] # should provide option to also write checkpoints? # output_tables_list.append("checkpoints.csv") for table_name in output_tables_list: table = inject.get_table(table_name, None) if table is None: logger.warn("Skipping '%s': Table not found." % table_name) continue df = table.to_frame() file_name = "%s.csv" % table_name logger.info("writing output file %s" % file_name) file_path = os.path.join(output_dir, file_name) write_index = df.index.name is not None df.to_csv(file_path, index=write_index)