def integerize_final_seed_weights(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) seed_weights_df = get_weight_table(seed_geography) # FIXME - I assume we want to integerize using meta controls too? control_cols = control_spec.target assert (seed_controls_df.columns == control_cols).all() # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("integerize_final_seed_weights seed id %s" % seed_id) # slice incidence rows for this seed geography seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id] balanced_seed_weights = \ seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight'] trace_label = "%s_%s" % (seed_geography, seed_id) integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=seed_controls_df.loc[seed_id], incidence_table=seed_incidence[control_cols], float_weights=balanced_seed_weights, total_hh_control_col=total_hh_control_col ) weight_list.append(integer_weights) # bulk concat all seed level results integer_seed_weights = pd.concat(weight_list) inject.add_column(weight_table_name(seed_geography), 'integer_weight', integer_seed_weights)
def final_seed_balancing(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_weight_table_name = weight_table_name(seed_geography) # if there are no meta controls, then balanced_weight is simply preliminary_balanced_weight geographies = settings['geographies'] if not (control_spec.geography == geographies[0]).any(): logger.warning( "no need for final_seed_balancing because no meta controls") seed_weights_df = get_weight_table(seed_geography) if 'balanced_weight' not in seed_weights_df: final_seed_weights = seed_weights_df['preliminary_balanced_weight'] inject.add_column(seed_weight_table_name, 'balanced_weight', final_seed_weights) return # we use all control_spec rows, so no need to filter on geography as for initial_seed_balancing seed_controls_df = get_control_table(seed_geography) assert (seed_controls_df.columns == control_spec.target).all() # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) relaxation_factors = pd.DataFrame(index=seed_controls_df.columns.tolist()) # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "final_seed_balancing for seed_id %s did not converge" % seed_id) weight_list.append(weights_df['final']) relaxation_factors[seed_id] = controls_df['relaxation_factor'] # bulk concat all seed level results final_seed_weights = pd.concat(weight_list) inject.add_column(seed_weight_table_name, 'balanced_weight', final_seed_weights)
def sub_balancing(settings, crosswalk, control_spec, incidence_table): """ Simul-balance and integerize all zones at a specified geographic level in groups by parent zone. For instance, if the 'geography' step arg is 'TRACT' and the parent geography is 'SEED', then for each seed zone, we simul-balance the TRACTS it contains. Creates a weight table for the target geography with float 'balanced_weight' and 'integer_weight' columns. Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table Returns ------- """ # geography is an injected model step arg geography = inject.get_step_arg('geography') crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] parent_geography = geographies[geographies.index(geography) - 1] sub_geographies = geographies[geographies.index(geography):] parent_geographies = geographies[:geographies.index(geography)] total_hh_control_col = setting('total_hh_control') parent_controls_df = get_control_table(parent_geography) sub_controls_df = get_control_table(geography) weights_df = get_weight_table(parent_geography) assert weights_df is not None integer_weights_list = [] # the incidence table is siloed by seed geography, se we handle each seed zone in turn seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: # slice incidence and crosswalk tables for this seed zone seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] assert len(seed_crosswalk_df[meta_geography].unique()) == 1 # list of unique parent zone ids in this seed zone # (there will be just one if parent geo is seed) parent_ids = seed_crosswalk_df[parent_geography].unique() # only want ones for which there are (non-zero) controls parent_ids = parent_controls_df.index.intersection(parent_ids) for parent_id in parent_ids: logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id)) initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index( settings.get('household_id_col')) # using balanced_weight slows down simul and doesn't improve results # (float seeds means no zero-weight households to drop) if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True): initial_weights = initial_weights['balanced_weight'] else: initial_weights = initial_weights['integer_weight'] assert len(initial_weights.index) == len(seed_incidence_df.index) zone_weights_df = balance_and_integerize( incidence_df=seed_incidence_df, parent_weights=initial_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, parent_geography=parent_geography, parent_id=parent_id, sub_geographies=sub_geographies, crosswalk_df=seed_crosswalk_df) # add higher level geography id columns to facilitate summaries parent_geography_ids = \ crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\ .max(axis=0) for z in parent_geography_ids.index: zone_weights_df[z] = parent_geography_ids[z] integer_weights_list.append(zone_weights_df) integer_weights_df = pd.concat(integer_weights_list) inject.add_table(weight_table_name(geography), integer_weights_df) inject.add_table( weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) if 'trace_geography' in settings and geography in settings[ 'trace_geography']: trace_geography_id = settings.get('trace_geography')[geography] df = integer_weights_df[integer_weights_df[geography] == trace_geography_id] inject.add_table('trace_%s' % weight_table_name(geography), df)
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): """ Balance the household weights for each of the seed geographies (independently) using the seed level controls and the aggregated sub-zone controls totals. Create the seed_weights table with one row per household and columns contaiing household_id, seed geography (e.g. PUMA), and float preliminary_balanced_weights Adds seed_weights table to pipeline named <seed_geography>_weights (e.g. PUMA_weights): +--------+------+-----------------------------+-------+ | index | PUMA | preliminary_balanced_weight | hh_id | | hh_id | | | | +========+======+=============================+=======+ | 0 | 600 | 0.313555 | 0 | | 1 | 601 | 0.627110 | 1 | | 2 | 602 | 0.313555 | 2 | | ... | | | | +--------+------+-----------------------------+-------+ Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) # only want control_spec rows for sub_geographies geographies = settings['geographies'] sub_geographies = geographies[geographies.index(seed_geography) + 1:] seed_control_spec = control_spec[control_spec['geography'].isin( sub_geographies)] # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=seed_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "initial_seed_balancing for seed_id %s did not converge" % seed_id) balanced_weights = weights_df['final'] logger.info("Total balanced weights for seed %s = %s" % (seed_id, balanced_weights.sum())) weight_list.append(balanced_weights) # bulk concat all seed level results weights = pd.concat(weight_list) # build canonical weights table seed_weights_df = incidence_df[[seed_geography]].copy() seed_weights_df['preliminary_balanced_weight'] = weights # copy household_id_col index to named column seed_weights_df[setting('household_id_col')] = seed_weights_df.index # this is just a convenience if there are no meta controls if inject.get_step_arg('final', default=False): seed_weights_df['balanced_weight'] = seed_weights_df[ 'preliminary_balanced_weight'] inject.add_table(weight_table_name(seed_geography), seed_weights_df)
def integerize_final_seed_weights(settings, crosswalk, control_spec, incidence_table): """ Final balancing for each seed (puma) zone with aggregated low and mid-level controls and distributed meta-level controls. Adds integer_weight column to seed-level weight table Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table Returns ------- """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) seed_weights_df = get_weight_table(seed_geography) # FIXME - I assume we want to integerize using meta controls too? control_cols = control_spec.target assert (seed_controls_df.columns == control_cols).all() # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("integerize_final_seed_weights seed id %s" % seed_id) # slice incidence rows for this seed geography seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id] balanced_seed_weights = \ seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight'] trace_label = "%s_%s" % (seed_geography, seed_id) integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=seed_controls_df.loc[seed_id], incidence_table=seed_incidence[control_cols], float_weights=balanced_seed_weights, total_hh_control_col=total_hh_control_col) weight_list.append(integer_weights) # bulk concat all seed level results integer_seed_weights = pd.concat(weight_list) inject.add_column(weight_table_name(seed_geography), 'integer_weight', integer_seed_weights)
def sub_balancing(settings, crosswalk, control_spec, incidence_table): # geography is an injected model step arg geography = inject.get_step_arg('geography') crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] parent_geography = geographies[geographies.index(geography) - 1] sub_geographies = geographies[geographies.index(geography):] parent_geographies = geographies[:geographies.index(geography)] total_hh_control_col = settings.get('total_hh_control') sub_controls_df = get_control_table(geography) weights_df = get_weight_table(parent_geography) assert weights_df is not None integer_weights_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] assert len(seed_crosswalk_df[meta_geography].unique()) == 1 parent_ids = seed_crosswalk_df[parent_geography].unique() for parent_id in parent_ids: logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id)) initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index( settings.get('household_id_col')) # using balanced_weight slows down simul and doesn't improve results # (float seeds means no zero-weight households to drop) if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True): initial_weights = initial_weights['balanced_weight'] else: initial_weights = initial_weights['integer_weight'] assert len(initial_weights.index) == len(seed_incidence_df.index) zone_weights_df = balance_and_integerize( incidence_df=seed_incidence_df, parent_weights=initial_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, parent_geography=parent_geography, parent_id=parent_id, sub_geographies=sub_geographies, crosswalk_df=seed_crosswalk_df) # add higher level geography id columns to facilitate summaries parent_geography_ids = \ crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\ .max(axis=0) for z in parent_geography_ids.index: zone_weights_df[z] = parent_geography_ids[z] integer_weights_list.append(zone_weights_df) integer_weights_df = pd.concat(integer_weights_list) inject.add_table(weight_table_name(geography), integer_weights_df) inject.add_table( weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) if 'trace_geography' in settings and geography in settings[ 'trace_geography']: trace_geography_id = settings.get('trace_geography')[geography] df = integer_weights_df[integer_weights_df[geography] == trace_geography_id] inject.add_table('trace_%s' % weight_table_name(geography), df)
def repop_balancing(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings['geographies'] low_geography = geographies[-1] seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) all_seed_weights_df = get_weight_table(seed_geography) assert all_seed_weights_df is not None # only want control_spec rows for low_geography low_control_spec = control_spec[control_spec['geography'] == low_geography] low_controls_df = get_control_table(low_geography) household_id_col = setting('household_id_col') total_hh_control_col = setting('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) # run balancer for each low geography low_weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] # initial seed weights in series indexed by hh id seed_weights_df = all_seed_weights_df[ all_seed_weights_df[seed_geography] == seed_id] seed_weights_df = seed_weights_df.set_index(household_id_col) # number of hh in seed zone (for scaling low zone weights) seed_zone_hh_count = seed_controls_df[total_hh_control_col].loc[ seed_id] low_ids = seed_crosswalk_df[low_geography].unique() for low_id in low_ids: trace_label = "%s_%s_%s_%s" % (seed_geography, seed_id, low_geography, low_id) logger.info("balance and integerize %s" % trace_label) # weights table for this zone with household_id index and low_geography column zone_weights_df = pd.DataFrame(index=seed_weights_df.index) zone_weights_df[low_geography] = low_id # scale seed weights by relative hh counts # it doesn't makes sense to repop balance with integer weights low_zone_hh_count = low_controls_df[total_hh_control_col].loc[ low_id] scaling_factor = float(low_zone_hh_count) / seed_zone_hh_count initial_weights = seed_weights_df[ 'balanced_weight'] * scaling_factor # - balance status, weights_df, controls_df = do_balancing( control_spec=low_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=low_controls_df.loc[low_id], initial_weights=initial_weights) logger.info("repop_balancing balancing %s status: %s" % (trace_label, status)) if not status['converged']: raise RuntimeError("repop_balancing for %s did not converge" % trace_label) zone_weights_df['balanced_weight'] = weights_df['final'] # - integerize integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=low_controls_df.loc[low_id], incidence_table=seed_incidence_df, float_weights=weights_df['final'], total_hh_control_col=total_hh_control_col) logger.info("repop_balancing integerizing status: %s" % status) zone_weights_df['integer_weight'] = integer_weights logger.info( "Total balanced weights for %s = %s" % (trace_label, zone_weights_df['balanced_weight'].sum())) logger.info("Total integerized weights for %s = %s" % (trace_label, zone_weights_df['integer_weight'].sum())) low_weight_list.append(zone_weights_df) # concat all low geography zone level results low_weights_df = pd.concat(low_weight_list).reset_index() # add higher level geography id columns to facilitate summaries crosswalk_df = crosswalk_df.set_index(low_geography)\ .loc[low_weights_df[low_geography]]\ .reset_index(drop=True) low_weights_df = pd.concat([low_weights_df, crosswalk_df], axis=1) inject.add_table(weight_table_name(low_geography), low_weights_df) inject.add_table(weight_table_name(low_geography, sparse=True), low_weights_df[low_weights_df['integer_weight'] > 0])
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) # only want control_spec rows for sub_geographies geographies = settings['geographies'] sub_geographies = geographies[geographies.index(seed_geography) + 1:] seed_control_spec = control_spec[control_spec['geography'].isin( sub_geographies)] # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=seed_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "initial_seed_balancing for seed_id %s did not converge" % seed_id) balanced_weights = weights_df['final'] logger.info("Total balanced weights for seed %s = %s" % (seed_id, balanced_weights.sum())) weight_list.append(balanced_weights) # bulk concat all seed level results weights = pd.concat(weight_list) # build canonical weights table seed_weights_df = incidence_df[[seed_geography]].copy() seed_weights_df['preliminary_balanced_weight'] = weights # copy household_id_col index to named column seed_weights_df[setting('household_id_col')] = seed_weights_df.index # this is just a convenience if there are no meta controls if inject.get_step_arg('final', default=False): seed_weights_df['balanced_weight'] = seed_weights_df[ 'preliminary_balanced_weight'] inject.add_table(weight_table_name(seed_geography), seed_weights_df)