def meta_summary(incidence_df, control_spec, top_geography, top_id, sub_geographies): incidence_df = incidence_df[incidence_df[top_geography] == top_id] control_cols = control_spec.target.values controls_df = get_control_table(top_geography) # controls for this geography as series controls = controls_df[control_cols].loc[top_id] incidence = incidence_df[control_cols] summary = pd.DataFrame(index=control_cols) summary.index.name = 'control_name' summary['control_value'] = controls seed_geography = setting('seed_geography') seed_weights_df = get_weight_table(seed_geography) seed_weight_cols = [ 'preliminary_balanced_weight', 'balanced_weight', 'integer_weight' ] for c in seed_weight_cols: if c in seed_weights_df: summary_col_name = '%s_%s' % (top_geography, c) summary[summary_col_name] = \ incidence.multiply(seed_weights_df[c], axis="index").sum(axis=0) for g in sub_geographies: sub_weight_cols = ['balanced_weight', 'integer_weight'] sub_weights = get_weight_table(g) if sub_weights is None: continue sub_weights = sub_weights[sub_weights[top_geography] == top_id] sub_weights = sub_weights[[setting('household_id_col')] + sub_weight_cols].groupby( setting('household_id_col')).sum() for c in sub_weight_cols: summary['%s_%s' % (g, c)] = \ incidence.multiply(sub_weights[c], axis="index").sum(axis=0) return summary
def integerize_final_seed_weights(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) seed_weights_df = get_weight_table(seed_geography) # FIXME - I assume we want to integerize using meta controls too? control_cols = control_spec.target assert (seed_controls_df.columns == control_cols).all() # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("integerize_final_seed_weights seed id %s" % seed_id) # slice incidence rows for this seed geography seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id] balanced_seed_weights = \ seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight'] trace_label = "%s_%s" % (seed_geography, seed_id) integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=seed_controls_df.loc[seed_id], incidence_table=seed_incidence[control_cols], float_weights=balanced_seed_weights, total_hh_control_col=total_hh_control_col ) weight_list.append(integer_weights) # bulk concat all seed level results integer_seed_weights = pd.concat(weight_list) inject.add_column(weight_table_name(seed_geography), 'integer_weight', integer_seed_weights)
def final_seed_balancing(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_weight_table_name = weight_table_name(seed_geography) # if there are no meta controls, then balanced_weight is simply preliminary_balanced_weight geographies = settings['geographies'] if not (control_spec.geography == geographies[0]).any(): logger.warning( "no need for final_seed_balancing because no meta controls") seed_weights_df = get_weight_table(seed_geography) if 'balanced_weight' not in seed_weights_df: final_seed_weights = seed_weights_df['preliminary_balanced_weight'] inject.add_column(seed_weight_table_name, 'balanced_weight', final_seed_weights) return # we use all control_spec rows, so no need to filter on geography as for initial_seed_balancing seed_controls_df = get_control_table(seed_geography) assert (seed_controls_df.columns == control_spec.target).all() # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) relaxation_factors = pd.DataFrame(index=seed_controls_df.columns.tolist()) # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "final_seed_balancing for seed_id %s did not converge" % seed_id) weight_list.append(weights_df['final']) relaxation_factors[seed_id] = controls_df['relaxation_factor'] # bulk concat all seed level results final_seed_weights = pd.concat(weight_list) inject.add_column(seed_weight_table_name, 'balanced_weight', final_seed_weights)
def sub_balancing(settings, crosswalk, control_spec, incidence_table): """ Simul-balance and integerize all zones at a specified geographic level in groups by parent zone. For instance, if the 'geography' step arg is 'TRACT' and the parent geography is 'SEED', then for each seed zone, we simul-balance the TRACTS it contains. Creates a weight table for the target geography with float 'balanced_weight' and 'integer_weight' columns. Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table Returns ------- """ # geography is an injected model step arg geography = inject.get_step_arg('geography') crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] parent_geography = geographies[geographies.index(geography) - 1] sub_geographies = geographies[geographies.index(geography):] parent_geographies = geographies[:geographies.index(geography)] total_hh_control_col = setting('total_hh_control') parent_controls_df = get_control_table(parent_geography) sub_controls_df = get_control_table(geography) weights_df = get_weight_table(parent_geography) assert weights_df is not None integer_weights_list = [] # the incidence table is siloed by seed geography, se we handle each seed zone in turn seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: # slice incidence and crosswalk tables for this seed zone seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] assert len(seed_crosswalk_df[meta_geography].unique()) == 1 # list of unique parent zone ids in this seed zone # (there will be just one if parent geo is seed) parent_ids = seed_crosswalk_df[parent_geography].unique() # only want ones for which there are (non-zero) controls parent_ids = parent_controls_df.index.intersection(parent_ids) for parent_id in parent_ids: logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id)) initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index( settings.get('household_id_col')) # using balanced_weight slows down simul and doesn't improve results # (float seeds means no zero-weight households to drop) if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True): initial_weights = initial_weights['balanced_weight'] else: initial_weights = initial_weights['integer_weight'] assert len(initial_weights.index) == len(seed_incidence_df.index) zone_weights_df = balance_and_integerize( incidence_df=seed_incidence_df, parent_weights=initial_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, parent_geography=parent_geography, parent_id=parent_id, sub_geographies=sub_geographies, crosswalk_df=seed_crosswalk_df) # add higher level geography id columns to facilitate summaries parent_geography_ids = \ crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\ .max(axis=0) for z in parent_geography_ids.index: zone_weights_df[z] = parent_geography_ids[z] integer_weights_list.append(zone_weights_df) integer_weights_df = pd.concat(integer_weights_list) inject.add_table(weight_table_name(geography), integer_weights_df) inject.add_table( weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) if 'trace_geography' in settings and geography in settings[ 'trace_geography']: trace_geography_id = settings.get('trace_geography')[geography] df = integer_weights_df[integer_weights_df[geography] == trace_geography_id] inject.add_table('trace_%s' % weight_table_name(geography), df)
def summarize(crosswalk, incidence_table, control_spec): """ Write aggregate summary files of controls and weights for all geographic levels to output dir Parameters ---------- crosswalk : pipeline table incidence_table : pipeline table control_spec : pipeline table Returns ------- """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() geographies = setting('geographies') seed_geography = setting('seed_geography') meta_geography = geographies[0] sub_geographies = geographies[geographies.index(seed_geography) + 1:] household_id_col = setting('household_id_col') meta_ids = crosswalk_df[meta_geography].unique() for meta_id in meta_ids: meta_summary_df = \ meta_summary(incidence_df, control_spec, meta_geography, meta_id, sub_geographies) out_table('%s_%s' % (meta_geography, meta_id), meta_summary_df) hh_weights_summary = pd.DataFrame(index=incidence_df.index) # add seed level summaries seed_weights_df = get_weight_table(seed_geography) hh_weights_summary['%s_balanced_weight' % seed_geography] = seed_weights_df['balanced_weight'] hh_weights_summary['%s_integer_weight' % seed_geography] = seed_weights_df['integer_weight'] for geography in sub_geographies: weights_df = get_weight_table(geography) if weights_df is None: continue hh_weight_cols = [ household_id_col, 'balanced_weight', 'integer_weight' ] hh_weights = weights_df[hh_weight_cols].groupby([household_id_col ]).sum() hh_weights_summary['%s_balanced_weight' % geography] = hh_weights['balanced_weight'] hh_weights_summary['%s_integer_weight' % geography] = hh_weights['integer_weight'] # aggregate to seed level hh_id_col = incidence_df.index.name aggegrate_weights = weights_df.groupby([seed_geography, hh_id_col], as_index=False).sum() aggegrate_weights.set_index(hh_id_col, inplace=True) aggegrate_weights = \ aggegrate_weights[[seed_geography, 'balanced_weight', 'integer_weight']] aggegrate_weights['sample_weight'] = \ incidence_df['sample_weight'] aggegrate_weights['%s_preliminary_balanced_weight' % seed_geography] = \ seed_weights_df['preliminary_balanced_weight'] aggegrate_weights['%s_balanced_weight' % seed_geography] = \ seed_weights_df['balanced_weight'] aggegrate_weights['%s_integer_weight' % seed_geography] = \ seed_weights_df['integer_weight'] out_table('%s_aggregate' % (geography, ), aggegrate_weights) df = summarize_geography(seed_geography, 'integer_weight', crosswalk_df, weights_df, incidence_df) out_table('%s_%s' % ( geography, seed_geography, ), df) df = summarize_geography(geography, 'integer_weight', crosswalk_df, weights_df, incidence_df) out_table('%s' % (geography, ), df) out_table('hh_weights', hh_weights_summary)
def integerize_final_seed_weights(settings, crosswalk, control_spec, incidence_table): """ Final balancing for each seed (puma) zone with aggregated low and mid-level controls and distributed meta-level controls. Adds integer_weight column to seed-level weight table Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table Returns ------- """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) seed_weights_df = get_weight_table(seed_geography) # FIXME - I assume we want to integerize using meta controls too? control_cols = control_spec.target assert (seed_controls_df.columns == control_cols).all() # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("integerize_final_seed_weights seed id %s" % seed_id) # slice incidence rows for this seed geography seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id] balanced_seed_weights = \ seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight'] trace_label = "%s_%s" % (seed_geography, seed_id) integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=seed_controls_df.loc[seed_id], incidence_table=seed_incidence[control_cols], float_weights=balanced_seed_weights, total_hh_control_col=total_hh_control_col) weight_list.append(integer_weights) # bulk concat all seed level results integer_seed_weights = pd.concat(weight_list) inject.add_column(weight_table_name(seed_geography), 'integer_weight', integer_seed_weights)
def sub_balancing(settings, crosswalk, control_spec, incidence_table): # geography is an injected model step arg geography = inject.get_step_arg('geography') crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] parent_geography = geographies[geographies.index(geography) - 1] sub_geographies = geographies[geographies.index(geography):] parent_geographies = geographies[:geographies.index(geography)] total_hh_control_col = settings.get('total_hh_control') sub_controls_df = get_control_table(geography) weights_df = get_weight_table(parent_geography) assert weights_df is not None integer_weights_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] assert len(seed_crosswalk_df[meta_geography].unique()) == 1 parent_ids = seed_crosswalk_df[parent_geography].unique() for parent_id in parent_ids: logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id)) initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index( settings.get('household_id_col')) # using balanced_weight slows down simul and doesn't improve results # (float seeds means no zero-weight households to drop) if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True): initial_weights = initial_weights['balanced_weight'] else: initial_weights = initial_weights['integer_weight'] assert len(initial_weights.index) == len(seed_incidence_df.index) zone_weights_df = balance_and_integerize( incidence_df=seed_incidence_df, parent_weights=initial_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, parent_geography=parent_geography, parent_id=parent_id, sub_geographies=sub_geographies, crosswalk_df=seed_crosswalk_df) # add higher level geography id columns to facilitate summaries parent_geography_ids = \ crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\ .max(axis=0) for z in parent_geography_ids.index: zone_weights_df[z] = parent_geography_ids[z] integer_weights_list.append(zone_weights_df) integer_weights_df = pd.concat(integer_weights_list) inject.add_table(weight_table_name(geography), integer_weights_df) inject.add_table( weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) if 'trace_geography' in settings and geography in settings[ 'trace_geography']: trace_geography_id = settings.get('trace_geography')[geography] df = integer_weights_df[integer_weights_df[geography] == trace_geography_id] inject.add_table('trace_%s' % weight_table_name(geography), df)
def repop_balancing(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings['geographies'] low_geography = geographies[-1] seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) all_seed_weights_df = get_weight_table(seed_geography) assert all_seed_weights_df is not None # only want control_spec rows for low_geography low_control_spec = control_spec[control_spec['geography'] == low_geography] low_controls_df = get_control_table(low_geography) household_id_col = setting('household_id_col') total_hh_control_col = setting('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) # run balancer for each low geography low_weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] # initial seed weights in series indexed by hh id seed_weights_df = all_seed_weights_df[ all_seed_weights_df[seed_geography] == seed_id] seed_weights_df = seed_weights_df.set_index(household_id_col) # number of hh in seed zone (for scaling low zone weights) seed_zone_hh_count = seed_controls_df[total_hh_control_col].loc[ seed_id] low_ids = seed_crosswalk_df[low_geography].unique() for low_id in low_ids: trace_label = "%s_%s_%s_%s" % (seed_geography, seed_id, low_geography, low_id) logger.info("balance and integerize %s" % trace_label) # weights table for this zone with household_id index and low_geography column zone_weights_df = pd.DataFrame(index=seed_weights_df.index) zone_weights_df[low_geography] = low_id # scale seed weights by relative hh counts # it doesn't makes sense to repop balance with integer weights low_zone_hh_count = low_controls_df[total_hh_control_col].loc[ low_id] scaling_factor = float(low_zone_hh_count) / seed_zone_hh_count initial_weights = seed_weights_df[ 'balanced_weight'] * scaling_factor # - balance status, weights_df, controls_df = do_balancing( control_spec=low_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=low_controls_df.loc[low_id], initial_weights=initial_weights) logger.info("repop_balancing balancing %s status: %s" % (trace_label, status)) if not status['converged']: raise RuntimeError("repop_balancing for %s did not converge" % trace_label) zone_weights_df['balanced_weight'] = weights_df['final'] # - integerize integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=low_controls_df.loc[low_id], incidence_table=seed_incidence_df, float_weights=weights_df['final'], total_hh_control_col=total_hh_control_col) logger.info("repop_balancing integerizing status: %s" % status) zone_weights_df['integer_weight'] = integer_weights logger.info( "Total balanced weights for %s = %s" % (trace_label, zone_weights_df['balanced_weight'].sum())) logger.info("Total integerized weights for %s = %s" % (trace_label, zone_weights_df['integer_weight'].sum())) low_weight_list.append(zone_weights_df) # concat all low geography zone level results low_weights_df = pd.concat(low_weight_list).reset_index() # add higher level geography id columns to facilitate summaries crosswalk_df = crosswalk_df.set_index(low_geography)\ .loc[low_weights_df[low_geography]]\ .reset_index(drop=True) low_weights_df = pd.concat([low_weights_df, crosswalk_df], axis=1) inject.add_table(weight_table_name(low_geography), low_weights_df) inject.add_table(weight_table_name(low_geography, sparse=True), low_weights_df[low_weights_df['integer_weight'] > 0])
def expand_households(): geographies = setting('geographies') household_id_col = setting('household_id_col') low_geography = geographies[-1] # only one we really need is low_geography seed_geography = setting('seed_geography') geography_cols = geographies[geographies.index(seed_geography):] weights = get_weight_table(low_geography, sparse=True) weights = weights[geography_cols + [household_id_col, 'integer_weight']] # - expand weights table by integer_weight, so there is one row per desired hh weight_cols = weights.columns.values weights_np = np.repeat(weights.as_matrix(), weights.integer_weight.values, axis=0) expanded_weights = pd.DataFrame(data=weights_np, columns=weight_cols) if setting('GROUP_BY_INCIDENCE_SIGNATURE'): # the household_id_col is really the group_id expanded_weights.rename(columns={household_id_col: 'group_id'}, inplace=True) # the original incidence table with one row per hh, with index hh_id household_groups = pipeline.get_table('household_groups') household_groups = household_groups[[ household_id_col, 'group_id', 'sample_weight' ]] # for each group, lists of hh_ids and their sample_weights (as relative probabiliities) # [ [ [<group_0_hh_id_list>], [<group_0_hh_prob_list>] ], # [ [<group_1_hh_id_list>], [<group_1_hh_prob_list>] ], ... ] HH_IDS = 0 HH_PROBS = 1 grouper = household_groups.groupby('group_id') group_hh_probs = [0] * len(grouper) for group_id, df in grouper: hh_ids = list(df[household_id_col]) probs = list(df.sample_weight / df.sample_weight.sum()) group_hh_probs[group_id] = [hh_ids, probs] # FIXME - should sample without replacement? # now make a hh_id choice for each group_id in expanded_weights def chooser(group_id): hh_ids = group_hh_probs[group_id][HH_IDS] hh_probs = group_hh_probs[group_id][HH_PROBS] return np.random.choice(hh_ids, p=hh_probs) expanded_weights[household_id_col] = \ expanded_weights.group_id.apply(chooser, convert_dtype=True,) # FIXME - omit in production? del expanded_weights['group_id'] del expanded_weights['integer_weight'] append = inject.get_step_arg('append', False) replace = inject.get_step_arg('replace', False) assert not ( append and replace), "can't specify both append and replace for expand_households" if append or replace: t = inject.get_table('expanded_household_ids').to_frame() prev_hhs = len(t.index) added_hhs = len(expanded_weights.index) if replace: # FIXME - should really get from crosswalk table? low_ids_to_replace = expanded_weights[low_geography].unique() t = t[~t[low_geography].isin(low_ids_to_replace)] expanded_weights = pd.concat([t, expanded_weights], ignore_index=True) dropped_hhs = prev_hhs - len(t.index) final_hhs = len(expanded_weights.index) op = 'append' if append else 'replace' logger.info( "expand_households op: %s prev hh count %s dropped %s added %s final %s" % (op, prev_hhs, dropped_hhs, added_hhs, final_hhs)) inject.add_table('expanded_household_ids', expanded_weights)
def meta_control_factoring(settings, control_spec, incidence_table): """ Apply simple factoring to summed household fractional weights based on original meta control values relative to summed household fractional weights by meta zone. The resulting factored meta control weights will be new meta controls, to be appended to the original controls, for final balancing. Parameters ---------- settings control_spec incidence_table Returns ------- """ # FIXME - if there is only one seed zone in the meta zone, just copy meta control values? incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] # - if there are no meta controls, then we don't have to do anything if not (control_spec.geography == meta_geography).any(): logger.warn("meta_control_factoring: no meta targets so nothing to do") return meta_controls_df = get_control_table(meta_geography) dump_table("meta_controls_df", meta_controls_df) # slice control_spec to select only the rows for meta level controls meta_controls_spec = control_spec[control_spec.geography == meta_geography] meta_control_targets = meta_controls_spec['target'] logger.info("meta_control_factoring %s targets" % len(meta_control_targets)) dump_table("meta_controls_spec", meta_controls_spec) dump_table("meta_control_targets", meta_control_targets) # seed level weights of all households (rows aligned with incidence_df rows) seed_weights_df = get_weight_table(seed_geography) assert len(incidence_df.index) == len(seed_weights_df.index) # expand person weights by incidence (incidnece will simply be 1 for household targets) hh_level_weights = incidence_df[[seed_geography, meta_geography]].copy() for target in meta_control_targets: hh_level_weights[target] = \ incidence_df[target] * seed_weights_df['preliminary_balanced_weight'] dump_table("hh_level_weights", hh_level_weights) # weights of meta targets at seed level factored_seed_weights = \ hh_level_weights.groupby([seed_geography, meta_geography], as_index=False).sum() factored_seed_weights.set_index(seed_geography, inplace=True) dump_table("factored_seed_weights", factored_seed_weights) # weights of meta targets summed from seed level to meta level factored_meta_weights = factored_seed_weights.groupby(meta_geography, as_index=True).sum() dump_table("factored_meta_weights", factored_meta_weights) # only the meta level controls from meta_controls table meta_controls_df = meta_controls_df[meta_control_targets] dump_table("meta_controls_df", meta_controls_df) # compute the scaling factors to be applied to the seed-level totals: meta_factors = pd.DataFrame(index=meta_controls_df.index) for target in meta_control_targets: meta_factors[target] = meta_controls_df[target] / factored_meta_weights[target] dump_table("meta_factors", meta_factors) # compute seed-level controls from meta-level controls seed_level_meta_controls = pd.DataFrame(index=factored_seed_weights.index) for target in meta_control_targets: # meta level scaling_factor for this meta_control scaling_factor = factored_seed_weights[meta_geography].map(meta_factors[target]) # scale the seed_level_meta_controls by meta_level scaling_factor seed_level_meta_controls[target] = factored_seed_weights[target] * scaling_factor # FIXME - why round scaled factored seed_weights to int prior to final seed balancing? seed_level_meta_controls[target] = seed_level_meta_controls[target].round().astype(int) dump_table("seed_level_meta_controls", seed_level_meta_controls) # create final balancing controls # add newly created seed_level_meta_controls to the existing set of seed level controls seed_controls_df = get_control_table(seed_geography) assert len(seed_controls_df.index) == len(seed_level_meta_controls.index) seed_controls_df = pd.concat([seed_controls_df, seed_level_meta_controls], axis=1) # ensure columns are in right order for orca-extended table seed_controls_df = seed_controls_df[control_spec.target] assert (seed_controls_df.columns == control_spec.target).all() dump_table("seed_controls_df", seed_controls_df) pipeline.replace_table(control_table_name(seed_geography), seed_controls_df)