예제 #1
0
def meta_summary(incidence_df, control_spec, top_geography, top_id,
                 sub_geographies):

    incidence_df = incidence_df[incidence_df[top_geography] == top_id]

    control_cols = control_spec.target.values

    controls_df = get_control_table(top_geography)

    # controls for this geography as series
    controls = controls_df[control_cols].loc[top_id]

    incidence = incidence_df[control_cols]

    summary = pd.DataFrame(index=control_cols)

    summary.index.name = 'control_name'

    summary['control_value'] = controls

    seed_geography = setting('seed_geography')
    seed_weights_df = get_weight_table(seed_geography)
    seed_weight_cols = [
        'preliminary_balanced_weight', 'balanced_weight', 'integer_weight'
    ]
    for c in seed_weight_cols:
        if c in seed_weights_df:
            summary_col_name = '%s_%s' % (top_geography, c)
            summary[summary_col_name] = \
                incidence.multiply(seed_weights_df[c], axis="index").sum(axis=0)

    for g in sub_geographies:

        sub_weight_cols = ['balanced_weight', 'integer_weight']

        sub_weights = get_weight_table(g)

        if sub_weights is None:
            continue

        sub_weights = sub_weights[sub_weights[top_geography] == top_id]

        sub_weights = sub_weights[[setting('household_id_col')] +
                                  sub_weight_cols].groupby(
                                      setting('household_id_col')).sum()

        for c in sub_weight_cols:
            summary['%s_%s' % (g, c)] = \
                incidence.multiply(sub_weights[c], axis="index").sum(axis=0)

    return summary
def integerize_final_seed_weights(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    seed_weights_df = get_weight_table(seed_geography)

    # FIXME - I assume we want to integerize using meta controls too?
    control_cols = control_spec.target
    assert (seed_controls_df.columns == control_cols).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("integerize_final_seed_weights seed id %s" % seed_id)

        # slice incidence rows for this seed geography
        seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id]

        balanced_seed_weights = \
            seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight']

        trace_label = "%s_%s" % (seed_geography, seed_id)

        integer_weights, status = do_integerizing(
            trace_label=trace_label,
            control_spec=control_spec,
            control_totals=seed_controls_df.loc[seed_id],
            incidence_table=seed_incidence[control_cols],
            float_weights=balanced_seed_weights,
            total_hh_control_col=total_hh_control_col
        )

        weight_list.append(integer_weights)

    # bulk concat all seed level results
    integer_seed_weights = pd.concat(weight_list)

    inject.add_column(weight_table_name(seed_geography), 'integer_weight', integer_seed_weights)
예제 #3
0
def final_seed_balancing(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_weight_table_name = weight_table_name(seed_geography)

    # if there are no meta controls, then balanced_weight is simply preliminary_balanced_weight
    geographies = settings['geographies']
    if not (control_spec.geography == geographies[0]).any():
        logger.warning(
            "no need for final_seed_balancing because no meta controls")
        seed_weights_df = get_weight_table(seed_geography)
        if 'balanced_weight' not in seed_weights_df:
            final_seed_weights = seed_weights_df['preliminary_balanced_weight']
            inject.add_column(seed_weight_table_name, 'balanced_weight',
                              final_seed_weights)
        return

    # we use all control_spec rows, so no need to filter on geography as for initial_seed_balancing
    seed_controls_df = get_control_table(seed_geography)
    assert (seed_controls_df.columns == control_spec.target).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    relaxation_factors = pd.DataFrame(index=seed_controls_df.columns.tolist())

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "final_seed_balancing for seed_id %s did not converge" %
                seed_id)

        weight_list.append(weights_df['final'])

        relaxation_factors[seed_id] = controls_df['relaxation_factor']

    # bulk concat all seed level results
    final_seed_weights = pd.concat(weight_list)

    inject.add_column(seed_weight_table_name, 'balanced_weight',
                      final_seed_weights)
예제 #4
0
def sub_balancing(settings, crosswalk, control_spec, incidence_table):
    """
    Simul-balance and integerize all zones at a specified geographic level
    in groups by parent zone.

    For instance, if the 'geography' step arg is 'TRACT' and the parent geography is 'SEED',
    then for each seed zone, we simul-balance the TRACTS it contains.

    Creates a weight table for the target geography
    with float 'balanced_weight' and 'integer_weight' columns.

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    Returns
    -------

    """

    # geography is an injected model step arg
    geography = inject.get_step_arg('geography')

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]
    parent_geography = geographies[geographies.index(geography) - 1]

    sub_geographies = geographies[geographies.index(geography):]
    parent_geographies = geographies[:geographies.index(geography)]

    total_hh_control_col = setting('total_hh_control')

    parent_controls_df = get_control_table(parent_geography)
    sub_controls_df = get_control_table(geography)

    weights_df = get_weight_table(parent_geography)
    assert weights_df is not None

    integer_weights_list = []

    # the incidence table is siloed by seed geography, se we handle each seed zone in turn
    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        # slice incidence and crosswalk tables for this seed zone
        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        assert len(seed_crosswalk_df[meta_geography].unique()) == 1

        # list of unique parent zone ids in this seed zone
        # (there will be just one if parent geo is seed)
        parent_ids = seed_crosswalk_df[parent_geography].unique()
        # only want ones for which there are (non-zero) controls
        parent_ids = parent_controls_df.index.intersection(parent_ids)

        for parent_id in parent_ids:

            logger.info("balancing seed %s, %s %s" %
                        (seed_id, parent_geography, parent_id))

            initial_weights = weights_df[weights_df[parent_geography] ==
                                         parent_id]
            initial_weights = initial_weights.set_index(
                settings.get('household_id_col'))

            # using balanced_weight slows down simul and doesn't improve results
            # (float seeds means no zero-weight households to drop)
            if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True):
                initial_weights = initial_weights['balanced_weight']
            else:
                initial_weights = initial_weights['integer_weight']

            assert len(initial_weights.index) == len(seed_incidence_df.index)

            zone_weights_df = balance_and_integerize(
                incidence_df=seed_incidence_df,
                parent_weights=initial_weights,
                sub_controls_df=sub_controls_df,
                control_spec=control_spec,
                total_hh_control_col=total_hh_control_col,
                parent_geography=parent_geography,
                parent_id=parent_id,
                sub_geographies=sub_geographies,
                crosswalk_df=seed_crosswalk_df)

            # add higher level geography id columns to facilitate summaries
            parent_geography_ids = \
                crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\
                .max(axis=0)
            for z in parent_geography_ids.index:
                zone_weights_df[z] = parent_geography_ids[z]

            integer_weights_list.append(zone_weights_df)

    integer_weights_df = pd.concat(integer_weights_list)

    inject.add_table(weight_table_name(geography), integer_weights_df)
    inject.add_table(
        weight_table_name(geography, sparse=True),
        integer_weights_df[integer_weights_df['integer_weight'] > 0])

    if 'trace_geography' in settings and geography in settings[
            'trace_geography']:
        trace_geography_id = settings.get('trace_geography')[geography]
        df = integer_weights_df[integer_weights_df[geography] ==
                                trace_geography_id]
        inject.add_table('trace_%s' % weight_table_name(geography), df)
예제 #5
0
def summarize(crosswalk, incidence_table, control_spec):
    """
    Write aggregate summary files of controls and weights for all geographic levels to output dir

    Parameters
    ----------
    crosswalk : pipeline table
    incidence_table : pipeline table
    control_spec : pipeline table

    Returns
    -------

    """

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()

    geographies = setting('geographies')
    seed_geography = setting('seed_geography')
    meta_geography = geographies[0]
    sub_geographies = geographies[geographies.index(seed_geography) + 1:]
    household_id_col = setting('household_id_col')

    meta_ids = crosswalk_df[meta_geography].unique()
    for meta_id in meta_ids:
        meta_summary_df = \
            meta_summary(incidence_df, control_spec, meta_geography, meta_id, sub_geographies)
        out_table('%s_%s' % (meta_geography, meta_id), meta_summary_df)

    hh_weights_summary = pd.DataFrame(index=incidence_df.index)

    # add seed level summaries
    seed_weights_df = get_weight_table(seed_geography)
    hh_weights_summary['%s_balanced_weight' %
                       seed_geography] = seed_weights_df['balanced_weight']
    hh_weights_summary['%s_integer_weight' %
                       seed_geography] = seed_weights_df['integer_weight']

    for geography in sub_geographies:

        weights_df = get_weight_table(geography)

        if weights_df is None:
            continue

        hh_weight_cols = [
            household_id_col, 'balanced_weight', 'integer_weight'
        ]
        hh_weights = weights_df[hh_weight_cols].groupby([household_id_col
                                                         ]).sum()
        hh_weights_summary['%s_balanced_weight' %
                           geography] = hh_weights['balanced_weight']
        hh_weights_summary['%s_integer_weight' %
                           geography] = hh_weights['integer_weight']

        # aggregate to seed level
        hh_id_col = incidence_df.index.name
        aggegrate_weights = weights_df.groupby([seed_geography, hh_id_col],
                                               as_index=False).sum()
        aggegrate_weights.set_index(hh_id_col, inplace=True)

        aggegrate_weights = \
            aggegrate_weights[[seed_geography, 'balanced_weight', 'integer_weight']]
        aggegrate_weights['sample_weight'] = \
            incidence_df['sample_weight']
        aggegrate_weights['%s_preliminary_balanced_weight' % seed_geography] = \
            seed_weights_df['preliminary_balanced_weight']
        aggegrate_weights['%s_balanced_weight' % seed_geography] = \
            seed_weights_df['balanced_weight']
        aggegrate_weights['%s_integer_weight' % seed_geography] = \
            seed_weights_df['integer_weight']

        out_table('%s_aggregate' % (geography, ), aggegrate_weights)

        df = summarize_geography(seed_geography, 'integer_weight',
                                 crosswalk_df, weights_df, incidence_df)
        out_table('%s_%s' % (
            geography,
            seed_geography,
        ), df)

        df = summarize_geography(geography, 'integer_weight', crosswalk_df,
                                 weights_df, incidence_df)
        out_table('%s' % (geography, ), df)

    out_table('hh_weights', hh_weights_summary)
def integerize_final_seed_weights(settings, crosswalk, control_spec,
                                  incidence_table):
    """
    Final balancing for each seed (puma) zone with aggregated low and mid-level controls and
    distributed meta-level controls.

    Adds integer_weight column to seed-level weight table

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    Returns
    -------

    """

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    seed_weights_df = get_weight_table(seed_geography)

    # FIXME - I assume we want to integerize using meta controls too?
    control_cols = control_spec.target
    assert (seed_controls_df.columns == control_cols).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("integerize_final_seed_weights seed id %s" % seed_id)

        # slice incidence rows for this seed geography
        seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id]

        balanced_seed_weights = \
            seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight']

        trace_label = "%s_%s" % (seed_geography, seed_id)

        integer_weights, status = do_integerizing(
            trace_label=trace_label,
            control_spec=control_spec,
            control_totals=seed_controls_df.loc[seed_id],
            incidence_table=seed_incidence[control_cols],
            float_weights=balanced_seed_weights,
            total_hh_control_col=total_hh_control_col)

        weight_list.append(integer_weights)

    # bulk concat all seed level results
    integer_seed_weights = pd.concat(weight_list)

    inject.add_column(weight_table_name(seed_geography), 'integer_weight',
                      integer_seed_weights)
예제 #7
0
def sub_balancing(settings, crosswalk, control_spec, incidence_table):

    # geography is an injected model step arg
    geography = inject.get_step_arg('geography')

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]
    parent_geography = geographies[geographies.index(geography) - 1]

    sub_geographies = geographies[geographies.index(geography):]
    parent_geographies = geographies[:geographies.index(geography)]

    total_hh_control_col = settings.get('total_hh_control')

    sub_controls_df = get_control_table(geography)

    weights_df = get_weight_table(parent_geography)
    assert weights_df is not None

    integer_weights_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        assert len(seed_crosswalk_df[meta_geography].unique()) == 1

        parent_ids = seed_crosswalk_df[parent_geography].unique()

        for parent_id in parent_ids:

            logger.info("balancing seed %s, %s %s" %
                        (seed_id, parent_geography, parent_id))

            initial_weights = weights_df[weights_df[parent_geography] ==
                                         parent_id]
            initial_weights = initial_weights.set_index(
                settings.get('household_id_col'))

            # using balanced_weight slows down simul and doesn't improve results
            # (float seeds means no zero-weight households to drop)
            if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True):
                initial_weights = initial_weights['balanced_weight']
            else:
                initial_weights = initial_weights['integer_weight']

            assert len(initial_weights.index) == len(seed_incidence_df.index)

            zone_weights_df = balance_and_integerize(
                incidence_df=seed_incidence_df,
                parent_weights=initial_weights,
                sub_controls_df=sub_controls_df,
                control_spec=control_spec,
                total_hh_control_col=total_hh_control_col,
                parent_geography=parent_geography,
                parent_id=parent_id,
                sub_geographies=sub_geographies,
                crosswalk_df=seed_crosswalk_df)

            # add higher level geography id columns to facilitate summaries
            parent_geography_ids = \
                crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\
                .max(axis=0)
            for z in parent_geography_ids.index:
                zone_weights_df[z] = parent_geography_ids[z]

            integer_weights_list.append(zone_weights_df)

    integer_weights_df = pd.concat(integer_weights_list)

    inject.add_table(weight_table_name(geography), integer_weights_df)
    inject.add_table(
        weight_table_name(geography, sparse=True),
        integer_weights_df[integer_weights_df['integer_weight'] > 0])

    if 'trace_geography' in settings and geography in settings[
            'trace_geography']:
        trace_geography_id = settings.get('trace_geography')[geography]
        df = integer_weights_df[integer_weights_df[geography] ==
                                trace_geography_id]
        inject.add_table('trace_%s' % weight_table_name(geography), df)
예제 #8
0
def repop_balancing(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings['geographies']
    low_geography = geographies[-1]

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    all_seed_weights_df = get_weight_table(seed_geography)
    assert all_seed_weights_df is not None

    # only want control_spec rows for low_geography
    low_control_spec = control_spec[control_spec['geography'] == low_geography]
    low_controls_df = get_control_table(low_geography)

    household_id_col = setting('household_id_col')
    total_hh_control_col = setting('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    # run balancer for each low geography
    low_weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        # initial seed weights in series indexed by hh id
        seed_weights_df = all_seed_weights_df[
            all_seed_weights_df[seed_geography] == seed_id]
        seed_weights_df = seed_weights_df.set_index(household_id_col)

        # number of hh in seed zone (for scaling low zone weights)
        seed_zone_hh_count = seed_controls_df[total_hh_control_col].loc[
            seed_id]

        low_ids = seed_crosswalk_df[low_geography].unique()
        for low_id in low_ids:

            trace_label = "%s_%s_%s_%s" % (seed_geography, seed_id,
                                           low_geography, low_id)
            logger.info("balance and integerize %s" % trace_label)

            # weights table for this zone with household_id index and low_geography column
            zone_weights_df = pd.DataFrame(index=seed_weights_df.index)
            zone_weights_df[low_geography] = low_id

            # scale seed weights by relative hh counts
            # it doesn't makes sense to repop balance with integer weights
            low_zone_hh_count = low_controls_df[total_hh_control_col].loc[
                low_id]
            scaling_factor = float(low_zone_hh_count) / seed_zone_hh_count
            initial_weights = seed_weights_df[
                'balanced_weight'] * scaling_factor

            # - balance
            status, weights_df, controls_df = do_balancing(
                control_spec=low_control_spec,
                total_hh_control_col=total_hh_control_col,
                max_expansion_factor=max_expansion_factor,
                incidence_df=seed_incidence_df,
                control_totals=low_controls_df.loc[low_id],
                initial_weights=initial_weights)

            logger.info("repop_balancing balancing %s status: %s" %
                        (trace_label, status))
            if not status['converged']:
                raise RuntimeError("repop_balancing for %s did not converge" %
                                   trace_label)

            zone_weights_df['balanced_weight'] = weights_df['final']

            # - integerize
            integer_weights, status = do_integerizing(
                trace_label=trace_label,
                control_spec=control_spec,
                control_totals=low_controls_df.loc[low_id],
                incidence_table=seed_incidence_df,
                float_weights=weights_df['final'],
                total_hh_control_col=total_hh_control_col)

            logger.info("repop_balancing integerizing status: %s" % status)

            zone_weights_df['integer_weight'] = integer_weights

            logger.info(
                "Total balanced weights for %s = %s" %
                (trace_label, zone_weights_df['balanced_weight'].sum()))
            logger.info("Total integerized weights for %s = %s" %
                        (trace_label, zone_weights_df['integer_weight'].sum()))

            low_weight_list.append(zone_weights_df)

    # concat all low geography zone level results
    low_weights_df = pd.concat(low_weight_list).reset_index()

    # add higher level geography id columns to facilitate summaries
    crosswalk_df = crosswalk_df.set_index(low_geography)\
        .loc[low_weights_df[low_geography]]\
        .reset_index(drop=True)
    low_weights_df = pd.concat([low_weights_df, crosswalk_df], axis=1)

    inject.add_table(weight_table_name(low_geography), low_weights_df)
    inject.add_table(weight_table_name(low_geography, sparse=True),
                     low_weights_df[low_weights_df['integer_weight'] > 0])
예제 #9
0
def expand_households():

    geographies = setting('geographies')
    household_id_col = setting('household_id_col')

    low_geography = geographies[-1]

    # only one we really need is low_geography
    seed_geography = setting('seed_geography')
    geography_cols = geographies[geographies.index(seed_geography):]

    weights = get_weight_table(low_geography, sparse=True)
    weights = weights[geography_cols + [household_id_col, 'integer_weight']]

    # - expand weights table by integer_weight, so there is one row per desired hh
    weight_cols = weights.columns.values
    weights_np = np.repeat(weights.as_matrix(),
                           weights.integer_weight.values,
                           axis=0)
    expanded_weights = pd.DataFrame(data=weights_np, columns=weight_cols)

    if setting('GROUP_BY_INCIDENCE_SIGNATURE'):

        # the household_id_col is really the group_id
        expanded_weights.rename(columns={household_id_col: 'group_id'},
                                inplace=True)

        # the original incidence table with one row per hh, with index hh_id
        household_groups = pipeline.get_table('household_groups')
        household_groups = household_groups[[
            household_id_col, 'group_id', 'sample_weight'
        ]]

        # for each group, lists of hh_ids and their sample_weights (as relative probabiliities)
        # [ [ [<group_0_hh_id_list>], [<group_0_hh_prob_list>] ],
        #   [ [<group_1_hh_id_list>], [<group_1_hh_prob_list>] ], ... ]
        HH_IDS = 0
        HH_PROBS = 1
        grouper = household_groups.groupby('group_id')
        group_hh_probs = [0] * len(grouper)
        for group_id, df in grouper:
            hh_ids = list(df[household_id_col])
            probs = list(df.sample_weight / df.sample_weight.sum())
            group_hh_probs[group_id] = [hh_ids, probs]

        # FIXME - should sample without replacement?
        # now make a hh_id choice for each group_id in expanded_weights
        def chooser(group_id):
            hh_ids = group_hh_probs[group_id][HH_IDS]
            hh_probs = group_hh_probs[group_id][HH_PROBS]
            return np.random.choice(hh_ids, p=hh_probs)
        expanded_weights[household_id_col] = \
            expanded_weights.group_id.apply(chooser, convert_dtype=True,)

        # FIXME - omit in production?
        del expanded_weights['group_id']
        del expanded_weights['integer_weight']

    append = inject.get_step_arg('append', False)
    replace = inject.get_step_arg('replace', False)
    assert not (
        append and
        replace), "can't specify both append and replace for expand_households"

    if append or replace:
        t = inject.get_table('expanded_household_ids').to_frame()
        prev_hhs = len(t.index)
        added_hhs = len(expanded_weights.index)

        if replace:
            # FIXME - should really get from crosswalk table?
            low_ids_to_replace = expanded_weights[low_geography].unique()
            t = t[~t[low_geography].isin(low_ids_to_replace)]

        expanded_weights = pd.concat([t, expanded_weights], ignore_index=True)

        dropped_hhs = prev_hhs - len(t.index)
        final_hhs = len(expanded_weights.index)
        op = 'append' if append else 'replace'
        logger.info(
            "expand_households op: %s prev hh count %s dropped %s added %s final %s"
            % (op, prev_hhs, dropped_hhs, added_hhs, final_hhs))

    inject.add_table('expanded_household_ids', expanded_weights)
예제 #10
0
def meta_control_factoring(settings, control_spec, incidence_table):
    """
    Apply simple factoring to summed household fractional weights based on original
    meta control values relative to summed household fractional weights by meta zone.

    The resulting factored meta control weights will be new meta controls, to be
    appended to the original controls, for final balancing.

    Parameters
    ----------
    settings
    control_spec
    incidence_table

    Returns
    -------

    """

    # FIXME - if there is only one seed zone in the meta zone, just copy meta control values?

    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]

    # - if there are no meta controls, then we don't have to do anything
    if not (control_spec.geography == meta_geography).any():
        logger.warn("meta_control_factoring: no meta targets so nothing to do")
        return

    meta_controls_df = get_control_table(meta_geography)
    dump_table("meta_controls_df", meta_controls_df)

    # slice control_spec to select only the rows for meta level controls
    meta_controls_spec = control_spec[control_spec.geography == meta_geography]
    meta_control_targets = meta_controls_spec['target']

    logger.info("meta_control_factoring %s targets" % len(meta_control_targets))

    dump_table("meta_controls_spec", meta_controls_spec)
    dump_table("meta_control_targets", meta_control_targets)

    # seed level weights of all households (rows aligned with incidence_df rows)
    seed_weights_df = get_weight_table(seed_geography)
    assert len(incidence_df.index) == len(seed_weights_df.index)

    # expand person weights by incidence (incidnece will simply be 1 for household targets)
    hh_level_weights = incidence_df[[seed_geography, meta_geography]].copy()
    for target in meta_control_targets:
        hh_level_weights[target] = \
            incidence_df[target] * seed_weights_df['preliminary_balanced_weight']

    dump_table("hh_level_weights", hh_level_weights)

    # weights of meta targets at seed level
    factored_seed_weights = \
        hh_level_weights.groupby([seed_geography, meta_geography], as_index=False).sum()
    factored_seed_weights.set_index(seed_geography, inplace=True)
    dump_table("factored_seed_weights", factored_seed_weights)

    # weights of meta targets summed from seed level to  meta level
    factored_meta_weights = factored_seed_weights.groupby(meta_geography, as_index=True).sum()
    dump_table("factored_meta_weights", factored_meta_weights)

    # only the meta level controls from meta_controls table
    meta_controls_df = meta_controls_df[meta_control_targets]
    dump_table("meta_controls_df", meta_controls_df)

    # compute the scaling factors to be applied to the seed-level totals:
    meta_factors = pd.DataFrame(index=meta_controls_df.index)
    for target in meta_control_targets:
        meta_factors[target] = meta_controls_df[target] / factored_meta_weights[target]
    dump_table("meta_factors", meta_factors)

    # compute seed-level controls from meta-level controls
    seed_level_meta_controls = pd.DataFrame(index=factored_seed_weights.index)
    for target in meta_control_targets:
        #  meta level scaling_factor for this meta_control
        scaling_factor = factored_seed_weights[meta_geography].map(meta_factors[target])
        # scale the seed_level_meta_controls by meta_level scaling_factor
        seed_level_meta_controls[target] = factored_seed_weights[target] * scaling_factor
        # FIXME - why round scaled factored seed_weights to int prior to final seed balancing?
        seed_level_meta_controls[target] = seed_level_meta_controls[target].round().astype(int)
    dump_table("seed_level_meta_controls", seed_level_meta_controls)

    # create final balancing controls
    # add newly created seed_level_meta_controls to the existing set of seed level controls

    seed_controls_df = get_control_table(seed_geography)
    assert len(seed_controls_df.index) == len(seed_level_meta_controls.index)
    seed_controls_df = pd.concat([seed_controls_df, seed_level_meta_controls], axis=1)

    # ensure columns are in right order for orca-extended table
    seed_controls_df = seed_controls_df[control_spec.target]
    assert (seed_controls_df.columns == control_spec.target).all()

    dump_table("seed_controls_df", seed_controls_df)

    pipeline.replace_table(control_table_name(seed_geography), seed_controls_df)