Exemplo n.º 1
0
def add_age_categories(df, year):
    zfi = zone_forecast_inputs()
    rc = regional_controls()

    seed_matrix = zfi[["sh_age0004", "sh_age0519", "sh_age2044",
                       "sh_age4564", "sh_age65p"]].\
        mul(df.totpop, axis='index').as_matrix()

    row_marginals = df.totpop.values
    agecols = ["age0004", "age0519", "age2044", "age4564", "age65p"]
    col_marginals = rc[agecols].loc[year].values

    target = df.totpop.sum()
    col_marginals = scale_by_target(pd.Series(col_marginals),
                                    target).round().astype('int')

    seed_matrix[seed_matrix == 0] = .1
    seed_matrix[row_marginals == 0, :] = 0

    mat = simple_ipf(seed_matrix, col_marginals, row_marginals)
    agedf = pd.DataFrame(mat)
    agedf.columns = [col.upper() for col in agecols]
    agedf.index = zfi.index

    for ind, row in agedf.iterrows():
        target = df.totpop.loc[ind]
        row = row.round()
        agedf.loc[ind] = round_series_match_target(row, target, 0)

    for col in agedf.columns:
        df[col] = agedf[col]

    return df
Exemplo n.º 2
0
def add_age_categories(df, year):
    zfi = zone_forecast_inputs()
    rc = regional_controls()

    seed_matrix = zfi[["sh_age0004", "sh_age0519", "sh_age2044",
                       "sh_age4564", "sh_age65p"]].\
        mul(df.totpop, axis='index').as_matrix()

    row_marginals = df.totpop.values
    agecols = ["age0004", "age0519", "age2044", "age4564", "age65p"]
    col_marginals = rc[agecols].loc[year].values

    target = df.totpop.sum()
    col_marginals = scale_by_target(pd.Series(col_marginals),
                                    target).round().astype('int')

    seed_matrix[seed_matrix == 0] = .1
    seed_matrix[row_marginals == 0, :] = 0

    mat = simple_ipf(seed_matrix, col_marginals, row_marginals)
    agedf = pd.DataFrame(mat)
    agedf.columns = [col.upper() for col in agecols]
    agedf.index = zfi.index

    for ind, row in agedf.iterrows():
        target = df.totpop.loc[ind]
        row = row.round()
        agedf.loc[ind] = round_series_match_target(row, target, 0)

    for col in agedf.columns:
        df[col] = agedf[col]

    return df
Exemplo n.º 3
0
def add_employment(df, year):

    hhs_by_inc = df[["hhincq1", "hhincq2", "hhincq3", "hhincq4"]]
    hh_shares = hhs_by_inc.divide(hhs_by_inc.sum(axis=1), axis="index")

    zfi = zone_forecast_inputs()

    empshare = 0.46381 * hh_shares.hhincq1 + 0.49361 * hh_shares.hhincq2 +\
        0.56938 * hh_shares.hhincq3 + 0.29818 * hh_shares.hhincq4 +\
        zfi.zonal_emp_sh_resid10

    # I really don't think more than 70% of people should be employed
    # in a given zone - this also makes sure that the employed residents
    # is less then the total population (after scaling) - if the
    # assertion below is triggered you can fix it by reducing this
    # .7 even a little bit more
    empshare = empshare.fillna(0).clip(.3, .7)

    empres = empshare * df.totpop

    rc = regional_controls()
    target = rc.empres.loc[year]

    empres = scale_by_target(empres, target)

    df["empres"] = round_series_match_target(empres, target, 0)

    # this should really make the assertion below pass, but this now
    # only occurs very infrequently
    df["empres"] = df[["empres", "totpop"]].min(axis=1)

    # make sure employed residents is less than total residents
    assert (df.empres <= df.totpop).all()

    return df
Exemplo n.º 4
0
def add_employment(df, year):

    hhs_by_inc = df[["hhincq1", "hhincq2", "hhincq3", "hhincq4"]]
    hh_shares = hhs_by_inc.divide(hhs_by_inc.sum(axis=1), axis="index")

    zfi = zone_forecast_inputs()

    empshare = 0.46381 * hh_shares.hhincq1 + 0.49361 * hh_shares.hhincq2 +\
        0.56938 * hh_shares.hhincq3 + 0.29818 * hh_shares.hhincq4 +\
        zfi.zonal_emp_sh_resid10

    # I really don't think more than 70% of people should be employed
    # in a given zone - this also makes sure that the employed residents
    # is less then the total population (after scaling) - if the
    # assertion below is triggered you can fix it by reducing this
    # .7 even a little bit more
    empshare = empshare.fillna(0).clip(.3, .7)

    empres = empshare * df.totpop

    rc = regional_controls()
    target = rc.empres.loc[year]

    empres = scale_by_target(empres, target)

    df["empres"] = round_series_match_target(empres, target, 0)

    # this should really make the assertion below pass, but this now
    # only occurs very infrequently
    df["empres"] = df[["empres", "totpop"]].min(axis=1)

    # make sure employed residents is less than total residents
    assert (df.empres <= df.totpop).all()

    return df
Exemplo n.º 5
0
def _proportional_jobs_model(
    target_ratio,  # ratio of jobs of this sector to households
    sector,        # empsix sector
    groupby_col,   # ratio will be matched at this level of geog
    hh_df,
    jobs_df,
    locations_series,
    target_jobs=None  # pass this if you want to compute target jobs
):

    if target_jobs is None:
        # compute it if not passed
        target_jobs = hh_df[groupby_col].value_counts() * target_ratio
        target_jobs = target_jobs.astype('int')

    current_jobs = jobs_df[
        jobs_df.empsix == sector][groupby_col].value_counts()
    need_more_jobs = target_jobs - current_jobs
    need_more_jobs = need_more_jobs[need_more_jobs > 0]
    need_more_jobs_total = int(need_more_jobs.sum())

    available_jobs = \
        jobs_df.query("empsix == '%s' and building_id == -1" % sector)

    print "Need more jobs total: %d" % need_more_jobs_total
    print "Available jobs: %d" % len(available_jobs)

    if len(available_jobs) == 0:
        # corner case
        return pd.Series()

    if len(available_jobs) >= need_more_jobs_total:

        # have enough jobs to assign, truncate available jobs
        available_jobs = available_jobs.head(need_more_jobs_total)

    else:

        # don't have enough jobs - random sample locations to partially
        # match the need (won't succed matching the entire need)
        need_more_jobs = round_series_match_target(
            need_more_jobs, len(available_jobs), 0)
        need_more_jobs_total = need_more_jobs.sum()

    assert need_more_jobs_total == len(available_jobs)

    if need_more_jobs_total <= 0:
        return pd.Series()

    print "Need more jobs\n", need_more_jobs

    choices = groupby_random_choice(locations_series, need_more_jobs)

    # choose random locations within jurises to match need_more_jobs totals
    return pd.Series(choices.index, available_jobs.index)
Exemplo n.º 6
0
def add_population(df, year):
    rc = regional_controls()
    target = rc.totpop.loc[year] - df.gqpop.sum()

    zfi = zone_forecast_inputs()
    s = df.tothh * zfi.meanhhsize

    s = scale_by_target(s, target, .15)

    df["hhpop"] = round_series_match_target(s, target, 0)
    df["hhpop"] = df.hhpop.fillna(0)
    return df
Exemplo n.º 7
0
def add_population(df, year):
    rc = regional_controls()
    target = rc.totpop.loc[year] - df.gqpop.sum()

    zfi = zone_forecast_inputs()
    s = df.tothh * zfi.meanhhsize

    s = scale_by_target(s, target, .15)

    df["hhpop"] = round_series_match_target(s, target, 0)
    df["hhpop"] = df.hhpop.fillna(0)
    return df
def add_age_categories(df, year):
    zfi = zone_forecast_inputs()
    rc = regional_controls()

    # Added [df.index.isin(zfi.index)] to prevent errors on non-matching data
    seed_matrix = zfi[["sh_age0004", "sh_age0519", "sh_age2044",
                       "sh_age4564", "sh_age65p"]].\
        mul(df[df.index.isin(zfi.index)].totpop, axis='index').as_matrix()
    row_marginals = df.totpop.values
    agecols = ["age0004", "age0519", "age2044", "age4564", "age65p"]
    col_marginals = rc[agecols].loc[year].values

    target = df.totpop.sum()
    col_marginals = scale_by_target(pd.Series(col_marginals),
                                    target).round().astype('int')

    seed_matrix[seed_matrix == 0] = .1
    seed_matrix[row_marginals == 0, :] = 0

    # Added by Derek to make sure the seed_matrix is a matrix, rather than a
    # ndarray, which caused problems in simple_ipf
    seed_matrix = pd.DataFrame(seed_matrix)

    mat = simple_ipf(seed_matrix, col_marginals, row_marginals)
    agedf = pd.DataFrame(mat)
    agedf.columns = [col.upper() for col in agecols]
    agedf.index = zfi.index

    for ind, row in agedf.iterrows():
        target = df.totpop.loc[ind]
        row = row.round()
        agedf.loc[ind] = round_series_match_target(row, target, 0)

    for col in agedf.columns:
        df[col] = agedf[col]

    return df
Exemplo n.º 9
0
def _proportional_jobs_model(
    target_ratio,  # ratio of jobs of this sector to households
    sector,        # empsix sector
    groupby_col,   # ratio will be matched at this level of geog
    hh_df,
    jobs_df,
    locations_series,
    target_jobs=None  # pass this if you want to compute target jobs
):

    if target_jobs is None:
        # compute it if not passed
        target_jobs = hh_df[groupby_col].value_counts() * target_ratio
        target_jobs = target_jobs.astype('int')

    current_jobs = jobs_df[
        jobs_df.empsix == sector][groupby_col].value_counts()
    need_more_jobs = target_jobs - current_jobs
    need_more_jobs = need_more_jobs[need_more_jobs > 0]
    need_more_jobs_total = int(need_more_jobs.sum())

    available_jobs = \
        jobs_df.query("empsix == '%s' and building_id == -1" % sector)

    print "Need more jobs total: %d" % need_more_jobs_total
    print "Available jobs: %d" % len(available_jobs)

    if len(available_jobs) == 0:
        # corner case
        return pd.Series()

    if len(available_jobs) >= need_more_jobs_total:

        # have enough jobs to assign, truncate available jobs
        available_jobs = available_jobs.head(need_more_jobs_total)

    else:

        # don't have enough jobs - random sample locations to partially
        # match the need (won't succed matching the entire need)
        need_more_jobs = round_series_match_target(
            need_more_jobs, len(available_jobs), 0)
        need_more_jobs_total = need_more_jobs.sum()

    assert need_more_jobs_total == len(available_jobs)

    if need_more_jobs_total <= 0:
        return pd.Series()

    print "Need more jobs\n", need_more_jobs

    excess = need_more_jobs.sub(locations_series.value_counts(), fill_value=0)
    print "Excess demand\n", excess[excess > 0]

    # there's an issue with groupby_random_choice where it can't choose from
    # a set of locations that don't exist - e.g. we have 2 jobs in a certain
    # city but not locations to put them in.  we need to drop this demand
    drop = need_more_jobs.index.difference(locations_series.unique())
    print "We don't have any locations for these locations:\n", drop
    need_more_jobs = need_more_jobs.drop(drop)

    # choose random locations within jurises to match need_more_jobs totals
    choices = groupby_random_choice(locations_series, need_more_jobs,
                                    replace=True)

    # these might not be the same length after dropping a few lines above
    available_jobs = available_jobs.head(len(choices))

    return pd.Series(choices.index, available_jobs.index)
Exemplo n.º 10
0
def _proportional_jobs_model(
    target_ratio,  # ratio of jobs of this sector to households
    sector,        # empsix sector
    groupby_col,   # ratio will be matched at this level of geog
    hh_df,
    jobs_df,
    locations_series,
    target_jobs=None  # pass this if you want to compute target jobs
):

    if target_jobs is None:
        # compute it if not passed
        target_jobs = hh_df[groupby_col].value_counts() * target_ratio
        target_jobs = target_jobs.astype('int')

    current_jobs = jobs_df[
        jobs_df.empsix == sector][groupby_col].value_counts()
    need_more_jobs = target_jobs - current_jobs
    need_more_jobs = need_more_jobs[need_more_jobs > 0]
    need_more_jobs_total = int(need_more_jobs.sum())

    available_jobs = \
        jobs_df.query("empsix == '%s' and building_id == -1" % sector)

    print "Need more jobs total: %d" % need_more_jobs_total
    print "Available jobs: %d" % len(available_jobs)

    if len(available_jobs) == 0:
        # corner case
        return pd.Series()

    if len(available_jobs) >= need_more_jobs_total:

        # have enough jobs to assign, truncate available jobs
        available_jobs = available_jobs.head(need_more_jobs_total)

    else:

        # don't have enough jobs - random sample locations to partially
        # match the need (won't succed matching the entire need)
        need_more_jobs = round_series_match_target(
            need_more_jobs, len(available_jobs), 0)
        need_more_jobs_total = need_more_jobs.sum()

    assert need_more_jobs_total == len(available_jobs)

    if need_more_jobs_total <= 0:
        return pd.Series()

    print "Need more jobs\n", need_more_jobs

    excess = need_more_jobs.sub(locations_series.value_counts(), fill_value=0)
    print "Excess demand\n", excess[excess > 0]

    # there's an issue with groupby_random_choice where it can't choose from
    # a set of locations that don't exist - e.g. we have 2 jobs in a certain
    # city but not locations to put them in.  we need to drop this demand
    drop = need_more_jobs.index.difference(locations_series.unique())
    print "We don't have any locations for these locations:\n", drop
    need_more_jobs = need_more_jobs.drop(drop)

    # choose random locations within jurises to match need_more_jobs totals
    choices = groupby_random_choice(locations_series, need_more_jobs,
                                    replace=True)

    # these might not be the same length after dropping a few lines above
    available_jobs = available_jobs.head(len(choices))

    return pd.Series(choices.index, available_jobs.index)