Exemplo n.º 1
0
def test_webapp_synthpops_calls(n=default_n,
                                location='seattle_metro',
                                state_location='Washington',
                                country_location='usa',
                                sheet_name='United States of America'):
    datadir = sp.datadir

    sp.read_age_bracket_distr(datadir,
                              location=location,
                              state_location=state_location,
                              country_location=country_location)
    sp.get_census_age_brackets(datadir,
                               state_location=state_location,
                               country_location=country_location)
    num_agebrackets = 16

    n_contacts_dic = {'H': 4.11, 'S': 11.41, 'W': 8.07, 'C': 7}

    contact_matrix_dic = sp.get_contact_matrix_dic(datadir,
                                                   sheet_name=sheet_name)
    contact_matrix_dic['M'] = sp.combine_matrices(contact_matrix_dic,
                                                  n_contacts_dic,
                                                  num_agebrackets)

    for k in contact_matrix_dic:
        print(contact_matrix_dic[k].shape)

    n = int(n)
    sp.get_age_n(datadir,
                 n=default_n,
                 location=location,
                 state_location=state_location,
                 country_location=country_location)

    return
Exemplo n.º 2
0
def test_n_single_ages(n_people=1e4,
                       location='seattle_metro',
                       state_location='Washington',
                       country_location='usa'):

    sc.heading('Running single ages')
    sp.validate()
    datadir = sp.datadir

    age_bracket_distr = sp.read_age_bracket_distr(datadir, location,
                                                  state_location,
                                                  country_location)
    gender_fraction_by_age = sp.read_gender_fraction_by_age_bracket(
        datadir, location, state_location, country_location)
    age_brackets_filepath = sp.get_census_age_brackets_path(
        datadir, state_location, country_location)
    age_brackets = sp.get_age_brackets_from_df(age_brackets_filepath)

    # ## Test selecting an age and sex for an individual ###
    a, s = sp.get_age_sex(gender_fraction_by_age, age_bracket_distr,
                          age_brackets)
    print(a, s)

    n_people = int(n_people)
    ages, sexes = [], []
    for p in range(n_people):
        a, s = sp.get_age_sex(gender_fraction_by_age, age_bracket_distr,
                              age_brackets)
        ages.append(a)
        sexes.append(s)

    return
Exemplo n.º 3
0
 def get_seattle_age_brackets(self):
     sea_age_brackets = sp.read_age_bracket_distr(
         sp.datadir,
         location=self.d_location,
         state_location=self.d_state_location,
         country_location=self.d_country_location)
     return sea_age_brackets
     pass
Exemplo n.º 4
0
def test_Spokane():
    """Test that a Dakar population can be created with the basic SynthPops API."""
    sp.logger.info("Test that a Spokane population can be created with the basic SynthPops API.")
    pop = sp.make_population(**pars)
    age_distr = sp.read_age_bracket_distr(sp.datadir, country_location='usa', state_location='Washington', location='seattle_metro')
    assert len(age_distr) == 20, f'Check failed, len(age_distr): {len(age_distr)}'  # will remove if this passes in github actions test

    sp.set_location_defaults('defaults')  # Reset default values after this test is complete.
    return pop
def test_Spokane():
    """Test that a Dakar population can be created with the basic SynthPops API."""
    sp.logger.info("Test that a Spokane population can be created with the basic SynthPops API.")

    pop = sp.Pop(**pars)
    loc_pars = pop.loc_pars
    age_dist = sp.read_age_bracket_distr(**loc_pars)
    assert len(age_dist) == 20, f'Check failed, len(age_dist): {len(age_dist)}'  # will remove if this passes in github actions test

    sp.set_location_defaults('defaults')  # Reset default values after this test is complete.
    return pop
def test_age_distribution_used():
    """
    Test that the age distribution used in sp.Pop.generate() is the expected one for the location specified.
    """
    sp.logger.info("Test that the age distribution used in sp.Pop.generate() are the expected age distributions. \nThis should be binned to the default number of age brackets (cfg.nbrackets).")

    pop = sp.Pop(**pars)
    loc_pars = pop.loc_pars
    age_dist = sp.read_age_bracket_distr(**loc_pars)
    assert len(age_dist) == sp.settings.nbrackets, f'Check failed, len(age_dist): {len(age_dist)} does not match sp.config.nbrackets: {sp.config.nbrackets}.'
    print(f'Check passed, len(age_dist): {len(age_dist)} == sp.config.nbrackets: {sp.settings.nbrackets}.')

    return pop
Exemplo n.º 7
0
def test_multiple_ages(n_people=1e4, location='seattle_metro', state_location='Washington', country_location='usa'):
    sc.heading('Running multiple ages')

    datadir = sp.datadir

    age_bracket_distr = sp.read_age_bracket_distr(datadir, location, state_location, country_location)
    gender_fraction_by_age = sp.read_gender_fraction_by_age_bracket(datadir, location, state_location, country_location)
    age_brackets_filepath = sp.get_census_age_brackets_path(datadir, state_location, country_location)
    age_brackets = sp.get_age_brackets_from_df(age_brackets_filepath)

    ages, sexes = sp.get_age_sex_n(gender_fraction_by_age, age_bracket_distr, age_brackets, n_people)
    print(len(ages), len(sexes))

    return
Exemplo n.º 8
0
def test_all(location='seattle_metro', state_location='Washington', country_location='usa', sheet_name='United States of America'):
    ''' Run all tests '''

    sc.heading('Running all tests')

    sp.validate()  # Validate that data files can be found
    dropbox_path = sp.datadir

    age_bracket_distr = sp.read_age_bracket_distr(dropbox_path, location, state_location, country_location)
    gender_fraction_by_age = sp.read_gender_fraction_by_age_bracket(dropbox_path, location, state_location, country_location)
    age_brackets_filepath = sp.get_census_age_brackets_path(dropbox_path, state_location, country_location)
    age_brackets = sp.get_age_brackets_from_df(age_brackets_filepath)
    age_by_brackets_dic = sp.get_age_by_brackets_dic(age_brackets)

    # ## Test selecting an age and sex for an individual ###
    a, s = sp.get_age_sex(gender_fraction_by_age, age_bracket_distr, age_brackets)
    print(a, s)

    # ## Test age mixing matrix ###
    # num_agebrackets = 18

    # flu-like weights. calibrated to empirical diary survey data.
    weights_dic = {'H': 4.11, 'S': 11.41, 'W': 8.07, 'C': 2.79}

    age_mixing_matrix_dic = sp.get_contact_matrix_dic(dropbox_path, sheet_name)

    # ## Test sampling contacts based on age ###
    age, sex = sp.get_age_sex(gender_fraction_by_age, age_bracket_distr, age_brackets)  # sample an age (and sex) from the seattle metro distribution

    n_contacts = 30
    contact_ages = sp.sample_n_contact_ages(n_contacts, age, age_brackets, age_by_brackets_dic, age_mixing_matrix_dic, weights_dic)
    print(contact_ages)

    # shut down schools
    no_schools_weights = sc.dcp(weights_dic)
    no_schools_weights['S'] = 0.1  # research shows that even with school closure, kids still have some contact with their friends from school.

    f_reduced_contacts_students = 0.5
    f_reduced_contacts_nonstudents = 0.2

    if age < 20:
        n_reduced_contacts = int(n_contacts * (1 - f_reduced_contacts_students))
    else:
        n_reduced_contacts = int(n_contacts * (1 - f_reduced_contacts_nonstudents))

    contact_ages = sp.sample_n_contact_ages(n_reduced_contacts, age, age_brackets, age_by_brackets_dic, age_mixing_matrix_dic, no_schools_weights)
    print(contact_ages)

    return
Exemplo n.º 9
0
def test_basic_api():
    ''' Basic SynthPops test '''
    sp.logger.info('Testing basic API')

    pop = sp.make_population(**pars)
    age_distr = sp.read_age_bracket_distr(sp.datadir,
                                          country_location='usa',
                                          state_location='Washington',
                                          location='seattle_metro')
    assert len(
        age_distr
    ) == 20, f'Check failed, len(age_distr): {len(age_distr)}'  # will remove if this passes in github actions test
    if regenerate or not os.path.exists(outfile):
        print('Saving...')
        sc.saveobj(outfile, pop)
    else:
        print('Checking...')
        pop2 = sc.loadobj(outfile)
        print(len(pop), len(pop2))
        assert pop == pop2, 'Check failed'
        print('Check passed')
    return pop
Exemplo n.º 10
0
def test_age_distribution_used():
    """
    Test that the age distribution used in sp.Pop.generate() is the expected one for the location specified.
    """
    sp.logger.info(
        "Test that the age distribution used in sp.Pop.generate() are the expected age distributions. \nThis should be binned to the default number of age brackets (cfg.nbrackets)."
    )

    pop = sp.make_population(**pars)

    age_distr = sp.read_age_bracket_distr(
        sp.datadir,
        location=pars['location'],
        state_location=pars['state_location'],
        country_location=pars['country_location'])
    assert len(
        age_distr
    ) == sp.config.nbrackets, f'Check failed, len(age_distr_1): {len(age_distr)} does not match sp.config.nbrackets: {sp.config.nbrackets}.'
    print(
        f'Check passed, len(age_distr_1): {len(age_distr)} == sp.config.nbrackets: {sp.config.nbrackets}.'
    )

    return pop
Exemplo n.º 11
0
def check_employment_age_distribution(pop,
                                      n,
                                      datadir,
                                      figdir,
                                      location=None,
                                      state_location=None,
                                      country_location=None,
                                      file_path=None,
                                      use_default=False,
                                      test_prefix="",
                                      skip_stat_check=False,
                                      do_close=True):
    """
    Check the population employment by age distribution against the reference data

    Args:
        pop              : population dictionary
        n                : population size
        datadir          : root data directory which has resides the reference data
        figdir           : directory where to result files are saved
        location         : name of the location
        state_location   : name of the state the location is in
        country_location : name of the country the location is in
        file_path        : file path to user specified gender by age bracket distribution data
        use_default      : if True, try to first use the other parameters to find data specific to the location
                           under study, otherwise returns default data drawing from Seattle, Washington.
        test_prefix      : used for prefix of the plot title
        skip_stat_check  : skip the statistics check for distribution
        do_close         : close the image immediately if set to True

    Returns:
        None.

    Plots will be save to figdir if provided
    """
    figdir = os.path.join(figdir, "employment")
    er = sp.get_employment_rates(datadir=datadir,
                                 location=location,
                                 state_location=state_location,
                                 country_location=country_location,
                                 file_path=file_path,
                                 use_default=use_default)
    brackets = sp.get_census_age_brackets(datadir=datadir,
                                          state_location=state_location,
                                          country_location=country_location)
    ageindex = sp.get_age_by_brackets_dic(brackets)
    age_dist = sp.read_age_bracket_distr(datadir=datadir,
                                         location=location,
                                         state_location=state_location,
                                         country_location=country_location,
                                         file_path=file_path,
                                         use_default=use_default)
    # counting the actual population by age with employment including teachers and staffs
    actual_employed_age_dist, actual_unemployed_age_dist = \
        utilities.get_ids_count_by_param(pop,
                                         condition_name=['wpid', 'sc_teacher', 'sc_staff'],
                                         param='age')
    utilities.plot_array([
        actual_employed_age_dist[k] for k in sorted(actual_employed_age_dist)
    ],
                         datadir=figdir,
                         names=[k for k in sorted(actual_employed_age_dist)],
                         expect_label='employed by age count',
                         xlabel_rotation=90,
                         testprefix="employeed count by age " + test_prefix)
    utilities.plot_array([
        actual_unemployed_age_dist[k]
        for k in sorted(actual_unemployed_age_dist)
    ],
                         datadir=figdir,
                         names=[k for k in sorted(actual_unemployed_age_dist)],
                         expect_label='unemployed by age count',
                         xlabel_rotation=90,
                         testprefix="unemployed count by age " + test_prefix)

    sorted_actual_employed_rate = {}
    actual_employed_rate = utilities.calc_rate(actual_employed_age_dist,
                                               actual_unemployed_age_dist)
    for i in er.keys():
        if i in actual_employed_rate:
            sorted_actual_employed_rate[i] = actual_employed_rate[i]
        else:
            sorted_actual_employed_rate[i] = 0
    actual_values = np.array(list(sorted_actual_employed_rate.values()))
    expected_values = np.array(list(er.values()))
    if not skip_stat_check:
        utilities.statistic_test(expected_values,
                                 actual_values,
                                 test="x",
                                 comments="employment rate distribution check")
    # plotting fill 0 to under age 16 for better display
    filled_count = min(er.keys())
    expected_values = np.insert(expected_values, 0, np.zeros(filled_count))
    actual_values = np.insert(actual_values, 0, np.zeros(filled_count))
    names = [i for i in range(0, max(er.keys()) + 1)]
    # somehow double stacks for age 100
    utilities.plot_array(
        expected_values,
        actual_values,
        names=None,
        datadir=figdir,
        testprefix="employment rate distribution " + test_prefix,
        do_close=do_close,
    )

    # check if total employment match
    expected_employed_brackets = {k: 0 for k in brackets}
    actual_employed_brackets = {k: 0 for k in brackets}
    for i in names:
        expected_employed_brackets[ageindex[i]] += expected_values[i]
        if i in actual_employed_age_dist:
            actual_employed_brackets[
                ageindex[i]] += actual_employed_age_dist[i]
    for i in expected_employed_brackets:
        expected_employed_brackets[i] = expected_employed_brackets[i] / len(
            brackets[i]) * age_dist[i] * n

    expected_total = np.array(list(expected_employed_brackets.values()))
    actual_total = np.array(list(actual_employed_brackets.values()))
    utilities.plot_array(expected_total,
                         actual_total,
                         names=brackets.keys(),
                         datadir=figdir,
                         testprefix="employment total " + test_prefix,
                         do_close=do_close)
    expected_etotal = np.round(np.sum(expected_total))
    actual_etotal = np.round(np.sum(actual_total))
    utilities.check_error_percentage(n,
                                     expected_etotal,
                                     actual_etotal,
                                     name="employee")
Exemplo n.º 12
0
def check_age_distribution(pop,
                           n,
                           datadir,
                           figdir,
                           location=None,
                           state_location=None,
                           country_location=None,
                           file_path=None,
                           use_default=False,
                           test_prefix="test",
                           skip_stat_check=False,
                           do_close=True):
    """
    Construct histogram from expected age distribution and compare with the actual generated data.

    Args:
        pop              : population dictionary
        n                : population size
        datadir          : root data directory which has resides the reference data
        figdir           : directory where to result files are saved
        location         : name of the location
        state_location   : name of the state the location is in
        country_location : name of the country the location is in
        file_path        : file path to user specified gender by age bracket distribution data
        use_default      : if True, try to first use the other parameters to find data specific to the location
                           under study, otherwise returns default data drawing from Seattle, Washington.
        test_prefix      : used for prefix of the plot title
        skip_stat_check  : skip the statistics check for distribution
        do_close         : close the image immediately if set to True

    Returns:
        None.

    Plots will be save to figdir if provided
    """
    figdir = os.path.join(figdir, "age_distribution")
    age_dist = sp.read_age_bracket_distr(datadir=datadir,
                                         location=location,
                                         state_location=state_location,
                                         country_location=country_location,
                                         file_path=file_path,
                                         use_default=use_default)
    brackets = sp.get_census_age_brackets(datadir=datadir,
                                          state_location=state_location,
                                          country_location=country_location)
    # un-normalized data
    # expected_values = np.array(list(age_dist.values())) * n
    # actual_values = get_age_distribution_from_pop(pop, brackets, False)
    # normalized
    expected_values = np.array(list(age_dist.values()))
    actual_values = utilities.get_age_distribution_from_pop(pop, brackets)
    names = np.array([i[0] for i in brackets.values()])
    utilities.plot_array(expected_values,
                         actual_values,
                         names,
                         figdir,
                         "age_distribution_" + test_prefix,
                         do_close=do_close)
    if not skip_stat_check:
        utilities.statistic_test(expected_values,
                                 actual_values,
                                 test="x",
                                 comments="age distribution check")
Exemplo n.º 13
0
def check_enrollment_distribution(pop,
                                  n,
                                  datadir,
                                  figdir,
                                  location=None,
                                  state_location=None,
                                  country_location=None,
                                  file_path=None,
                                  use_default=False,
                                  test_prefix="test",
                                  skip_stat_check=False,
                                  do_close=True,
                                  plot_only=False,
                                  school_type=None):
    """
    Compute the statistic on expected enrollment-age distribution and compare with actual distribution
    check zero enrollment bins to make sure there is nothing generated

    Args:
        pop              : population dictionary
        n                : population size
        datadir          : root data directory which has resides the reference data
        figdir           : directory where to result files are saved
        location         : name of the location
        state_location   : name of the state
        country_location : name of the country the state_location is in
        file_path        : file path to user specified gender by age bracket distribution data
        use_default      : if True, try to first use the other parameters to find data specific to the location
                           under study, otherwise returns default data drawing from Seattle, Washington.
        test_prefix      : used for prefix of the plot title
        skip_stat_check  : skip the statistics check for distribution
        do_close         : close the image immediately if set to True
        plot_only        : plot only without doing any data checks
        school_type      : list of school types e.g. ['pk', 'es', 'ms', 'hs', 'uv']

    Returns:
        None.

    Plots will be save to figdir if provided
    """
    expected_dist = sp.get_school_enrollment_rates(
        datadir=datadir,
        location=location,
        state_location=state_location,
        country_location=country_location,
        file_path=file_path,
        use_default=use_default)
    age_dist = sp.read_age_bracket_distr(datadir=datadir,
                                         location=location,
                                         state_location=state_location,
                                         country_location=country_location,
                                         file_path=file_path,
                                         use_default=use_default)
    brackets = sp.get_census_age_brackets(datadir=datadir,
                                          state_location=state_location,
                                          country_location=country_location)

    figdir = os.path.join(figdir, "enrollment")
    # get actual school enrollment by age
    if school_type is not None:
        actual_per_school_type_dict = dict.fromkeys(school_type)
        for sc in school_type:
            actual_per_school_type_dict[sc] = dict.fromkeys(
                list(range(0, 101)), 0)
    else:
        actual_per_school_type_dict = {}
    actual_pool = []
    actual_dist = dict.fromkeys(list(range(0, 101)), 0)
    for p in pop.values():
        if p["scid"] is not None and p["sc_student"] is not None:
            for sc in actual_per_school_type_dict.keys():
                if p["sc_type"] == sc:
                    actual_per_school_type_dict[sc][p["age"]] += 1
            actual_dist[p["age"]] += 1
            actual_pool.append(p["age"])

    # plot total school enrollment and individual age distribution
    actual_per_school_type_dict["all"] = actual_dist
    if school_type is not None:
        utilities.plot_array([
            sum(actual_per_school_type_dict[i].values())
            for i in actual_per_school_type_dict.keys()
        ],
                             names=actual_per_school_type_dict.keys(),
                             datadir=figdir,
                             testprefix="enrollment_by_school_type\n" +
                             test_prefix,
                             expect_label="enrollment",
                             value_text=True,
                             do_close=do_close)
    for k in actual_per_school_type_dict:
        utilities.plot_array(actual_per_school_type_dict[k].values(),
                             datadir=figdir,
                             testprefix=f"enrollment_by_age {k}\n" +
                             test_prefix,
                             expect_label="enrollment by age bucket",
                             do_close=do_close)

    actual_age_dist = utilities.get_age_distribution_from_pop(pop, brackets)
    # adjust expected enrollment percentage by age brackets
    expected_combined_dist = dict.fromkeys(list(range(0, len(brackets))), 0)
    adjusted_expected_combined_dist = dict.fromkeys(
        list(range(0, len(brackets))), 0)
    actual_combined_dist = dict.fromkeys(list(range(0, len(brackets))), 0)

    scaled_dist = dict.fromkeys(list(range(0, 101)), 0)
    adjusted_scaled_dist = dict.fromkeys(list(range(0, 101)), 0)
    for i in age_dist:
        for j in brackets[i]:
            scaled_dist[j] = (expected_dist[j] * n * age_dist[i]) / len(
                brackets[i])
            adjusted_scaled_dist[j] = (expected_dist[j] * n *
                                       actual_age_dist[i]) / len(brackets[i])
            expected_combined_dist[i] += scaled_dist[j]
            adjusted_expected_combined_dist[i] += adjusted_scaled_dist[j]
            actual_combined_dist[i] += actual_dist[j]

    # construct expected pool adjusted based on expected age distribution
    expected_pool = []
    for key in scaled_dist:
        for i in range(0, int(scaled_dist[key])):
            expected_pool.append(key)

    # construct expected pool adjusted based on the actual age distribution
    adjusted_expected_pool = []
    for key in adjusted_scaled_dist:
        for i in range(0, int(adjusted_scaled_dist[key])):
            adjusted_expected_pool.append(key)

    print(f"total enrollment expected :{int(sum(scaled_dist.values()))}")
    print(
        f"total enrollment expected (adjusted) :{int(sum(adjusted_scaled_dist.values()))}"
    )
    print(f"total enrollment actual :{sum(actual_dist.values())}")

    # make sure results are sorted by key
    # scaled_dist_dist = dict(sorted(scaled_dist.items()))
    actual_dist = dict(sorted(actual_dist.items()))

    expected_values = np.array(list(scaled_dist.values()))
    adjusted_expected_values = np.array(list(adjusted_scaled_dist.values()))
    actual_values = np.array(list(actual_dist.values()))

    expected_combined_values = np.array(list(expected_combined_dist.values()))
    adjusted_expected_combined_values = np.array(
        list(adjusted_expected_combined_dist.values()))
    actual_combined_values = np.array(list(actual_combined_dist.values()))

    utilities.plot_array(expected_values,
                         actual_values,
                         None,
                         figdir,
                         "enrollment_" + test_prefix,
                         do_close=do_close)
    utilities.plot_array(adjusted_expected_values,
                         actual_values,
                         None,
                         figdir,
                         "adjusted enrollment_" + test_prefix,
                         do_close=do_close)

    utilities.plot_array(expected_combined_values,
                         actual_combined_values,
                         np.array([i[0] for i in brackets.values()]),
                         figdir,
                         "enrollment by age bin" + test_prefix,
                         do_close=do_close)
    utilities.plot_array(adjusted_expected_combined_values,
                         actual_combined_values,
                         np.array([i[0] for i in brackets.values()]),
                         figdir,
                         "adjusted enrollment by age bin" + test_prefix,
                         do_close=do_close)
    if plot_only:
        return
    np.savetxt(os.path.join(os.path.dirname(datadir),
                            f"{test_prefix}_expected.csv"),
               expected_values,
               delimiter=",")
    np.savetxt(os.path.join(os.path.dirname(datadir),
                            f"{test_prefix}_actual.csv"),
               actual_values,
               delimiter=",")

    # check for expected 0 count bins
    # if expected enrollment is 0, actual enrollment must be 0
    # if the expected enrollment is greater than threshold, actual enrollment should not be zero
    # here we use tentative threshold 9 meaning if we expected 10+ enrollment and actually
    # generate 0, we should investigate why
    threshold = 9
    assert np.sum(actual_values[expected_values == 0]) == 0, \
        f"expected enrollment should be 0 for these age bins: " \
        f"{str(np.where((expected_values == 0) & (actual_values != 0)))}"

    assert len(actual_values[np.where((expected_values > threshold) & (actual_values == 0))]) == 0, \
        f"actual enrollment should not be 0 for these age bins: " \
        f"{str(np.where((expected_values > threshold) & (actual_values == 0)))}"

    # if expected bin count is less than threshold, use range check to allow some buffer
    # this is usually observed in smaller population in that expected count is small
    # so we allow actual observations to be 0 and up to the expected value plus threshold

    i = np.where((expected_values <= threshold) & (expected_values > 0))
    u = expected_values[i] + threshold  # upper bound
    l = np.zeros(len(expected_values[i]))  # lower bound can be 0
    assert (sum(l <= actual_values[i]) == len(actual_values[i]) and sum(actual_values[i] <= u) == len(
        actual_values[i])), \
        f"results show too much difference:\n" \
        f"expected:{expected_values[i]} \n actual:{actual_values[i]} \n" \
        f"please check these age bins: {i}"

    # check if pool looks right
    # h, bins = np.histogram(np.array(expected_pool), bins=100)
    # h, bins = np.histogram(np.array(actual_pool), bins=100)
    # plt.bar(bins[:-1],h,width=1)
    # plt.show()

    if not skip_stat_check:
        utilities.statistic_test(adjusted_expected_pool,
                                 actual_pool,
                                 test="ks",
                                 comments="enrollment distribution check")