Exemplo n.º 1
0
def test_missing_table_list(data_dir):

    settings = inject.get_injectable('settings')
    assert isinstance(settings, dict)

    with pytest.raises(AssertionError) as excinfo:
        input.read_input_table('households')
    assert 'no input_table_list found' in str(excinfo.value)
Exemplo n.º 2
0
def test_missing_filename(seed_households, data_dir):

    settings_yaml = """
        input_table_list:
          - tablename: households
            index_col: household_id
            rename_columns:
              HHID: household_id
    """

    settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader)
    inject.add_injectable('settings', settings)

    with pytest.raises(AssertionError) as excinfo:
        input.read_input_table('households')
    assert 'no input file provided' in str(excinfo.value)
Exemplo n.º 3
0
def test_create_input_store(seed_households, data_dir):

    settings_yaml = """
        create_input_store: True
        input_table_list:
          - tablename: households
            h5_tablename: seed_households
            filename: households.csv
            index_col: household_id
            rename_columns:
              HHID: household_id
    """

    settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader)
    inject.add_injectable('settings', settings)

    hh_file = os.path.join(data_dir, 'households.csv')
    seed_households.to_csv(hh_file, index=False)

    assert os.path.isfile(hh_file)

    df = input.read_input_table('households')

    assert df.index.name == 'household_id'

    output_store = os.path.join(inject.get_injectable('output_dir'),
                                'input_data.h5')
    assert os.path.exists(output_store)

    store_df = pd.read_hdf(output_store, 'seed_households')
    assert store_df.equals(seed_households)
Exemplo n.º 4
0
def accessibility(land_use):
    """
    If 'accessibility' is in input_tables list, then read it in,
    otherwise create skeleton table with same index as landuse.

    This allows loading of pre-computed accessibility table, which is particularly useful
    for single-process small household sample runs when there are many zones in landuse

    skeleton table only required if multiprocessing wants to slice accessibility,
    otherwise it will simply be replaced when accessibility model is run
    """

    accessibility_df = read_input_table("accessibility", required=False)

    if accessibility_df is None:
        accessibility_df = pd.DataFrame(index=land_use.index)
        logger.info("created placeholder accessibility table %s" %
                    (accessibility_df.shape, ))
    else:
        assert accessibility_df.sort_index().index.equals(land_use.to_frame().sort_index().index), \
            f"loaded accessibility table index does not match index of land_use table"
        logger.info("loaded land_use %s" % (accessibility_df.shape, ))

    # replace table function with dataframe
    inject.add_table('accessibility', accessibility_df)

    return accessibility_df
Exemplo n.º 5
0
def read_raw_persons(households):

    df = read_input_table("persons")

    if inject.get_injectable('households_sliced', False):
        # keep only persons in the sampled households
        df = df[df.household_id.isin(households.index)]

    return df
Exemplo n.º 6
0
def land_use():

    df = read_input_table("land_use")

    logger.info("loaded land_use %s" % (df.shape, ))

    # replace table function with dataframe
    inject.add_table('land_use', df)

    return df
Exemplo n.º 7
0
def initialize_tours(network_los, households, persons, trace_hh_id):

    trace_label = 'initialize_tours'

    tours = read_input_table("tours")

    # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after.
    # FIXME could just always slice...
    slice_happened = \
        inject.get_injectable('households_sample_size', 0) > 0 \
        or inject.get_injectable('households_sample_size', 0) > 0
    if slice_happened:
        logger.info("slicing tours %s" % (tours.shape,))
        # keep all persons in the sampled households
        tours = tours[tours.person_id.isin(persons.index)]

    # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above
    model_settings = config.read_model_settings('initialize_tours.yaml', mandatory=True)
    expressions.assign_columns(
        df=tours,
        model_settings=model_settings.get('annotate_tours'),
        trace_label=tracing.extend_trace_label(trace_label, 'annotate_tours'))

    skip_patch_tour_ids = model_settings.get('skip_patch_tour_ids', False)
    if skip_patch_tour_ids:
        pass
    else:
        tours = patch_tour_ids(tours)
    assert tours.index.name == 'tour_id'

    # replace table function with dataframe
    inject.add_table('tours', tours)

    pipeline.get_rn_generator().add_channel('tours', tours)

    tracing.register_traceable_table('tours', tours)

    logger.debug(f"{len(tours.household_id.unique())} unique household_ids in tours")
    logger.debug(f"{len(households.index.unique())} unique household_ids in households")
    assert not tours.index.duplicated().any()

    tours_without_persons = ~tours.person_id.isin(persons.index)
    if tours_without_persons.any():
        logger.error(f"{tours_without_persons.sum()} tours out of {len(persons)} without persons\n"
                     f"{pd.Series({'person_id': tours_without_persons.index.values})}")
        raise RuntimeError(f"{tours_without_persons.sum()} tours with bad person_id")

    if trace_hh_id:
        tracing.trace_df(tours,
                         label='initialize_tours',
                         warn_if_empty=True)
Exemplo n.º 8
0
def zone_data():
    """
    Pipeline table containing zone info. Specify with 'input_table_list'
    in settings.yaml. Must contain columns for at least zone id, latitude,
    and longitude.

    """
    df = read_input_table('zone_data')

    logger.info('loaded zone data %s' % (df.shape,))

    # replace table function with dataframe
    inject.add_table('zone_data', df)

    return df
Exemplo n.º 9
0
def land_use():

    df = read_input_table("land_use")

    # try to make life easy for everybody by keeping everything in canonical order
    # but as long as coalesce_pipeline doesn't sort tables it coalesces, it might not stay in order
    # so even though we do this, anyone downstream who depends on it, should look out for themselves...
    if not df.index.is_monotonic_increasing:
        df = df.sort_index()

    logger.info("loaded land_use %s" % (df.shape, ))

    # replace table function with dataframe
    inject.add_table('land_use', df)

    return df
Exemplo n.º 10
0
def test_hdf_reader1(seed_households, data_dir):

    settings_yaml = """
        input_table_list:
          - tablename: households
            filename: households.h5
            index_col: household_id
            rename_columns:
              HHID: household_id
    """

    settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader)
    inject.add_injectable('settings', settings)

    hh_file = os.path.join(data_dir, 'households.h5')
    seed_households.to_hdf(hh_file, key='households', mode='w')

    assert os.path.isfile(hh_file)

    df = input.read_input_table('households')

    assert df.index.name == 'household_id'
Exemplo n.º 11
0
def test_csv_reader(seed_households, data_dir):

    settings_yaml = """
        input_table_list:
          - tablename: households
            filename: households.csv
            index_col: household_id
            column_map:
              HHID: household_id
    """

    settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader)
    inject.add_injectable('settings', settings)

    hh_file = os.path.join(data_dir, 'households.csv')
    seed_households.to_csv(hh_file, index=False)

    assert os.path.isfile(hh_file)

    df = input.read_input_table('households')

    assert df.index.name == 'household_id'
Exemplo n.º 12
0
def households(households_sample_size, override_hh_ids, trace_hh_id):

    df_full = read_input_table("households")
    households_sliced = False

    logger.info("full household list contains %s households" % df_full.shape[0])

    # only using households listed in override_hh_ids
    if override_hh_ids is not None:

        # trace_hh_id will not used if it is not in list of override_hh_ids
        logger.info("override household list containing %s households" % len(override_hh_ids))

        df = df_full[df_full.index.isin(override_hh_ids)]
        households_sliced = True

        if df.shape[0] < len(override_hh_ids):
            logger.info("found %s of %s households in override household list" %
                        (df.shape[0], len(override_hh_ids)))

        if df.shape[0] == 0:
            raise RuntimeError('No override households found in store')

    # if we are tracing hh exclusively
    elif trace_hh_id and households_sample_size == 1:

        # df contains only trace_hh (or empty if not in full store)
        df = tracing.slice_ids(df_full, trace_hh_id)
        households_sliced = True

    # if we need a subset of full store
    elif households_sample_size > 0 and df_full.shape[0] > households_sample_size:

        logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0]))

        """
        Because random seed is set differently for each step, sampling of households using
        Random.global_rng would sample differently depending upon which step it was called from.
        We use a one-off rng seeded with the pseudo step name 'sample_households' to provide
        repeatable sampling no matter when the table is loaded.

        Note that the external_rng is also seeded with base_seed so the sample will (rightly) change
        if the pipeline rng's base_seed is changed
        """

        prng = pipeline.get_rn_generator().get_external_rng('sample_households')
        df = df_full.take(prng.choice(len(df_full), size=households_sample_size, replace=False))
        households_sliced = True

        # if tracing and we missed trace_hh in sample, but it is in full store
        if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index:
            # replace first hh in sample with trace_hh
            logger.debug("replacing household %s with %s in household sample" %
                         (df.index[0], trace_hh_id))
            df_hh = df_full.loc[[trace_hh_id]]
            df = pd.concat([df_hh, df[1:]])

    else:
        df = df_full

    # persons table
    inject.add_injectable('households_sliced', households_sliced)

    logger.info("loaded households %s" % (df.shape,))

    # FIXME - pathological knowledge of name of chunk_id column used by chunked_choosers_by_chunk_id
    assert 'chunk_id' not in df.columns
    df['chunk_id'] = pd.Series(list(range(len(df))), df.index)

    # replace table function with dataframe
    inject.add_table('households', df)

    pipeline.get_rn_generator().add_channel('households', df)

    if trace_hh_id:
        tracing.register_traceable_table('households', df)
        tracing.trace_df(df, "raw.households", warn_if_empty=True)

    return df