Пример #1
0
def accessibility(land_use):
    """
    If 'accessibility' is in input_tables list, then read it in,
    otherwise create skeleton table with same index as landuse.

    This allows loading of pre-computed accessibility table, which is particularly useful
    for single-process small household sample runs when there are many zones in landuse

    skeleton table only required if multiprocessing wants to slice accessibility,
    otherwise it will simply be replaced when accessibility model is run
    """

    accessibility_df = read_input_table("accessibility", required=False)

    if accessibility_df is None:
        accessibility_df = pd.DataFrame(index=land_use.index)
        logger.info("created placeholder accessibility table %s" %
                    (accessibility_df.shape, ))
    else:
        assert accessibility_df.sort_index().index.equals(land_use.to_frame().sort_index().index), \
            f"loaded accessibility table index does not match index of land_use table"
        logger.info("loaded land_use %s" % (accessibility_df.shape, ))

    # replace table function with dataframe
    inject.add_table('accessibility', accessibility_df)

    return accessibility_df
def test_vts():

    inject.add_injectable("settings", {})

    # note: need 0 duration tour on one end of day to guarantee at least one available tour
    alts = pd.DataFrame({
        "start": [1, 1, 2, 3],
        "end": [1, 4, 5, 6]
    })
    alts['duration'] = alts.end - alts.start
    inject.add_injectable("tdd_alts", alts)

    current_tour_person_ids = pd.Series(['b', 'c'],
                                        index=['d', 'e'])

    previous_tour_by_personid = pd.Series([2, 2, 1],
                                          index=['a', 'b', 'c'])

    prev_tour_attrs = get_previous_tour_by_tourid(current_tour_person_ids,
                                                  previous_tour_by_personid,
                                                  alts)

    pdt.assert_series_equal(
        prev_tour_attrs.start_previous,
        pd.Series([2, 1], index=['d', 'e'], name='start_previous'))

    pdt.assert_series_equal(
        prev_tour_attrs.end_previous,
        pd.Series([5, 4], index=['d', 'e'], name='end_previous'))

    tours = pd.DataFrame({
        "person_id": [1, 1, 2, 3, 3],
        "tour_num": [1, 2, 1, 1, 2],
        "tour_type": ['x', 'x', 'x', 'x', 'x']
    })

    persons = pd.DataFrame({
        "income": [20, 30, 25]
    }, index=[1, 2, 3])

    inject.add_table('persons', persons)

    spec = pd.DataFrame({"Coefficient": [1.2]},
                        index=["income"])
    spec.index.name = "Expression"
    segment_col = None  # no segmentation of model_spec

    inject.add_injectable("check_for_variability", True)

    tdd_choices, timetable = vectorize_tour_scheduling(
        tours, persons, alts, spec, segment_col,
        model_settings={},
        chunk_size=0, trace_label='test_vts')

    # FIXME - dead reckoning regression
    # there's no real logic here - this is just what came out of the monte carlo
    # note that the result comes out ordered by the nth trips and not ordered
    # by the trip index.  shrug?
    expected = [2, 2, 2, 0, 0]
    assert (tdd_choices.tdd.values == expected).all()
Пример #3
0
def step2():

    table_name = inject.get_step_arg('table_name')
    assert table_name is not None

    table2 = pd.DataFrame({'column1': [10, 20, 30]})
    inject.add_table(table_name, table2)
Пример #4
0
def person_windows(persons, tdd_alts):

    df = tt.create_timetable_windows(persons, tdd_alts)

    inject.add_table('person_windows', df)

    return df
def atwork_subtour_destination_sample(tours,
                                      persons_merged,
                                      atwork_subtour_destination_sample_spec,
                                      skim_dict,
                                      destination_size_terms,
                                      chunk_size,
                                      trace_hh_id):

    trace_label = 'atwork_subtour_location_sample'
    model_settings = inject.get_injectable('atwork_subtour_destination_settings')

    persons_merged = persons_merged.to_frame()

    tours = tours.to_frame()
    tours = tours[tours.tour_category == 'subtour']

    # merge persons into tours
    choosers = pd.merge(tours, persons_merged, left_on='person_id', right_index=True)

    alternatives = destination_size_terms.to_frame()

    constants = config.get_model_constants(model_settings)

    sample_size = model_settings["SAMPLE_SIZE"]
    alt_col_name = model_settings["ALT_COL_NAME"]
    chooser_col_name = 'workplace_taz'

    logger.info("Running atwork_subtour_location_sample with %d persons" % len(choosers))

    # create wrapper with keys for this lookup - in this case there is a workplace_taz
    # in the choosers and a TAZ in the alternatives which get merged during interaction
    # the skims will be available under the name "skims" for any @ expressions
    skims = skim_dict.wrap(chooser_col_name, 'TAZ')

    locals_d = {
        'skims': skims
    }
    if constants is not None:
        locals_d.update(constants)

    # FIXME - MEMORY HACK - only include columns actually used in spec
    chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS']
    choosers = choosers[chooser_columns]

    choices = interaction_sample(
        choosers,
        alternatives,
        sample_size=sample_size,
        alt_col_name=alt_col_name,
        spec=atwork_subtour_destination_sample_spec,
        skims=skims,
        locals_d=locals_d,
        chunk_size=chunk_size,
        trace_label=trace_label)

    choices['person_id'] = choosers.person_id
    choices['workplace_taz'] = choosers.workplace_taz

    inject.add_table('atwork_subtour_destination_sample', choices)
Пример #6
0
def land_use(store):

    df = store["land_use/taz_data"]

    logger.info("loaded land_use %s" % (df.shape, ))

    # replace table function with dataframe
    inject.add_table('land_use', df)

    return df
Пример #7
0
def create_households(trace_hh_id):

    df = pd.DataFrame({
        'household_id': [1, 2, 3],
        'home_zone_id': {100, 100, 101}
    })
    inject.add_table('households', df)

    pipeline.get_rn_generator().add_channel('households', df)

    tracing.register_traceable_table('households', df)
Пример #8
0
def land_use():

    df = read_input_table("land_use_taz")

    logger.info("loaded land_use %s" % (df.shape,))

    df.index.name = 'TAZ'

    # replace table function with dataframe
    inject.add_table('land_use', df)

    return df
Пример #9
0
def input_pre_processor():
    """
    Read input text files and save them as pipeline tables for use in subsequent steps.

    The files to read as specified by table_list, and array of dicts that specify the
    input file name, the name of the pipeline table, along with keys allow the specification
    of pre-processing steps.

    By default, reads table_list from 'input_table_list' in settings.yaml,
    unless an alternate table_list name is specified as a model step argument 'table_list'.
    (This allows alternate/additional input files to be read for repop)

    In the case of repop, this step is being run after an initial run has completed,
    in which case the input_table_list may specify replacement tables.
    (e.g. lowest geography controls that will replace the previous low controls dataframe.)

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    | name of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+

    """

    # alternate table list name may have been provided as a model argument
    table_list_name = inject.get_step_arg('table_list',
                                          default='input_table_list')
    table_list = config.setting(table_list_name)

    assert table_list is not None, "no table list '%s' found in settings." % table_list_name

    logger.info('Using table list: %s' % table_list)

    for table_info in table_list:

        tablename = table_info.get('tablename')
        df = input.read_from_table_info(table_info)
        logger.info('registering table %s' % tablename)

        # add (or replace) pipeline table
        repop = inject.get_step_arg('repop', default=False)
        inject.add_table(tablename, df, replace=repop)
Пример #10
0
def create_controls_table(settings, configs_dir):
    expression_file_path = os.path.join(configs_dir, settings['controls_expression_file'])
    spec = read_spec(expression_file_path)
    df_list = []
    for county in settings['counties']:
        df = get_acs_data(county, spec, settings)
        df_list.append(df)
    acs_table = pd.concat(df_list) 
    acs_table.reset_index(inplace = True)
    inject.add_table('all_acs', acs_table)
    controls_table = create_controls(spec)
    inject.add_table('combined_acs', controls_table)

    print 'done'
Пример #11
0
def zone_data():
    """
    Pipeline table containing zone info. Specify with 'input_table_list'
    in settings.yaml. Must contain columns for at least zone id, latitude,
    and longitude.

    """
    df = read_input_table('zone_data')

    logger.info('loaded zone data %s' % (df.shape,))

    # replace table function with dataframe
    inject.add_table('zone_data', df)

    return df
Пример #12
0
def persons(households, trace_hh_id):

    df = read_raw_persons(households)

    logger.info("loaded persons %s" % (df.shape, ))

    # replace table function with dataframe
    inject.add_table('persons', df)

    pipeline.get_rn_generator().add_channel('persons', df)

    if trace_hh_id:
        tracing.register_traceable_table('persons', df)
        tracing.trace_df(df, "raw.persons", warn_if_empty=True)

    return df
Пример #13
0
def persons(households, trace_hh_id):

    df = read_raw_persons(households)

    logger.info("loaded persons %s" % (df.shape,))

    df.index.name = 'person_id'

    # replace table function with dataframe
    inject.add_table('persons', df)

    pipeline.get_rn_generator().add_channel('persons', df)

    if trace_hh_id:
        tracing.register_traceable_table('persons', df)
        tracing.trace_df(df, "raw.persons", warn_if_empty=True)

    return df
Пример #14
0
def persons(households, trace_hh_id):

    df = read_raw_persons(households)

    logger.info("loaded persons %s" % (df.shape, ))

    # replace table function with dataframe
    inject.add_table('persons', df)

    pipeline.get_rn_generator().add_channel('persons', df)

    tracing.register_traceable_table('persons', df)
    if trace_hh_id:
        tracing.trace_df(df, "raw.persons", warn_if_empty=True)

    logger.debug(
        f"{len(df.household_id.unique())} unique household_ids in persons")
    logger.debug(
        f"{len(households.index.unique())} unique household_ids in households")
    assert not households.index.duplicated().any()
    assert not df.index.duplicated().any()

    persons_without_households = ~df.household_id.isin(households.index)
    if persons_without_households.any():
        logger.error(
            f"{persons_without_households.sum()} persons out of {len(persons)} without households\n"
            f"{pd.Series({'person_id': persons_without_households.index.values})}"
        )
        raise RuntimeError(
            f"{persons_without_households.sum()} persons with bad household_id"
        )

    households_without_persons = df.groupby('household_id').size().reindex(
        households.index).isnull()
    if households_without_persons.any():
        logger.error(
            f"{households_without_persons.sum()} households out of {len(households.index)} without  persons\n"
            f"{pd.Series({'household_id': households_without_persons.index.values})}"
        )
        raise RuntimeError(
            f"{households_without_persons.sum()} households with no persons")

    return df
Пример #15
0
def create_sample_data():

    FIPS_NH = 33
    FIPS_VT = 50
    FIPS_ME = 23
    STATE_FIPS_SAMPLE = [FIPS_ME, FIPS_NH, FIPS_VT]

    data_dir = setting('data_dir', inject.get_injectable('data_dir'))
    input_tables = setting('input_tables')

    # - corresp_taz_fips
    table_info = input_tables['corresp_taz_fips']
    data_file_name = table_info['filename']
    data_file_path = os.path.join(data_dir, table_info['filename'])
    FIPS_NUMA = pd.read_csv(data_file_path, comment='#')
    # slice by state FIPS
    FIPS_NUMA = FIPS_NUMA[FIPS_NUMA['StateFIPS'].isin(STATE_FIPS_SAMPLE)]
    inject.add_table(os.path.splitext(data_file_name)[0], FIPS_NUMA)

    # - corresp_taz_faf4
    table_info = input_tables['corresp_taz_faf4']
    data_file_name = table_info['filename']
    data_file_path = os.path.join(data_dir, data_file_name)
    FAF_NUMA = pd.read_csv(data_file_path, comment='#')
    # slice by taz list from FIPS_NUMA
    FAF_NUMA = FAF_NUMA[FAF_NUMA['TAZ'].isin(FAF_NUMA.TAZ)]
    inject.add_table(os.path.splitext(data_file_name)[0], FAF_NUMA)

    # - cbp
    table_info = input_tables['cbp']
    data_file_name = table_info['filename']
    data_file_path = os.path.join(data_dir, data_file_name)
    cbp = pd.read_csv(data_file_path, comment='#')
    # slice by state FIPS
    cbp = cbp[cbp['fipstate'].isin(STATE_FIPS_SAMPLE)]
    inject.add_table(os.path.splitext(data_file_name)[0], cbp)
Пример #16
0
def iterate_location_choice(
        model_settings,
        persons_merged, persons, households,
        skim_dict, skim_stack,
        chunk_size, trace_hh_id, locutor,
        trace_label):
    """
    iterate run_location_choice updating shadow pricing until convergence criteria satisfied
    or max_iterations reached.

    (If use_shadow_pricing not enabled, then just iterate once)

    Parameters
    ----------
    model_settings : dict
    persons_merged : injected table
    persons : injected table
    skim_dict : skim.SkimDict
    skim_stack : skim.SkimStack
    chunk_size : int
    trace_hh_id : int
    locutor : bool
        whether this process is the privileged logger of shadow_pricing when multiprocessing
    trace_label : str

    Returns
    -------
    adds choice column model_settings['DEST_CHOICE_COLUMN_NAME'] and annotations to persons table
    """

    # column containing segment id
    chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME']

    # boolean to filter out persons not needing location modeling (e.g. is_worker, is_student)
    chooser_filter_column = model_settings['CHOOSER_FILTER_COLUMN_NAME']

    persons_merged_df = persons_merged.to_frame()

    persons_merged_df = persons_merged_df[persons_merged[chooser_filter_column]]

    spc = shadow_pricing.load_shadow_price_calculator(model_settings)
    max_iterations = spc.max_iterations

    logging.debug("%s max_iterations: %s" % (trace_label, max_iterations))

    choices = None
    for iteration in range(1, max_iterations + 1):

        if spc.use_shadow_pricing and iteration > 1:
            spc.update_shadow_prices()

        choices = run_location_choice(
            persons_merged_df,
            skim_dict, skim_stack,
            spc,
            model_settings,
            chunk_size, trace_hh_id,
            trace_label=tracing.extend_trace_label(trace_label, 'i%s' % iteration))

        choices_df = choices.to_frame('dest_choice')
        choices_df['segment_id'] = \
            persons_merged_df[chooser_segment_column].reindex(choices_df.index)

        spc.set_choices(choices_df)

        if locutor:
            spc.write_trace_files(iteration)

        if spc.use_shadow_pricing and spc.check_fit(iteration):
            logging.info("%s converged after iteration %s" % (trace_label, iteration,))
            break

    # - shadow price table
    if locutor:
        if spc.use_shadow_pricing and 'SHADOW_PRICE_TABLE' in model_settings:
            inject.add_table(model_settings['SHADOW_PRICE_TABLE'], spc.shadow_prices)
        if 'MODELED_SIZE_TABLE' in model_settings:
            inject.add_table(model_settings['MODELED_SIZE_TABLE'], spc.modeled_size)

    dest_choice_column_name = model_settings['DEST_CHOICE_COLUMN_NAME']
    tracing.print_summary(dest_choice_column_name, choices, value_counts=True)

    persons_df = persons.to_frame()

    # We only chose school locations for the subset of persons who go to school
    # so we backfill the empty choices with -1 to code as no school location
    NO_DEST_TAZ = -1
    persons_df[dest_choice_column_name] = \
        choices.reindex(persons_df.index).fillna(NO_DEST_TAZ).astype(int)

    # - annotate persons table
    if 'annotate_persons' in model_settings:
        expressions.assign_columns(
            df=persons_df,
            model_settings=model_settings.get('annotate_persons'),
            trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons'))

        pipeline.replace_table("persons", persons_df)

        if trace_hh_id:
            tracing.trace_df(persons_df,
                             label=trace_label,
                             warn_if_empty=True)

    # - annotate households table
    if 'annotate_households' in model_settings:

        households_df = households.to_frame()
        expressions.assign_columns(
            df=households_df,
            model_settings=model_settings.get('annotate_households'),
            trace_label=tracing.extend_trace_label(trace_label, 'annotate_households'))
        pipeline.replace_table("households", households_df)

        if trace_hh_id:
            tracing.trace_df(households_df,
                             label=trace_label,
                             warn_if_empty=True)

    return persons_df
Пример #17
0
def households(households_sample_size, override_hh_ids, trace_hh_id):

    df_full = read_input_table("households")
    households_sliced = False

    logger.info("full household list contains %s households" % df_full.shape[0])

    # only using households listed in override_hh_ids
    if override_hh_ids is not None:

        # trace_hh_id will not used if it is not in list of override_hh_ids
        logger.info("override household list containing %s households" % len(override_hh_ids))

        df = df_full[df_full.index.isin(override_hh_ids)]
        households_sliced = True

        if df.shape[0] < len(override_hh_ids):
            logger.info("found %s of %s households in override household list" %
                        (df.shape[0], len(override_hh_ids)))

        if df.shape[0] == 0:
            raise RuntimeError('No override households found in store')

    # if we are tracing hh exclusively
    elif trace_hh_id and households_sample_size == 1:

        # df contains only trace_hh (or empty if not in full store)
        df = tracing.slice_ids(df_full, trace_hh_id)
        households_sliced = True

    # if we need a subset of full store
    elif households_sample_size > 0 and df_full.shape[0] > households_sample_size:

        logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0]))

        """
        Because random seed is set differently for each step, sampling of households using
        Random.global_rng would sample differently depending upon which step it was called from.
        We use a one-off rng seeded with the pseudo step name 'sample_households' to provide
        repeatable sampling no matter when the table is loaded.

        Note that the external_rng is also seeded with base_seed so the sample will (rightly) change
        if the pipeline rng's base_seed is changed
        """

        prng = pipeline.get_rn_generator().get_external_rng('sample_households')
        df = df_full.take(prng.choice(len(df_full), size=households_sample_size, replace=False))
        households_sliced = True

        # if tracing and we missed trace_hh in sample, but it is in full store
        if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index:
            # replace first hh in sample with trace_hh
            logger.debug("replacing household %s with %s in household sample" %
                         (df.index[0], trace_hh_id))
            df_hh = df_full.loc[[trace_hh_id]]
            df = pd.concat([df_hh, df[1:]])

    else:
        df = df_full

    # persons table
    inject.add_injectable('households_sliced', households_sliced)

    logger.info("loaded households %s" % (df.shape,))

    df.index.name = 'household_id'

    # FIXME - pathological knowledge of name of chunk_id column used by chunked_choosers_by_chunk_id
    assert 'chunk_id' not in df.columns
    df['chunk_id'] = pd.Series(list(range(len(df))), df.index)

    # replace table function with dataframe
    inject.add_table('households', df)

    pipeline.get_rn_generator().add_channel('households', df)

    if trace_hh_id:
        tracing.register_traceable_table('households', df)
        tracing.trace_df(df, "raw.households", warn_if_empty=True)

    return df
Пример #18
0
def add_size_tables():
    """
    inject tour_destination_size_terms tables for each model_selector (e.g. school, workplace)

    Size tables are pandas dataframes with locations counts for model_selector by zone and segment
    tour_destination_size_terms

    if using shadow pricing, we scale size_table counts to sample population
    (in which case, they have to be created while single-process)

    Scaling is problematic as it breaks household result replicability across sample sizes
    It also changes the magnitude of the size terms so if they are used as utilities in
    expression files, their importance will diminish relative to other utilities as the sample
    size decreases.

    Scaling makes most sense for a full sample in conjunction with shadow pricing, where
    shadow prices can be adjusted iteratively to bring modelled counts into line with desired
    (size table) counts.
    """

    use_shadow_pricing = bool(config.setting('use_shadow_pricing'))

    shadow_settings = config.read_model_settings('shadow_pricing.yaml')
    shadow_pricing_models = shadow_settings['shadow_pricing_models']

    # probably ought not scale if not shadow_pricing (breaks partial sample replicability)
    # but this allows compatability with existing CTRAMP behavior...
    scale_size_table = shadow_settings.get('SCALE_SIZE_TABLE', False)

    if shadow_pricing_models is None:
        logger.warning('shadow_pricing_models list not found in shadow_pricing settings')
        return

    # shadow_pricing_models is dict of {<model_selector>: <model_name>}
    # since these are scaled to model size, they have to be created while single-process

    for model_selector, model_name in iteritems(shadow_pricing_models):

        model_settings = config.read_model_settings(model_name)

        assert model_selector == model_settings['MODEL_SELECTOR']

        segment_ids = model_settings['SEGMENT_IDS']
        chooser_table_name = model_settings['CHOOSER_TABLE_NAME']
        chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME']

        choosers_df = inject.get_table(chooser_table_name).to_frame()
        if 'CHOOSER_FILTER_COLUMN_NAME' in model_settings:
            choosers_df = \
                choosers_df[choosers_df[model_settings['CHOOSER_FILTER_COLUMN_NAME']] != 0]

        # - raw_desired_size
        land_use = inject.get_table('land_use')
        size_terms = inject.get_injectable('size_terms')
        raw_size = tour_destination_size_terms(land_use, size_terms, model_selector)
        assert set(raw_size.columns) == set(segment_ids.keys())

        if use_shadow_pricing or scale_size_table:

            inject.add_table('raw_' + size_table_name(model_selector), raw_size)

            # - scale size_table counts to sample population
            # scaled_size = zone_size * (total_segment_modeled / total_segment_desired)

            # segment scale factor (modeled / desired) keyed by segment_name
            segment_scale_factors = {}
            for c in raw_size:
                # number of zone demographics desired destination choices
                segment_desired_size = raw_size[c].astype(np.float64).sum()

                # number of synthetic population choosers in segment
                segment_chooser_count = \
                    (choosers_df[chooser_segment_column] == segment_ids[c]).sum()

                segment_scale_factors[c] = \
                    segment_chooser_count / np.maximum(segment_desired_size, 1)

                logger.info("add_desired_size_tables %s segment %s "
                            "desired %s modeled %s scale_factor %s" %
                            (chooser_table_name, c,
                             segment_desired_size,
                             segment_chooser_count,
                             segment_scale_factors[c]))

            # FIXME - should we be rounding?
            scaled_size = (raw_size * segment_scale_factors).round()
        else:
            scaled_size = raw_size

        inject.add_table(size_table_name(model_selector), scaled_size)