示例#1
0
def choose_intermediate_trip_purpose(trips, probs_spec, trace_hh_id, trace_label):
    """
    chose purpose for intermediate trips based on probs_spec
    which assigns relative weights (summing to 1) to the possible purpose choices

    Returns
    -------
    purpose: pandas.Series of purpose (str) indexed by trip_id
    """

    probs_join_cols = ['primary_purpose', 'outbound', 'person_type']
    non_purpose_cols = probs_join_cols + ['depart_range_start', 'depart_range_end']
    purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols]

    num_trips = len(trips.index)
    have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips)

    # probs shold sum to 1 across rows
    sum_probs = probs_spec[purpose_cols].sum(axis=1)
    probs_spec.loc[:, purpose_cols] = probs_spec.loc[:, purpose_cols].div(sum_probs, axis=0)

    # left join trips to probs (there may be multiple rows per trip for multiple depart ranges)
    choosers = pd.merge(trips.reset_index(), probs_spec, on=probs_join_cols,
                        how='left').set_index('trip_id')

    chunk.log_df(trace_label, 'choosers', choosers)

    # select the matching depart range (this should result on in exactly one chooser row per trip)
    choosers = choosers[(choosers.start >= choosers['depart_range_start']) & (
                choosers.start <= choosers['depart_range_end'])]

    # choosers should now match trips row for row
    assert choosers.index.is_unique
    assert len(choosers.index) == num_trips

    choices, rands = logit.make_choices(
        choosers[purpose_cols],
        trace_label=trace_label, trace_choosers=choosers)

    if have_trace_targets:
        tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'trip_purpose'])
        tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand'])

    choices = choices.map(pd.Series(purpose_cols))
    return choices
示例#2
0
def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id,
                       trace_label):
    """
    extend tour counts based on a probability table

    counts can only be extended if original count is between 1 and 4
    and tours can only be extended if their count is at the max possible
    (e.g. 2 for escort, 1 otherwise) so escort might be increased to 3 or 4
    and other tour types might be increased to 2 or 3

    Parameters
    ----------
    persons: pandas dataframe
        (need this for join columns)
    tour_counts: pandas dataframe
        one row per person, once column per tour_type
    alternatives
        alternatives from nmtv interaction_simulate
        only need this to know max possible frequency for a tour type
    trace_hh_id
    trace_label

    Returns
    -------
    extended tour_counts


    tour_counts looks like this:
               escort  shopping  othmaint  othdiscr    eatout    social
    parent_id
    2588676         2         0         0         1         1         0
    2588677         0         1         0         1         0         0

    """

    assert tour_counts.index.name == persons.index.name

    PROBABILITY_COLUMNS = ['0_tours', '1_tours', '2_tours']
    JOIN_COLUMNS = ['ptype', 'has_mandatory_tour', 'has_joint_tour']
    TOUR_TYPE_COL = 'nonmandatory_tour_type'

    probs_spec = extension_probs()
    persons = persons[JOIN_COLUMNS]

    # only extend if there are 1 - 4 non_mandatory tours to start with
    extend_tour_counts = tour_counts.sum(axis=1).between(1, 4)
    if not extend_tour_counts.any():
        logger.info(
            "extend_tour_counts - no persons eligible for tour_count extension"
        )
        return tour_counts

    have_trace_targets = trace_hh_id and tracing.has_trace_targets(
        extend_tour_counts)

    for i, tour_type in enumerate(alternatives.columns):

        i_tour_type = i + 1  # (probs_spec nonmandatory_tour_type column is 1-based)
        tour_type_trace_label = tracing.extend_trace_label(
            trace_label, tour_type)

        # - only extend tour if frequency is max possible frequency for this tour type
        tour_type_is_maxed = \
            extend_tour_counts & (tour_counts[tour_type] == alternatives[tour_type].max())
        maxed_tour_count_idx = tour_counts.index[tour_type_is_maxed]

        if len(maxed_tour_count_idx) == 0:
            continue

        # - get extension probs for tour_type
        choosers = pd.merge(
            persons.loc[maxed_tour_count_idx],
            probs_spec[probs_spec[TOUR_TYPE_COL] == i_tour_type],
            on=JOIN_COLUMNS,
            how='left').set_index(maxed_tour_count_idx)
        assert choosers.index.name == tour_counts.index.name

        # - random choice of extension magnitude based on relative probs
        choices, rands = logit.make_choices(choosers[PROBABILITY_COLUMNS],
                                            trace_label=tour_type_trace_label,
                                            trace_choosers=choosers)

        # - extend tour_count (0-based prob alternative choice equals magnitude of extension)
        if choices.any():
            tour_counts.loc[choices.index, tour_type] += choices

        if have_trace_targets:
            tracing.trace_df(choices,
                             tracing.extend_trace_label(
                                 tour_type_trace_label, 'choices'),
                             columns=[None, 'choice'])
            tracing.trace_df(rands,
                             tracing.extend_trace_label(
                                 tour_type_trace_label, 'rands'),
                             columns=[None, 'rand'])

    return tour_counts
示例#3
0
def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los,
                       alt_dest_col_name, trace_label):
    """
    Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ
    choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ

    Parameters
    ----------
    taz_sample: dataframe with duplicated index <chooser_id_col> and columns: <alt_dest_col_name>, prob, pick_count
    MAZ_size_terms: dataframe with duplicated index <chooser_id_col> and columns: zone_id, dest_TAZ, size_term

    Returns
    -------
    dataframe with with duplicated index <chooser_id_col> and columns: <alt_dest_col_name>, prob, pick_count
    """

    if len(taz_sample) == 0:
        # it can happen that all trips have no viable destinations (and so are dropped from the sample)
        # in which case we can just return the empty taz_sample, since it has the same columns
        return taz_sample.copy()

    # we had to use alt_dest_col_name as specified in model_settings for interaction_sample
    # because expressions reference it to look up size_terms by trip purpose
    DEST_MAZ = alt_dest_col_name
    DEST_TAZ = f"{alt_dest_col_name}_TAZ"

    taz_sample.rename(columns={alt_dest_col_name: DEST_TAZ}, inplace=True)

    trace_hh_id = inject.get_injectable("trace_hh_id", None)
    have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample)
    if have_trace_targets:
        trace_label = tracing.extend_trace_label(trace_label,
                                                 'choose_MAZ_for_TAZ')

        # write taz choices, pick_counts, probs
        trace_targets = tracing.trace_targets(taz_sample)
        tracing.trace_df(taz_sample[trace_targets],
                         label=tracing.extend_trace_label(
                             trace_label, 'taz_sample'),
                         transpose=False)

    # print(f"taz_sample\n{taz_sample}")
    #            alt_dest_TAZ      prob  pick_count
    # trip_id
    # 4343721              12  0.000054           1
    # 4343721              20  0.001864           2

    taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False)
    taz_choices = taz_choices.reindex(
        taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True)
    taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'})

    # print(f"taz_choices\n{taz_choices}")
    #         trip_id  alt_dest_TAZ      prob
    # 0       4343721            12  0.000054
    # 1       4343721            20  0.001864
    # 2       4343721            20  0.001864

    # print(f"MAZ_size_terms\n{MAZ_size_terms.df}")
    #           work  escort  shopping  eatout  othmaint  social  othdiscr   univ
    # alt_dest
    # 2         31.0   9.930     0.042   0.258     0.560   0.520    10.856  0.042
    # 3          0.0   3.277     0.029   0.000     0.029   0.029     7.308  0.029
    # 4          0.0   1.879     0.023   0.000     0.023   0.023     5.796  0.023

    # just to make it clear we are siloing choices by chooser_id
    chooser_id_col = taz_sample.index.name  # should be canonical chooser index name (e.g. 'trip_id')

    # for random_for_df, we need df with de-duplicated chooser canonical index
    chooser_df = pd.DataFrame(
        index=taz_sample.index[~taz_sample.index.duplicated()])
    num_choosers = len(chooser_df)
    assert chooser_df.index.name == chooser_id_col

    # to make choices, <taz_sample_size> rands for each chooser (one rand for each sampled TAZ)
    # taz_sample_size will be model_settings['SAMPLE_SIZE'] samples, except if we are estimating
    taz_sample_size = taz_choices.groupby(
        chooser_id_col)[DEST_TAZ].count().max()

    # taz_choices index values should be contiguous
    assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index,
                                                     taz_sample_size)).all()

    # we need to choose a MAZ for each DEST_TAZ choice
    # probability of choosing MAZ based on MAZ size_term fraction of TAZ total
    # there will be a different set (and number) of candidate MAZs for each TAZ
    # (preserve index, which will have duplicates as result of join)

    maz_taz = network_los.maz_taz_df[['MAZ', 'TAZ']].rename(columns={
        'TAZ': DEST_TAZ,
        'MAZ': DEST_MAZ
    })
    maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(),
                         maz_taz,
                         how='left',
                         on=DEST_TAZ).set_index('index')

    purpose = maz_sizes['trip_id'].map(
        trips.purpose)  # size term varies by purpose
    maz_sizes['size_term'] = MAZ_size_terms.get(maz_sizes[DEST_MAZ], purpose)

    # print(f"maz_sizes\n{maz_sizes}")
    #          trip_id  alt_dest_TAZ  alt_dest  size_term
    # index
    # 0        4343721            12      3445      0.019
    # 0        4343721            12     11583      0.017
    # 0        4343721            12     21142      0.020

    if have_trace_targets:
        # write maz_sizes: maz_sizes[index,trip_id,dest_TAZ,zone_id,size_term]
        maz_sizes_trace_targets = tracing.trace_targets(maz_sizes,
                                                        slicer='trip_id')
        trace_maz_sizes = maz_sizes[maz_sizes_trace_targets]
        tracing.trace_df(trace_maz_sizes,
                         label=tracing.extend_trace_label(
                             trace_label, 'maz_sizes'),
                         transpose=False)

    # number of DEST_TAZ candidates per chooser
    maz_counts = maz_sizes.groupby(maz_sizes.index).size().values
    # print(maz_counts)

    # max number of MAZs for any TAZ
    max_maz_count = maz_counts.max()
    # print(f"max_maz_count {max_maz_count}")

    # offsets of the first and last rows of each chooser in sparse interaction_utilities
    last_row_offsets = maz_counts.cumsum()
    first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0)

    # repeat the row offsets once for each dummy utility to insert
    # (we want to insert dummy utilities at the END of the list of alternative utilities)
    # inserts is a list of the indices at which we want to do the insertions
    inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts)

    # insert zero filler to pad each alternative set to same size
    padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts, 0.0)
    padded_maz_sizes = padded_maz_sizes.reshape(-1, max_maz_count)

    # prob array with one row TAZ_choice, one column per alternative
    row_sums = padded_maz_sizes.sum(axis=1)
    maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1))
    assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count)

    rands = pipeline.get_rn_generator().random_for_df(
        chooser_df, n=taz_sample_size).reshape(-1, 1)
    assert len(rands) == num_choosers * taz_sample_size
    assert len(rands) == maz_probs.shape[0]

    # make choices
    # positions is array with the chosen alternative represented as a column index in probs
    # which is an integer between zero and max_maz_count
    positions = np.argmax((maz_probs.cumsum(axis=1) - rands) > 0.0, axis=1)

    # shouldn't have chosen any of the dummy pad positions
    assert (positions < maz_counts).all()

    taz_choices[DEST_MAZ] = maz_sizes[DEST_MAZ].take(positions +
                                                     first_row_offsets)
    taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]),
                                        positions]
    taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob']

    if have_trace_targets:

        taz_choices_trace_targets = tracing.trace_targets(taz_choices,
                                                          slicer='trip_id')
        trace_taz_choices_df = taz_choices[taz_choices_trace_targets]
        tracing.trace_df(trace_taz_choices_df,
                         label=tracing.extend_trace_label(
                             trace_label, 'taz_choices'),
                         transpose=False)

        lhs_df = trace_taz_choices_df[['trip_id', DEST_TAZ]]
        alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)]

        # following the same logic as the full code, but for trace cutout
        trace_maz_counts = maz_counts[taz_choices_trace_targets]
        trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum()
        trace_inserts = np.repeat(trace_last_row_offsets,
                                  max_maz_count - trace_maz_counts)

        # trace dest_maz_alts
        padded_maz_sizes = np.insert(trace_maz_sizes[DEST_MAZ].values,
                                     trace_inserts,
                                     0.0).reshape(-1, max_maz_count)
        df = pd.DataFrame(data=padded_maz_sizes,
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_alts'),
                         transpose=False)

        # trace dest_maz_size_terms
        padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values,
                                     trace_inserts,
                                     0.0).reshape(-1, max_maz_count)
        df = pd.DataFrame(data=padded_maz_sizes,
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_size_terms'),
                         transpose=False)

        # trace dest_maz_probs
        df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets],
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        df['rand'] = rands[taz_choices_trace_targets]
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_probs'),
                         transpose=False)

    taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob'])
    taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ
                                       ]).agg(prob=('prob', 'max'),
                                              pick_count=('prob', 'count'))

    taz_choices.reset_index(level=DEST_MAZ, inplace=True)

    return taz_choices
示例#4
0
def choose_intermediate_trip_purpose(trips, probs_spec, estimator,
                                     probs_join_cols, use_depart_time,
                                     trace_hh_id, trace_label):
    """
    chose purpose for intermediate trips based on probs_spec
    which assigns relative weights (summing to 1) to the possible purpose choices

    Returns
    -------
    purpose: pandas.Series of purpose (str) indexed by trip_id
    """

    non_purpose_cols = probs_join_cols.copy()
    if use_depart_time:
        non_purpose_cols += ['depart_range_start', 'depart_range_end']
    purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols]

    num_trips = len(trips.index)
    have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips)

    # probs should sum to 1 across rows
    sum_probs = probs_spec[purpose_cols].sum(axis=1)
    probs_spec.loc[:,
                   purpose_cols] = probs_spec.loc[:,
                                                  purpose_cols].div(sum_probs,
                                                                    axis=0)

    # left join trips to probs (there may be multiple rows per trip for multiple depart ranges)
    choosers = pd.merge(trips.reset_index(),
                        probs_spec,
                        on=probs_join_cols,
                        how='left').set_index('trip_id')
    chunk.log_df(trace_label, 'choosers', choosers)

    if use_depart_time:

        # select the matching depart range (this should result on in exactly one chooser row per trip)
        chooser_probs = \
            (choosers.start >= choosers['depart_range_start']) & (choosers.start <= choosers['depart_range_end'])

        # if we failed to match a row in probs_spec
        if chooser_probs.sum() < num_trips:

            # this can happen if the spec doesn't have probs for the trips matching a trip's probs_join_cols
            missing_trip_ids = trips.index[
                ~trips.index.isin(choosers.index[chooser_probs])].values
            unmatched_choosers = choosers[choosers.index.isin(
                missing_trip_ids)]
            unmatched_choosers = unmatched_choosers[['person_id', 'start'] +
                                                    non_purpose_cols]

            # join to persons for better diagnostics
            persons = inject.get_table('persons').to_frame()
            persons_cols = [
                'age', 'is_worker', 'is_student', 'is_gradeschool',
                'is_highschool', 'is_university'
            ]
            unmatched_choosers = pd.merge(unmatched_choosers,
                                          persons[[
                                              col for col in persons_cols
                                              if col in persons.columns
                                          ]],
                                          left_on='person_id',
                                          right_index=True,
                                          how='left')

            file_name = '%s.UNMATCHED_PROBS' % trace_label
            logger.error(
                "%s %s of %s intermediate trips could not be matched to probs based on join columns  %s"
                % (trace_label, len(unmatched_choosers), len(choosers),
                   probs_join_cols))
            logger.info("Writing %s unmatched choosers to %s" % (
                len(unmatched_choosers),
                file_name,
            ))
            tracing.write_csv(unmatched_choosers,
                              file_name=file_name,
                              transpose=False)
            raise RuntimeError(
                "Some trips could not be matched to probs based on join columns %s."
                % probs_join_cols)

        # select the matching depart range (this should result on in exactly one chooser row per trip)
        choosers = choosers[chooser_probs]

    # choosers should now match trips row for row
    assert choosers.index.identical(trips.index)

    if estimator:
        probs_cols = list(probs_spec.columns)
        print(choosers[probs_cols])
        estimator.write_table(choosers[probs_cols], 'probs', append=True)

    choices, rands = logit.make_choices(choosers[purpose_cols],
                                        trace_label=trace_label,
                                        trace_choosers=choosers)

    if have_trace_targets:
        tracing.trace_df(choices,
                         '%s.choices' % trace_label,
                         columns=[None, 'trip_purpose'])
        tracing.trace_df(rands,
                         '%s.rands' % trace_label,
                         columns=[None, 'rand'])

    choices = choices.map(pd.Series(purpose_cols))
    return choices
示例#5
0
def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label):
    """
    Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ
    choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ

    Parameters
    ----------
    taz_sample: dataframe with duplicated index <chooser_id_col> and columns: <DEST_TAZ>, prob, pick_count
    MAZ_size_terms: dataframe with duplicated index <chooser_id_col> and columns: zone_id, dest_TAZ, size_term

    Returns
    -------
    dataframe with with duplicated index <chooser_id_col> and columns: <DEST_MAZ>, prob, pick_count
    """

    # print(f"taz_sample\n{taz_sample}")
    #           dest_TAZ      prob  pick_count  person_id
    # tour_id
    # 542963          18  0.004778           1      13243
    # 542963          53  0.004224           2      13243
    # 542963          59  0.008628           1      13243

    trace_hh_id = inject.get_injectable("trace_hh_id", None)
    have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample)
    if have_trace_targets:
        trace_label = tracing.extend_trace_label(trace_label,
                                                 'choose_MAZ_for_TAZ')

        CHOOSER_ID = taz_sample.index.name  # zone_id for tours, but person_id for location choice
        assert CHOOSER_ID is not None

        # write taz choices, pick_counts, probs
        trace_targets = tracing.trace_targets(taz_sample)
        tracing.trace_df(taz_sample[trace_targets],
                         label=tracing.extend_trace_label(
                             trace_label, 'taz_sample'),
                         transpose=False)

    # redupe taz_sample[[DEST_TAZ, 'prob']] using pick_count to repeat rows
    taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False)
    taz_choices = taz_choices.reindex(
        taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True)
    taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'})

    # print(f"taz_choices\n{taz_choices}")
    #        tour_id  dest_TAZ  TAZ_prob
    # 0       542963        18  0.004778
    # 1       542963        53  0.004224
    # 2       542963        53  0.004224
    # 3       542963        59  0.008628

    # print(f"MAZ_size_terms\n{MAZ_size_terms}")
    #       zone_id  dest_TAZ  size_term
    # 0        6097         2      7.420
    # 1       16421         2      9.646
    # 2       24251         2     10.904

    # just to make it clear we are siloing choices by chooser_id
    chooser_id_col = taz_sample.index.name  # should be canonical chooser index name (e.g. 'person_id')

    # for random_for_df, we need df with de-duplicated chooser canonical index
    chooser_df = pd.DataFrame(
        index=taz_sample.index[~taz_sample.index.duplicated()])
    num_choosers = len(chooser_df)
    assert chooser_df.index.name == chooser_id_col

    # to make choices, <taz_sample_size> rands for each chooser (one rand for each sampled TAZ)
    # taz_sample_size will be model_settings['SAMPLE_SIZE'] samples, except if we are estimating
    taz_sample_size = taz_choices.groupby(
        chooser_id_col)[DEST_TAZ].count().max()

    # taz_choices index values should be contiguous
    assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index,
                                                     taz_sample_size)).all()

    # we need to choose a MAZ for each DEST_TAZ choice
    # probability of choosing MAZ based on MAZ size_term fraction of TAZ total
    # there will be a different set (and number) of candidate MAZs for each TAZ
    # (preserve index, which will have duplicates as result of join)
    # maz_sizes.index is the integer offset into taz_choices of the taz for which the maz_size row is a candidate)
    maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(),
                         MAZ_size_terms,
                         how='left',
                         on=DEST_TAZ).set_index('index')

    #         tour_id  dest_TAZ  zone_id  size_term
    # index
    # 0        542963        18      498     12.130
    # 0        542963        18     7696     18.550
    # 0        542963        18    15431      8.678
    # 0        542963        18    21429     29.938
    # 1        542963        53    17563     34.252

    if have_trace_targets:
        # write maz_sizes: maz_sizes[index,tour_id,dest_TAZ,zone_id,size_term]

        maz_sizes_trace_targets = tracing.trace_targets(maz_sizes,
                                                        slicer=CHOOSER_ID)
        trace_maz_sizes = maz_sizes[maz_sizes_trace_targets]
        tracing.trace_df(trace_maz_sizes,
                         label=tracing.extend_trace_label(
                             trace_label, 'maz_sizes'),
                         transpose=False)

    # number of DEST_TAZ candidates per chooser
    maz_counts = maz_sizes.groupby(maz_sizes.index).size().values

    # max number of MAZs for any TAZ
    max_maz_count = maz_counts.max()

    # offsets of the first and last rows of each chooser in sparse interaction_utilities
    last_row_offsets = maz_counts.cumsum()
    first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0)

    # repeat the row offsets once for each dummy utility to insert
    # (we want to insert dummy utilities at the END of the list of alternative utilities)
    # inserts is a list of the indices at which we want to do the insertions
    inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts)

    # insert zero filler to pad each alternative set to same size
    padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts,
                                 0.0).reshape(-1, max_maz_count)

    # prob array with one row TAZ_choice, one column per alternative
    row_sums = padded_maz_sizes.sum(axis=1)
    maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1))
    assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count)

    rands = pipeline.get_rn_generator().random_for_df(chooser_df,
                                                      n=taz_sample_size)
    rands = rands.reshape(-1, 1)
    assert len(rands) == num_choosers * taz_sample_size
    assert len(rands) == maz_probs.shape[0]

    # make choices
    # positions is array with the chosen alternative represented as a column index in probs
    # which is an integer between zero and max_maz_count
    positions = np.argmax((maz_probs.cumsum(axis=1) - rands) > 0.0, axis=1)

    # shouldn't have chosen any of the dummy pad positions
    assert (positions < maz_counts).all()

    taz_choices[DEST_MAZ] = maz_sizes['zone_id'].take(positions +
                                                      first_row_offsets)
    taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]),
                                        positions]
    taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob']

    if have_trace_targets:

        taz_choices_trace_targets = tracing.trace_targets(taz_choices,
                                                          slicer=CHOOSER_ID)
        trace_taz_choices_df = taz_choices[taz_choices_trace_targets]
        tracing.trace_df(trace_taz_choices_df,
                         label=tracing.extend_trace_label(
                             trace_label, 'taz_choices'),
                         transpose=False)

        lhs_df = trace_taz_choices_df[[CHOOSER_ID, DEST_TAZ]]
        alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)]

        # following the same logic as the full code, but for trace cutout
        trace_maz_counts = maz_counts[taz_choices_trace_targets]
        trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum()
        trace_inserts = np.repeat(trace_last_row_offsets,
                                  max_maz_count - trace_maz_counts)

        # trace dest_maz_alts
        padded_maz_sizes = np.insert(trace_maz_sizes[CHOOSER_ID].values,
                                     trace_inserts,
                                     0.0).reshape(-1, max_maz_count)
        df = pd.DataFrame(data=padded_maz_sizes,
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_alts'),
                         transpose=False)

        # trace dest_maz_size_terms
        padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values,
                                     trace_inserts,
                                     0.0).reshape(-1, max_maz_count)
        df = pd.DataFrame(data=padded_maz_sizes,
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_size_terms'),
                         transpose=False)

        # trace dest_maz_probs
        df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets],
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        df['rand'] = rands[taz_choices_trace_targets]
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_probs'),
                         transpose=False)

    taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob'])
    taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ
                                       ]).agg(prob=('prob', 'max'),
                                              pick_count=('prob', 'count'))

    taz_choices.reset_index(level=DEST_MAZ, inplace=True)

    return taz_choices
示例#6
0
def schedule_nth_trips(trips, probs_spec, model_settings, first_trip_in_leg,
                       report_failed_trips, trace_hh_id, trace_label):
    """
    We join each trip with the appropriate row in probs_spec by joining on probs_join_cols,
    which should exist in both trips, probs_spec dataframe.

    Parameters
    ----------
    trips: pd.DataFrame
    probs_spec: pd.DataFrame
        Dataframe of probs for choice of depart times and join columns to match them with trips.
        Depart columns names are irrelevant. Instead, they are position dependent,
        time period choice is their index + depart_alt_base
    depart_alt_base: int
        int to add to probs column index to get time period it represents.
        e.g. depart_alt_base = 5 means first column (column 0) represents 5 am
    report_failed_trips : bool
    trace_hh_id
    trace_label

    Returns
    -------
    choices: pd.Series
        time periods depart choices, one per trip (except for trips with zero probs)
    """

    depart_alt_base = model_settings.get('DEPART_ALT_BASE')

    probs_cols = [c for c in probs_spec.columns if c not in PROBS_JOIN_COLUMNS]

    # left join trips to probs (there may be multiple rows per trip for multiple depart ranges)
    choosers = pd.merge(trips.reset_index(),
                        probs_spec,
                        on=PROBS_JOIN_COLUMNS,
                        how='left').set_index('trip_id')
    chunk.log_df(trace_label, "choosers", choosers)

    if trace_hh_id and tracing.has_trace_targets(trips):
        tracing.trace_df(choosers, '%s.choosers' % trace_label)

    # choosers should now match trips row for row
    assert choosers.index.is_unique
    assert len(choosers.index) == len(trips.index)

    # zero out probs outside earliest-latest window
    chooser_probs = clip_probs(trips, choosers[probs_cols], model_settings)
    chunk.log_df(trace_label, "chooser_probs", chooser_probs)

    if first_trip_in_leg:
        # probs should sum to 1 unless all zero
        chooser_probs = chooser_probs.div(chooser_probs.sum(axis=1),
                                          axis=0).fillna(0)

    # probs should sum to 1 with residual probs resulting in choice of 'fail'
    chooser_probs['fail'] = 1 - chooser_probs.sum(axis=1).clip(0, 1)
    chunk.log_df(trace_label, "chooser_probs", chooser_probs)

    if trace_hh_id and tracing.has_trace_targets(trips):
        tracing.trace_df(chooser_probs, '%s.chooser_probs' % trace_label)

    choices, rands = logit.make_choices(chooser_probs,
                                        trace_label=trace_label,
                                        trace_choosers=choosers)

    chunk.log_df(trace_label, "choices", choices)
    chunk.log_df(trace_label, "rands", rands)

    if trace_hh_id and tracing.has_trace_targets(trips):
        tracing.trace_df(choices,
                         '%s.choices' % trace_label,
                         columns=[None, 'depart'])
        tracing.trace_df(rands,
                         '%s.rands' % trace_label,
                         columns=[None, 'rand'])

    # convert alt choice index to depart time (setting failed choices to -1)
    failed = (choices == chooser_probs.columns.get_loc('fail'))
    choices = (choices + depart_alt_base).where(~failed, -1)

    chunk.log_df(trace_label, "failed", failed)

    # report failed trips while we have the best diagnostic info
    if report_failed_trips and failed.any():
        report_bad_choices(bad_row_map=failed,
                           df=choosers,
                           filename='failed_choosers',
                           trace_label=trace_label,
                           trace_choosers=None)

    # trace before removing failures
    if trace_hh_id and tracing.has_trace_targets(trips):
        tracing.trace_df(choices,
                         '%s.choices' % trace_label,
                         columns=[None, 'depart'])
        tracing.trace_df(rands,
                         '%s.rands' % trace_label,
                         columns=[None, 'rand'])

    # remove any failed choices
    if failed.any():
        choices = choices[~failed]

    assert (choices >= trips.earliest[~failed]).all()
    assert (choices <= trips.latest[~failed]).all()

    return choices
示例#7
0
def choose_tour_leg_pattern(trip_segment,
                            patterns, spec,
                            trace_label='trace_label'):
    alternatives = generate_alternatives(trip_segment, STOP_TIME_DURATION).sort_index()
    have_trace_targets = tracing.has_trace_targets(trip_segment)

    if have_trace_targets:
        tracing.trace_df(trip_segment, tracing.extend_trace_label(trace_label, 'choosers'))
        tracing.trace_df(alternatives, tracing.extend_trace_label(trace_label, 'alternatives'),
                         transpose=False)

    if len(spec.columns) > 1:
        raise RuntimeError('spec must have only one column')

    # - join choosers and alts
    # in vanilla interaction_simulate interaction_df is cross join of choosers and alternatives
    # interaction_df = logit.interaction_dataset(choosers, alternatives, sample_size)
    # here, alternatives is sparsely repeated once for each (non-dup) sample
    # we expect alternatives to have same index of choosers (but with duplicate index values)
    # so we just need to left join alternatives with choosers
    assert alternatives.index.name == trip_segment.index.name

    interaction_df = alternatives.join(trip_segment, how='left', rsuffix='_chooser')

    chunk.log_df(trace_label, 'interaction_df', interaction_df)

    if have_trace_targets:
        trace_rows, trace_ids = tracing.interaction_trace_rows(interaction_df, trip_segment)

        tracing.trace_df(interaction_df,
                         tracing.extend_trace_label(trace_label, 'interaction_df'),
                         transpose=False)
    else:
        trace_rows = trace_ids = None

    interaction_utilities, trace_eval_results \
        = interaction_simulate.eval_interaction_utilities(spec, interaction_df, None, trace_label, trace_rows,
                                                          estimator=None)

    interaction_utilities = pd.concat([interaction_df[STOP_TIME_DURATION], interaction_utilities], axis=1)
    chunk.log_df(trace_label, 'interaction_utilities', interaction_utilities)

    interaction_utilities = pd.merge(interaction_utilities.reset_index(),
                                     patterns[patterns[TRIP_ID].isin(interaction_utilities.index)],
                                     on=[TRIP_ID, STOP_TIME_DURATION], how='left')

    if have_trace_targets:
        tracing.trace_interaction_eval_results(trace_eval_results, trace_ids,
                                               tracing.extend_trace_label(trace_label, 'eval'))

        tracing.trace_df(interaction_utilities,
                         tracing.extend_trace_label(trace_label, 'interaction_utilities'),
                         transpose=False)

    del interaction_df
    chunk.log_df(trace_label, 'interaction_df', None)

    interaction_utilities = interaction_utilities.groupby([TOUR_ID, OUTBOUND, PATTERN_ID],
                                                          as_index=False)[['utility']].sum()

    interaction_utilities[TOUR_LEG_ID] = \
        interaction_utilities.apply(generate_tour_leg_id, axis=1)

    tour_choosers = interaction_utilities.set_index(TOUR_LEG_ID)
    interaction_utilities = tour_choosers[['utility']].copy()

    # reshape utilities (one utility column and one row per row in model_design)
    # to a dataframe with one row per chooser and one column per alternative
    # interaction_utilities is sparse because duplicate sampled alternatives were dropped
    # so we need to pad with dummy utilities so low that they are never chosen

    # number of samples per chooser
    sample_counts = interaction_utilities.groupby(interaction_utilities.index).size().values
    chunk.log_df(trace_label, 'sample_counts', sample_counts)

    # max number of alternatvies for any chooser
    max_sample_count = sample_counts.max()

    # offsets of the first and last rows of each chooser in sparse interaction_utilities
    last_row_offsets = sample_counts.cumsum()
    first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0)

    # repeat the row offsets once for each dummy utility to insert
    # (we want to insert dummy utilities at the END of the list of alternative utilities)
    # inserts is a list of the indices at which we want to do the insertions
    inserts = np.repeat(last_row_offsets, max_sample_count - sample_counts)

    del sample_counts
    chunk.log_df(trace_label, 'sample_counts', None)

    # insert the zero-prob utilities to pad each alternative set to same size
    padded_utilities = np.insert(interaction_utilities.utility.values, inserts, -999)
    del inserts

    del interaction_utilities
    chunk.log_df(trace_label, 'interaction_utilities', None)

    # reshape to array with one row per chooser, one column per alternative
    padded_utilities = padded_utilities.reshape(-1, max_sample_count)
    chunk.log_df(trace_label, 'padded_utilities', padded_utilities)

    # convert to a dataframe with one row per chooser and one column per alternative
    utilities_df = pd.DataFrame(
        padded_utilities,
        index=tour_choosers.index.unique())
    chunk.log_df(trace_label, 'utilities_df', utilities_df)

    del padded_utilities
    chunk.log_df(trace_label, 'padded_utilities', None)

    if have_trace_targets:
        tracing.trace_df(utilities_df, tracing.extend_trace_label(trace_label, 'utilities'),
                         column_labels=['alternative', 'utility'])

    # convert to probabilities (utilities exponentiated and normalized to probs)
    # probs is same shape as utilities, one row per chooser and one column for alternative
    probs = logit.utils_to_probs(utilities_df,
                                 trace_label=trace_label, trace_choosers=trip_segment)

    chunk.log_df(trace_label, 'probs', probs)

    del utilities_df
    chunk.log_df(trace_label, 'utilities_df', None)

    if have_trace_targets:
        tracing.trace_df(probs, tracing.extend_trace_label(trace_label, 'probs'),
                         column_labels=['alternative', 'probability'])

    # make choices
    # positions is series with the chosen alternative represented as a column index in probs
    # which is an integer between zero and num alternatives in the alternative sample
    positions, rands = \
        logit.make_choices(probs, trace_label=trace_label, trace_choosers=trip_segment)

    chunk.log_df(trace_label, 'positions', positions)
    chunk.log_df(trace_label, 'rands', rands)

    del probs
    chunk.log_df(trace_label, 'probs', None)

    # shouldn't have chosen any of the dummy pad utilities
    assert positions.max() < max_sample_count

    # need to get from an integer offset into the alternative sample to the alternative index
    # that is, we want the index value of the row that is offset by <position> rows into the
    # tranche of this choosers alternatives created by cross join of alternatives and choosers

    # resulting pandas Int64Index has one element per chooser row and is in same order as choosers
    choices = tour_choosers[PATTERN_ID].take(positions + first_row_offsets)

    chunk.log_df(trace_label, 'choices', choices)

    if have_trace_targets:
        tracing.trace_df(choices, tracing.extend_trace_label(trace_label, 'choices'),
                         columns=[None, PATTERN_ID])
        tracing.trace_df(rands, tracing.extend_trace_label(trace_label, 'rands'),
                         columns=[None, 'rand'])

    return choices
def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_label):
    """
    extend tour counts based on a probability table

    counts can only be extended if original count is between 1 and 4
    and tours can only be extended if their count is at the max possible
    (e.g. 2 for escort, 1 otherwise) so escort might be increased to 3 or 4
    and other tour types might be increased to 2 or 3

    Parameters
    ----------
    persons: pandas dataframe
        (need this for join columns)
    tour_counts: pandas dataframe
        one row per person, once column per tour_type
    alternatives
        alternatives from nmtv interaction_simulate
        only need this to know max possible frequency for a tour type
    trace_hh_id
    trace_label

    Returns
    -------
    extended tour_counts


    tour_counts looks like this:
               escort  shopping  othmaint  othdiscr    eatout    social
    parent_id
    2588676         2         0         0         1         1         0
    2588677         0         1         0         1         0         0

    """

    assert tour_counts.index.name == persons.index.name

    PROBABILITY_COLUMNS = ['0_tours', '1_tours', '2_tours']
    JOIN_COLUMNS = ['ptype', 'has_mandatory_tour', 'has_joint_tour']
    TOUR_TYPE_COL = 'nonmandatory_tour_type'

    probs_spec = extension_probs()
    persons = persons[JOIN_COLUMNS]

    # only extend if there are 1 - 4 non_mandatory tours to start with
    extend_tour_counts = tour_counts.sum(axis=1).between(1, 4)
    if not extend_tour_counts.any():
        return tour_counts

    have_trace_targets = trace_hh_id and tracing.has_trace_targets(extend_tour_counts)

    for i, tour_type in enumerate(alternatives.columns):

        i_tour_type = i + 1  # (probs_spec nonmandatory_tour_type column is 1-based)
        tour_type_trace_label = tracing.extend_trace_label(trace_label, tour_type)

        # - only extend tour if frequency is max possible frequency for this tour type
        tour_type_is_maxed = \
            extend_tour_counts & (tour_counts[tour_type] == alternatives[tour_type].max())
        maxed_tour_count_idx = tour_counts.index[tour_type_is_maxed]

        if len(maxed_tour_count_idx) == 0:
            continue

        # - get extension probs for tour_type
        choosers = pd.merge(
            persons.loc[maxed_tour_count_idx],
            probs_spec[probs_spec[TOUR_TYPE_COL] == i_tour_type],
            on=JOIN_COLUMNS,
            how='left'
        ).set_index(maxed_tour_count_idx)
        assert choosers.index.name == tour_counts.index.name

        # - random choice of extension magnituce based on relative probs
        choices, rands = logit.make_choices(
            choosers[PROBABILITY_COLUMNS],
            trace_label=tour_type_trace_label,
            trace_choosers=choosers)

        # - extend tour_count (0-based prob alternative choice equals magnitude of extension)
        if choices.any():
            tour_counts.loc[choices.index, tour_type] += choices

        if have_trace_targets:
            tracing.trace_df(choices,
                             tracing.extend_trace_label(tour_type_trace_label, 'choices'),
                             columns=[None, 'choice'])
            tracing.trace_df(rands,
                             tracing.extend_trace_label(tour_type_trace_label, 'rands'),
                             columns=[None, 'rand'])

    return tour_counts
示例#9
0
def make_scheduling_choices(
        choosers_df, scheduling_mode,
        probs_spec, probs_join_cols,
        depart_alt_base,
        first_trip_in_leg,
        report_failed_trips, trace_hh_id, trace_label,
        trace_choice_col_name='depart',
        clip_earliest_latest=True):
    """
    We join each trip with the appropriate row in probs_spec by joining on probs_join_cols,
    which should exist in both trips, probs_spec dataframe.

    Parameters
    ----------
    choosers: pd.DataFrame
    scheduling_mode: str
        Either 'departure' or 'stop_duration' depending on whether the probability
        lookup table is keyed on depature period or stop duration.
    trips: pd.DataFrame
    probs_spec: pd.DataFrame
        Dataframe of probs for choice of depart times and join columns to match them with trips.
        Depart columns names are irrelevant. Instead, they are position dependent,
        time period choice is their index + depart_alt_base
    depart_alt_base: int
        int to add to probs column index to get time period it represents.
        e.g. depart_alt_base = 5 means first column (column 0) represents 5 am
    report_failed_trips : bool
    trace_hh_id
    trace_label

    Returns
    -------
    choices: pd.Series
        time periods depart choices, one per trip (except for trips with zero probs)
    """

    choosers = pd.merge(choosers_df.reset_index(), probs_spec, on=probs_join_cols,
                        how='left').set_index(choosers_df.index.name)
    chunk.log_df(trace_label, "choosers", choosers)

    if trace_hh_id and tracing.has_trace_targets(choosers_df):
        tracing.trace_df(choosers, '%s.choosers' % trace_label)

    # different pre-processing is required based on the scheduling mode
    chooser_probs = _preprocess_scheduling_probs(
        scheduling_mode, choosers_df, choosers, probs_spec,
        probs_join_cols, clip_earliest_latest, depart_alt_base, first_trip_in_leg)

    chunk.log_df(trace_label, "chooser_probs", chooser_probs)

    if trace_hh_id and tracing.has_trace_targets(choosers_df):
        tracing.trace_df(chooser_probs, '%s.chooser_probs' % trace_label)

    raw_choices, rands = logit.make_choices(chooser_probs, trace_label=trace_label, trace_choosers=choosers)

    chunk.log_df(trace_label, "choices", raw_choices)
    chunk.log_df(trace_label, "rands", rands)

    if trace_hh_id and tracing.has_trace_targets(choosers_df):
        tracing.trace_df(raw_choices, '%s.choices' % trace_label, columns=[None, trace_choice_col_name])
        tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand'])

    # different post-processing is required based on the scheduling mode
    choices, failed = _postprocess_scheduling_choices(
        scheduling_mode, depart_alt_base, raw_choices, chooser_probs.columns, choosers_df)

    chunk.log_df(trace_label, "failed", failed)

    # report failed trips while we have the best diagnostic info
    if report_failed_trips and failed.any():
        _report_bad_choices(
            bad_row_map=failed,
            df=choosers,
            filename='failed_choosers',
            trace_label=trace_label,
            trace_choosers=None)

    # trace before removing failures
    if trace_hh_id and tracing.has_trace_targets(choosers_df):
        tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, trace_choice_col_name])
        tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand'])

    # remove any failed choices
    if failed.any():
        choices = choices[~failed]

    if all([check_col in choosers_df.columns for check_col in ['earliest', 'latest']]):
        assert (choices >= choosers_df.earliest[~failed]).all()
        assert (choices <= choosers_df.latest[~failed]).all()

    return choices