Exemplo n.º 1
0
def make_choices(probs, trace_label=None, trace_choosers=None):
    """
    Make choices for each chooser from among a set of alternatives.

    Parameters
    ----------
    probs : pandas.DataFrame
        Rows for choosers and columns for the alternatives from which they
        are choosing. Values are expected to be valid probabilities across
        each row, e.g. they should sum to 1.

    trace_choosers : pandas.dataframe
        the choosers df (for interaction_simulate) to facilitate the reporting of hh_id
        by report_bad_choices because it can't deduce hh_id from the interaction_dataset
        which is indexed on index values from alternatives df

    Returns
    -------
    choices : pandas.Series
        Maps chooser IDs (from `probs` index) to a choice, where the choice
        is an index into the columns of `probs`.

    rands : pandas.Series
        The random numbers used to make the choices (for debugging, tracing)

    """
    trace_label = tracing.extend_trace_label(trace_label, 'make_choices')

    # probs should sum to 1 across each row

    BAD_PROB_THRESHOLD = 0.001
    bad_probs = \
        probs.sum(axis=1).sub(np.ones(len(probs.index))).abs() \
        > BAD_PROB_THRESHOLD * np.ones(len(probs.index))

    if bad_probs.any():

        report_bad_choices(bad_probs,
                           probs,
                           tracing.extend_trace_label(trace_label,
                                                      'bad_probs'),
                           msg="probabilities do not add up to 1",
                           trace_choosers=trace_choosers)

    rands = pipeline.get_rn_generator().random_for_df(probs)

    probs_arr = probs.as_matrix().cumsum(axis=1) - rands

    # rows, cols = np.where(probs_arr > 0)
    # choices = [s.iat[0] for _, s in pd.Series(cols).groupby(rows)]
    choices = np.argmax(probs_arr > 0.0, axis=1)

    choices = pd.Series(choices, index=probs.index)

    rands = pd.Series(np.asanyarray(rands).flatten(), index=probs.index)

    return choices, rands
Exemplo n.º 2
0
def interaction_dataset(choosers, alternatives, sample_size=None):
    """
    Combine choosers and alternatives into one table for the purposes
    of creating interaction variables and/or sampling alternatives.

    Parameters
    ----------
    choosers : pandas.DataFrame
    alternatives : pandas.DataFrame
    sample_size : int, optional
        If sampling from alternatives for each chooser, this is
        how many to sample.

    Returns
    -------
    interacted : pandas.DataFrame
        Merged choosers and alternatives with data repeated either
        len(alternatives) or `sample_size` times.

    """
    if not choosers.index.is_unique:
        raise RuntimeError("ERROR: choosers index is not unique, "
                           "sample will not work correctly")
    if not alternatives.index.is_unique:
        raise RuntimeError("ERROR: alternatives index is not unique, "
                           "sample will not work correctly")

    numchoosers = len(choosers)
    numalts = len(alternatives)
    sample_size = sample_size or numalts

    # FIXME - is this faster or just dumb?
    alts_idx = np.arange(numalts)

    if sample_size < numalts:
        sample = pipeline.get_rn_generator().choice_for_df(choosers,
                                                           alts_idx,
                                                           sample_size,
                                                           replace=False)
    else:
        sample = np.tile(alts_idx, numchoosers)

    alts_sample = alternatives.take(sample)
    alts_sample['chooser_idx'] = np.repeat(choosers.index.values, sample_size)

    alts_sample = pd.merge(alts_sample,
                           choosers,
                           left_on='chooser_idx',
                           right_index=True,
                           suffixes=('', '_r'))

    return alts_sample
Exemplo n.º 3
0
def interaction_dataset(choosers, alternatives, sample_size=None):
    """
    Combine choosers and alternatives into one table for the purposes
    of creating interaction variables and/or sampling alternatives.

    Any duplicate column names in alternatives table will be renamed with an '_r' suffix.
    (e.g. TAZ field in alternatives will appear as TAZ_r so that it can be targeted in a skim)

    Parameters
    ----------
    choosers : pandas.DataFrame
    alternatives : pandas.DataFrame
    sample_size : int, optional
        If sampling from alternatives for each chooser, this is
        how many to sample.

    Returns
    -------
    alts_sample : pandas.DataFrame
        Merged choosers and alternatives with data repeated either
        len(alternatives) or `sample_size` times.

    """
    if not choosers.index.is_unique:
        raise RuntimeError("ERROR: choosers index is not unique, "
                           "sample will not work correctly")
    if not alternatives.index.is_unique:
        raise RuntimeError("ERROR: alternatives index is not unique, "
                           "sample will not work correctly")

    numchoosers = len(choosers)
    numalts = len(alternatives)
    sample_size = sample_size or numalts

    # FIXME - is this faster or just dumb?
    alts_idx = np.arange(numalts)

    if sample_size < numalts:
        sample = pipeline.get_rn_generator().choice_for_df(choosers,
                                                           alts_idx,
                                                           sample_size,
                                                           replace=False)
    else:
        sample = np.tile(alts_idx, numchoosers)

    alts_sample = alternatives.take(sample).copy()
    alts_sample['chooser_idx'] = np.repeat(choosers.index.values, sample_size)

    logger.debug(
        "interaction_dataset pre-merge choosers %s alternatives %s alts_sample %s"
        % (choosers.shape, alternatives.shape, alts_sample.shape))

    AVOID_PD_MERGE = True
    if AVOID_PD_MERGE:

        for c in choosers.columns:
            c_alts = ('%s_r' % c) if c in alts_sample.columns else c
            alts_sample[c_alts] = np.repeat(choosers[c].values, sample_size)

    else:

        # FIXME - merge throws error trying to merge df with two many rows - may be a pandas bug?
        # this sets limits to max chunk size  - might work to merge in chunks and join
        # no pressing as there is currently no obvious performance gain to larger chunk size
        # DEBUG - merge choosers (564016, 4) alternatives (1443, 16) alts_sample (813875088, 17)
        #
        #   File "..\pandas\core\internals.py", line 5573, in is_na
        #     for i in range(0, total_len, chunk_len):
        # OverflowError: Python int too large to convert to C long

        alts_sample = pd.merge(alts_sample,
                               choosers,
                               left_on='chooser_idx',
                               right_index=True,
                               suffixes=('', '_r'))

    logger.debug("interaction_dataset merged alts_sample %s" %
                 (alts_sample.shape, ))

    return alts_sample
def make_sample_choices(
        choosers, probs, interaction_utilities,
        sample_size, alternative_count, alt_col_name,
        trace_label):
    """

    Parameters
    ----------
    choosers
    probs : pandas DataFrame
        one row per chooser and one column per alternative
    interaction_utilities
        dataframe  with len(interaction_df) rows and one utility column
    sample_size : int
        number of samples/choices to make
    alternative_count
    alt_col_name
    trace_label

    Returns
    -------

    """

    assert isinstance(probs, pd.DataFrame)
    assert probs.shape == (len(choosers), alternative_count)

    assert isinstance(interaction_utilities, pd.DataFrame)
    assert interaction_utilities.shape == (len(choosers)*alternative_count, 1)

    t0 = tracing.print_elapsed_time()

    # probs should sum to 1 across each row
    BAD_PROB_THRESHOLD = 0.001
    bad_probs = \
        probs.sum(axis=1).sub(np.ones(len(probs.index))).abs() \
        > BAD_PROB_THRESHOLD * np.ones(len(probs.index))

    if bad_probs.any():
        logit.report_bad_choices.report_bad_choices(
            bad_probs, probs,
            tracing.extend_trace_label(trace_label, 'bad_probs'),
            msg="probabilities do not add up to 1",
            trace_choosers=choosers)

    t0 = tracing.print_elapsed_time("make_choices bad_probs", t0, debug=True)

    cum_probs_arr = probs.as_matrix().cumsum(axis=1)
    t0 = tracing.print_elapsed_time("make_choices cum_probs_arr", t0, debug=True)

    # alt probs in convenient layout to return prob of chose alternative
    # (same layout as cum_probs_arr and interaction_utilities)
    alt_probs_array = probs.as_matrix().flatten()

    # get sample_size rands for each chooser
    # transform as we iterate over alternatives
    # reshape so rands[i] is in broadcastable (2-D) shape for cum_probs_arr
    # i.e rands[i] is a 2-D array of one alt choice rand for each chooser
    rands = pipeline.get_rn_generator().random_for_df(probs, n=sample_size)
    rands = rands.T.reshape(sample_size, -1, 1)
    t0 = tracing.print_elapsed_time("make_choices random_for_df", t0, debug=True)

    # the alternative value chosen
    choices_array = np.empty([sample_size, len(choosers)]).astype(int)

    # the probability of the chosen alternative
    choice_probs_array = np.empty([sample_size, len(choosers)])

    # FIXME - do this all at once rather than iterate?
    for i in range(sample_size):

        # FIXME - do this in numpy, not pandas?

        # rands for this alt in broadcastable shape
        r = rands[i]

        # position of first occurrence of positive value
        positions = np.argmax(cum_probs_arr > r, axis=1)

        # FIXME - leave positions as numpy array, not pandas series?

        # positions is series with the chosen alternative represented as a column index in probs
        # which is an integer between zero and num alternatives in the alternative sample
        positions = pd.Series(positions, index=probs.index)

        # need to get from an integer offset into the alternative sample to the alternative index
        # that is, we want the index value of the row that is offset by <position> rows into the
        # tranche of this choosers alternatives created by cross join of alternatives and choosers

        # offsets is the offset into model_design df of first row of chooser alternatives
        offsets = np.arange(len(positions)) * alternative_count

        # resulting pandas Int64Index has one element per chooser and is in same order as choosers
        choices_array[i] = interaction_utilities.index.take(positions + offsets)

        choice_probs_array[i] = np.take(alt_probs_array, positions + offsets)

    # explode to one row per chooser.index, alt_TAZ
    choices_df = pd.DataFrame(
        {alt_col_name: choices_array.flatten(order='F'),
         'rand': rands.flatten(order='F'),
         'prob': choice_probs_array.flatten(order='F'),
         choosers.index.name: np.repeat(np.asanyarray(choosers.index), sample_size)
         })

    return choices_df