def make_choices(probs, trace_label=None, trace_choosers=None): """ Make choices for each chooser from among a set of alternatives. Parameters ---------- probs : pandas.DataFrame Rows for choosers and columns for the alternatives from which they are choosing. Values are expected to be valid probabilities across each row, e.g. they should sum to 1. trace_choosers : pandas.dataframe the choosers df (for interaction_simulate) to facilitate the reporting of hh_id by report_bad_choices because it can't deduce hh_id from the interaction_dataset which is indexed on index values from alternatives df Returns ------- choices : pandas.Series Maps chooser IDs (from `probs` index) to a choice, where the choice is an index into the columns of `probs`. rands : pandas.Series The random numbers used to make the choices (for debugging, tracing) """ trace_label = tracing.extend_trace_label(trace_label, 'make_choices') # probs should sum to 1 across each row BAD_PROB_THRESHOLD = 0.001 bad_probs = \ probs.sum(axis=1).sub(np.ones(len(probs.index))).abs() \ > BAD_PROB_THRESHOLD * np.ones(len(probs.index)) if bad_probs.any(): report_bad_choices(bad_probs, probs, tracing.extend_trace_label(trace_label, 'bad_probs'), msg="probabilities do not add up to 1", trace_choosers=trace_choosers) rands = pipeline.get_rn_generator().random_for_df(probs) probs_arr = probs.as_matrix().cumsum(axis=1) - rands # rows, cols = np.where(probs_arr > 0) # choices = [s.iat[0] for _, s in pd.Series(cols).groupby(rows)] choices = np.argmax(probs_arr > 0.0, axis=1) choices = pd.Series(choices, index=probs.index) rands = pd.Series(np.asanyarray(rands).flatten(), index=probs.index) return choices, rands
def interaction_dataset(choosers, alternatives, sample_size=None): """ Combine choosers and alternatives into one table for the purposes of creating interaction variables and/or sampling alternatives. Parameters ---------- choosers : pandas.DataFrame alternatives : pandas.DataFrame sample_size : int, optional If sampling from alternatives for each chooser, this is how many to sample. Returns ------- interacted : pandas.DataFrame Merged choosers and alternatives with data repeated either len(alternatives) or `sample_size` times. """ if not choosers.index.is_unique: raise RuntimeError("ERROR: choosers index is not unique, " "sample will not work correctly") if not alternatives.index.is_unique: raise RuntimeError("ERROR: alternatives index is not unique, " "sample will not work correctly") numchoosers = len(choosers) numalts = len(alternatives) sample_size = sample_size or numalts # FIXME - is this faster or just dumb? alts_idx = np.arange(numalts) if sample_size < numalts: sample = pipeline.get_rn_generator().choice_for_df(choosers, alts_idx, sample_size, replace=False) else: sample = np.tile(alts_idx, numchoosers) alts_sample = alternatives.take(sample) alts_sample['chooser_idx'] = np.repeat(choosers.index.values, sample_size) alts_sample = pd.merge(alts_sample, choosers, left_on='chooser_idx', right_index=True, suffixes=('', '_r')) return alts_sample
def interaction_dataset(choosers, alternatives, sample_size=None): """ Combine choosers and alternatives into one table for the purposes of creating interaction variables and/or sampling alternatives. Any duplicate column names in alternatives table will be renamed with an '_r' suffix. (e.g. TAZ field in alternatives will appear as TAZ_r so that it can be targeted in a skim) Parameters ---------- choosers : pandas.DataFrame alternatives : pandas.DataFrame sample_size : int, optional If sampling from alternatives for each chooser, this is how many to sample. Returns ------- alts_sample : pandas.DataFrame Merged choosers and alternatives with data repeated either len(alternatives) or `sample_size` times. """ if not choosers.index.is_unique: raise RuntimeError("ERROR: choosers index is not unique, " "sample will not work correctly") if not alternatives.index.is_unique: raise RuntimeError("ERROR: alternatives index is not unique, " "sample will not work correctly") numchoosers = len(choosers) numalts = len(alternatives) sample_size = sample_size or numalts # FIXME - is this faster or just dumb? alts_idx = np.arange(numalts) if sample_size < numalts: sample = pipeline.get_rn_generator().choice_for_df(choosers, alts_idx, sample_size, replace=False) else: sample = np.tile(alts_idx, numchoosers) alts_sample = alternatives.take(sample).copy() alts_sample['chooser_idx'] = np.repeat(choosers.index.values, sample_size) logger.debug( "interaction_dataset pre-merge choosers %s alternatives %s alts_sample %s" % (choosers.shape, alternatives.shape, alts_sample.shape)) AVOID_PD_MERGE = True if AVOID_PD_MERGE: for c in choosers.columns: c_alts = ('%s_r' % c) if c in alts_sample.columns else c alts_sample[c_alts] = np.repeat(choosers[c].values, sample_size) else: # FIXME - merge throws error trying to merge df with two many rows - may be a pandas bug? # this sets limits to max chunk size - might work to merge in chunks and join # no pressing as there is currently no obvious performance gain to larger chunk size # DEBUG - merge choosers (564016, 4) alternatives (1443, 16) alts_sample (813875088, 17) # # File "..\pandas\core\internals.py", line 5573, in is_na # for i in range(0, total_len, chunk_len): # OverflowError: Python int too large to convert to C long alts_sample = pd.merge(alts_sample, choosers, left_on='chooser_idx', right_index=True, suffixes=('', '_r')) logger.debug("interaction_dataset merged alts_sample %s" % (alts_sample.shape, )) return alts_sample
def make_sample_choices( choosers, probs, interaction_utilities, sample_size, alternative_count, alt_col_name, trace_label): """ Parameters ---------- choosers probs : pandas DataFrame one row per chooser and one column per alternative interaction_utilities dataframe with len(interaction_df) rows and one utility column sample_size : int number of samples/choices to make alternative_count alt_col_name trace_label Returns ------- """ assert isinstance(probs, pd.DataFrame) assert probs.shape == (len(choosers), alternative_count) assert isinstance(interaction_utilities, pd.DataFrame) assert interaction_utilities.shape == (len(choosers)*alternative_count, 1) t0 = tracing.print_elapsed_time() # probs should sum to 1 across each row BAD_PROB_THRESHOLD = 0.001 bad_probs = \ probs.sum(axis=1).sub(np.ones(len(probs.index))).abs() \ > BAD_PROB_THRESHOLD * np.ones(len(probs.index)) if bad_probs.any(): logit.report_bad_choices.report_bad_choices( bad_probs, probs, tracing.extend_trace_label(trace_label, 'bad_probs'), msg="probabilities do not add up to 1", trace_choosers=choosers) t0 = tracing.print_elapsed_time("make_choices bad_probs", t0, debug=True) cum_probs_arr = probs.as_matrix().cumsum(axis=1) t0 = tracing.print_elapsed_time("make_choices cum_probs_arr", t0, debug=True) # alt probs in convenient layout to return prob of chose alternative # (same layout as cum_probs_arr and interaction_utilities) alt_probs_array = probs.as_matrix().flatten() # get sample_size rands for each chooser # transform as we iterate over alternatives # reshape so rands[i] is in broadcastable (2-D) shape for cum_probs_arr # i.e rands[i] is a 2-D array of one alt choice rand for each chooser rands = pipeline.get_rn_generator().random_for_df(probs, n=sample_size) rands = rands.T.reshape(sample_size, -1, 1) t0 = tracing.print_elapsed_time("make_choices random_for_df", t0, debug=True) # the alternative value chosen choices_array = np.empty([sample_size, len(choosers)]).astype(int) # the probability of the chosen alternative choice_probs_array = np.empty([sample_size, len(choosers)]) # FIXME - do this all at once rather than iterate? for i in range(sample_size): # FIXME - do this in numpy, not pandas? # rands for this alt in broadcastable shape r = rands[i] # position of first occurrence of positive value positions = np.argmax(cum_probs_arr > r, axis=1) # FIXME - leave positions as numpy array, not pandas series? # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample positions = pd.Series(positions, index=probs.index) # need to get from an integer offset into the alternative sample to the alternative index # that is, we want the index value of the row that is offset by <position> rows into the # tranche of this choosers alternatives created by cross join of alternatives and choosers # offsets is the offset into model_design df of first row of chooser alternatives offsets = np.arange(len(positions)) * alternative_count # resulting pandas Int64Index has one element per chooser and is in same order as choosers choices_array[i] = interaction_utilities.index.take(positions + offsets) choice_probs_array[i] = np.take(alt_probs_array, positions + offsets) # explode to one row per chooser.index, alt_TAZ choices_df = pd.DataFrame( {alt_col_name: choices_array.flatten(order='F'), 'rand': rands.flatten(order='F'), 'prob': choice_probs_array.flatten(order='F'), choosers.index.name: np.repeat(np.asanyarray(choosers.index), sample_size) }) return choices_df