Пример #1
0
def create_state_space_class(optim_paras, options):
    """Create the state space of the model."""
    core, indexer = _create_core_and_indexer(optim_paras, options)
    dense_grid = _create_dense_state_space_grid(optim_paras)

    # Downcast after calculations or be aware of silent integer overflows.
    core = compute_covariates(core, options["covariates_core"])
    core = core.apply(downcast_to_smallest_dtype)
    dense = _create_dense_state_space_covariates(dense_grid, optim_paras,
                                                 options)

    base_draws_sol = create_base_draws(
        (options["n_periods"], options["solution_draws"],
         len(optim_paras["choices"])),
        next(options["solution_seed_startup"]),
        options["monte_carlo_sequence"],
    )

    if dense:
        state_space = _MultiDimStateSpace(core, indexer, base_draws_sol,
                                          optim_paras, options, dense)
    else:
        state_space = _SingleDimStateSpace(core, indexer, base_draws_sol,
                                           optim_paras, options)

    return state_space
Пример #2
0
def _create_dense_period_choice(
    core, dense, core_key_to_core_indices, core_key_to_complex, optim_paras, options
):
    """Create dense period choice parts of the state space.

    We loop over all dense combinations and calculate choice restrictions for each
    particular dense state space. The information allows us to compile a dict that maps
    a combination of period, choice_set and dense_index into core_key!

    Note that we do not allow for choice restrictions that interact between core and
    dense covariates. In order to do so we would have to rewrite this function and
    return explicit state space position instead of core indices!

    Returns
    -------
    dense_period_choice : dict
        d: (period, choice_set, dense_index) -> core_key

    """
    if not dense:
        for key, complex_ in core_key_to_complex.items():
            dump_objects(
                core.loc[core_key_to_core_indices[key]], "states", complex_, options
            )
        dense_period_choice = {k: i for i, k in core_key_to_complex.items()}
    else:
        choices = [f"_{choice}" for choice in optim_paras["choices"]]
        dense_period_choice = {}
        for dense_idx, (_, dense_vec) in enumerate(dense.items()):
            states = core.copy().assign(**dense_vec)
            states = compute_covariates(states, options["covariates_all"])

            states = create_is_inadmissible(states, optim_paras, options)
            for core_idx, indices in core_key_to_core_indices.items():
                df = states.copy().loc[indices].assign(**dense_vec)
                df[choices] = ~df[choices]
                grouper = df.groupby(choices).groups
                if not len(grouper) == 1:
                    raise ValueError(
                        "Choice restrictions cannot interact between core and dense "
                        "information such that heterogeneous choice sets within a "
                        "period are created. Use penalties in the utility functions "
                        "for that."
                    )
                period_choice = {
                    (core_key_to_complex[core_idx][0], idx, dense_idx): core_idx
                    for idx, indices in grouper.items()
                }

                dense_period_choice = {**dense_period_choice, **period_choice}
                idx = list(grouper.keys())[0]
                dump_objects(
                    df,
                    "states",
                    (core_key_to_complex[core_idx][0], idx, dense_idx),
                    options,
                )

    return dense_period_choice
Пример #3
0
def _sample_characteristic(states_df, options, level_dict, use_keys):
    """Sample characteristic of individuals.

    The function is used to sample the values of one state space characteristic, say
    experience. The keys of ``level_dict`` are the possible starting values of
    experience. The values of the dictionary are :class:`pandas.Series` whose index are
    covariate names and the values are the parameter values.

    ``states_df`` is used to generate all possible covariates with the existing
    information.

    For each level, the dot product of parameters and covariates determines the value
    ``z``. The softmax function converts the level-specific ``z``-values to
    probabilities. The probabilities are used to sample the characteristic.

    Parameters
    ----------
    states_df : pandas.DataFrame
        Contains the state of each individual.
    options : dict
        Options of the model.
    level_dict : dict
        A dictionary where the keys are the values distributed according to the
        probability mass function. The values are a :class:`pandas.Series` with
        covariate names as the index and parameter values.
    use_keys : bool
        Identifier for whether the keys of the level dict should be used as variables
        values or use numeric codes instead. For example, assign numbers to choices.

    Returns
    -------
    characteristic : numpy.ndarray
        Array with shape (n_individuals,) containing sampled values.

    """
    # Generate covariates.
    all_data = compute_covariates(states_df,
                                  options["covariates_all"],
                                  check_nans=True,
                                  raise_errors=False)

    # Calculate dot product of covariates and parameters.
    z = ()
    for level in level_dict:
        x_beta = pandas_dot(all_data, level_dict[level])
        z += (x_beta, )

    # Calculate probabilities with the softmax function.
    probabilities = softmax(np.column_stack(z), axis=1)

    np.random.seed(next(options["simulation_seed_iteration"]))

    choices = level_dict if use_keys else len(level_dict)
    characteristic = _random_choice(choices, probabilities)

    return characteristic
Пример #4
0
def _compute_x_beta_for_type_probabilities(df, optim_paras, options):
    for type_ in range(optim_paras["n_types"]):
        first_observations = df.copy().assign(type=type_)
        relevant_covariates = identify_necessary_covariates(
            optim_paras["type_prob"][type_].index, options["covariates_all"]
        )
        first_observations = compute_covariates(first_observations, relevant_covariates)

        labels = optim_paras["type_prob"][type_].index
        df[type_] = np.dot(
            first_observations[labels].to_numpy(dtype=COVARIATES_DOT_PRODUCT_DTYPE),
            optim_paras["type_prob"][type_],
        )

    return df[range(optim_paras["n_types"])]
Пример #5
0
def _create_dense_state_space_covariates(dense_grid, optim_paras, options):
    if dense_grid:
        columns = create_dense_state_space_columns(optim_paras)

        df = pd.DataFrame(data=dense_grid,
                          columns=columns).set_index(columns, drop=False)

        covariates = compute_covariates(df, options["covariates_dense"])
        covariates = covariates.apply(downcast_to_smallest_dtype)
        covariates = covariates.to_dict(orient="index")
        covariates = convert_dictionary_keys_to_dense_indices(covariates)

    else:
        covariates = False

    return covariates
Пример #6
0
def _compute_x_beta_for_type_probabilities(df, optim_paras, options):
    """Compute the vector dot product of type covariates and type coefficients.

    For each individual, compute as many vector dot products as there are types. The
    scalars are later passed to a softmax function to compute the type probabilities.
    The probability for each individual to be some type.

    """
    for type_ in range(optim_paras["n_types"]):
        first_observations = df.copy().assign(type=type_)
        relevant_covariates = identify_necessary_covariates(
            optim_paras["type_prob"][type_].index, options["covariates_all"])
        first_observations = compute_covariates(first_observations,
                                                relevant_covariates)

        df[type_] = pandas_dot(first_observations,
                               optim_paras["type_prob"][type_])

    return df[range(optim_paras["n_types"])]
Пример #7
0
def create_state_space_class(optim_paras, options):
    """Create the state space of the model."""
    prepare_cache_directory(options)
    core = _create_core_state_space(optim_paras, options)
    dense_grid = _create_dense_state_space_grid(optim_paras)

    # Downcast after calculations or be aware of silent integer overflows.
    core = compute_covariates(core, options["covariates_core"])
    core = core.apply(downcast_to_smallest_dtype)
    dense = _create_dense_state_space_covariates(dense_grid, optim_paras,
                                                 options)

    core_period_choice = _create_core_period_choice(core, optim_paras, options)

    core_key_to_complex = dict(enumerate(core_period_choice))
    core_key_to_core_indices = {
        i: core_period_choice[complex_]
        for i, complex_ in core_key_to_complex.items()
    }

    indexer = _create_indexer(core, core_key_to_core_indices, optim_paras)

    dense_period_choice = _create_dense_period_choice(
        core, dense, core_key_to_core_indices, core_key_to_complex,
        optim_paras, options)

    state_space = StateSpace(
        core,
        indexer,
        dense,
        dense_period_choice,
        core_key_to_complex,
        core_key_to_core_indices,
        optim_paras,
        options,
    )

    return state_space
Пример #8
0
 def states(self):
     states = self.core.copy().assign(**self.dense_covariates)
     states = compute_covariates(states, self.mixed_covariates)
     return states
Пример #9
0
def _process_estimation_data(df, state_space, optim_paras, options):
    """Process estimation data.

    All necessary objects for :func:`_internal_log_like_obs` dependent on the data are
    produced.

    Some objects have to be repeated for each type which is a desirable format for the
    estimation where every observations is weighted by type probabilities.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame which contains the data used for estimation. The DataFrame
        contains individual identifiers, periods, experiences, lagged choices, choices
        in current period, the wage and other observed data.
    indexer : numpy.ndarray
        Indexer for the core state space.
    optim_paras : dict
    options : dict

    Returns
    -------
    choices : numpy.ndarray
        Array with shape (n_observations, n_types) where information is only repeated
        over the second axis.
    idx_indiv_first_obs : numpy.ndarray
        Array with shape (n_individuals,) containing indices for the first observations
        of each individual.
    indices : numpy.ndarray
        Array with shape (n_observations, n_types) containing indices for states which
        correspond to observations.
    log_wages_observed : numpy.ndarray
        Array with shape (n_observations, n_types) containing clipped log wages.
    type_covariates : numpy.ndarray
        Array with shape (n_individuals, n_type_covariates) containing covariates to
        predict probabilities for each type.

    """
    col_dtype = generate_column_dtype_dict_for_estimation(optim_paras)

    df = (
        df.sort_index()[list(col_dtype)[2:]]
        .rename(columns=rename_labels_to_internal)
        .rename_axis(index=rename_labels_to_internal)
    )
    df = convert_labeled_variables_to_codes(df, optim_paras)

    # Get indices of states in the state space corresponding to all observations for all
    # types. The indexer has the shape (n_observations,).
    n_periods = int(df.index.get_level_values("period").max() + 1)
    indices = []
    core_columns = create_core_state_space_columns(optim_paras)

    for period in range(n_periods):
        period_df = df.query("period == @period")
        period_core = tuple(period_df[col].to_numpy() for col in core_columns)
        period_indices = state_space.indexer[period][period_core]
        indices.append(period_indices)

    indices = np.concatenate(indices)

    # The indexer is now sorted in period-individual pairs whereas the estimation needs
    # individual-period pairs. Sort it!
    indices_to_reorder = (
        df.sort_values(["period", "identifier"])
        .assign(__index__=np.arange(df.shape[0]))
        .sort_values(["identifier", "period"])["__index__"]
        .to_numpy()
    )
    df["index"] = indices[indices_to_reorder]

    # Add indices of child states to the DataFrame.
    children = pd.DataFrame(
        data=state_space.indices_of_child_states[df["index"].to_numpy()],
        index=df.index,
        columns=[f"child_index_{c}" for c in optim_paras["choices"]],
    )
    df = pd.concat([df, children], axis="columns")

    # For the estimation, log wages are needed with shape (n_observations, n_types).
    df["log_wage"] = np.log(np.clip(df.wage.to_numpy(), 1 / MAX_FLOAT, MAX_FLOAT))
    df = df.drop(columns="wage")

    # For the type covariates, we only need the first observation of each individual.
    if optim_paras["n_types"] >= 2:
        initial_states = df.query("period == 0").copy()
        type_covariates = compute_covariates(
            initial_states, options["covariates_core"], raise_errors=False
        )
        type_covariates = type_covariates.apply(downcast_to_smallest_dtype)
    else:
        type_covariates = None

    return df, type_covariates
Пример #10
0
def _process_estimation_data(df, state_space, optim_paras, options):
    """Process estimation data.

    All necessary objects for :func:`_internal_log_like_obs` dependent on the data are
    produced.

    Some objects have to be repeated for each type which is a desirable format for the
    estimation where every observations is weighted by type probabilities.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame which contains the data used for estimation. The DataFrame
        contains individual identifiers, periods, experiences, lagged choices, choices
        in current period, the wage and other observed data.
    indexer : numpy.ndarray
        Indexer for the core state space.
    optim_paras : dict
    options : dict

    Returns
    -------
    choices : numpy.ndarray
        Array with shape (n_observations, n_types) where information is only repeated
        over the second axis.
    idx_indiv_first_obs : numpy.ndarray
        Array with shape (n_individuals,) containing indices for the first observations
        of each individual.
    indices : numpy.ndarray
        Array with shape (n_observations, n_types) containing indices for states which
        correspond to observations.
    log_wages_observed : numpy.ndarray
        Array with shape (n_observations, n_types) containing clipped log wages.
    type_covariates : numpy.ndarray
        Array with shape (n_individuals, n_type_covariates) containing covariates to
        predict probabilities for each type.

    """
    n_types = optim_paras["n_types"]
    col_dtype = generate_column_dtype_dict_for_estimation(optim_paras)

    df = (df.sort_index()[list(col_dtype)[2:]].rename(
        columns=rename_labels_to_internal).rename_axis(
            index=rename_labels_to_internal))
    df = convert_labeled_variables_to_codes(df, optim_paras)

    # Duplicate observations for each type.
    if n_types >= 2:
        df = pd.concat([df.copy().assign(type=i) for i in range(n_types)])

    df["dense_key"], df["core_index"] = map_observations_to_states(
        df, state_space, optim_paras)

    # For the estimation, log wages are needed with shape (n_observations, n_types).
    df["log_wage"] = np.log(
        np.clip(df.wage.to_numpy(), 1 / MAX_FLOAT, MAX_FLOAT))
    df = df.drop(columns="wage")

    # For the type covariates, we only need the first observation of each individual.
    if n_types >= 2:
        initial_states = df.query("period == 0").copy()
        type_covariates = compute_covariates(initial_states,
                                             options["covariates_core"],
                                             raise_errors=False)
        type_covariates = type_covariates.apply(downcast_to_smallest_dtype)
    else:
        type_covariates = None

    return df, type_covariates