예제 #1
0
파일: formulation.py 프로젝트: yk0817/pyblp
    def _build_ids(self, data: Mapping) -> Array:
        """Convert a mapping from variable names to arrays into the designed matrix of IDs to be absorbed."""

        # normalize the data
        data_mapping: Data = {}
        for name in self._absorbed_names:
            try:
                data_mapping[name] = np.asarray(data[name]).flatten()
            except Exception as exception:
                assert self._absorb is not None
                origin = patsy.origin.Origin(self._absorb, 0,
                                             len(self._absorb))
                raise patsy.PatsyError(f"Failed to load data for '{name}'.",
                                       origin) from exception

        # build columns of absorbed IDs
        ids_columns: List[Array] = []
        for term in self._absorbed_terms:
            factor_columns: List[Array] = []
            term_design = design_matrix([term], data_mapping)
            for factor, info in term_design.factor_infos.items():
                if info.type != 'categorical':
                    raise patsy.PatsyError(
                        "Only categorical variables can be absorbed.",
                        factor.origin)
                symbol = parse_expression(factor.name())
                factor_columns.append(data_mapping[symbol.name])
            ids_columns.append(interact_ids(*factor_columns))

        return np.column_stack(ids_columns)
예제 #2
0
    def _build_ids(self, data):
        """Convert a mapping from variable names to arrays into the designed matrix of IDs to be absorbed."""

        # normalize the data
        data_mapping = {}
        for name in self._absorbed_names:
            try:
                data_mapping[name] = np.asarray(data[name])
            except Exception as exception:
                origin = patsy.origin.Origin(self._absorb, 0, len(self._absorb))
                raise patsy.PatsyError(f"Failed to load data for '{name}'.", origin) from exception

        # build columns of absorbed IDs
        ids_columns = []
        for term in self._absorbed_terms:
            factor_columns = []
            term_design = design_matrix([term], data_mapping)
            for factor, info in term_design.factor_infos.items():
                if info.type != 'categorical':
                    raise patsy.PatsyError("Only categorical variables can be absorbed.", factor.origin)
                symbol = parse_expression(factor.name())
                factor_columns.append(data_mapping[symbol.name])

            # store interactions as tuples
            column = factor_columns[0].astype(np.object)
            if len(factor_columns) > 1:
                column[:] = list(zip(*factor_columns))
            ids_columns.append(column)

        # build the matrix of IDs
        return np.column_stack(ids_columns)
예제 #3
0
def parse_terms(formula: str) -> List[patsy.desc.Term]:
    """Parse patsy terms from a string. Validate that the string contains only right-hand side terms."""
    description = patsy.highlevel.ModelDesc.from_formula(formula)
    if description.lhs_termlist:
        end = formula.index('~') + 1 if '~' in formula else len(formula)
        raise patsy.PatsyError("Formulas should not have left-hand sides.", patsy.origin.Origin(formula, 0, end))
    return description.rhs_termlist
예제 #4
0
def parse_term_expression(term: patsy.desc.Term) -> sp.Expr:
    """Multiply the SymPy expressions parsed from each factor in a patsy term."""
    expression = sp.Integer(1)
    for factor in term.factors:
        try:
            expression *= parse_expression(factor.name())
        except Exception as exception:
            raise patsy.PatsyError("Failed to parse a term.", factor.origin) from exception
    return expression
예제 #5
0
    def __init__(
            self, formula: str, absorb: Optional[str] = None,
            absorb_method: Optional[Union[str, Iteration]] = None) -> None:
        """Parse the formula into patsy terms and SymPy expressions. In the process, validate it as much as possible
        without any data.
        """

        # validate the formulas
        if not isinstance(formula, str):
            raise TypeError("formula must be a str.")
        if absorb is not None and not isinstance(absorb, str):
            raise TypeError("absorb must be a None or a str.")

        # parse the formulas into patsy terms
        self._formula = formula
        self._absorb = absorb
        self._terms = parse_terms(formula)
        self._absorbed_terms: List[patsy.desc.Term] = []
        if absorb is not None:
            self._absorbed_terms = parse_terms(f'{absorb} - 1')

        # ignore intercepts if there are any absorbed terms and check that there is at least one term
        if self._absorbed_terms:
            self._terms = [t for t in self._terms if t != patsy.desc.INTERCEPT]
        if not self._terms:
            raise patsy.PatsyError("formula has no terms.", patsy.origin.Origin(formula, 0, len(formula)))

        # parse the terms into SymPy expressions and extract variable names
        self._expressions = [parse_term_expression(t) for t in self._terms]
        self._absorbed_expressions = [parse_term_expression(t) for t in self._absorbed_terms]
        self._names = {str(s) for e in self._expressions for s in e.free_symbols}
        self._absorbed_names = {str(s) for e in self._absorbed_expressions for s in e.free_symbols}
        if sum(not e.free_symbols for e in self._expressions) > 1:
            origin = patsy.origin.Origin(formula, 0, len(formula))
            raise patsy.PatsyError("formula should have at most one constant term.", origin)
        if self._absorbed_expressions and any(not e.free_symbols for e in self._absorbed_expressions):
            assert absorb is not None
            origin = patsy.origin.Origin(absorb, 0, len(absorb))
            raise patsy.PatsyError("absorb should not have any constant terms.", origin)

        # configure fixed effect absorption
        if absorb_method not in {None, 'simple', 'memory', 'speed'} and not isinstance(absorb_method, Iteration):
            raise TypeError("absorb_method must be None, 'simple', 'memory', 'speed', or an Iteration instance.")
        self._absorb_method = absorb_method
예제 #6
0
    def _build_matrix(self, data: Mapping) -> Tuple[Array, List['ColumnFormulation'], Data]:
        """Convert a mapping from variable names to arrays into the designed matrix, a list of column formulations that
        describe the columns of the matrix, and a mapping from variable names to arrays of data underlying the matrix,
        which include unchanged continuous variables and indicators constructed from categorical variables.
        """

        # normalize the data
        data_mapping: Data = {}
        for name in self._names:
            try:
                data_mapping[name] = np.asarray(data[name]).flatten()
            except Exception as exception:
                origin = patsy.origin.Origin(self._formula, 0, len(self._formula))
                raise patsy.PatsyError(f"Failed to load data for '{name}'.", origin) from exception

        # always have at least one column to represent the size of the data
        if not data_mapping:
            data_mapping = {'': np.zeros(extract_size(data))}

        # design the matrix (adding an intercept term if there are absorbed terms gets Patsy to use reduced coding)
        if self._absorbed_terms:
            matrix_design = design_matrix([patsy.desc.INTERCEPT] + self._terms, data_mapping)
        else:
            matrix_design = design_matrix(self._terms, data_mapping)

        # store matrix column indices and build column formulations for each designed column (ignore the intercept if
        #   it was added only to get Patsy to use reduced coding)
        column_indices: List[int] = []
        column_formulations: List[ColumnFormulation] = []
        for term, expression in zip(self._terms, self._expressions):
            if term != patsy.desc.INTERCEPT or not self._absorbed_terms:
                term_slice = matrix_design.term_slices[term]
                for index in range(term_slice.start, term_slice.stop):
                    column_indices.append(index)
                    formula = '1' if term == patsy.desc.INTERCEPT else matrix_design.column_names[index]
                    column_formulations.append(ColumnFormulation(formula, expression))

        # construct a mapping from continuous variable names that appear in at least one column to their arrays
        underlying_data: Data = {}
        for formulation in column_formulations:
            for symbol in formulation.expression.free_symbols:
                underlying_data[symbol.name] = data_mapping.get(symbol.name)

        # supplement the mapping with indicators constructed from categorical variables
        for factor, info in matrix_design.factor_infos.items():
            if info.type == 'categorical':
                indicator_design = design_matrix([patsy.desc.Term([factor])], data_mapping)
                indicator_matrix = build_matrix(indicator_design, data_mapping)
                for name, indicator in zip(indicator_design.column_names, indicator_matrix.T):
                    symbol = CategoricalTreatment.parse_full_symbol(name)
                    if symbol.name in underlying_data:
                        underlying_data[symbol.name] = indicator

        # build the matrix
        matrix = build_matrix(matrix_design, data_mapping)
        return matrix[:, column_indices], column_formulations, underlying_data
예제 #7
0
    def __init__(self, formula, absorb=None, iteration=None):
        """Parse the formula into patsy terms and SymPy expressions. In the process, validate it as much as possible
        without any data.
        """

        # validate the formulas
        if not isinstance(formula, str):
            raise TypeError("formula must be a string.")
        if absorb is not None and not isinstance(absorb, str):
            raise TypeError("absorb must be a None or a string.")

        # parse the formulas into patsy terms
        self._formula = formula
        self._absorb = absorb
        self._terms = parse_terms(formula)
        self._absorbed_terms = parse_terms(f'{absorb} - 1') if absorb is not None else []
        if not self._terms:
            raise patsy.PatsyError("formula has no terms.", patsy.origin.Origin(formula, 0, len(formula)))

        # parse the terms into SymPy expressions and extract variable names
        self._expressions = [parse_term_expression(t) for t in self._terms]
        self._absorbed_expressions = [parse_term_expression(t) for t in self._absorbed_terms]
        self._names = {str(s) for e in self._expressions for s in e.free_symbols}
        self._absorbed_names = {str(s) for e in self._absorbed_expressions for s in e.free_symbols}
        if sum(not e.free_symbols for e in self._expressions) > 1:
            raise patsy.PatsyError(
                "formula should have at most one constant term.",
                patsy.origin.Origin(formula, 0, len(formula))
            )
        if self._absorbed_expressions and any(not e.free_symbols for e in self._absorbed_expressions):
            raise patsy.PatsyError(
                "absorb should not have any constant terms.",
                patsy.origin.Origin(absorb, 0, len(absorb))
            )

        # configure demeaning iteration
        if iteration is None:
            iteration = Iteration('simple', {'tol': 1e-14})
        if not isinstance(iteration, Iteration):
            raise TypeError("iteration must be None or an Iteration instance.")
        self._iteration = iteration
    def to_problem(
            self, supply_shifter_formulation: Optional[Formulation] = None,
            demand_shifter_formulation: Optional[Formulation] = None, product_data: Optional[Mapping] = None) -> (
            'OptimalInstrumentProblem'):
        r"""Re-create the problem with estimated feasible optimal instruments.

        The re-created problem will be exactly the same, except that instruments will be replaced with estimated
        feasible optimal instruments.

        .. note::

           Most of the explanation here is only important if a supply side was estimated.

        The optimal excluded demand-side instruments consist of the following:

            1. Estimated optimal demand-side instruments for :math:`\theta`, :math:`Z_D^\text{opt}`, excluding columns
               of instruments for any parameters on exogenous linear characteristics that were not concentrated out, but
               rather included in :math:`\theta` by :meth:`Problem.solve`.

            2. Optimal instruments for any linear demand-side parameters on endogenous product characteristics,
               :math:`\alpha`, which were concentrated out and hence not included in :math:`\theta`. These optimal
               instruments are simply an integral of the endogenous product characteristics, :math:`X_1^\text{en}`, over
               the joint density of :math:`\xi` and :math:`\omega`. It is only possible to concentrate out
               :math:`\alpha` when there isn't a supply side, so the approximation of these optimal instruments is
               simply :math:`X_1^\text{en}` evaluated at the constant vector of expected prices, :math:`E[p \mid Z]`,
               specified in :meth:`ProblemResults.compute_optimal_instruments`.

            3. If a supply side was estimated, any supply shifters, which are by default formulated by
               :attr:`OptimalInstrumentResults.supply_shifter_formulation`: all characteristics in :math:`X_3^\text{ex}`
               not in :math:`X_1^\text{ex}`.

        Similarly, if a supply side was estimated, the optimal excluded supply-side instruments consist of the
        following:

            1. Estimated optimal supply-side instruments for :math:`\theta`, :math:`Z_S^\text{opt}`, excluding columns
               of instruments for any parameters on exogenous linear characteristics that were not concentrated out, but
               rather included in :math:`\theta` by :meth:`Problem.solve`.

            2. Optimal instruments for any linear supply-side parameters on endogenous product characteristics,
               :math:`\gamma^\text{en}`, which were concentrated out an hence not included in :math:`\theta`. This
               is only relevant if ``shares`` were included in the formulation for :math:`X_3` in :class:`Problem`.
               The corresponding optimal instruments are simply an integral of the endogenous product characteristics,
               :math:`X_3^\text{en}`, over the joint density of :math:`\xi` and :math:`\omega`. The approximation of
               these optimal instruments is simply :math:`X_3^\text{en}` evaluated at the market shares that arise under
               the constant vector of expected prices, :math:`E[p \mid Z]`, specified in
               :meth:`ProblemResults.compute_optimal_instruments`.

            2. If a supply side was estimated, any demand shifters, which are by default formulated by
               :attr:`OptimalInstrumentResults.demand_shifter_formulation`: all characteristics in :math:`X_1^\text{ex}`
               not in :math:`X_3^\text{ex}`.

        As usual, the excluded demand-side instruments will be supplemented with :math:`X_1^\text{ex}` and the excluded
        supply-side instruments will be supplemented with :math:`X_3^\text{ex}`. The same fixed effects configured in
        :class:`Problem` will be absorbed.

        .. warning::

           If a supply side was estimated, the addition of supply- and demand-shifters may create collinearity issues.
           Make sure to check that shifters and other product characteristics are not collinear.

        Parameters
        ----------
        supply_shifter_formulation : `Formulation, optional`
            :class:`Formulation` configuration for supply shifters to be included in the set of optimal demand-side
            instruments. This is only used if a supply side was estimated. Intercepts will be ignored. By default,
            :attr:`OptimalInstrumentResults.supply_shifter_formulation` is used.
        demand_shifter_formulation : `Formulation, optional`
            :class:`Formulation` configuration for demand shifters to be included in the set of optimal supply-side
            instruments. This is only used if a supply side was estimated. Intercepts will be ignored. By default,
            :attr:`OptimalInstrumentResults.demand_shifter_formulation` is used.
        product_data : `structured array-like`
            Product data used instead of what was saved from ``product_data`` when initializing the original
            :class:`Problem`. This may need to be specified if either the supply or demand shifter formulation contains
            some term that was not stored into memory, such as a categorical variable or a mathematical expression.

        Returns
        -------
        `OptimalInstrumentProblem`
            :class:`OptimalInstrumentProblem`, which is a :class:`Problem` updated to use the estimated optimal
            instruments.

        Examples
        --------
            - :doc:`Tutorial </tutorial>`

        """

        # either use the stored variables as product data or any provided data
        if product_data is None:
            product_data = self.problem_results.problem.products

        # configure or validate the supply shifter formulation
        if self.problem_results.problem.K3 == 0:
            if supply_shifter_formulation is not None:
                raise TypeError("A supply side was not estimated, so supply_shifter_formulation should be None.")
        elif supply_shifter_formulation is None:
            supply_shifter_formulation = self.supply_shifter_formulation
        elif not isinstance(supply_shifter_formulation, Formulation):
            raise TypeError("supply_shifter_formulation must be None or a Formulation instance.")
        elif supply_shifter_formulation._names:
            supply_shifter_formulation = Formulation(f'{supply_shifter_formulation._formula} - 1')
        else:
            supply_shifter_formulation = None

        # configure or validate the demand shifter formulation
        if self.problem_results.problem.K3 == 0:
            if demand_shifter_formulation is not None:
                raise TypeError("A demand side was not estimated, so demand_shifter_formulation should be None.")
        elif demand_shifter_formulation is None:
            demand_shifter_formulation = self.demand_shifter_formulation
        elif not isinstance(demand_shifter_formulation, Formulation):
            raise TypeError("demand_shifter_formulation must be None or a Formulation instance.")
        elif demand_shifter_formulation._names:
            demand_shifter_formulation = Formulation(f'{demand_shifter_formulation._formula} - 1')
        else:
            demand_shifter_formulation = None

        # identify which parameters in theta are are exogenous linear characteristics
        dropped_index = np.zeros(self.problem_results._parameters.P, np.bool_)
        for p, parameter in enumerate(self.problem_results._parameters.unfixed):
            if isinstance(parameter, LinearCoefficient):
                names = parameter.get_product_formulation(self.problem_results.problem).names
                if 'prices' not in names and 'shares' not in names:
                    dropped_index[p] = True

        # build excluded demand-side instruments
        demand_instruments = self.demand_instruments[:, ~dropped_index]
        if self.problem_results._parameters.eliminated_alpha_index.any():
            demand_instruments = np.c_[
                demand_instruments,
                self.problem_results.problem._compute_true_X1(
                    {'prices': self.expected_prices},
                    self.problem_results._parameters.eliminated_alpha_index.flatten()
                )
            ]
        if supply_shifter_formulation is not None:
            try:
                demand_instruments = np.c_[
                    demand_instruments, supply_shifter_formulation._build_matrix(product_data)[0]
                ]
            except patsy.PatsyError as exception:
                message = (
                    "Failed to construct supply shifters from their formulation. You may need to specify "
                    "product_data if not all variables in the formulation were saved when initializing the problem."
                )
                raise patsy.PatsyError(message) from exception

        # build excluded supply-side instruments
        if self.problem_results.problem.K3 == 0:
            supply_instruments = self.supply_instruments
        else:
            supply_instruments = self.supply_instruments[:, ~dropped_index]
            if self.problem_results._parameters.eliminated_endogenous_gamma_index.any():
                supply_instruments = np.c_[
                    supply_instruments,
                    self.problem_results.problem._compute_true_X3(
                        {'shares': self.expected_shares},
                        self.problem_results._parameters.eliminated_endogenous_gamma_index.flatten()
                    )
                ]
            if demand_shifter_formulation is not None:
                try:
                    supply_instruments = np.c_[
                        supply_instruments, demand_shifter_formulation._build_matrix(product_data)[0]
                    ]
                except patsy.PatsyError as exception:
                    message = (
                        "Failed to construct demand shifters from their formulation. You may need to specify "
                        "product_data if not all variables in the formulation were saved when initializing the problem."
                    )
                    raise patsy.PatsyError(message) from exception

        # initialize the problem
        from ..economies.problem import OptimalInstrumentProblem  # noqa
        return OptimalInstrumentProblem(self.problem_results.problem, demand_instruments, supply_instruments)
예제 #9
0
def _parse_terms(formula: str) -> List[patsy.desc.Term]:
    description = patsy.highlevel.ModelDesc.from_formula(formula)
    if description.lhs_termlist:
        raise patsy.PatsyError("Formulae should not have left-hand sides.")
    return description.rhs_termlist