def _build_ids(self, data: Mapping) -> Array: """Convert a mapping from variable names to arrays into the designed matrix of IDs to be absorbed.""" # normalize the data data_mapping: Data = {} for name in self._absorbed_names: try: data_mapping[name] = np.asarray(data[name]).flatten() except Exception as exception: assert self._absorb is not None origin = patsy.origin.Origin(self._absorb, 0, len(self._absorb)) raise patsy.PatsyError(f"Failed to load data for '{name}'.", origin) from exception # build columns of absorbed IDs ids_columns: List[Array] = [] for term in self._absorbed_terms: factor_columns: List[Array] = [] term_design = design_matrix([term], data_mapping) for factor, info in term_design.factor_infos.items(): if info.type != 'categorical': raise patsy.PatsyError( "Only categorical variables can be absorbed.", factor.origin) symbol = parse_expression(factor.name()) factor_columns.append(data_mapping[symbol.name]) ids_columns.append(interact_ids(*factor_columns)) return np.column_stack(ids_columns)
def _build_ids(self, data): """Convert a mapping from variable names to arrays into the designed matrix of IDs to be absorbed.""" # normalize the data data_mapping = {} for name in self._absorbed_names: try: data_mapping[name] = np.asarray(data[name]) except Exception as exception: origin = patsy.origin.Origin(self._absorb, 0, len(self._absorb)) raise patsy.PatsyError(f"Failed to load data for '{name}'.", origin) from exception # build columns of absorbed IDs ids_columns = [] for term in self._absorbed_terms: factor_columns = [] term_design = design_matrix([term], data_mapping) for factor, info in term_design.factor_infos.items(): if info.type != 'categorical': raise patsy.PatsyError("Only categorical variables can be absorbed.", factor.origin) symbol = parse_expression(factor.name()) factor_columns.append(data_mapping[symbol.name]) # store interactions as tuples column = factor_columns[0].astype(np.object) if len(factor_columns) > 1: column[:] = list(zip(*factor_columns)) ids_columns.append(column) # build the matrix of IDs return np.column_stack(ids_columns)
def parse_terms(formula: str) -> List[patsy.desc.Term]: """Parse patsy terms from a string. Validate that the string contains only right-hand side terms.""" description = patsy.highlevel.ModelDesc.from_formula(formula) if description.lhs_termlist: end = formula.index('~') + 1 if '~' in formula else len(formula) raise patsy.PatsyError("Formulas should not have left-hand sides.", patsy.origin.Origin(formula, 0, end)) return description.rhs_termlist
def parse_term_expression(term: patsy.desc.Term) -> sp.Expr: """Multiply the SymPy expressions parsed from each factor in a patsy term.""" expression = sp.Integer(1) for factor in term.factors: try: expression *= parse_expression(factor.name()) except Exception as exception: raise patsy.PatsyError("Failed to parse a term.", factor.origin) from exception return expression
def __init__( self, formula: str, absorb: Optional[str] = None, absorb_method: Optional[Union[str, Iteration]] = None) -> None: """Parse the formula into patsy terms and SymPy expressions. In the process, validate it as much as possible without any data. """ # validate the formulas if not isinstance(formula, str): raise TypeError("formula must be a str.") if absorb is not None and not isinstance(absorb, str): raise TypeError("absorb must be a None or a str.") # parse the formulas into patsy terms self._formula = formula self._absorb = absorb self._terms = parse_terms(formula) self._absorbed_terms: List[patsy.desc.Term] = [] if absorb is not None: self._absorbed_terms = parse_terms(f'{absorb} - 1') # ignore intercepts if there are any absorbed terms and check that there is at least one term if self._absorbed_terms: self._terms = [t for t in self._terms if t != patsy.desc.INTERCEPT] if not self._terms: raise patsy.PatsyError("formula has no terms.", patsy.origin.Origin(formula, 0, len(formula))) # parse the terms into SymPy expressions and extract variable names self._expressions = [parse_term_expression(t) for t in self._terms] self._absorbed_expressions = [parse_term_expression(t) for t in self._absorbed_terms] self._names = {str(s) for e in self._expressions for s in e.free_symbols} self._absorbed_names = {str(s) for e in self._absorbed_expressions for s in e.free_symbols} if sum(not e.free_symbols for e in self._expressions) > 1: origin = patsy.origin.Origin(formula, 0, len(formula)) raise patsy.PatsyError("formula should have at most one constant term.", origin) if self._absorbed_expressions and any(not e.free_symbols for e in self._absorbed_expressions): assert absorb is not None origin = patsy.origin.Origin(absorb, 0, len(absorb)) raise patsy.PatsyError("absorb should not have any constant terms.", origin) # configure fixed effect absorption if absorb_method not in {None, 'simple', 'memory', 'speed'} and not isinstance(absorb_method, Iteration): raise TypeError("absorb_method must be None, 'simple', 'memory', 'speed', or an Iteration instance.") self._absorb_method = absorb_method
def _build_matrix(self, data: Mapping) -> Tuple[Array, List['ColumnFormulation'], Data]: """Convert a mapping from variable names to arrays into the designed matrix, a list of column formulations that describe the columns of the matrix, and a mapping from variable names to arrays of data underlying the matrix, which include unchanged continuous variables and indicators constructed from categorical variables. """ # normalize the data data_mapping: Data = {} for name in self._names: try: data_mapping[name] = np.asarray(data[name]).flatten() except Exception as exception: origin = patsy.origin.Origin(self._formula, 0, len(self._formula)) raise patsy.PatsyError(f"Failed to load data for '{name}'.", origin) from exception # always have at least one column to represent the size of the data if not data_mapping: data_mapping = {'': np.zeros(extract_size(data))} # design the matrix (adding an intercept term if there are absorbed terms gets Patsy to use reduced coding) if self._absorbed_terms: matrix_design = design_matrix([patsy.desc.INTERCEPT] + self._terms, data_mapping) else: matrix_design = design_matrix(self._terms, data_mapping) # store matrix column indices and build column formulations for each designed column (ignore the intercept if # it was added only to get Patsy to use reduced coding) column_indices: List[int] = [] column_formulations: List[ColumnFormulation] = [] for term, expression in zip(self._terms, self._expressions): if term != patsy.desc.INTERCEPT or not self._absorbed_terms: term_slice = matrix_design.term_slices[term] for index in range(term_slice.start, term_slice.stop): column_indices.append(index) formula = '1' if term == patsy.desc.INTERCEPT else matrix_design.column_names[index] column_formulations.append(ColumnFormulation(formula, expression)) # construct a mapping from continuous variable names that appear in at least one column to their arrays underlying_data: Data = {} for formulation in column_formulations: for symbol in formulation.expression.free_symbols: underlying_data[symbol.name] = data_mapping.get(symbol.name) # supplement the mapping with indicators constructed from categorical variables for factor, info in matrix_design.factor_infos.items(): if info.type == 'categorical': indicator_design = design_matrix([patsy.desc.Term([factor])], data_mapping) indicator_matrix = build_matrix(indicator_design, data_mapping) for name, indicator in zip(indicator_design.column_names, indicator_matrix.T): symbol = CategoricalTreatment.parse_full_symbol(name) if symbol.name in underlying_data: underlying_data[symbol.name] = indicator # build the matrix matrix = build_matrix(matrix_design, data_mapping) return matrix[:, column_indices], column_formulations, underlying_data
def __init__(self, formula, absorb=None, iteration=None): """Parse the formula into patsy terms and SymPy expressions. In the process, validate it as much as possible without any data. """ # validate the formulas if not isinstance(formula, str): raise TypeError("formula must be a string.") if absorb is not None and not isinstance(absorb, str): raise TypeError("absorb must be a None or a string.") # parse the formulas into patsy terms self._formula = formula self._absorb = absorb self._terms = parse_terms(formula) self._absorbed_terms = parse_terms(f'{absorb} - 1') if absorb is not None else [] if not self._terms: raise patsy.PatsyError("formula has no terms.", patsy.origin.Origin(formula, 0, len(formula))) # parse the terms into SymPy expressions and extract variable names self._expressions = [parse_term_expression(t) for t in self._terms] self._absorbed_expressions = [parse_term_expression(t) for t in self._absorbed_terms] self._names = {str(s) for e in self._expressions for s in e.free_symbols} self._absorbed_names = {str(s) for e in self._absorbed_expressions for s in e.free_symbols} if sum(not e.free_symbols for e in self._expressions) > 1: raise patsy.PatsyError( "formula should have at most one constant term.", patsy.origin.Origin(formula, 0, len(formula)) ) if self._absorbed_expressions and any(not e.free_symbols for e in self._absorbed_expressions): raise patsy.PatsyError( "absorb should not have any constant terms.", patsy.origin.Origin(absorb, 0, len(absorb)) ) # configure demeaning iteration if iteration is None: iteration = Iteration('simple', {'tol': 1e-14}) if not isinstance(iteration, Iteration): raise TypeError("iteration must be None or an Iteration instance.") self._iteration = iteration
def to_problem( self, supply_shifter_formulation: Optional[Formulation] = None, demand_shifter_formulation: Optional[Formulation] = None, product_data: Optional[Mapping] = None) -> ( 'OptimalInstrumentProblem'): r"""Re-create the problem with estimated feasible optimal instruments. The re-created problem will be exactly the same, except that instruments will be replaced with estimated feasible optimal instruments. .. note:: Most of the explanation here is only important if a supply side was estimated. The optimal excluded demand-side instruments consist of the following: 1. Estimated optimal demand-side instruments for :math:`\theta`, :math:`Z_D^\text{opt}`, excluding columns of instruments for any parameters on exogenous linear characteristics that were not concentrated out, but rather included in :math:`\theta` by :meth:`Problem.solve`. 2. Optimal instruments for any linear demand-side parameters on endogenous product characteristics, :math:`\alpha`, which were concentrated out and hence not included in :math:`\theta`. These optimal instruments are simply an integral of the endogenous product characteristics, :math:`X_1^\text{en}`, over the joint density of :math:`\xi` and :math:`\omega`. It is only possible to concentrate out :math:`\alpha` when there isn't a supply side, so the approximation of these optimal instruments is simply :math:`X_1^\text{en}` evaluated at the constant vector of expected prices, :math:`E[p \mid Z]`, specified in :meth:`ProblemResults.compute_optimal_instruments`. 3. If a supply side was estimated, any supply shifters, which are by default formulated by :attr:`OptimalInstrumentResults.supply_shifter_formulation`: all characteristics in :math:`X_3^\text{ex}` not in :math:`X_1^\text{ex}`. Similarly, if a supply side was estimated, the optimal excluded supply-side instruments consist of the following: 1. Estimated optimal supply-side instruments for :math:`\theta`, :math:`Z_S^\text{opt}`, excluding columns of instruments for any parameters on exogenous linear characteristics that were not concentrated out, but rather included in :math:`\theta` by :meth:`Problem.solve`. 2. Optimal instruments for any linear supply-side parameters on endogenous product characteristics, :math:`\gamma^\text{en}`, which were concentrated out an hence not included in :math:`\theta`. This is only relevant if ``shares`` were included in the formulation for :math:`X_3` in :class:`Problem`. The corresponding optimal instruments are simply an integral of the endogenous product characteristics, :math:`X_3^\text{en}`, over the joint density of :math:`\xi` and :math:`\omega`. The approximation of these optimal instruments is simply :math:`X_3^\text{en}` evaluated at the market shares that arise under the constant vector of expected prices, :math:`E[p \mid Z]`, specified in :meth:`ProblemResults.compute_optimal_instruments`. 2. If a supply side was estimated, any demand shifters, which are by default formulated by :attr:`OptimalInstrumentResults.demand_shifter_formulation`: all characteristics in :math:`X_1^\text{ex}` not in :math:`X_3^\text{ex}`. As usual, the excluded demand-side instruments will be supplemented with :math:`X_1^\text{ex}` and the excluded supply-side instruments will be supplemented with :math:`X_3^\text{ex}`. The same fixed effects configured in :class:`Problem` will be absorbed. .. warning:: If a supply side was estimated, the addition of supply- and demand-shifters may create collinearity issues. Make sure to check that shifters and other product characteristics are not collinear. Parameters ---------- supply_shifter_formulation : `Formulation, optional` :class:`Formulation` configuration for supply shifters to be included in the set of optimal demand-side instruments. This is only used if a supply side was estimated. Intercepts will be ignored. By default, :attr:`OptimalInstrumentResults.supply_shifter_formulation` is used. demand_shifter_formulation : `Formulation, optional` :class:`Formulation` configuration for demand shifters to be included in the set of optimal supply-side instruments. This is only used if a supply side was estimated. Intercepts will be ignored. By default, :attr:`OptimalInstrumentResults.demand_shifter_formulation` is used. product_data : `structured array-like` Product data used instead of what was saved from ``product_data`` when initializing the original :class:`Problem`. This may need to be specified if either the supply or demand shifter formulation contains some term that was not stored into memory, such as a categorical variable or a mathematical expression. Returns ------- `OptimalInstrumentProblem` :class:`OptimalInstrumentProblem`, which is a :class:`Problem` updated to use the estimated optimal instruments. Examples -------- - :doc:`Tutorial </tutorial>` """ # either use the stored variables as product data or any provided data if product_data is None: product_data = self.problem_results.problem.products # configure or validate the supply shifter formulation if self.problem_results.problem.K3 == 0: if supply_shifter_formulation is not None: raise TypeError("A supply side was not estimated, so supply_shifter_formulation should be None.") elif supply_shifter_formulation is None: supply_shifter_formulation = self.supply_shifter_formulation elif not isinstance(supply_shifter_formulation, Formulation): raise TypeError("supply_shifter_formulation must be None or a Formulation instance.") elif supply_shifter_formulation._names: supply_shifter_formulation = Formulation(f'{supply_shifter_formulation._formula} - 1') else: supply_shifter_formulation = None # configure or validate the demand shifter formulation if self.problem_results.problem.K3 == 0: if demand_shifter_formulation is not None: raise TypeError("A demand side was not estimated, so demand_shifter_formulation should be None.") elif demand_shifter_formulation is None: demand_shifter_formulation = self.demand_shifter_formulation elif not isinstance(demand_shifter_formulation, Formulation): raise TypeError("demand_shifter_formulation must be None or a Formulation instance.") elif demand_shifter_formulation._names: demand_shifter_formulation = Formulation(f'{demand_shifter_formulation._formula} - 1') else: demand_shifter_formulation = None # identify which parameters in theta are are exogenous linear characteristics dropped_index = np.zeros(self.problem_results._parameters.P, np.bool_) for p, parameter in enumerate(self.problem_results._parameters.unfixed): if isinstance(parameter, LinearCoefficient): names = parameter.get_product_formulation(self.problem_results.problem).names if 'prices' not in names and 'shares' not in names: dropped_index[p] = True # build excluded demand-side instruments demand_instruments = self.demand_instruments[:, ~dropped_index] if self.problem_results._parameters.eliminated_alpha_index.any(): demand_instruments = np.c_[ demand_instruments, self.problem_results.problem._compute_true_X1( {'prices': self.expected_prices}, self.problem_results._parameters.eliminated_alpha_index.flatten() ) ] if supply_shifter_formulation is not None: try: demand_instruments = np.c_[ demand_instruments, supply_shifter_formulation._build_matrix(product_data)[0] ] except patsy.PatsyError as exception: message = ( "Failed to construct supply shifters from their formulation. You may need to specify " "product_data if not all variables in the formulation were saved when initializing the problem." ) raise patsy.PatsyError(message) from exception # build excluded supply-side instruments if self.problem_results.problem.K3 == 0: supply_instruments = self.supply_instruments else: supply_instruments = self.supply_instruments[:, ~dropped_index] if self.problem_results._parameters.eliminated_endogenous_gamma_index.any(): supply_instruments = np.c_[ supply_instruments, self.problem_results.problem._compute_true_X3( {'shares': self.expected_shares}, self.problem_results._parameters.eliminated_endogenous_gamma_index.flatten() ) ] if demand_shifter_formulation is not None: try: supply_instruments = np.c_[ supply_instruments, demand_shifter_formulation._build_matrix(product_data)[0] ] except patsy.PatsyError as exception: message = ( "Failed to construct demand shifters from their formulation. You may need to specify " "product_data if not all variables in the formulation were saved when initializing the problem." ) raise patsy.PatsyError(message) from exception # initialize the problem from ..economies.problem import OptimalInstrumentProblem # noqa return OptimalInstrumentProblem(self.problem_results.problem, demand_instruments, supply_instruments)
def _parse_terms(formula: str) -> List[patsy.desc.Term]: description = patsy.highlevel.ModelDesc.from_formula(formula) if description.lhs_termlist: raise patsy.PatsyError("Formulae should not have left-hand sides.") return description.rhs_termlist