def test_factor_evaluation_edge_cases(self, materializer): # Test that categorical kinds are set if type would otherwise be numerical ev_factor = materializer._evaluate_factor(Factor('a', eval_method='lookup', kind='categorical'), ModelSpec([]), drop_rows=set()) assert ev_factor.kind.value == 'categorical' # Test that other kind mismatches result in an exception materializer.factor_cache = {} with pytest.raises(FactorEncodingError): materializer._evaluate_factor(Factor('A', eval_method='lookup', kind='numerical'), ModelSpec([]), drop_rows=[]) # Test that if an encoding has already been determined, that an exception is raised # if the new encoding does not match materializer.factor_cache = {} with pytest.raises(FactorEncodingError): materializer._evaluate_factor( Factor('a', eval_method='lookup', kind='numerical'), ModelSpec([], encoder_state={'a': ('categorical', {})}), drop_rows=[]) # Test that invalid (kind == UNKNOWN) factors raise errors materializer.factor_cache = {} with pytest.raises(FactorEvaluationError): assert materializer._evaluate_factor(Factor('a'), ModelSpec([]), drop_rows=set())
def test_to_terms(self): terms = Factor('a').to_terms() assert len(terms) == 1 term = next(iter(terms)) assert len(term.factors) == 1 assert next(iter(term.factors)) == Factor('a')
def test_attributes(self): assert Factor('a').kind is Factor.Kind.UNKNOWN assert Factor('a', kind='constant').kind is Factor.Kind.CONSTANT assert Factor('a').eval_method is Factor.EvalMethod.UNKNOWN assert Factor( 'a', eval_method='lookup').eval_method is Factor.EvalMethod.LOOKUP
def test_sort(self): a, b, c = Factor('a'), Factor('b'), Factor('c') assert a < b assert b < c assert a < c with pytest.raises(TypeError): a < 1
def evaled_factors(self): return [ EvaluatedFactor(Factor('A'), pandas.Series([1, 2, 3, 4]), kind='categorical', spans_intercept=True), EvaluatedFactor(Factor('b'), pandas.Series([1, 2, 3, 4]), kind='numerical', spans_intercept=False), ]
def differentiate_term(term, vars, use_sympy=False): factors = term.factors for var in vars: affected_factors = set( factor for factor in factors if var in _factor_symbols(factor, use_sympy=use_sympy)) if not affected_factors: return Term({Factor('0', eval_method='literal')}) factors = factors.difference(affected_factors).union( _differentiate_factors(affected_factors, var, use_sympy=use_sympy)) return Term(factors or {Factor('1', eval_method='literal')})
def test_equality(self): assert Factor('a') == 'a' assert Factor('a') != 1 assert Factor('a', kind='constant') == Factor('a', kind='numerical') assert Factor('a', eval_method='literal') == Factor('a', eval_method='lookup')
def _differentiate_factors(factors, var, use_sympy=False): if use_sympy: expr = sympy.S('(' + ') * ('.join(factor.expr for factor in factors) + ')').diff(var) eval_method = 'python' else: assert len(factors) == 1 expr = 1 eval_method = next(iter(factors)).eval_method if expr == 1: return set() return {Factor(f'({str(expr)})', eval_method=eval_method)}
def test_encoding_edge_cases(self, materializer): # Verify that constant encoding works well assert (list( materializer._encode_evaled_factor( factor=EvaluatedFactor( Factor("10", eval_method='literal', kind='constant'), values=10, kind='constant', ), spec=ModelSpec([]), drop_rows=[], )['10']) == [10, 10, 10]) # Verify that encoding of nested dictionaries works well assert (list( materializer._encode_evaled_factor( factor=EvaluatedFactor( Factor("A", eval_method='python', kind='numerical'), values={ 'a': [1, 2, 3], 'b': [4, 5, 6], '__metadata__': None }, kind='numerical', ), spec=ModelSpec([]), drop_rows=[], )['A[a]']) == [1, 2, 3]) assert (list( materializer._encode_evaled_factor( factor=EvaluatedFactor( Factor("B", eval_method='python', kind='categorical'), values={'a': ['a', 'b', 'c']}, kind='categorical', ), spec=ModelSpec([]), drop_rows=[], )) == ['B[a][T.a]', 'B[a][T.b]', 'B[a][T.c]'])
def _evaluate_factor(self, factor, spec, drop_rows): if factor.expr not in self.factor_cache: if factor.eval_method.value == 'lookup': value = self._lookup(factor.expr) elif factor.eval_method.value == 'python': value = self._evaluate(factor.expr, factor.metadata, spec) elif factor.eval_method.value == 'literal': value = EvaluatedFactor(factor, self._evaluate(factor.expr, factor.metadata, spec), kind='constant') else: raise FactorEvaluationError( f"Evaluation method {factor.eval_method.value} not recognised for factor {factor.expr}." ) if not isinstance(value, EvaluatedFactor): if isinstance(value, dict) and '__kind__' in value: kind = value['__kind__'] spans_intercept = value.get('__spans_intercept__', False) elif self._is_categorical(value): kind = 'categorical' spans_intercept = True else: kind = 'numerical' spans_intercept = False if factor.kind is not Factor.Kind.UNKNOWN and factor.kind.value != kind: if factor.kind.value == 'categorical': kind = factor.kind.value else: raise FactorEncodingError( f"Factor is expecting to be of kind '{factor.kind.value}' but is actually of kind '{kind}'." ) if factor.expr in spec.encoder_state and Factor.Kind( kind) is not spec.encoder_state[factor.expr][0]: raise FactorEncodingError( f"Factor kind `{kind}` does not match model specification of `{spec.encoder_state[factor.expr][0]}`." ) value = EvaluatedFactor( factor=factor, values=value, kind=kind, spans_intercept=spans_intercept, ) self._check_for_nulls(factor.expr, value.values, spec.na_action, drop_rows) self.factor_cache[factor.expr] = value return self.factor_cache[factor.expr]
def _differentiate_factors(factors, var, use_sympy=False): if use_sympy: try: import sympy expr = sympy.S('(' + ') * ('.join(factor.expr for factor in factors) + ')').diff(var) eval_method = 'python' except ImportError: # pragma: no cover raise ImportError( "`sympy` is not available. Install it using `pip install formulaic[calculus]` or `pip install sympy`." ) else: assert len(factors) == 1 expr = 1 eval_method = next(iter(factors)).eval_method if expr == 1: return set() return {Factor(f'({str(expr)})', eval_method=eval_method)}
def scoped_factor(self): return ScopedFactor(Factor('a'))
def test_sort(self, scoped_factor, scoped_factor_reduced): assert scoped_factor_reduced < scoped_factor assert scoped_factor < ScopedFactor(Factor('b')) with pytest.raises(TypeError): scoped_factor < 1
def scoped_factor_reduced(self): return ScopedFactor(Factor('a'), reduced=True)
def test_hash(self): assert hash(Factor('a')) == hash('a')
def term1(self): return Term([Factor('c'), Factor('b')])
def term2(self): return Term([Factor('c'), Factor('d')])
def factor_lookup(self): return Factor('a', kind='lookup')
def test_equality(self, ev_factor): assert ev_factor == EvaluatedFactor(Factor('a'), [4, 5, 6], kind='numerical') assert ev_factor != 'a'
def test_repr(self): assert repr(Factor('a')) == 'a'
def factor_unknown(self): return Factor('unknown')
def test_repr(self, ev_factor): assert repr(ev_factor) == repr(Factor('a'))
def kind(self, kind): if not kind or kind == 'unknown': raise ValueError( "`EvaluatedFactor` instances must have a known kind.") self._kind = Factor.Kind(kind)
def test_sort(self, ev_factor): assert ev_factor < EvaluatedFactor(Factor('b'), [4, 5, 6], kind='numerical') with pytest.raises(TypeError): ev_factor < 1
def factor_literal(self): return Factor('"string"', kind='literal')