def _tokenize_constraint(string, variable_names): lparen_re = r"\(" rparen_re = r"\)" op_re = "|".join([re.escape(op.token_type) for op in _ops]) num_re = r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?" whitespace_re = r"\s+" # Prefer long matches: variable_names = sorted(variable_names, key=len, reverse=True) variable_re = "|".join([re.escape(n) for n in variable_names]) lexicon = [ (lparen_re, _token_maker(Token.LPAREN, string)), (rparen_re, _token_maker(Token.RPAREN, string)), (op_re, _token_maker("__OP__", string)), (variable_re, _token_maker("VARIABLE", string)), (num_re, _token_maker("NUMBER", string)), (whitespace_re, None), ] scanner = re.Scanner(lexicon) tokens, leftover = scanner.scan(string) if leftover: offset = len(string) - len(leftover) raise PatsyError("unrecognized token in constraint", Origin(string, offset, offset + 1)) return tokens
def test__tokenize_constraint(): code = "2 * (a + b) = q" tokens = _tokenize_constraint(code, ["a", "b", "q"]) expecteds = [("NUMBER", 0, 1, "2"), ("*", 2, 3, "*"), (Token.LPAREN, 4, 5, "("), ("VARIABLE", 5, 6, "a"), ("+", 7, 8, "+"), ("VARIABLE", 9, 10, "b"), (Token.RPAREN, 10, 11, ")"), ("=", 12, 13, "="), ("VARIABLE", 14, 15, "q")] for got, expected in zip(tokens, expecteds): assert isinstance(got, Token) assert got.type == expected[0] assert got.origin == Origin(code, expected[1], expected[2]) assert got.extra == expected[3] from nose.tools import assert_raises assert_raises(PatsyError, _tokenize_constraint, "1 + @b", ["b"]) # Shouldn't raise an error: _tokenize_constraint("1 + @b", ["@b"]) # Check we aren't confused by names which are proper prefixes of other # names: for names in (["a", "aa"], ["aa", "a"]): tokens = _tokenize_constraint("a aa a", names) assert len(tokens) == 3 assert [t.extra for t in tokens] == ["a", "aa", "a"] # Check that embedding ops and numbers inside a variable name works tokens = _tokenize_constraint("2 * a[1,1],", ["a[1,1]"]) assert len(tokens) == 4 assert [t.type for t in tokens] == ["NUMBER", "*", "VARIABLE", ","] assert [t.extra for t in tokens] == ["2", "*", "a[1,1]", ","]
def make_token(scanner, token_string): if type == "__OP__": actual_type = token_string else: actual_type = type return Token(actual_type, Origin(string, *scanner.match.span()), token_string)
def python_tokenize(code): # Since formulas can only contain Python expressions, and Python # expressions cannot meaningfully contain newlines, we'll just remove all # the newlines up front to avoid any complications: code = code.replace("\n", " ").strip() it = tokenize.generate_tokens(StringIO(code).readline) try: for (pytype, string, (_, start), (_, end), code) in it: if pytype == tokenize.ENDMARKER: break origin = Origin(code, start, end) assert pytype != tokenize.NL if pytype == tokenize.NEWLINE: assert string == "" continue if pytype == tokenize.ERRORTOKEN: raise PatsyError("error tokenizing input " "(maybe an unclosed string?)", origin) if pytype == tokenize.COMMENT: raise PatsyError("comments are not allowed", origin) yield (pytype, string, origin) else: # pragma: no cover raise ValueError("stream ended without ENDMARKER?!?") except tokenize.TokenError as e: # TokenError is raised iff the tokenizer thinks that there is # some sort of multi-line construct in progress (e.g., an # unclosed parentheses, which in Python lets a virtual line # continue past the end of the physical line), and it hits the # end of the source text. We have our own error handling for # such cases, so just treat this as an end-of-stream. # # Just in case someone adds some other error case: assert e.args[0].startswith("EOF in multi-line") return
def test_parse_origin(): tree = parse_formula("a ~ b + c") assert tree.origin == Origin("a ~ b + c", 0, 9) assert tree.token.origin == Origin("a ~ b + c", 2, 3) assert tree.args[0].origin == Origin("a ~ b + c", 0, 1) assert tree.args[1].origin == Origin("a ~ b + c", 4, 9) assert tree.args[1].token.origin == Origin("a ~ b + c", 6, 7) assert tree.args[1].args[0].origin == Origin("a ~ b + c", 4, 5) assert tree.args[1].args[1].origin == Origin("a ~ b + c", 8, 9)
def test_evalfactor_reraise(): # This will produce a PatsyError, but buried inside the factor evaluation, # so the original code has no way to give it an appropriate origin= # attribute. EvalFactor should notice this, and add a useful origin: def raise_patsy_error(x): raise PatsyError("WHEEEEEE") formula = "raise_patsy_error(X) + Y" try: dmatrix(formula, {"X": [1, 2, 3], "Y": [4, 5, 6]}) except PatsyError, e: assert e.origin == Origin(formula, 0, formula.index(" "))
def test_NAAction_raise(): action = NAAction(on_NA="raise") # no-NA just passes through: in_arrs = [np.asarray([1.1, 1.2]), np.asarray([1, 2])] is_NAs = [np.asarray([False, False])] * 2 got_arrs = action.handle_NA(in_arrs, is_NAs, [None, None]) assert np.array_equal(got_arrs[0], in_arrs[0]) assert np.array_equal(got_arrs[1], in_arrs[1]) from patsy.origin import Origin o1 = Origin("asdf", 0, 1) o2 = Origin("asdf", 2, 3) # NA raises an error with a correct origin in_idx = np.arange(2) in_arrs = [np.asarray([1.1, 1.2]), np.asarray([1.0, np.nan])] is_NAs = [np.asarray([False, False]), np.asarray([False, True])] try: action.handle_NA(in_arrs, is_NAs, [o1, o2]) assert False except PatsyError as e: assert e.origin is o2
def test_infix_parse(): ops = [Operator("+", 2, 10), Operator("*", 2, 20), Operator("-", 1, 30)] atomic = ["ATOM1", "ATOM2"] # a + -b * (c + d) mock_origin = Origin("asdf", 2, 3) tokens = [ Token("ATOM1", mock_origin, "a"), Token("+", mock_origin, "+"), Token("-", mock_origin, "-"), Token("ATOM2", mock_origin, "b"), Token("*", mock_origin, "*"), Token(Token.LPAREN, mock_origin, "("), Token("ATOM1", mock_origin, "c"), Token("+", mock_origin, "+"), Token("ATOM2", mock_origin, "d"), Token(Token.RPAREN, mock_origin, ")") ] tree = infix_parse(tokens, ops, atomic) def te(tree, type, extra): assert tree.type == type assert tree.token.extra == extra te(tree, "+", "+") te(tree.args[0], "ATOM1", "a") assert tree.args[0].args == [] te(tree.args[1], "*", "*") te(tree.args[1].args[0], "-", "-") assert len(tree.args[1].args[0].args) == 1 te(tree.args[1].args[0].args[0], "ATOM2", "b") te(tree.args[1].args[1], "+", "+") te(tree.args[1].args[1].args[0], "ATOM1", "c") te(tree.args[1].args[1].args[1], "ATOM2", "d") import pytest # No ternary ops pytest.raises(ValueError, infix_parse, [], [Operator("+", 3, 10)], ["ATOMIC"]) # smoke test just to make sure there are no egregious bugs in 'trace' infix_parse(tokens, ops, atomic, trace=True)
formula = "raise_patsy_error(X) + Y" try: dmatrix(formula, {"X": [1, 2, 3], "Y": [4, 5, 6]}) except PatsyError, e: assert e.origin == Origin(formula, 0, formula.index(" ")) else: assert False # This will produce a KeyError, which on Python 3 we can do wrap without # destroying the traceback, so we do so. On Python 2 we let the original # exception escape. try: dmatrix("1 + x[1]", {"x": {}}) except Exception, e: if sys.version_info[0] >= 3: assert isinstance(e, PatsyError) assert e.origin == Origin("1 + x[1]", 4, 8) else: assert isinstance(e, KeyError) else: assert False def test_dmatrix_NA_action(): data = {"x": [1, 2, 3, np.nan], "y": [np.nan, 20, 30, 40]} mat = dmatrix("x + y", data=data) assert np.array_equal(mat, [[1, 2, 20], [1, 3, 30]]) assert_raises(PatsyError, dmatrix, "x + y", data=data, NA_action="raise") lmat, rmat = dmatrices("y ~ x", data=data) assert np.array_equal(lmat, [[20], [30]])
def __init__(self): # You should check this using 'is', not '==' from patsy.origin import Origin self.origin = Origin("MOCK", 1, 2)
def test_python_tokenize(): code = "a + (foo * -1)" tokens = list(python_tokenize(code)) expected = [(tokenize.NAME, "a", Origin(code, 0, 1)), (tokenize.OP, "+", Origin(code, 2, 3)), (tokenize.OP, "(", Origin(code, 4, 5)), (tokenize.NAME, "foo", Origin(code, 5, 8)), (tokenize.OP, "*", Origin(code, 9, 10)), (tokenize.OP, "-", Origin(code, 11, 12)), (tokenize.NUMBER, "1", Origin(code, 12, 13)), (tokenize.OP, ")", Origin(code, 13, 14))] assert tokens == expected code2 = "a + (b" tokens2 = list(python_tokenize(code2)) expected2 = [(tokenize.NAME, "a", Origin(code2, 0, 1)), (tokenize.OP, "+", Origin(code2, 2, 3)), (tokenize.OP, "(", Origin(code2, 4, 5)), (tokenize.NAME, "b", Origin(code2, 5, 6))] assert tokens2 == expected2 import pytest pytest.raises(PatsyError, list, python_tokenize("a b # c")) import pytest pytest.raises(PatsyError, list, python_tokenize("a b \"c"))
def test__tokenize_formula(): code = "y ~ a + (foo(b,c + 2)) + -1 + 0 + 10" tokens = list(_tokenize_formula(code, ["+", "-", "~"])) expecteds = [("PYTHON_EXPR", Origin(code, 0, 1), "y"), ("~", Origin(code, 2, 3), None), ("PYTHON_EXPR", Origin(code, 4, 5), "a"), ("+", Origin(code, 6, 7), None), (Token.LPAREN, Origin(code, 8, 9), None), ("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"), (Token.RPAREN, Origin(code, 23, 24), None), ("+", Origin(code, 25, 26), None), ("-", Origin(code, 27, 28), None), ("ONE", Origin(code, 28, 29), "1"), ("+", Origin(code, 30, 31), None), ("ZERO", Origin(code, 32, 33), "0"), ("+", Origin(code, 34, 35), None), ("NUMBER", Origin(code, 36, 38), "10"), ] for got, expected in zip(tokens, expecteds): assert isinstance(got, Token) assert got.type == expected[0] assert got.origin == expected[1] assert got.extra == expected[2]
def test_formula_factor_origin(): from patsy.origin import Origin desc = ModelDesc.from_formula("a + b") assert (desc.rhs_termlist[1].factors[0].origin == Origin("a + b", 0, 1)) assert (desc.rhs_termlist[2].factors[0].origin == Origin("a + b", 4, 5))