示例#1
0
def test_formula_factor_origin():
    from patsy.origin import Origin
    desc = ModelDesc.from_formula("a + b", EvalEnvironment.capture(0))
    assert (desc.rhs_termlist[1].factors[0].origin
            == Origin("a + b", 0, 1))
    assert (desc.rhs_termlist[2].factors[0].origin
            == Origin("a + b", 4, 5))
示例#2
0
def test_evalfactor_reraise():
    # This will produce a PatsyError, but buried inside the factor evaluation,
    # so the original code has no way to give it an appropriate origin=
    # attribute. EvalFactor should notice this, and add a useful origin:
    def raise_patsy_error(x):
        raise PatsyError("WHEEEEEE")

    formula = "raise_patsy_error(X) + Y"
    try:
        dmatrix(formula, {"X": [1, 2, 3], "Y": [4, 5, 6]})
    except PatsyError as e:
        assert e.origin == Origin(formula, 0, formula.index(" "))
    else:
        assert False
    # This will produce a KeyError, which on Python 3 we can do wrap without
    # destroying the traceback, so we do so. On Python 2 we let the original
    # exception escape.
    try:
        dmatrix("1 + x[1]", {"x": {}})
    except Exception as e:
        if sys.version_info[0] >= 3:
            assert isinstance(e, PatsyError)
            assert e.origin == Origin("1 + x[1]", 4, 8)
        else:
            assert isinstance(e, KeyError)
    else:
        assert False
示例#3
0
def _read_python_expr(it, end_tokens):
    # Read out a full python expression, stopping when we hit an
    # unnested end token.
    pytypes = []
    token_strings = []
    origins = []
    bracket_level = 0
    for pytype, token_string, origin in it:
        assert bracket_level >= 0
        if bracket_level == 0 and token_string in end_tokens:
            it.push_back((pytype, token_string, origin))
            break
        if token_string in ("(", "[", "{"):
            bracket_level += 1
        if token_string in (")", "]", "}"):
            bracket_level -= 1
        if bracket_level < 0:
            raise PatsyError("unmatched close bracket", origin)
        pytypes.append(pytype)
        token_strings.append(token_string)
        origins.append(origin)
    # Either we found an end_token, or we hit the end of the string
    if bracket_level == 0:
        expr_text = pretty_untokenize(zip(pytypes, token_strings))
        if expr_text == "0":
            token_type = "ZERO"
        elif expr_text == "1":
            token_type = "ONE"
        elif _is_a(int, expr_text) or _is_a(float, expr_text):
            token_type = "NUMBER"
        else:
            token_type = "PYTHON_EXPR"
        return Token(token_type, Origin.combine(origins), extra=expr_text)
    else:
        raise PatsyError("unclosed bracket in embedded Python " "expression", Origin.combine(origins))
示例#4
0
def test__tokenize_constraint():
    code = "2 * (a + b) = q"
    tokens = _tokenize_constraint(code, ["a", "b", "q"])
    expecteds = [("NUMBER", 0, 1, "2"),
                 ("*", 2, 3, "*"),
                 (Token.LPAREN, 4, 5, "("),
                 ("VARIABLE", 5, 6, "a"),
                 ("+", 7, 8, "+"),
                 ("VARIABLE", 9, 10, "b"),
                 (Token.RPAREN, 10, 11, ")"),
                 ("=", 12, 13, "="),
                 ("VARIABLE", 14, 15, "q")]
    for got, expected in zip(tokens, expecteds):
        assert isinstance(got, Token)
        assert got.type == expected[0]
        assert got.origin == Origin(code, expected[1], expected[2])
        assert got.extra == expected[3]

    from nose.tools import assert_raises
    assert_raises(PatsyError, _tokenize_constraint, "1 + @b", ["b"])
    # Shouldn't raise an error:
    _tokenize_constraint("1 + @b", ["@b"])

    # Check we aren't confused by names which are proper prefixes of other
    # names:
    for names in (["a", "aa"], ["aa", "a"]):
        tokens = _tokenize_constraint("a aa a", names)
        assert len(tokens) == 3
        assert [t.extra for t in tokens] == ["a", "aa", "a"]

    # Check that embedding ops and numbers inside a variable name works
    tokens = _tokenize_constraint("2 * a[1,1],", ["a[1,1]"])
    assert len(tokens) == 4
    assert [t.type for t in tokens] == ["NUMBER", "*", "VARIABLE", ","]
    assert [t.extra for t in tokens] == ["2", "*", "a[1,1]", ","]
示例#5
0
def _read_op_context(token, c):
    if token.type == Token.RPAREN:
        if c.trace:
            print "Found close-paren"
        while c.op_stack and c.op_stack[-1].op.token_type != Token.LPAREN:
            _run_op(c)
        if not c.op_stack:
            raise PatsyError("missing '(' or extra ')'", token)
        assert c.op_stack[-1].op.token_type == Token.LPAREN
        # Expand the origin of the item on top of the noun stack to include
        # the open and close parens:
        combined = Origin.combine([c.op_stack[-1].token,
                                   c.noun_stack[-1].token,
                                   token])
        c.noun_stack[-1].origin = combined
        # Pop the open-paren
        c.op_stack.pop()
        return False
    elif token.type in c.binary_ops:
        if c.trace:
            print "Found binary operator %r" % (token.type)
        stackop = _StackOperator(c.binary_ops[token.type], token)
        while (c.op_stack
               and stackop.op.precedence <= c.op_stack[-1].op.precedence):
            _run_op(c)
        if c.trace:
            print "Pushing binary operator %r" % (token.type)
        c.op_stack.append(stackop)
        return True
    else:
        raise PatsyError("expected an operator, not '%s'"
                            % (token.origin.relevant_code(),),
                            token)
def _read_op_context(token, c):
    if token.type == Token.RPAREN:
        if c.trace:
            print("Found close-paren")
        while c.op_stack and c.op_stack[-1].op.token_type != Token.LPAREN:
            _run_op(c)
        if not c.op_stack:
            raise PatsyError("missing '(' or extra ')'", token)
        assert c.op_stack[-1].op.token_type == Token.LPAREN
        # Expand the origin of the item on top of the noun stack to include
        # the open and close parens:
        combined = Origin.combine(
            [c.op_stack[-1].token, c.noun_stack[-1].token, token])
        c.noun_stack[-1].origin = combined
        # Pop the open-paren
        c.op_stack.pop()
        return False
    elif token.type in c.binary_ops:
        if c.trace:
            print("Found binary operator %r" % (token.type))
        stackop = _StackOperator(c.binary_ops[token.type], token)
        while (c.op_stack
               and stackop.op.precedence <= c.op_stack[-1].op.precedence):
            _run_op(c)
        if c.trace:
            print("Pushing binary operator %r" % (token.type))
        c.op_stack.append(stackop)
        return True
    else:
        raise PatsyError(
            "expected an operator, not '%s'" %
            (token.origin.relevant_code(), ), token)
示例#7
0
def _tokenize_constraint(string, variable_names):
    lparen_re = r"\("
    rparen_re = r"\)"
    op_re = "|".join([re.escape(op.token_type) for op in _ops])
    num_re = r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?"
    whitespace_re = r"\s+"

    # Prefer long matches:
    variable_names = sorted(variable_names, key=len, reverse=True)
    variable_re = "|".join([re.escape(n) for n in variable_names])

    lexicon = [
        (lparen_re, _token_maker(Token.LPAREN, string)),
        (rparen_re, _token_maker(Token.RPAREN, string)),
        (op_re, _token_maker("__OP__", string)),
        (variable_re, _token_maker("VARIABLE", string)),
        (num_re, _token_maker("NUMBER", string)),
        (whitespace_re, None),
        ]

    scanner = Scanner(lexicon)
    tokens, leftover = scanner.scan(string)
    if leftover:
        offset = len(string) - len(leftover)
        raise PatsyError("unrecognized token in constraint",
                            Origin(string, offset, offset + 1))

    return tokens
示例#8
0
 def make_token(scanner, token_string):
     if type == "__OP__":
         actual_type = token_string
     else:
         actual_type = type
     return Token(actual_type, Origin(string, *scanner.match.span()),
                  token_string)
示例#9
0
def python_tokenize(code):
    # Since formulas can only contain Python expressions, and Python
    # expressions cannot meaningfully contain newlines, we'll just remove all
    # the newlines up front to avoid any complications:
    code = code.replace("\n", " ").strip()
    it = tokenize.generate_tokens(StringIO(code).readline)
    try:
        for (pytype, string, (_, start), (_, end), code) in it:
            if pytype == tokenize.ENDMARKER:
                break
            origin = Origin(code, start, end)
            assert pytype != tokenize.NL
            if pytype == tokenize.NEWLINE:
                assert string == ""
                continue
            if pytype == tokenize.ERRORTOKEN:
                raise PatsyError("error tokenizing input "
                                 "(maybe an unclosed string?)",
                                 origin)
            if pytype == tokenize.COMMENT:
                raise PatsyError("comments are not allowed", origin)
            yield (pytype, string, origin)
        else: # pragma: no cover
            raise ValueError("stream ended without ENDMARKER?!?")
    except tokenize.TokenError as e:
        # TokenError is raised iff the tokenizer thinks that there is
        # some sort of multi-line construct in progress (e.g., an
        # unclosed parentheses, which in Python lets a virtual line
        # continue past the end of the physical line), and it hits the
        # end of the source text. We have our own error handling for
        # such cases, so just treat this as an end-of-stream.
        #
        # Just in case someone adds some other error case:
        assert e.args[0].startswith("EOF in multi-line")
        return
示例#10
0
def test_parse_origin():
    tree = parse_formula("a ~ b + c")
    assert tree.origin == Origin("a ~ b + c", 0, 9)
    assert tree.token.origin == Origin("a ~ b + c", 2, 3)
    assert tree.args[0].origin == Origin("a ~ b + c", 0, 1)
    assert tree.args[1].origin == Origin("a ~ b + c", 4, 9)
    assert tree.args[1].token.origin == Origin("a ~ b + c", 6, 7)
    assert tree.args[1].args[0].origin == Origin("a ~ b + c", 4, 5)
    assert tree.args[1].args[1].origin == Origin("a ~ b + c", 8, 9)
示例#11
0
def _run_op(c):
    assert c.op_stack
    stackop = c.op_stack.pop()
    args = []
    for i in range(stackop.op.arity):
        args.append(c.noun_stack.pop())
    args.reverse()
    if c.trace:
        print("Reducing %r (%r)" % (stackop.op.token_type, args))
    node = ParseNode(stackop.op.token_type, stackop.token, args,
                     Origin.combine([stackop.token] + args))
    c.noun_stack.append(node)
示例#12
0
def test_evalfactor_reraise():
    # This will produce a PatsyError, but buried inside the factor evaluation,
    # so the original code has no way to give it an appropriate origin=
    # attribute. EvalFactor should notice this, and add a useful origin:
    def raise_patsy_error(x):
        raise PatsyError("WHEEEEEE")

    formula = "raise_patsy_error(X) + Y"
    try:
        dmatrix(formula, {"X": [1, 2, 3], "Y": [4, 5, 6]})
    except PatsyError, e:
        assert e.origin == Origin(formula, 0, formula.index(" "))
示例#13
0
def _run_op(c):
    assert c.op_stack
    stackop = c.op_stack.pop()
    args = []
    for i in xrange(stackop.op.arity):
        args.append(c.noun_stack.pop())
    args.reverse()
    if c.trace:
        print "Reducing %r (%r)" % (stackop.op.token_type, args)
    node = ParseNode(stackop.op.token_type, stackop.token, args,
                     Origin.combine([stackop.token] + args))
    c.noun_stack.append(node)
示例#14
0
def test_NAAction_raise():
    action = NAAction(on_NA="raise")

    # no-NA just passes through:
    in_arrs = [np.asarray([1.1, 1.2]), np.asarray([1, 2])]
    is_NAs = [np.asarray([False, False])] * 2
    got_arrs = action.handle_NA(in_arrs, is_NAs, [None, None])
    assert np.array_equal(got_arrs[0], in_arrs[0])
    assert np.array_equal(got_arrs[1], in_arrs[1])

    from patsy.origin import Origin
    o1 = Origin("asdf", 0, 1)
    o2 = Origin("asdf", 2, 3)

    # NA raises an error with a correct origin
    in_idx = np.arange(2)
    in_arrs = [np.asarray([1.1, 1.2]), np.asarray([1.0, np.nan])]
    is_NAs = [np.asarray([False, False]), np.asarray([False, True])]
    try:
        action.handle_NA(in_arrs, is_NAs, [o1, o2])
        assert False
    except PatsyError as e:
        assert e.origin is o2
示例#15
0
def _read_python_expr(it, end_tokens):
    # Read out a full python expression, stopping when we hit an
    # unnested end token.
    pytypes = []
    token_strings = []
    origins = []
    bracket_level = 0
    for pytype, token_string, origin in it:
        assert bracket_level >= 0
        if bracket_level == 0 and token_string in end_tokens:
            it.push_back((pytype, token_string, origin))
            break
        if token_string in ("(", "[", "{"):
            bracket_level += 1
        if token_string in (")", "]", "}"):
            bracket_level -= 1
        if bracket_level < 0:
            raise PatsyError("unmatched close bracket", origin)
        pytypes.append(pytype)
        token_strings.append(token_string)
        origins.append(origin)
    # Either we found an end_token, or we hit the end of the string
    if bracket_level == 0:
        expr_text = pretty_untokenize(zip(pytypes, token_strings))
        if expr_text == "0":
            token_type = "ZERO"
        elif expr_text == "1":
            token_type = "ONE"
        elif _is_a(int, expr_text) or _is_a(float, expr_text):
            token_type = "NUMBER"
        else:
            token_type = "PYTHON_EXPR"
        return Token(token_type, Origin.combine(origins), extra=expr_text)
    else:
        raise PatsyError("unclosed bracket in embedded Python "
                            "expression",
                            Origin.combine(origins))
示例#16
0
def test_infix_parse():
    ops = [Operator("+", 2, 10), Operator("*", 2, 20), Operator("-", 1, 30)]
    atomic = ["ATOM1", "ATOM2"]
    # a + -b * (c + d)
    mock_origin = Origin("asdf", 2, 3)
    tokens = [
        Token("ATOM1", mock_origin, "a"),
        Token("+", mock_origin, "+"),
        Token("-", mock_origin, "-"),
        Token("ATOM2", mock_origin, "b"),
        Token("*", mock_origin, "*"),
        Token(Token.LPAREN, mock_origin, "("),
        Token("ATOM1", mock_origin, "c"),
        Token("+", mock_origin, "+"),
        Token("ATOM2", mock_origin, "d"),
        Token(Token.RPAREN, mock_origin, ")")
    ]
    tree = infix_parse(tokens, ops, atomic)

    def te(tree, type, extra):
        assert tree.type == type
        assert tree.token.extra == extra

    te(tree, "+", "+")
    te(tree.args[0], "ATOM1", "a")
    assert tree.args[0].args == []
    te(tree.args[1], "*", "*")
    te(tree.args[1].args[0], "-", "-")
    assert len(tree.args[1].args[0].args) == 1
    te(tree.args[1].args[0].args[0], "ATOM2", "b")
    te(tree.args[1].args[1], "+", "+")
    te(tree.args[1].args[1].args[0], "ATOM1", "c")
    te(tree.args[1].args[1].args[1], "ATOM2", "d")

    import pytest
    # No ternary ops
    pytest.raises(ValueError, infix_parse, [], [Operator("+", 3, 10)],
                  ["ATOMIC"])

    # smoke test just to make sure there are no egregious bugs in 'trace'
    infix_parse(tokens, ops, atomic, trace=True)
示例#17
0
    formula = "raise_patsy_error(X) + Y"
    try:
        dmatrix(formula, {"X": [1, 2, 3], "Y": [4, 5, 6]})
    except PatsyError, e:
        assert e.origin == Origin(formula, 0, formula.index(" "))
    else:
        assert False
    # This will produce a KeyError, which on Python 3 we can do wrap without
    # destroying the traceback, so we do so. On Python 2 we let the original
    # exception escape.
    try:
        dmatrix("1 + x[1]", {"x": {}})
    except Exception, e:
        if sys.version_info[0] >= 3:
            assert isinstance(e, PatsyError)
            assert e.origin == Origin("1 + x[1]", 4, 8)
        else:
            assert isinstance(e, KeyError)
    else:
        assert False


def test_dmatrix_NA_action():
    data = {"x": [1, 2, 3, np.nan], "y": [np.nan, 20, 30, 40]}

    mat = dmatrix("x + y", data=data)
    assert np.array_equal(mat, [[1, 2, 20], [1, 3, 30]])
    assert_raises(PatsyError, dmatrix, "x + y", data=data, NA_action="raise")

    lmat, rmat = dmatrices("y ~ x", data=data)
    assert np.array_equal(lmat, [[20], [30]])
示例#18
0
 def __init__(self):
     # You should check this using 'is', not '=='
     from patsy.origin import Origin
     self.origin = Origin("MOCK", 1, 2)
示例#19
0
def test_python_tokenize():
    code = "a + (foo * -1)"
    tokens = list(python_tokenize(code))
    expected = [(tokenize.NAME, "a", Origin(code, 0, 1)),
                (tokenize.OP, "+", Origin(code, 2, 3)),
                (tokenize.OP, "(", Origin(code, 4, 5)),
                (tokenize.NAME, "foo", Origin(code, 5, 8)),
                (tokenize.OP, "*", Origin(code, 9, 10)),
                (tokenize.OP, "-", Origin(code, 11, 12)),
                (tokenize.NUMBER, "1", Origin(code, 12, 13)),
                (tokenize.OP, ")", Origin(code, 13, 14))]
    assert tokens == expected

    code2 = "a + (b"
    tokens2 = list(python_tokenize(code2))
    expected2 = [(tokenize.NAME, "a", Origin(code2, 0, 1)),
                 (tokenize.OP, "+", Origin(code2, 2, 3)),
                 (tokenize.OP, "(", Origin(code2, 4, 5)),
                 (tokenize.NAME, "b", Origin(code2, 5, 6))]
    assert tokens2 == expected2

    import pytest
    pytest.raises(PatsyError, list, python_tokenize("a b # c"))

    import pytest
    pytest.raises(PatsyError, list, python_tokenize("a b \"c"))
示例#20
0
def test__tokenize_formula():
    code = "y ~ a + (foo(b,c +   2)) + -1 + 0 + 10"
    tokens = list(_tokenize_formula(code, ["+", "-", "~"]))
    expecteds = [("PYTHON_EXPR", Origin(code, 0, 1), "y"),
                 ("~", Origin(code, 2, 3), None),
                 ("PYTHON_EXPR", Origin(code, 4, 5), "a"),
                 ("+", Origin(code, 6, 7), None),
                 (Token.LPAREN, Origin(code, 8, 9), None),
                 ("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"),
                 (Token.RPAREN, Origin(code, 23, 24), None),
                 ("+", Origin(code, 25, 26), None),
                 ("-", Origin(code, 27, 28), None),
                 ("ONE", Origin(code, 28, 29), "1"),
                 ("+", Origin(code, 30, 31), None),
                 ("ZERO", Origin(code, 32, 33), "0"),
                 ("+", Origin(code, 34, 35), None),
                 ("NUMBER", Origin(code, 36, 38), "10"),
                 ]
    for got, expected in zip(tokens, expecteds):
        assert isinstance(got, Token)
        assert got.type == expected[0]
        assert got.origin == expected[1]
        assert got.extra == expected[2]