Python Token 예제들, mlprogram.languages.Token Python 예제들

예제 #1

0

파일 보기

 def test_package_name_like_string(self):
     reference = TokenizeQuery()('foo.bar')
     assert [
         Token(None, "foo.bar", "foo.bar"),
         Token(None, "foo", "foo"),
         Token(None, "bar", "bar")
     ] == reference

예제 #2

0

파일 보기

    def test_eval(self):
        entries = [Environment(
            {"ground_truth": "y = x + 1"},
            set(["ground_truth"])
        )]
        dataset = ListDataset(entries)
        d = get_samples(dataset, MockParser())
        aencoder = ActionSequenceEncoder(d, 0)
        action_sequence = GroundTruthToActionSequence(MockParser())(
            "y = x + 1"
        )
        transform = AddPreviousActions(aencoder)
        prev_action_tensor = transform(
            reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")],
            action_sequence=action_sequence,
            train=False
        )

        assert np.array_equal(
            [
                [2, -1, -1], [3, -1, -1], [4, -1, -1], [-1, 1, -1],
                [1, -1, -1], [5, -1, -1], [-1, 2, -1], [1, -1, -1],
                [4, -1, -1], [-1, 3, -1], [1, -1, -1], [6, -1, -1],
                [-1, 4, -1], [1, -1, -1]
            ],
            prev_action_tensor.numpy()
        )

예제 #3

0

파일 보기

    def test_eval(self):
        entries = [Environment(
            {"text_query": "foo bar", "ground_truth": "y = x + 1"},
            set(["ground_truth"])
        )]
        dataset = ListDataset(entries)
        d = get_samples(dataset, MockParser())
        aencoder = ActionSequenceEncoder(d, 0)
        action_sequence = GroundTruthToActionSequence(MockParser())(
            "y = x + 1"
        )
        transform = AddActions(aencoder)
        action_tensor = transform(
            reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")],
            action_sequence=action_sequence,
            train=False
        )

        assert np.array_equal(
            [
                [2, 2, 0], [4, 3, 1], [6, 4, 2], [6, 4, 2], [5, 3, 1],
                [6, 5, 5], [6, 5, 5], [5, 5, 5], [6, 4, 8], [6, 4, 8],
                [5, 5, 5], [9, 6, 11], [9, 6, 11], [-1, -1, -1]
            ],
            action_tensor.numpy()
        )

예제 #4

0

파일 보기

    def test_decode(self):
        funcdef = ExpandTreeRule(
            NodeType("def", NodeConstraint.Node, False),
            [("name", NodeType("value", NodeConstraint.Token, True)),
             ("body", NodeType("expr", NodeConstraint.Node, True))])
        expr = ExpandTreeRule(
            NodeType("expr", NodeConstraint.Node, False),
            [("op", NodeType("value", NodeConstraint.Token, True)),
             ("arg0", NodeType("value", NodeConstraint.Token, True)),
             ("arg1", NodeType("value", NodeConstraint.Token, True))])

        encoder = ActionSequenceEncoder(
            Samples([funcdef, expr], [
                NodeType("def", NodeConstraint.Node, False),
                NodeType("value", NodeConstraint.Token, True),
                NodeType("expr", NodeConstraint.Node, False)
            ], [("", "f")]), 0)
        action_sequence = ActionSequence()
        action_sequence.eval(ApplyRule(funcdef))
        action_sequence.eval(GenerateToken("", "f"))
        action_sequence.eval(GenerateToken("", "1"))
        action_sequence.eval(ApplyRule(CloseVariadicFieldRule()))

        expected_action_sequence = ActionSequence()
        expected_action_sequence.eval(ApplyRule(funcdef))
        expected_action_sequence.eval(GenerateToken("", "f"))
        expected_action_sequence.eval(GenerateToken("", "1"))
        expected_action_sequence.eval(ApplyRule(CloseVariadicFieldRule()))

        result = encoder.decode(
            encoder.encode_action(action_sequence,
                                  [Token(None, "1", "1")])[:-1, 1:],
            [Token(None, "1", "1")])
        assert \
            expected_action_sequence.action_sequence == result.action_sequence

예제 #5

0

파일 보기

 def test_subtokens(self):
     reference = TokenizeQuery()('foo.bar')
     assert [Token(None, "SUB_START", ""),
             Token(None, "foo", "foo"),
             Token(None, ".", "."),
             Token(None, "bar", "bar"),
             Token(None, "SUB_END", "")] == reference

예제 #6

0

파일 보기

    def test_encode_action(self):
        funcdef = ExpandTreeRule(
            NodeType("def", NodeConstraint.Node, False),
            [("name", NodeType("value", NodeConstraint.Token, True)),
             ("body", NodeType("expr", NodeConstraint.Node, True))])
        expr = ExpandTreeRule(
            NodeType("expr", NodeConstraint.Node, False),
            [("op", NodeType("value", NodeConstraint.Token, True)),
             ("arg0", NodeType("value", NodeConstraint.Token, True)),
             ("arg1", NodeType("value", NodeConstraint.Token, True))])

        encoder = ActionSequenceEncoder(
            Samples([funcdef, expr], [
                NodeType("def", NodeConstraint.Node, False),
                NodeType("value", NodeConstraint.Token, True),
                NodeType("expr", NodeConstraint.Node, True)
            ], [("", "f"), ("", "2")]), 0)
        action_sequence = ActionSequence()
        action_sequence.eval(ApplyRule(funcdef))
        action_sequence.eval(GenerateToken("", "f"))
        action_sequence.eval(GenerateToken("", "1"))
        action_sequence.eval(GenerateToken("", "2"))
        action_sequence.eval(ApplyRule(CloseVariadicFieldRule()))
        action = encoder.encode_action(
            action_sequence,
            [Token("", "1", "1"), Token("", "2", "2")])

        assert np.array_equal([[-1, 2, -1, -1], [2, -1, 1, -1], [2, -1, -1, 0],
                               [2, -1, 2, 1], [2, 1, -1, -1], [3, -1, -1, -1]],
                              action.numpy())

예제 #7

0

파일 보기

파일: test_lexer.py 프로젝트: nashid/mlprogram

 def test_happy_path(self):
     lexer = Lexer()
     assert lexer.tokenize_with_offset("int a = 0;") == [
         (0, Token("INT", "int", "int")), (4, Token("ID", "a", "a")),
         (6, Token("EQUALS", "=", "=")),
         (8, Token("INT_CONST_OCT", "0", "0")), (9, Token("SEMI", ";", ";"))
     ]

예제 #8

0

파일 보기

 def test_simple_case(self):
     entries = [Environment(
         {"ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParser())
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParser())(
         ground_truth="y = x + 1"
     )
     transform = EncodeActionSequence(aencoder)
     ground_truth = transform(
         action_sequence=action_sequence,
         reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")],
     )
     assert np.array_equal(
         [
             [3, -1, -1], [4, -1, -1], [-1, 1, -1], [1, -1, -1],
             [5, -1, -1], [-1, 2, -1], [1, -1, -1], [4, -1, -1],
             [-1, 3, -1], [1, -1, -1], [6, -1, -1], [-1, 4, -1],
             [1, -1, -1]
         ],
         ground_truth.numpy()
     )

예제 #9

0

파일 보기

파일: parser.py 프로젝트: nashid/mlprogram

 def unparse(self, code: AST) -> Optional[diffAST]:
     assert isinstance(code, Node)
     fields = {field.name: field.value for field in code.fields}
     if code.get_type_name() == "Diff":
         deltas = [
             self.unparse(delta)
             for delta in cast(List[AST], fields["deltas"])
         ]
         if None in deltas:
             return None
         return Diff(cast(List[Delta], deltas))
     elif code.get_type_name() == "Insert":
         value = self.lexer.untokenize([
             Token(None,
                   cast(Leaf, v).value,
                   cast(Leaf, v).value)
             for v in cast(List[AST], fields["value"])
         ])
         if value is None:
             return None
         return Insert(cast(Leaf, fields["line_number"]).value, value)
     elif code.get_type_name() == "Remove":
         return Remove(cast(Leaf, fields["line_number"]).value)
     elif code.get_type_name() == "Replace":
         value = self.lexer.untokenize([
             Token(None,
                   cast(Leaf, v).value,
                   cast(Leaf, v).value)
             for v in cast(List[AST], fields["value"])
         ])
         if value is None:
             return None
         return Replace(cast(Leaf, fields["line_number"]).value, value)
     raise AssertionError(f"invalid node type_name: {code.get_type_name()}")

예제 #10

0

파일 보기

 def test_eval(self):
     entries = [Environment(
         {"text_query": "ab test", "ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParserWithoutVariadicArgs())
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())(
         "y = x + 1"
     )
     transform = AddQueryForTreeGenDecoder(aencoder, 3,)
     query = transform(
         reference=[Token(None, "ab", "ab"), Token(None, "test", "test")],
         action_sequence=action_sequence,
         train=False
     )
     assert np.array_equal(
         [
             [-1, -1, -1], [2, -1, -1], [3, 2, -1], [4, 3, 2],
             [3, 2, -1], [5, 3, 2], [5, 3, 2], [4, 5, 3],
             [5, 3, 2], [6, 5, 3]
         ],
         query.numpy()
     )

예제 #11

0

파일 보기

 def test_eval(self):
     entries = [Environment(
         {"text_query": "ab test", "ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParserWithoutVariadicArgs())
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())(
         "y = x + 1"
     )
     transform = AddActionSequenceAsTree(aencoder,)
     matrix, depth = transform(
         reference=[Token(None, "ab", "ab"), Token(None, "test", "test")],
         action_sequence=action_sequence,
         train=False
     )
     assert np.array_equal(
         [0, 1, 2, 3, 2, 3, 3, 4, 3, 4],
         depth.numpy()
     )
     assert np.array_equal(
         [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 0, 1, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
         matrix.numpy()
     )

예제 #12

0

파일 보기

 def test_n_dependent(self):
     entries = [Environment(
         {"text_query": "ab test", "ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParserWithoutVariadicArgs())
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())(
         "y = x + 1"
     )
     transform = AddPreviousActionRules(aencoder, 2, n_dependent=3)
     prev_rule_action = transform(
         reference=[Token(None, "ab", "ab"), Token(None, "test", "test")],
         action_sequence=action_sequence,
         train=False,
     )
     assert np.array_equal(
         [
             # str -> "y"
             [[-1, -1, -1], [-1, 3, -1], [-1, -1, -1]],
             # Number -> number
             [[8, -1, -1], [9, -1, -1], [-1, -1, -1]],
             [[-1, -1, -1], [-1, 4, -1], [-1, -1, -1]],
         ],
         prev_rule_action.numpy()
     )

예제 #13

0

파일 보기

파일: test_lexer.py 프로젝트: nashid/mlprogram

 def test_invalid_program(self):
     lexer = Lexer()
     assert lexer.tokenize_with_offset("int a = 0") == [
         (0, Token("INT", "int", "int")),
         (4, Token("ID", "a", "a")),
         (6, Token("EQUALS", "=", "=")),
         (8, Token("INT_CONST_OCT", "0", "0")),
     ]

예제 #14

0

파일 보기

파일: test_lexer.py 프로젝트: nashid/mlprogram

 def test_int_placeholder(self):
     lexer = Lexer()
     assert lexer.tokenize_with_offset("int a = 1;") == [
         (0, Token("name", "___name@0___", "int")),
         (4, Token("name", "___name@1___", "a")),
         (6, Token("op", "=", "=")),
         (8, Token("number", "___number@0___", "1")),
         (9, Token("op", ";", ";"))
     ]

예제 #15

0

파일 보기

파일: test_lexer.py 프로젝트: nashid/mlprogram

 def test_float_placeholder(self):
     lexer = Lexer()
     assert lexer.tokenize_with_offset("float a = 1.0;") == [
         (0, Token("name", "___name@0___", "float")),
         (6, Token("name", "___name@1___", "a")),
         (8, Token("op", "=", "=")),
         (10, Token("number", "___number@0___", "1.0")),
         (13, Token("op", ";", ";"))
     ]

예제 #16

0

파일 보기

def test_encode():
    encoder = EncodeQuery(128)
    out = encoder(Environment(
        states={"reference": [Token(None, "print", "print"),
                              Token(None, "___var@1___", "`x`")]}
    ))
    assert out.states["input_ids"].shape == (128, 50)
    assert out.states["segment_ids"].shape == (128,)
    assert out.states["input_mask"].shape == (128,)
    assert np.all(out.states["segment_ids"].numpy() == 0)
    assert np.all(out.states["input_mask"][:4].numpy() == 1)
    assert np.all(out.states["input_mask"][4:].numpy() == 0)

예제 #17

0

파일 보기

 def test_quoted_string(self):
     reference = TokenizeQuery()('"quoted string" test')
     assert [
         Token(None, "####0####", 'quoted string'),
         Token(None, "test", "test")
     ] == reference
     reference = TokenizeQuery()('"quoted string" "quoted string" test')
     assert [
         Token(None, '####0####', "quoted string"),
         Token(None, "####0####", 'quoted string'),
         Token(None, "test", "test")
     ] == reference

예제 #18

0

파일 보기

def test_extractor():
    encoder = EncodeQuery(128)
    extractor = Extractor()
    out = encoder(Environment(
        states={"reference": [Token(None, "print", "print"),
                              Token(None, "___var@1___", "`x`")]}
    ))
    out.states["input_ids"] = out.states["input_ids"].unsqueeze(0)
    out.states["segment_ids"] = out.states["segment_ids"].unsqueeze(0)
    out.states["input_mask"] = out.states["input_mask"].unsqueeze(0)
    out = extractor(out)
    assert out.states["reference_features"].data.shape == (128, 1, 768)
    assert not out.states["reference_features"].data.requires_grad
    assert np.all(out.states["reference_features"].mask[:4].numpy() == 1)
    assert np.all(out.states["reference_features"].mask[4:].numpy() == 0)

예제 #19

0

파일 보기

    def test_encode_invalid_sequence(self):
        funcdef = ExpandTreeRule(
            NodeType("def", NodeConstraint.Node, False),
            [("name", NodeType("value", NodeConstraint.Token, True)),
             ("body", NodeType("expr", NodeConstraint.Node, True))])
        expr = ExpandTreeRule(
            NodeType("expr", NodeConstraint.Node, False),
            [("op", NodeType("value", NodeConstraint.Token, False)),
             ("arg0", NodeType("value", NodeConstraint.Token, True)),
             ("arg1", NodeType("value", NodeConstraint.Token, True))])

        encoder = ActionSequenceEncoder(
            Samples([funcdef, expr], [
                NodeType("def", NodeConstraint.Node, False),
                NodeType("value", NodeConstraint.Token, True),
                NodeType("expr", NodeConstraint.Node, True)
            ], [("", "f")]), 0)
        action_sequence = ActionSequence()
        action_sequence.eval(ApplyRule(funcdef))
        action_sequence.eval(GenerateToken("", "f"))
        action_sequence.eval(GenerateToken("", "1"))
        action_sequence.eval(ApplyRule(CloseVariadicFieldRule()))

        assert encoder.encode_action(action_sequence,
                                     [Token("", "2", "2")]) is None

예제 #20

0

파일 보기

    def test_encode_empty_sequence(self):
        funcdef = ExpandTreeRule(
            NodeType("def", NodeConstraint.Node, False),
            [("name", NodeType("value", NodeConstraint.Token, False)),
             ("body", NodeType("expr", NodeConstraint.Node, True))])
        expr = ExpandTreeRule(
            NodeType("expr", NodeConstraint.Node, False),
            [("op", NodeType("value", NodeConstraint.Token, False)),
             ("arg0", NodeType("value", NodeConstraint.Token, False)),
             ("arg1", NodeType("value", NodeConstraint.Token, False))])

        encoder = ActionSequenceEncoder(
            Samples([funcdef, expr], [
                NodeType("def", NodeConstraint.Node, False),
                NodeType("value", NodeConstraint.Token, False),
                NodeType("expr", NodeConstraint.Node, False)
            ], [("", "f")]), 0)
        action_sequence = ActionSequence()
        action = encoder.encode_action(action_sequence, [Token("", "1", "1")])
        parent = encoder.encode_parent(action_sequence)
        d, m = encoder.encode_tree(action_sequence)

        assert np.array_equal([[-1, -1, -1, -1]], action.numpy())
        assert np.array_equal([[-1, -1, -1, -1]], parent.numpy())
        assert np.array_equal(np.zeros((0, )), d.numpy())
        assert np.array_equal(np.zeros((0, 0)), m.numpy())

예제 #21

0

파일 보기

 def test_ast_set_sample(self):
     asts = ["c0", "c1", "c2"]
     sampler = SequentialProgramSampler(MockSynthesizer(asts),
                                        transform_input, Collate(),
                                        MockEncoder(), MockExpander(),
                                        MockInterpreter())
     zero = SamplerState(0, sampler.initialize([(None, None)]))
     samples = list(sampler.batch_k_samples([zero], [3]))
     samples.sort(key=lambda x: -x.state.score)
     assert 3 == len(samples)
     assert samples[0] == DuplicatedSamplerState(
         SamplerState(
             1,
             Environment({
                 "test_cases": [(None, None)],
                 "reference": [Token(None, str(asts[0]), str(asts[0]))],
                 "variables": [["#" + str(asts[0])]],
                 "interpreter_state":
                 BatchedState({str(asts[0]): None},
                              {str(asts[0]): ["#" + str(asts[0])]},
                              [str(asts[0])], ["#" + str(asts[0])])
             })), 1)
     assert DuplicatedSamplerState(
         SamplerState(
             0.5,
             Environment({
                 "test_cases": [(None, None)],
                 "reference": [Token(None, str(asts[1]), str(asts[1]))],
                 "variables": [["#" + str(asts[1])]],
                 "interpreter_state":
                 BatchedState({str(asts[1]): None},
                              {str(asts[1]): ["#" + str(asts[1])]},
                              [str(asts[1])], ["#" + str(asts[1])])
             })), 1) == samples[1]
     assert DuplicatedSamplerState(
         SamplerState(
             1.0 / 3,
             Environment({
                 "test_cases": [(None, None)],
                 "reference": [Token(None, str(asts[2]), str(asts[2]))],
                 "variables": [["#" + str(asts[2])]],
                 "interpreter_state":
                 BatchedState({str(asts[2]): None},
                              {str(asts[2]): ["#" + str(asts[2])]},
                              [str(asts[2])], ["#" + str(asts[2])])
             })), 1) == samples[2]

예제 #22

0

파일 보기

    def test_encode_each_action(self):
        funcdef = ExpandTreeRule(
            NodeType("def", NodeConstraint.Node, False),
            [("name", NodeType("value", NodeConstraint.Token, True)),
             ("body", NodeType("expr", NodeConstraint.Node, True))])
        expr = ExpandTreeRule(
            NodeType("expr", NodeConstraint.Node, False),
            [("constant", NodeType("value", NodeConstraint.Token, True))])

        encoder = ActionSequenceEncoder(
            Samples([funcdef, expr], [
                NodeType("def", NodeConstraint.Node, False),
                NodeType("value", NodeConstraint.Token, True),
                NodeType("expr", NodeConstraint.Node, False),
                NodeType("expr", NodeConstraint.Node, True)
            ], [("", "f"), ("", "2")]), 0)
        action_sequence = ActionSequence()
        action_sequence.eval(ApplyRule(funcdef))
        action_sequence.eval(GenerateToken("", "f"))
        action_sequence.eval(GenerateToken("", "1"))
        action_sequence.eval(GenerateToken("", "2"))
        action_sequence.eval(ApplyRule(CloseVariadicFieldRule()))
        action_sequence.eval(ApplyRule(expr))
        action_sequence.eval(GenerateToken("", "f"))
        action_sequence.eval(ApplyRule(CloseVariadicFieldRule()))
        action_sequence.eval(ApplyRule(CloseVariadicFieldRule()))
        action = encoder.encode_each_action(
            action_sequence,
            [Token("", "1", "1"), Token("", "2", "2")], 1)

        assert np.array_equal(
            np.array(
                [
                    [[1, -1, -1], [2, -1, -1]],  # funcdef
                    [[-1, -1, -1], [-1, 1, -1]],  # f
                    [[-1, -1, -1], [-1, -1, 0]],  # 1
                    [[-1, -1, -1], [-1, 2, 1]],  # 2
                    [[-1, -1, -1], [-1, -1, -1]],  # CloseVariadicField
                    [[3, -1, -1], [2, -1, -1]],  # expr
                    [[-1, -1, -1], [-1, 1, -1]],  # f
                    [[-1, -1, -1], [-1, -1, -1]],  # CloseVariadicField
                    [[-1, -1, -1], [-1, -1, -1]]  # CloseVariadicField
                ],
                dtype=np.long),
            action.numpy())

예제 #23

0

파일 보기

 def test_impossible_case(self):
     entries = [Environment(
         {"ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParser())
     d.tokens = [("", "y"), ("", "1")]
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParser())(
         ground_truth="y = x + 1"
     )
     transform = EncodeActionSequence(aencoder)
     with pytest.raises(RuntimeError):
         transform(
             reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")],
             action_sequence=action_sequence,
         )

예제 #24

0

파일 보기

파일: test_lexer_with_line_number.py 프로젝트: nashid/mlprogram

 def tokenize_with_offset(self, value):
     if value == "\n":
         return None
     retval = []
     offset = 0
     while True:
         next_sp = value.find(" ", offset)
         next_nl = value.find("\n", offset)
         if next_sp == -1 and next_nl == -1:
             retval.append(
                 (offset, Token(None, value[offset:], value[offset:])))
             break
         next_sp = next_sp if next_sp >= 0 else len(value) + 1
         next_nl = next_nl if next_nl >= 0 else len(value) + 1
         next = min(next_sp, next_nl)
         v = value[offset:next]
         retval.append((offset, Token(None, v, v)))
         offset = next + 1
     return retval

예제 #25

0

파일 보기

파일: test_lexer.py 프로젝트: nashid/mlprogram

 def test_multiple_lines(self):
     lexer = Lexer()
     assert lexer.tokenize_with_offset("int a = 0\nint b;") == [
         (0, Token("INT", "int", "int")), (4, Token("ID", "a", "a")),
         (6, Token("EQUALS", "=", "=")),
         (8, Token("INT_CONST_OCT", "0", "0")),
         (10, Token("INT", "int", "int")), (14, Token("ID", "b", "b")),
         (15, Token("SEMI", ";", ";"))
     ]

예제 #26

0

파일 보기

파일: charactor_bert.py 프로젝트: HiroakiMikami/python-introductory-qa

    def __call__(self, env):
        # Encode for CharacterBERT
        env.states["reference"] = \
            [Token(None, "[CLS]", "[CLS]")] + env.states["reference"] + \
            [Token(None, "[SEP]", "[SEP]")]
        tokens = [token.raw_value for token in env.states["reference"]]
        segmend_ids = [0] * len(tokens)
        input_ids = self.indexer.as_padded_tensor(
            [tokens], maxlen=self.max_query_length)[0]
        input_mask = [1] * len(tokens)

        # Zero pad
        padding_length = self.max_query_length - len(input_mask)
        input_mask += [0] * padding_length
        segmend_ids += [0] * padding_length

        env.states["input_ids"] = input_ids
        env.states["input_mask"] = torch.tensor(input_mask)
        env.states["segment_ids"] = torch.tensor(segmend_ids)
        return env

예제 #27

0

파일 보기

 def test_eval(self):
     entries = [Environment(
         {"text_query": "ab test", "ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParserWithoutVariadicArgs())
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())(
         "y = x + 1"
     )
     transform = AddPreviousActionRules(aencoder, 2)
     prev_rule_action = transform(
         reference=[Token(None, "ab", "ab"), Token(None, "test", "test")],
         action_sequence=action_sequence,
         train=False
     )
     assert np.array_equal(
         [
             # None -> Root
             [[1, -1, -1], [2, -1, -1], [-1, -1, -1]],
             # Assign -> Name, expr
             [[3, -1, -1], [4, -1, -1], [5, -1, -1]],
             # Name -> str
             [[4, -1, -1], [6, -1, -1], [-1, -1, -1]],
             # str -> "x"
             [[-1, -1, -1], [-1, 1, -1], [-1, -1, -1]],
             # Op -> str, expr, expr
             [[7, -1, -1], [6, -1, -1], [5, -1, -1]],
             # str -> "+"
             [[-1, -1, -1], [-1, 2, -1], [-1, -1, -1]],
             # Name -> str
             [[4, -1, -1], [6, -1, -1], [-1, -1, -1]],
             # str -> "y"
             [[-1, -1, -1], [-1, 3, -1], [-1, -1, -1]],
             # Number -> number
             [[8, -1, -1], [9, -1, -1], [-1, -1, -1]],
             [[-1, -1, -1], [-1, 4, -1], [-1, -1, -1]],
         ],
         prev_rule_action.numpy()
     )

예제 #28

0

파일 보기

    def tokenize_with_offset(self, code: str) \
            -> Optional[List[Tuple[int, Token[str, str]]]]:
        code = code.replace("\r", "")

        lexer = CLexer(logger.warning, lambda: None, lambda: None,
                       lambda x: False)
        lexer.build(optimize=False)
        lexer.input(code)
        tokens: List[LexToken] = list(iter(lexer.token, None))

        return [(token.lexpos, Token(token.type, token.value, token.value))
                for token in tokens]

예제 #29

0

파일 보기

파일: test_lexer.py 프로젝트: nashid/mlprogram

 def test_id_placeholder(self):
     lexer = Lexer()
     assert lexer.tokenize_with_offset("int a;a;b;") == [
         (0, Token("name", "___name@0___", "int")),
         (4, Token("name", "___name@1___", "a")),
         (5, Token("op", ";", ";")),
         (6, Token("name", "___name@1___", "a")),
         (7, Token("op", ";", ";")),
         (8, Token("name", "___name@2___", "b")),
         (9, Token("op", ";", ";"))
     ]

예제 #30

0

파일 보기

    def __call__(self, query: str) -> List[Token]:
        """
        Tokenize query

        Parameters
        ----------
        query: str

        Returns
        -------
        List[Token]
        """

        # Preprocess annotation
        def placeholder(id: int) -> str:
            return f"####{id}####"

        # Replace quoted string literals with placeholders
        mappings: Dict[str, str] = {}
        word_to_placeholder: Dict[str, str] = {}
        literal = r'\'\\\'\'|\"[^\"]+\"|\'[^\']+\'|`[^`]+`|"""[^"]+"""'
        while True:
            m = re.search(literal, query)
            if m is None:
                break

            w = m.group(0)[1:len(m.group(0)) - 1]
            if str(w) in word_to_placeholder:
                p = word_to_placeholder[str(w)]
            else:
                p = placeholder(len(mappings))
            query = query[:m.start()] + p + query[m.end():]

            assert (not ("####" in w))
            mappings[p] = str(w)
            word_to_placeholder[str(w)] = p

        reference = []
        for word in tokenizer.tokenize(query):
            if word in mappings:
                reference.append(Token[str, str](None, word, mappings[word]))
            else:
                reference.append(Token[str, str](None, word, word))

            vars = list(filter(lambda x: len(x) > 0,
                               word.split('.')))  # split by '.'
            if len(vars) > 1:
                for v in vars:
                    reference.append(Token(None, v, v))
        return reference