示例#1
0
 def test_no_root_nodes(self):
     tokenlist = TokenList([
         OrderedDict([('id', 1), ('form', 'To'), ('head', 1)]),
         OrderedDict([('id', 2), ('form', 'appear'), ('head', 2)]),
     ])
     with self.assertRaises(ParseException):
         tokenlist.to_tree()
示例#2
0
 def test_nested_filtering(self):
     tokenlist = TokenList([
         {"form": "The", "feats": Token([('Definite', 'Def'), ('PronType', 'Art')])},
         {"form": "quick", "feats": Token([('Degree', 'Pos')])},
         {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         {"form": "fox", "feats": Token([('Number', 'Sing')])},
     ])
     self.assertEqual(
         tokenlist.filter(feats__Degree="Pos").filter(form="brown"),
         TokenList([
             {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         ])
     )
     self.assertEqual(
         tokenlist.filter(form="brown").filter(feats__Degree="Pos"),
         TokenList([
             {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         ])
     )
     self.assertEqual(
         tokenlist.filter(form="brown").filter(feats__Degree="Pos").filter(),
         TokenList([
             {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         ])
     )
     self.assertEqual(
         tokenlist.filter(form="brown").filter(feats__Degree="Pos").filter(id=0),
         TokenList([])
     )
示例#3
0
 def test_multiple_root_nodes(self):
     tokenlist = TokenList([
         Token([('id', 1), ('form', 'To'), ('head', 0)]),
         Token([('id', 2), ('form', 'appear'), ('head', 1)]),
         Token([('id', 4), ('form', 'EMNLP'), ('head', 0)]),
         Token([('id', 5), ('form', '2014'), ('head', 4)]),
         Token([('id', 6), ('form', 'Yay!'), ('head', 0)]),
     ])
     tree = TokenTree(
         token=Token([("id", 0), ("form", "_"), ("deprel", "root")]),
         children=[
             TokenTree(
                 token=Token([("id", 1), ("form", "To"), ("head", 0)]),
                 children=[TokenTree(
                     token=Token([("id", 2), ("form", "appear"), ("head", 1)]),
                     children=[]
                 )]
             ),
             TokenTree(
                 token=Token([("id", 4), ("form", "EMNLP"), ("head", 0)]),
                 children=[TokenTree(
                     token=Token([("id", 5), ("form", "2014"), ("head", 4)]),
                     children=[]
                 )]
             ),
             TokenTree(
                 token=Token([("id", 6), ("form", "Yay!"), ("head", 0)]),
                 children=[]
             ),
         ]
     )
     self.assertTreeEqual(tokenlist.to_tree(), tree)
示例#4
0
 def test_multiple_root_nodes(self):
     tokenlist = TokenList([
         OrderedDict([('id', 1), ('form', 'To'), ('head', 0)]),
         OrderedDict([('id', 2), ('form', 'appear'), ('head', 1)]),
         OrderedDict([('id', 4), ('form', 'EMNLP'), ('head', 0)]),
         OrderedDict([('id', 5), ('form', '2014'), ('head', 4)]),
     ])
     with self.assertRaises(ParseException):
         tokenlist.to_tree()
示例#5
0
 def test_extend_with_dict_list(self):
     tokenlist = TokenList([{"id": 1}])
     tokenlist.extend([{"id": 2}, {"id": 3}])
     self.assertEqual(tokenlist, TokenList([{
         "id": 1
     }, {
         "id": 2
     }, {
         "id": 3
     }]))
示例#6
0
    def test_parse_tree_and_serialize(self):
        from tests.fixtures import TESTCASES

        for testcase in TESTCASES:
            data = parse(testcase)
            testcase_without_range_and_elided = TokenList(
                [token for token in data[0] if isinstance(token["id"], int)],
                metadata=data[0].metadata)
            self.assertEqual(
                parse_tree(testcase)[0].serialize(),
                testcase_without_range_and_elided.serialize())
示例#7
0
 def test_clear(self):
     tokenlist = TokenList([{
         "id": 1
     }, {
         "id": 2
     }, {
         "id": 3
     }], {"meta": "data"})
     tokenlist.clear()
     self.assertEqual(len(tokenlist.tokens), 0)
     self.assertEqual(tokenlist.metadata, None)
示例#8
0
 def test_copy(self):
     tokenlist1 = TokenList([{
         "id": 1
     }, {
         "id": 2
     }, {
         "id": 3
     }], {"meta": "data"})
     tokenlist2 = tokenlist1.copy()
     self.assertIsNot(tokenlist1, tokenlist2)
     self.assertEqual(tokenlist1, tokenlist2)
示例#9
0
 def test_simple_tree(self):
     tokenlist = TokenList([
         Token([("id", 2), ("form", "dog"), ("head", 0)]),
         Token([("id", 1), ("form", "a"), ("head", 2)]),
     ])
     tree = TokenTree(token=Token([("id", 2), ("form", "dog"),
                                   ("head", 0)]),
                      children=[
                          TokenTree(token=Token([("id", 1), ("form", "a"),
                                                 ("head", 2)]),
                                    children=[])
                      ])
     self.assertTreeEqual(tokenlist.to_tree(), tree)
示例#10
0
 def test_to_tree(self):
     tokenlist = TokenList([
         OrderedDict([("id", 2), ("form", "dog"), ("head", 0)]),
         OrderedDict([("id", 1), ("form", "a"), ("head", 2)]),
     ])
     tree = TokenTree(
         token=OrderedDict([("id", 2), ("form", "dog"), ("head", 0)]),
         children=[TokenTree(
             token=OrderedDict([("id", 1), ("form", "a"), ("head", 2)]),
             children=[]
         )]
     )
     self.assertEqual(tokenlist.to_tree(), tree)
示例#11
0
 def test_removes_negative_nodes(self):
     tokenlist = TokenList([
         Token([("id", 2), ("form", "dog"), ("head", 0)]),
         Token([("id", 1), ("form", "a"), ("head", 2)]),
         Token([("id", 3), ("form", "😍"), ("head", -1)]),
     ])
     tree = TokenTree(token=Token([("id", 2), ("form", "dog"),
                                   ("head", 0)]),
                      children=[
                          TokenTree(token=Token([("id", 1), ("form", "a"),
                                                 ("head", 2)]),
                                    children=[])
                      ])
     self.assertTreeEqual(tokenlist.to_tree(), tree)
示例#12
0
    def test_remove(self):
        tokenlist = TokenList([{"id": 1}, {"id": 2}])
        tokenlist.remove(Token({"id": 1}))
        self.assertEqual(tokenlist, TokenList([{"id": 2}]))

        tokenlist.remove({"id": 2})
        self.assertEqual(tokenlist, TokenList())
示例#13
0
    def test_insert(self):
        tokenlist = TokenList()
        tokenlist.insert(0, Token({"id": 1}))
        self.assertEqual(tokenlist, TokenList([{"id": 1}]))

        tokenlist.insert(1, {"id": 2})
        self.assertEqual(tokenlist, TokenList([{"id": 1}, {"id": 2}]))
        self.assertEqual(type(tokenlist[1]), Token)
示例#14
0
    def test_append(self):
        tokenlist = TokenList()
        tokenlist.append(Token({"id": 1}))
        self.assertEqual(tokenlist, TokenList([{"id": 1}]))

        tokenlist.append({"id": 2})
        self.assertEqual(tokenlist, TokenList([{"id": 1}, {"id": 2}]))
        self.assertEqual(type(tokenlist[1]), Token)
示例#15
0
文件: parse.py 项目: daemon/vizbert
def compute_mst(distance_matrix: torch.Tensor,
                tokens: TokenList,
                ignore_punct=True) -> TokenTree:
    open_set = set([id_wrap(x) for x in tokens.copy()])
    closed_set = set()
    if ignore_punct:
        open_set = open_set - set(
            list(filter(lambda x: is_punctuation(x.obj['form']), open_set)))
    treenodes = {}
    root = None
    while open_set:
        if not closed_set:
            token = open_set.pop().obj
            treenodes[token['id']] = root = TokenTree(token, [])
            closed_set.add(id_wrap(token))
            continue
        grow_node_from = None
        grow_node_to = None
        grow_dist = np.inf
        for onode in open_set:
            onode = onode.obj
            for cnode in closed_set:
                cnode = cnode.obj
                dist = distance_matrix[onode['id'] - 1, cnode['id'] - 1]
                if dist < grow_dist:
                    grow_dist = dist
                    grow_node_from = cnode
                    grow_node_to = onode
        treenodes[grow_node_to['id']] = node = TokenTree(grow_node_to, [])
        treenodes[grow_node_from['id']].children.append(node)
        closed_set.add(id_wrap(grow_node_to))
        open_set.remove(id_wrap(grow_node_to))
    return root
示例#16
0
 def test_extend_tokenlist_no_metadata_with_list(self):
     tokenlist1 = TokenList([{"id": 1}, {"id": 2}, {"id": 3}])
     tokenlist2 = [{"id": 4}, {"id": 5}, {"id": 6}]
     tokenlist1.extend(tokenlist2)
     tokenlist3 = TokenList([{
         "id": 1
     }, {
         "id": 2
     }, {
         "id": 3
     }, {
         "id": 4
     }, {
         "id": 5
     }, {
         "id": 6
     }])
     self.assertEqual(tokenlist1, tokenlist3)
示例#17
0
def parse_single(data, fields=None, field_parsers=None):
    '''
    parse single is the function which shall handle single conll sentence files.
    '''
    l = data.read()
    return [
        TokenList(*parse_token_and_metadata(
            l, fields=fields, field_parsers=field_parsers))
    ]
示例#18
0
    def test_metadata(self):
        data = dedent("""\
            # data = meta
            # meta = data
            1\tdog

        """)
        tokenlist = TokenList(*parse_token_and_metadata(data))
        self.assertEqual(serialize(tokenlist), data)
示例#19
0
def parse(data, fields=None, field_parsers=None):
    '''
    Here we assume a multiple sentence conll files conll files, however since we are handling single sentence conll files,
    we shall assume so, unless stated otherwise. For this we shall define a new function parse_single
    '''
    return [
        TokenList(*parse_token_and_metadata(
            sentence, fields=fields, field_parsers=field_parsers))
        for sentence in data.read() if sentence
    ]
示例#20
0
 def test_and_filtering(self):
     tokenlist = TokenList([
         {
             "id": 1,
             "form": "a",
             "field": "x"
         },
         {
             "id": 2,
             "form": "dog",
             "field": "x"
         },
         {
             "id": 3,
             "form": "dog",
             "field": "y"
         },
     ])
     self.assertEqual(tokenlist.filter(field="x", id=2),
                      TokenList([
                          {
                              "id": 2,
                              "form": "dog",
                              "field": "x"
                          },
                      ]))
     self.assertEqual(tokenlist.filter(field="x", id=3), TokenList([]))
示例#21
0
 def test_extend_tokenlist_and_merge_metadata(self):
     tokenlist4 = TokenList([{
         "id": 1
     }, {
         "id": 2
     }, {
         "id": 3
     }], {"meta1": "data1"})
     tokenlist5 = TokenList([{
         "id": 4
     }, {
         "id": 5
     }, {
         "id": 6
     }], {"meta2": "data2"})
     tokenlist4.extend(tokenlist5)
     tokenlist6 = TokenList([{
         "id": 1
     }, {
         "id": 2
     }, {
         "id": 3
     }, {
         "id": 4
     }, {
         "id": 5
     }, {
         "id": 6
     }], {
         "meta1": "data1",
         "meta2": "data2"
     })
     self.assertEqual(tokenlist4, tokenlist6)
示例#22
0
def parse_incr(in_file,
               fields=None,
               field_parsers=None,
               metadata_parsers=None):
    if not fields:
        fields = parse_conllu_plus_fields(in_file,
                                          metadata_parsers=metadata_parsers)

    for sentence in parse_sentences(in_file):
        yield TokenList(
            *parse_token_and_metadata(sentence,
                                      fields=fields,
                                      field_parsers=field_parsers,
                                      metadata_parsers=metadata_parsers))
示例#23
0
    def test_eq(self):
        metadata = {"meta": "data"}

        tokenlist1 = TokenList([{"id": 1}])
        tokenlist1.metadata = metadata
        tokenlist2 = TokenList([{"id": 1}])
        self.assertNotEqual(tokenlist1, tokenlist2)

        tokenlist2.metadata = metadata
        self.assertEqual(tokenlist1, tokenlist2)
示例#24
0
 def test_eq(self):
     self.assertEqual(TokenList([{"id": 1}]), TokenList([{"id": 1}]))
     self.assertNotEqual(TokenList([{
         "id": 1
     }], metadata={"meta": "data"}), TokenList([{
         "id": 1
     }]))
     self.assertEqual(TokenList([{
         "id": 1
     }], metadata={"meta": "data"}),
                      TokenList([{
                          "id": 1
                      }], metadata={"meta": "data"}))
     self.assertEqual(TokenList([{"id": 1}]), [{"id": 1}])
示例#25
0
def parse_incr(in_file,
               fields=None,
               field_parsers=None,
               metadata_parsers=None):
    if not hasattr(in_file, 'read'):
        raise FileNotFoundError(
            "Invalid file, 'parse_incr' needs an opened file as input")

    if not fields:
        fields = parse_conllu_plus_fields(in_file,
                                          metadata_parsers=metadata_parsers)

    for sentence in parse_sentences(in_file):
        yield TokenList(
            *parse_token_and_metadata(sentence,
                                      fields=fields,
                                      field_parsers=field_parsers,
                                      metadata_parsers=metadata_parsers))
示例#26
0
    def test_lambda_deep_filtering(self):
        tokenlist = TokenList([
            Token({
                'id': (1, '-', 2),
                'feats': None
            }),
            Token({
                'id': 1,
                'feats': {
                    'Case': 'Nom',
                    'Number': 'Sing'
                }
            }),
            Token({
                'id': 2,
                'feats': {
                    'Mood': 'Ind',
                    'Number': 'Sing'
                }
            })
        ])

        self.assertEqual(
            tokenlist.filter(feats__Mood=lambda x: x == 'Ind'),
            TokenList(
                [Token({
                    'id': 2,
                    'feats': {
                        'Mood': 'Ind',
                        'Number': 'Sing'
                    }
                })]))

        self.assertEqual(
            tokenlist.filter(feats__Number=lambda x: x == 'Sing'),
            TokenList([
                Token({
                    'id': 1,
                    'feats': {
                        'Case': 'Nom',
                        'Number': 'Sing'
                    }
                }),
                Token({
                    'id': 2,
                    'feats': {
                        'Mood': 'Ind',
                        'Number': 'Sing'
                    }
                })
            ]))
示例#27
0
 def test_basic_filtering(self):
     tokenlist = TokenList([
         {"id": 1, "form": "a", "field": "x"},
         {"id": 2, "form": "dog", "field": "x"},
     ])
     self.assertEqual(
         tokenlist.filter(id=0),
         TokenList([])
     )
     self.assertEqual(
         tokenlist.filter(id=1),
         TokenList([{"id": 1, "form": "a", "field": "x"}])
     )
     self.assertEqual(
         tokenlist.filter(),
         tokenlist
     )
     self.assertEqual(
         tokenlist.filter(field="x"),
         tokenlist
     )
示例#28
0
def parse_incr(
    in_file: T.TextIO,
    fields: T.Optional[T.Sequence[str]] = None,
    field_parsers: T.Dict[str, _FieldParserType] = None,
    metadata_parsers: T.Optional[T.Dict[str, _MetadataParserType]] = None
) -> T.Iterator[TokenList]:
    if not hasattr(in_file, 'read'):
        raise FileNotFoundError(
            "Invalid file, 'parse_incr' needs an opened file as input")

    if not fields:
        fields = parse_conllu_plus_fields(in_file,
                                          metadata_parsers=metadata_parsers)

    for sentence in parse_sentences(in_file):
        yield TokenList(
            *parse_token_and_metadata(sentence,
                                      fields=fields,
                                      field_parsers=field_parsers,
                                      metadata_parsers=metadata_parsers))
示例#29
0
def parse_token_and_metadata(
    data: str,
    fields: T.Optional[T.Sequence[str]] = None,
    field_parsers: T.Optional[T.Dict[str, _FieldParserType]] = None,
    metadata_parsers: T.Optional[T.Dict[str, _MetadataParserType]] = None
) -> TokenList:
    if not data:
        raise ParseException(
            "Can't create TokenList, no data sent to constructor.")

    fields = fields or DEFAULT_FIELDS

    if not field_parsers:
        field_parsers = DEFAULT_FIELD_PARSERS.copy()
    elif sorted(field_parsers.keys()) != sorted(fields):
        new_field_parsers = DEFAULT_FIELD_PARSERS.copy()
        new_field_parsers.update(field_parsers)
        field_parsers = new_field_parsers

    tokens = []
    metadata = Metadata()

    for line in data.split('\n'):
        line = line.strip()

        if not line:
            continue

        if line.startswith('#'):
            pairs = parse_comment_line(line, metadata_parsers=metadata_parsers)
            for key, value in pairs:
                metadata[key] = value
        else:
            tokens.append(parse_line(line, fields, field_parsers))

    return TokenList(tokens, metadata, default_fields=fields)
示例#30
0
    def test_lambda_basic_filtering(self):
        tokenlist = TokenList([
            Token({'id': (1, '-', 2), 'form': "It's", 'lemma': '_', 'feats': None}),
            Token({'id': 1, 'form': 'It', 'lemma': 'it'}),
            Token({'id': 2, 'form': "'s", 'lemma': 'be'})
        ])

        self.assertEqual(
            tokenlist.filter(id=lambda x: type(x) is int),
            TokenList([
                Token({'id': 1, 'form': 'It', 'lemma': 'it'}),
                Token({'id': 2, 'form': "'s", 'lemma': 'be'})
            ])
        )
        self.assertEqual(
            tokenlist.filter(lemma=lambda x: x.startswith('b')),
            TokenList([
                Token({'id': 2, 'form': "'s", 'lemma': 'be'})
            ])
        )