Пример #1
0
    def test_parse_CoNLL2009_2(self):
        data = dedent("""\
            #\tid='1'-document_id='36:1047'-span='1'
            1\t+\t+\tPunc\tPunc\t_\t0\tROOT\t_\t_
            2\tIn\tin\tr\tr\tr|-|-|-|-|-|-|-|-\t5\tAuxP\t_\t_
            3\tDei\tDeus\tn\tPropn\tn|-|s|-|-|-|m|g|-\t4\tATR\t_\t_
            4\tnomine\tnomen\tn\tn\tn|-|s|-|-|-|n|b|-\t2\tADV\t_\t_
            5\tregnante\tregno\tt\tt\tt|-|s|p|p|a|m|b|-\t0\tADV\t_\t_

        """)

        sentences = parse(
            data,
            fields=('id', 'form', 'lemma', 'upostag', 'xpostag', 'feats',
                    'head', 'deprel', 'deps', 'misc'),
            field_parsers={
                "feats": lambda line, i: [feat for feat in line[i].split("|")]
            })
        self.assertEqual(
            sentences[0][4],
            Token([
                ('id', 5),
                ('form', 'regnante'),
                ('lemma', 'regno'),
                ('upostag', 't'),
                ('xpostag', 't'),
                ('feats', ['t', '-', 's', 'p', 'p', 'a', 'm', 'b', '-']),
                ('head', 0),
                ('deprel', 'ADV'),
                ('deps', None),
                ('misc', None),
            ]))
        self.assertEqual(sentences[0].metadata,
                         Token([('id', "'1'-document_id='36:1047'-span='1'")]))
Пример #2
0
 def test_no_root_nodes(self):
     tokenlist = TokenList([
         Token([('id', 1), ('form', 'To'), ('head', 1)]),
         Token([('id', 2), ('form', 'appear'), ('head', 2)]),
     ])
     with self.assertRaises(ParseException):
         tokenlist.to_tree()
Пример #3
0
    def test_invalid_key_access(self):
        token = Token({"id": 1, "xpostag": "DT", "upostag": "DET"})
        with self.assertRaises(KeyError):
            token["inexistent_value"]

        self.assertEqual(token.get("inexistent_value"), None)
        self.assertEqual(token.get("inexistent_value", "HEJ"), "HEJ")
Пример #4
0
    def test_custom_metadata_parsers(self):
        data = dedent("""\
            # global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
            # newdoc id = mf920901-001
            # newpar id = mf920901-001-p1
            # sent_id = mf920901-001-p1s1A
            # text = Slovenská ústava: pro i proti
            # text_en = Slovak constitution: pros and cons
        """)
        _, metadata = parse_token_and_metadata(data)
        self.assertEqual(metadata, Token([
            ("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"),
            ("newdoc id", "mf920901-001"),
            ("newpar id", "mf920901-001-p1"),
            ("sent_id", "mf920901-001-p1s1A"),
            ("text", "Slovenská ústava: pro i proti"),
            ("text_en", "Slovak constitution: pros and cons"),
        ]))

        _, metadata = parse_token_and_metadata(
            data,
            metadata_parsers={"global.columns": lambda key, value: (key, value.split())}
        )
        self.assertEqual(metadata, Token([
            ("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]),
            ("newdoc id", "mf920901-001"),
            ("newpar id", "mf920901-001-p1"),
            ("sent_id", "mf920901-001-p1s1A"),
            ("text", "Slovenská ústava: pro i proti"),
            ("text_en", "Slovak constitution: pros and cons"),
        ]))
Пример #5
0
    def test_flatten(self):
        tree = TokenTree(token=Token([("id", 2), ("form", "dog")]),
                         children=[
                             TokenTree(token=Token([("id", 1), ("form", "a")]),
                                       children=[])
                         ])
        self.assertEqual(
            tree.serialize(),
            dedent("""\
                1\ta
                2\tdog

            """))
        tree = TokenTree(token=Token([("id", 1), ("form", "dog")]),
                         children=[
                             TokenTree(token=Token([("id", 2), ("form", "a")]),
                                       children=[])
                         ])
        self.assertEqual(
            tree.serialize(),
            dedent("""\
                1\tdog
                2\ta

            """))
Пример #6
0
    def test_parse(self):
        sentences = parse(data)
        self.assertEqual(len(sentences), 1)

        sentence = sentences[0]

        self.assertEqual(
            str(sentence),
            "TokenList<The, quick, brown, fox, jumps, over, the, lazy, dog, .>"
        )

        self.assertEqual(
            sentence[0],
            Token([('id', 1), ('form', 'The'), ('lemma', 'the'),
                   ('upos', 'DET'), ('xpos', 'DT'),
                   ('feats', Token([('Definite', 'Def'),
                                    ('PronType', 'Art')])), ('head', 4),
                   ('deprel', 'det'), ('deps', None), ('misc', None)]))
        self.assertEqual(
            sentence[8],
            Token([('id', 9), ('form', 'dog'), ('lemma', 'dog'),
                   ('upos', 'NOUN'), ('xpos', 'NN'),
                   ('feats', Token([('Number', 'Sing')])), ('head', 5),
                   ('deprel', 'nmod'), ('deps', None),
                   ('misc', Token([("SpaceAfter", "No")]))]))
        self.assertEqual(
            [token["form"] for token in sentence],
            "The quick brown fox jumps over the lazy dog .".split(" "))

        self.assertEqual(sentence.metadata["text"],
                         "The quick brown fox jumps over the lazy dog.")
Пример #7
0
    def test_ordered_dict(self):
        data = Token()
        self.assertEqual(serialize_field(data), "")

        data = Token([('SpaceAfter', 'No')])
        self.assertEqual(serialize_field(data), "SpaceAfter=No")

        data = Token([('Translit', None)])
        self.assertEqual(serialize_field(data), "Translit=_")
Пример #8
0
 def test_parse_line(self):
     line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_"
     self.assertEqual(
         parse_line(line, fields=DEFAULT_FIELDS),
         Token([('id', 1), ('form', 'The'), ('lemma', 'the'),
                ('upos', 'DET'), ('xpos', 'DT'),
                ('feats', Token([('Definite', 'Def'),
                                 ('PronType', 'Art')])), ('head', 4),
                ('deprel', 'det'), ('deps', None), ('misc', None)]))
Пример #9
0
 def test_custom_fields(self):
     data = dedent("""\
         1\t1\t1
         2\t2\t2
     """)
     tokens, _ = parse_token_and_metadata(data, fields=("id", "id", "id"))
     self.assertEqual(tokens, [
         Token([("id", 1), ("id", 1), ("id", 1)]),
         Token([("id", 2), ("id", 2), ("id", 2)]),
     ])
Пример #10
0
 def test_simple_tree(self):
     tokenlist = TokenList([
         Token([("id", 2), ("form", "dog"), ("head", 0)]),
         Token([("id", 1), ("form", "a"), ("head", 2)]),
     ])
     tree = TokenTree(token=Token([("id", 2), ("form", "dog"),
                                   ("head", 0)]),
                      children=[
                          TokenTree(token=Token([("id", 1), ("form", "a"),
                                                 ("head", 2)]),
                                    children=[])
                      ])
     self.assertTreeEqual(tokenlist.to_tree(), tree)
Пример #11
0
 def test_removes_negative_nodes(self):
     tokenlist = TokenList([
         Token([("id", 2), ("form", "dog"), ("head", 0)]),
         Token([("id", 1), ("form", "a"), ("head", 2)]),
         Token([("id", 3), ("form", "😍"), ("head", -1)]),
     ])
     tree = TokenTree(token=Token([("id", 2), ("form", "dog"),
                                   ("head", 0)]),
                      children=[
                          TokenTree(token=Token([("id", 1), ("form", "a"),
                                                 ("head", 2)]),
                                    children=[])
                      ])
     self.assertTreeEqual(tokenlist.to_tree(), tree)
Пример #12
0
 def test_default_field_parsers_when_undefined(self):
     data = dedent("""\
         1\tfrom
         2\tparis
     """)
     fields = ("id", "form")
     field_parsers = {
         # Rely on default 'id' field parser
         "form": lambda line, i: line[i].upper()
     }
     tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers)
     self.assertEqual(tokens, [
         Token([("id", 1), ("form", "FROM")]),
         Token([("id", 2), ("form", "PARIS")]),
     ])
Пример #13
0
 def test_parse_line_nullable_fields(self):
     line = "_\t_\t_\t_\t_\t_\t_\t_\t_\t_"
     self.assertEqual(
         parse_line(line, fields=DEFAULT_FIELDS),
         Token([('id', None), ('form', '_'), ('lemma', '_'), ('upos', '_'),
                ('xpos', None), ('feats', None), ('head', None),
                ('deprel', '_'), ('deps', None), ('misc', None)]))
Пример #14
0
    def test_remove(self):
        tokenlist = TokenList([{"id": 1}, {"id": 2}])
        tokenlist.remove(Token({"id": 1}))
        self.assertEqual(tokenlist, TokenList([{"id": 2}]))

        tokenlist.remove({"id": 2})
        self.assertEqual(tokenlist, TokenList())
Пример #15
0
 def test_parse_line_only_id_head(self):
     line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_"
     self.assertEqual(parse_line(line, fields=["id", "form"]),
                      Token([
                          ('id', 1),
                          ('form', 'The'),
                      ]))
Пример #16
0
 def test_parse_line_two_spaces(self):
     line = "1  The  the  DET  DT  Definite=Def|PronType=Art  4  det  _  _"
     self.assertEqual(parse_line(line, fields=["id", "form"]),
                      Token([
                          ('id', 1),
                          ('form', 'The'),
                      ]))
Пример #17
0
    def test_parse_conllu_plus(self):
        data = dedent("""\
            # global.columns = ID FORM UPOS HEAD DEPREL MISC PARSEME:MWE
            # source_sent_id = conllu http://hdl.handle.net/11234/1-2837 UD_German-GSD/de_gsd-ud-train.conllu train-s16
            # sent_id = train-s16
            # text = Der CDU-Politiker strebt
            1\tDer\tDET\t2\tdet\t_\t*
            2\tCDU\tPROPN\t4\tcompound\tSpaceAfter=No\t*
            3\t-\tPUNCT\t2\tpunct\tSpaceAfter=No\t*
            4\tPolitiker\tNOUN\t5\tnsubj\t_\t*
            5\tstrebt\tVERB\t0\troot\t_\t2:VPC.full
        """)

        sentences = parse(data)

        self.assertEqual(
            sentences[0][4],
            Token([
                ('id', 5),
                ('form', 'strebt'),
                ('upos', 'VERB'),
                ('head', 0),
                ('deprel', 'root'),
                ('misc', None),
                ('parseme:mwe', '2:VPC.full'),
            ]))
Пример #18
0
    def test_parse_tree(self):
        sentences = parse_tree(data)
        self.assertEqual(len(sentences), 1)

        root = sentences[0]
        self.assertEqual(
            str(root), "TokenTree<token={id=5, form=jumps}, children=[...]>")

        self.assertEqual(
            root.token,
            Token([('id', 5), ('form', 'jumps'), ('lemma', 'jump'),
                   ('upos', 'VERB'), ('xpos', 'VBZ'),
                   ('feats',
                    Token([
                        ("Mood", "Ind"),
                        ("Number", "Sing"),
                        ("Person", "3"),
                        ("Tense", "Pres"),
                        ("VerbForm", "Fin"),
                    ])), ('head', 0), ('deprel', 'root'), ('deps', None),
                   ('misc', None)]))

        self.assertEqual([str(child) for child in root.children], [
            "TokenTree<token={id=4, form=fox}, children=[...]>",
            "TokenTree<token={id=9, form=dog}, children=[...]>",
            "TokenTree<token={id=10, form=.}, children=None>",
        ])

        self.assertEqual(root.metadata["text"],
                         "The quick brown fox jumps over the lazy dog.")

        self.assertEqual(root.serialize(), data)

        self.assertEqual(
            capture_print(root.print_tree),
            dedent("""\
                (deprel:root) form:jumps lemma:jump upos:VERB [5]
                    (deprel:nsubj) form:fox lemma:fox upos:NOUN [4]
                        (deprel:det) form:The lemma:the upos:DET [1]
                        (deprel:amod) form:quick lemma:quick upos:ADJ [2]
                        (deprel:amod) form:brown lemma:brown upos:ADJ [3]
                    (deprel:nmod) form:dog lemma:dog upos:NOUN [9]
                        (deprel:case) form:over lemma:over upos:ADP [6]
                        (deprel:det) form:the lemma:the upos:DET [7]
                        (deprel:amod) form:lazy lemma:lazy upos:ADJ [8]
                    (deprel:punct) form:. lemma:. upos:PUNCT [10]
            """))
Пример #19
0
    def test_newlines_in_sentence(self):
        data = dedent("""\
            # meta = data
            1\thej
            2\tdå

            3\thej
            4\tdå
        """)
        tokens, metadata = parse_token_and_metadata(data)
        self.assertListEqual(tokens, [
            Token([("id", 1), ("form", "hej")]),
            Token([("id", 2), ("form", "då")]),
            Token([("id", 3), ("form", "hej")]),
            Token([("id", 4), ("form", "då")]),
        ])
        self.assertEqual(metadata, Token([("meta", "data")]))
Пример #20
0
    def test_insert(self):
        tokenlist = TokenList()
        tokenlist.insert(0, Token({"id": 1}))
        self.assertEqual(tokenlist, TokenList([{"id": 1}]))

        tokenlist.insert(1, {"id": 2})
        self.assertEqual(tokenlist, TokenList([{"id": 1}, {"id": 2}]))
        self.assertEqual(type(tokenlist[1]), Token)
Пример #21
0
    def test_append(self):
        tokenlist = TokenList()
        tokenlist.append(Token({"id": 1}))
        self.assertEqual(tokenlist, TokenList([{"id": 1}]))

        tokenlist.append({"id": 2})
        self.assertEqual(tokenlist, TokenList([{"id": 1}, {"id": 2}]))
        self.assertEqual(type(tokenlist[1]), Token)
Пример #22
0
    def _tacred_example_to_token_list(
            self, example: Dict[str, Any]) -> conllu.TokenList:
        id_ = example["id"]
        tokens = example["token"]
        ner = example["stanford_ner"]

        subj_start = example["subj_start"]
        subj_end = example["subj_end"]
        obj_start = example["obj_start"]
        obj_end = example["obj_end"]

        subj_tag = example["subj_type"]
        obj_tag = example["obj_type"]

        label = example["relation"]

        metadata = {
            "text":
            " ".join(tokens),
            "sentence_id":
            str(id_),
            "relations":
            ";".join([
                str(subj_start + 1),
                str(subj_end + 1),
                str(obj_start + 1),
                str(obj_end + 1),
                label,
            ]),
        }

        prev_tag = None
        token_dicts = []
        for idx, (token, tag) in enumerate(zip(tokens, ner)):
            if subj_start <= idx <= subj_end:
                tag = subj_tag

            if obj_start <= idx <= obj_end:
                tag = obj_tag

            prefix = ""
            if tag != "O":
                if tag != prev_tag:
                    prefix = "B-"
                else:
                    prefix = "I-"

            prev_tag = tag

            token_dicts.append(
                Token({
                    "id": str(idx + 1),
                    "form": convert_ptb_token(token),
                    "ner": prefix + tag,
                }))

        return conllu.TokenList(tokens=token_dicts,
                                metadata=Metadata(metadata))
Пример #23
0
 def test_deep_filtering(self):
     tokenlist = TokenList([
         {"form": "The", "feats": Token([('Definite', 'Def'), ('PronType', 'Art')])},
         {"form": "quick", "feats": Token([('Degree', 'Pos')])},
         {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         {"form": "fox", "feats": Token([('Number', 'Sing')])},
     ])
     self.assertEqual(
         tokenlist.filter(feats__Degree="Pos"),
         TokenList([
             {"form": "quick", "feats": Token([('Degree', 'Pos')])},
             {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         ])
     )
     self.assertEqual(
         tokenlist.filter(form="brown", feats__Degree="Pos"),
         TokenList([
             {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         ])
     )
     self.assertEqual(
         tokenlist.filter(form="brown", feats__Degree="Pos", id=1),
         TokenList([])
     )
     self.assertEqual(
         tokenlist.filter(unknown__property__value="undefined"),
         TokenList([])
     )
     self.assertEqual(
         tokenlist.filter(unknown___property____value="undefined"),
         TokenList([])
     )
Пример #24
0
 def test_nested_filtering(self):
     tokenlist = TokenList([
         {"form": "The", "feats": Token([('Definite', 'Def'), ('PronType', 'Art')])},
         {"form": "quick", "feats": Token([('Degree', 'Pos')])},
         {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         {"form": "fox", "feats": Token([('Number', 'Sing')])},
     ])
     self.assertEqual(
         tokenlist.filter(feats__Degree="Pos").filter(form="brown"),
         TokenList([
             {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         ])
     )
     self.assertEqual(
         tokenlist.filter(form="brown").filter(feats__Degree="Pos"),
         TokenList([
             {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         ])
     )
     self.assertEqual(
         tokenlist.filter(form="brown").filter(feats__Degree="Pos").filter(),
         TokenList([
             {"form": "brown", "feats": Token([('Degree', 'Pos')])},
         ])
     )
     self.assertEqual(
         tokenlist.filter(form="brown").filter(feats__Degree="Pos").filter(id=0),
         TokenList([])
     )
Пример #25
0
 def test_parse_line_fewer_columns(self):
     line = "1\tThe\tthe\tDET\tDT"
     self.assertEqual(parse_line(line, fields=DEFAULT_FIELDS), Token([
         ('id', 1),
         ('form', 'The'),
         ('lemma', 'the'),
         ('upos', 'DET'),
         ('xpos', 'DT'),
     ]))
Пример #26
0
 def test_parse_custom_fieldparsers(self):
     line = "1\t2"
     custom_fieldparsers = {
         "id": lambda line, i: line[i] * 5,
     }
     self.assertEqual(
         parse_line(line, fields=["id"], field_parsers=custom_fieldparsers),
         Token([
             ('id', "11111"),
         ]))
Пример #27
0
    def test_parse_CoNLL2009_1(self):
        data = dedent("""\
            #\tid\tform\tlemma\tplemma\tpos\tppos\tfeats\tpfeats\thead\tphead\tdeprel\tpdeprel\tfillpred\tpred\tapreds
            1\tZ\tz\tz\tR\tR\tSubPOS=R|Cas=2\tSubPOS=R|Cas=2\t10\t10\tAuxP\tAuxP\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_
            2\ttéto\ttento\ttento\tP\tP\tSubPOS=D|Gen=F|Num=S|Cas=2\tSubPOS=D|Gen=F|Num=S|Cas=2\t3\t3\tAtr\tAtr\tY\ttento\t_\tRSTR\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_
            3\tknihy\tkniha\tkniha\tN\tN\tSubPOS=N|Gen=F|Num=S|Cas=2|Neg=A\tSubPOS=N|Gen=F|Num=S|Cas=2|Neg=A\t1\t1\tAdv\tAdv\tY\tkniha\t_\t_\t_\t_\t_\t_\t_\tDIR1\t_\t_\t_\t_\t_\t_\t_\t_

        """)

        sentences = parse(
            data,
            fields=('id', 'form', 'lemma', 'plemma', 'pos', 'ppos', 'feats',
                    'pfeats', 'head', 'phead', 'deprel', 'pdeprel', 'fillpred',
                    'pred', 'apreds'),
            field_parsers={
                "pfeats":
                lambda line, i: parse_dict_value(line[i]),
                "phead":
                lambda line, i: parse_int_value(line[i]),
                "apreds":
                lambda line, i: [
                    apred_field if apred_field != "_" else None
                    for apred_field in line[i:len(line)]
                ],
            },
        )
        self.assertEqual(
            sentences[0][2],
            Token([('id', 3), ('form', 'knihy'), ('lemma', 'kniha'),
                   ('plemma', 'kniha'), ('pos', 'N'), ('ppos', 'N'),
                   ('feats',
                    Token([('SubPOS', 'N'), ('Gen', 'F'), ('Num', 'S'),
                           ('Cas', '2'), ('Neg', 'A')])),
                   ('pfeats',
                    Token([('SubPOS', 'N'), ('Gen', 'F'), ('Num', 'S'),
                           ('Cas', '2'), ('Neg', 'A')])), ('head', 1),
                   ('phead', 1), ('deprel', 'Adv'), ('pdeprel', 'Adv'),
                   ('fillpred', 'Y'), ('pred', 'kniha'),
                   ('apreds', [
                       None, None, None, None, None, None, None, 'DIR1', None,
                       None, None, None, None, None, None, None
                   ])]))
Пример #28
0
 def test_parse_dict_value(self):
     self.assertEqual(
         parse_dict_value("key1"),
         Token([("key1", "")])
     )
     self.assertEqual(
         parse_dict_value("key1=val1"),
         Token([("key1", "val1")])
     )
     self.assertEqual(
         parse_dict_value("key1=val1|key2=val2"),
         Token([("key1", "val1"), ("key2", "val2")])
     )
     self.assertEqual(
         parse_dict_value("key1=val1|key2|key3=val3"),
         Token([("key1", "val1"), ("key2", ""), ("key3", "val3")])
     )
     self.assertEqual(
         parse_dict_value("key1=val1|key1=val2"),
         Token([("key1", "val2")])
     )
     self.assertEqual(
         parse_dict_value("key1=_|_|_=val1"),
         Token([("key1", None)])
     )
     self.assertEqual(parse_dict_value(""), None)
     self.assertEqual(parse_dict_value("_"), None)
Пример #29
0
    def test_custom_field_parsers(self):
        data = dedent("""\
            1\tbackwards\tline
            2\tparis\tsirap
        """)
        fields = ("id", "backwards")

        # A field parser that takes all remaining field, reverses their letters and joins them
        def parse_backwards(value):
            return " ".join([part[::-1] for part in value])

        # This overrides the default parsers, so the id is parsed as a string
        field_parsers = {
            "id": lambda line, i: line[i],
            "backwards": lambda line, i: parse_backwards(line[i:len(line)])
        }

        tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers)
        self.assertEqual(tokens, [
            Token([("id", '1'), ("backwards", "sdrawkcab enil")]),
            Token([("id", '2'), ("backwards", "sirap paris")]),
        ])
Пример #30
0
 def test_parse_fieldparsers_alias_two_ways(self):
     line = "1\t2"
     custom_fieldparsers = {
         "xpos": lambda line, i: line[i] * 5,
         "upostag": lambda line, i: line[i] * 5,
     }
     self.assertEqual(
         parse_line(line, fields=["xpostag", "upos"], field_parsers=custom_fieldparsers),
         Token([
             ('xpostag', "11111"),
             ('upos', "22222"),
         ])
     )