示例#1
0
def parse_comment_line(line, metadata_parsers=None):
    line = line.strip()

    if line[0] != '#':
        raise ParseException(
            "Invalid comment format, comment must start with '#'")

    key, value = parse_pair_value(line[1:])

    if not metadata_parsers:
        metadata_parsers = DEFAULT_METADATA_PARSERS.copy()
    else:
        new_metadata_parsers = DEFAULT_METADATA_PARSERS.copy()
        new_metadata_parsers.update(metadata_parsers)
        metadata_parsers = new_metadata_parsers

    custom_result = None
    if key in metadata_parsers:
        custom_result = metadata_parsers[key](key, value)
    elif "__fallback__" in metadata_parsers:
        custom_result = metadata_parsers["__fallback__"](key, value)

    # Allow returning pair instead of list of pairs from metadata parsers
    if custom_result:
        if isinstance(custom_result, tuple):
            key, value = custom_result
            return [(text(key), value)]
        return [(text(key), value) for key, value in custom_result]

    if not key or not value:
        # Lines without value are invalid by default
        return []

    return [(text(key), value)]
示例#2
0
    def test_parse_tree(self):
        sentences = parse_tree(data)
        self.assertEqual(len(sentences), 1)

        root = sentences[0]
        self.assertEqual(text(root), "TokenTree<token={id=5, form=jumps}, children=[...]>")

        self.assertEqual(
            root.token,
            OrderedDict([
                ('id', 5),
                ('form', 'jumps'),
                ('lemma', 'jump'),
                ('upostag', 'VERB'),
                ('xpostag', 'VBZ'),
                ('feats', OrderedDict([
                    ("Mood", "Ind"),
                    ("Number", "Sing"),
                    ("Person", "3"),
                    ("Tense", "Pres"),
                    ("VerbForm", "Fin"),
                ])),
                ('head', 0),
                ('deprel', 'root'),
                ('deps', None),
                ('misc', None)
            ])
        )

        self.assertEqual(
            [text(child) for child in root.children],
            [
                "TokenTree<token={id=4, form=fox}, children=[...]>",
                "TokenTree<token={id=9, form=dog}, children=[...]>",
                "TokenTree<token={id=10, form=.}, children=None>",
            ]
        )

        self.assertEqual(
            root.metadata["text"],
            "The quick brown fox jumps over the lazy dog."
        )

        self.assertEqual(root.serialize(), data)

        self.assertEqual(
            capture_print(root.print_tree),
            dedent("""\
                (deprel:root) form:jumps lemma:jump upostag:VERB [5]
                    (deprel:nsubj) form:fox lemma:fox upostag:NOUN [4]
                        (deprel:det) form:The lemma:the upostag:DET [1]
                        (deprel:amod) form:quick lemma:quick upostag:ADJ [2]
                        (deprel:amod) form:brown lemma:brown upostag:ADJ [3]
                    (deprel:nmod) form:dog lemma:dog upostag:NOUN [9]
                        (deprel:case) form:over lemma:over upostag:ADP [6]
                        (deprel:det) form:the lemma:the upostag:DET [7]
                        (deprel:amod) form:lazy lemma:lazy upostag:ADJ [8]
                    (deprel:punct) form:. lemma:. upostag:PUNCT [10]
            """)
        )
示例#3
0
文件: parser.py 项目: grivaz/conllu
def serialize_field(field):
    if field is None:
        return '_'

    if isinstance(field, OrderedDict):
        fields = []
        for key, value in field.items():
            if value is None:
                value = "_"

            fields.append('='.join((key, value)))

        return '|'.join(fields)

    if isinstance(field, tuple):
        return "".join([text(item) for item in field])

    if isinstance(field, list):
        if len(field[0]) != 2:
            raise ParseException(
                "Can't serialize '{}', invalid format".format(field))
        return "|".join(
            [text(value) + ":" + text(key) for key, value in field])

    return "{}".format(field)
示例#4
0
    def test_parse(self):
        sentences = parse(data)
        self.assertEqual(len(sentences), 1)

        sentence = sentences[0]

        self.assertEqual(
            text(sentence),
            "TokenList<The, quick, brown, fox, jumps, over, the, lazy, dog, .>"
        )

        self.assertEqual(
            sentence[0],
            OrderedDict([('id', 1), ('form', 'The'), ('lemma', 'the'),
                         ('upostag', 'DET'), ('xpostag', 'DT'),
                         ('feats',
                          OrderedDict([('Definite', 'Def'),
                                       ('PronType', 'Art')])), ('head', 4),
                         ('deprel', 'det'), ('deps', None), ('misc', None)]))
        self.assertEqual(
            sentence[8],
            OrderedDict([('id', 9), ('form', 'dog'), ('lemma', 'dog'),
                         ('upostag', 'NOUN'), ('xpostag', 'NN'),
                         ('feats', OrderedDict([('Number', 'Sing')])),
                         ('head', 5), ('deprel', 'nmod'), ('deps', None),
                         ('misc', OrderedDict([("SpaceAfter", "No")]))]))
        self.assertEqual(
            [token["form"] for token in sentence],
            "The quick brown fox jumps over the lazy dog .".split(" "))

        self.assertEqual(sentence.metadata["text"],
                         "The quick brown fox jumps over the lazy dog.")
示例#5
0
def parse_line(line, fields, field_parsers=None):
    # Be backwards compatible if people called parse_line without field_parsers before
    field_parsers = field_parsers or DEFAULT_FIELD_PARSERS

    line = re.split(r"\t| {2,}", line)

    if len(line) == 1:
        raise ParseException(
            "Invalid line format, line must contain either tabs or two spaces."
        )

    data = OrderedDict()

    for i, field in enumerate(fields):
        # Allow parsing CoNNL-U files with fewer columns
        if i >= len(line):
            break

        if field in field_parsers:
            try:
                value = field_parsers[field](line, i)
            except ParseException as e:
                raise ParseException(
                    "Failed parsing field '{}': ".format(field) + str(e))

        else:
            value = line[i]

        data[text(field)] = value

    return data
示例#6
0
    def test_multiple_sentences(self):
        data = dedent("""\
            1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
            2   dog     dog    NOUN   NN   Number=Sing                 5   nmod    _   SpaceAfter=No
            3  .       .      PUNCT  .    _                           5   punct   _   _

            1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
            2   dog     dog    NOUN   NN   Number=Sing                 5   nmod    _   SpaceAfter=No
            3  .       .      PUNCT  .    _                           5   punct   _   _

        """)
        self.assertEqual(text(parse(data)),
                         "[TokenList<The, dog, .>, TokenList<The, dog, .>]")
示例#7
0
def parse_line(line, fields, field_parsers=None):
    # Be backwards compatible if people called parse_line without field_parsers before
    field_parsers = field_parsers or DEFAULT_FIELD_PARSERS

    # Support xpostag/upostag as aliases for xpos/upos (both ways)
    if "xpostag" not in field_parsers and "xpos" in field_parsers:
        field_parsers["xpostag"] = field_parsers["xpos"]
    if "xpos" not in field_parsers and "xpostag" in field_parsers:
        field_parsers["xpos"] = field_parsers["xpostag"]

    if "upostag" not in field_parsers and "upos" in field_parsers:
        field_parsers["upostag"] = field_parsers["upos"]
    if "upos" not in field_parsers and "upostag" in field_parsers:
        field_parsers["upos"] = field_parsers["upostag"]

    line = re.split(r"\t| {2,}", line)

    if len(line) == 1:
        raise ParseException(
            "Invalid line format, line must contain either tabs or two spaces."
        )

    data = Token()

    for i, field in enumerate(fields):
        # Allow parsing CoNNL-U files with fewer columns
        if i >= len(line):
            break

        if field in field_parsers:
            try:
                value = field_parsers[field](line, i)
            except ParseException as e:
                raise ParseException(
                    "Failed parsing field '{}': ".format(field) + str(e))

        else:
            value = line[i]

        data[text(field)] = value

    return data
示例#8
0
文件: models.py 项目: zoharai/conllu
 def __repr__(self):
     return 'TokenTree<' + \
         'token={id=' + text(self.token['id']) + ', form=' + self.token['form'] + '}, ' + \
         'children=' + ('[...]' if self.children else 'None') + \
         '>'