Exemplo n.º 1
0
    def test_empty(self):
        with self.assertRaises(ParseException) as assert_context:
            line = "invalid_id\t_\t_\t_\t_\t_\t_\t_\t_\t"
            parse_line(line, fields=DEFAULT_FIELDS)

        expected = "Failed parsing field 'id'"
        self.assertEqual(str(assert_context.exception)[:len(expected)], expected)
Exemplo n.º 2
0
    def test_parse_line_with_spaces(self):
        line = "1 The the DET DT Definite=Def|PronType=Art 4 det _ _"
        with self.assertRaises(ParseException) as assert_context:
            parse_line(line, fields=DEFAULT_FIELDS)

        expected = "Invalid line format"
        self.assertEqual(str(assert_context.exception)[:len(expected)], expected)
Exemplo n.º 3
0
def lazy_parse(text: str, fields: Tuple = DEFAULT_FIELDS):
    for sentence in text.split("\n\n"):
        if sentence:
            yield [
                parse_line(line, fields) for line in sentence.split("\n")
                if line and not line.strip().startswith("#")
            ]
Exemplo n.º 4
0
def parse_token_and_metadata(data, fields=None, field_parsers=None):
    if not data:
        raise ParseException(
            "Can't create TokenList, no data sent to constructor.")

    fields = fields or DEFAULT_FIELDS
    field_parsers = field_parsers or DEFAULT_FIELD_PARSERS

    tokens = []
    texts = []

    for line in data.split('\n'):
        line = line.strip()

        if not line:
            continue

        if line.startswith('#'):
            var_name, var_value = parse_comment_line(line)
            if var_name == "text":
                texts.append(var_value)
        else:
            tokens.append(parse_line(line, fields, field_parsers))

    return tokens, texts
Exemplo n.º 5
0
 def test_parse_line_nullable_fields(self):
     line = "_\t_\t_\t_\t_\t_\t_\t_\t_\t_"
     self.assertEqual(
         parse_line(line, fields=DEFAULT_FIELDS),
         Token([('id', None), ('form', '_'), ('lemma', '_'), ('upos', '_'),
                ('xpos', None), ('feats', None), ('head', None),
                ('deprel', '_'), ('deps', None), ('misc', None)]))
Exemplo n.º 6
0
def lazy_parse(text: str, fields: Tuple[str, ...] = DEFAULT_FIELDS):
    for sentence in text.split("\n\n"):
        if not sentence: continue
        annotation = []
        features = {}
        for line in sentence.split("\n"):
            if line.strip().startswith("#"):
                if line[:8] == '# prompt':
                    features['prompt'] = line.strip().split(':')[1]
                else:
                    new_features = line.strip()[1:].split()
                    for new_feature in new_features:
                        name, value = new_feature.split(':')
                        features[name] = value
                continue

            index_label, *data = line.strip().split()
            *data, label = data

            output = parse_line('\t'.join(data), fields)
            output.update(unpack_token_index(index_label))
            output.update({'label': int(label)})

            annotation.append(output)

        yield annotation, features
Exemplo n.º 7
0
 def test_parse_line_only_id_head(self):
     line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_"
     self.assertEqual(parse_line(line, fields=["id", "form"]),
                      OrderedDict([
                          ('id', 1),
                          ('form', 'The'),
                      ]))
Exemplo n.º 8
0
 def test_parse_line_two_spaces(self):
     line = "1  The  the  DET  DT  Definite=Def|PronType=Art  4  det  _  _"
     self.assertEqual(parse_line(line, fields=["id", "form"]),
                      OrderedDict([
                          ('id', 1),
                          ('form', 'The'),
                      ]))
Exemplo n.º 9
0
 def test_parse_line(self):
     line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_"
     self.assertEqual(
         parse_line(line, fields=DEFAULT_FIELDS),
         Token([('id', 1), ('form', 'The'), ('lemma', 'the'),
                ('upos', 'DET'), ('xpos', 'DT'),
                ('feats', Token([('Definite', 'Def'),
                                 ('PronType', 'Art')])), ('head', 4),
                ('deprel', 'det'), ('deps', None), ('misc', None)]))
Exemplo n.º 10
0
 def test_parse_line_fewer_columns(self):
     line = "1\tThe\tthe\tDET\tDT"
     self.assertEqual(parse_line(line, fields=DEFAULT_FIELDS), Token([
         ('id', 1),
         ('form', 'The'),
         ('lemma', 'the'),
         ('upos', 'DET'),
         ('xpos', 'DT'),
     ]))
Exemplo n.º 11
0
 def test_parse_line(self):
     line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_"
     self.assertEqual(
         parse_line(line),
         OrderedDict([('id', 1), ('form', 'The'), ('lemma', 'the'),
                      ('upostag', 'DET'), ('xpostag', 'DT'),
                      ('feats',
                       OrderedDict([('Definite', 'Def'),
                                    ('PronType', 'Art')])), ('head', 4),
                      ('deprel', 'det'), ('deps', None), ('misc', None)]))
Exemplo n.º 12
0
 def test_parse_custom_fieldparsers(self):
     line = "1\t2"
     custom_fieldparsers = {
         "id": lambda line, i: line[i] * 5,
     }
     self.assertEqual(
         parse_line(line, fields=["id"], field_parsers=custom_fieldparsers),
         Token([
             ('id', "11111"),
         ]))
Exemplo n.º 13
0
def _lazy_parse(text: str, fields=DEFAULT_FIELDS):
    """
    Reads conllu annotations, yielding unwieldy OrderedDict-like
    objects per sentence.
    """
    for sentence in text.split("\n\n"):
        if sentence:
            yield [
                parse_line(line, fields) for line in sentence.split("\n")
                if line and not line.strip().startswith("#")
            ]
Exemplo n.º 14
0
 def test_parse_fieldparsers_alias_two_ways(self):
     line = "1\t2"
     custom_fieldparsers = {
         "xpos": lambda line, i: line[i] * 5,
         "upostag": lambda line, i: line[i] * 5,
     }
     self.assertEqual(
         parse_line(line, fields=["xpostag", "upos"], field_parsers=custom_fieldparsers),
         Token([
             ('xpostag', "11111"),
             ('upos', "22222"),
         ])
     )
Exemplo n.º 15
0
 def test_parse_fieldparsers_doesnt_alias_when_exists(self):
     line = "1\t2"
     custom_fieldparsers = {
         "xpos": lambda line, i: line[i] * 5,
         "xpostag": lambda line, i: line[i],
         "upos": lambda line, i: line[i] * 5,
         "upostag": lambda line, i: line[i],
     }
     self.assertEqual(
         parse_line(line, fields=["xpostag", "upostag"], field_parsers=custom_fieldparsers),
         Token([
             ('xpostag', "1"),
             ('upostag', "2"),
         ])
     )
Exemplo n.º 16
0
    def extract_token_info_from_companion_data(self):
        annotation = []
        for line in self.companion:
            line = '\t'.join(line)
            annotation.append(parse_line(line, DEFAULT_FIELDS))

        tokens = [x["form"] for x in annotation if x["form"] is not None]
        lemmas = [x["lemma"] for x in annotation if x["lemma"] is not None]
        pos_tags = [x["upostag"] for x in annotation if x["upostag"] is not None]
        token_range = [tuple([int(i) for i in list(x["misc"].values())[0].split(':')]) for x in annotation]

        return {"tokens": tokens,
                "lemmas": lemmas,
                "pos_tags": pos_tags,
                "token_range": token_range}
Exemplo n.º 17
0
def lazy_parse(text: str, fields: Tuple = DEFAULT_FIELDS):
    for sentence in text.split("\n\n"):
        if sentence:
            annotations = [
                parse_line(line, fields) for line in sentence.split("\n")
                if line and not line.strip().startswith("#")
            ]

            # (child, parent/head) pairs
            arc_indices = []
            # Strings with the relation for each pair
            arc_labels = []
            for idx, annotation in enumerate(annotations):
                head = annotation["head"]
                if head == 0:
                    # Skip the root
                    continue
                # UD marks the head with 1-indexed numbering, so we subtract
                # one to get the index of the parent.
                arc_indices.append((idx, head - 1))
                arc_labels.append(annotation["deprel"])
            yield annotations, arc_indices, arc_labels
Exemplo n.º 18
0
def lazy_parse(text: str, fields: Tuple = DEFAULT_FIELDS):
    for sentence in text.split("\n\n"):
        if sentence:
            yield [parse_line(line, fields)
                   for line in sentence.split("\n")
                   if line and not line.strip().startswith("#")]
Exemplo n.º 19
0
 def test_parse_line_with_no_tabs(self):
     line = "1 The the DET DT Definite=Def|PronType=Art 4 det _ _"
     with self.assertRaises(ParseException):
         parse_line(line)
Exemplo n.º 20
0
 def test_parse_line_with_spaces(self):
     line = "1 The the DET DT Definite=Def|PronType=Art 4 det _ _"
     with self.assertRaises(ParseException):
         parse_line(line, fields=DEFAULT_FIELDS)
Exemplo n.º 21
0
 def test_empty(self):
     with self.assertRaises(ParseException):
         line = "invalid_id\t_\t_\t_\t_\t_\t_\t_\t_\t"
         parse_line(line, fields=DEFAULT_FIELDS)
Exemplo n.º 22
0
parser.add_argument("conll", type=str, help="Augment CoNLL file")
parser.add_argument("mrp", type=str, help="Input MRP file")
parser.add_argument("output", type=str, help="Output Augmented file")
args = parser.parse_args()

conll_file = args.conll
mrp_file = args.mrp
out_file = args.output

augs = {}
with open(conll_file, 'r', encoding='utf8') as f_c:
    conlls = f_c.read().split('\n\n')
    for conll in conlls:
        id = conll.split('\n')[0][1:]
        augs[id] = [
            parse_line(line, DEFAULT_FIELDS)
            for line in conll.strip().split('\n')[1:]
        ]
    #print augs.keys()
with open(mrp_file, 'r', encoding='utf8') as f_m, open(out_file,
                                                       'w',
                                                       encoding='utf8') as fo:
    line = f_m.readline()
    while line:
        mrp = json.loads(line, object_pairs_hook=collections.OrderedDict)
        id = mrp['id']
        if id not in augs:
            print("id:{} not in companion".format(id))
        else:
            mrp['companion'] = dict(sent_id=id, toks=augs[id])
            fo.write((json.dumps(mrp) + '\n'))