示例#1
0
    def parse_conll_data(self, corpus, conll_in):
        text = ""
        offset = 0

        arg_text = []
        sent_predicates = []
        sent_args = defaultdict(list)
        doc = DEDocument(corpus)

        props = []

        for line in conll_in:
            parts = line.strip().split()
            if len(parts) < 8:
                text += "\n"
                offset += 1

                for index, predicate in enumerate(sent_predicates):
                    arg_content = sent_args[index]
                    props.append((predicate, arg_content))

                sent_predicates.clear()
                sent_args.clear()
                arg_text.clear()

                continue

            fname, _, index, token, pos, parse, lemma, sense = parts[:8]
            pb_annos = parts[8:]

            if len(arg_text) == 0:
                arg_text = [None] * len(pb_annos)

            domain = fname.split("/")[1]

            start = offset
            end = start + len(token)

            text += token + " "
            offset += len(token) + 1

            for index, t in enumerate(arg_text):
                if t:
                    arg_text[index] += " " + token

            if not sense == "-":
                sent_predicates.append((start, end, token))

            for index, anno in enumerate(pb_annos):
                if anno == "(V*)":
                    continue

                if anno.startswith("("):
                    role = anno.strip("(").strip(")").strip("*")
                    sent_args[index].append([role, start])
                    arg_text[index] = token
                if anno.endswith(")"):
                    sent_args[index][-1].append(end)
                    sent_args[index][-1].append(arg_text[index])
                    arg_text[index] = ""

        doc.set_text(text)

        for (p_start, p_end, p_token), args in props:
            hopper = doc.add_hopper()

            pred = doc.add_predicate(hopper, Span(p_start, p_end), p_token)

            if pred is not None:
                for role, arg_start, arg_end, arg_text in args:
                    filler = doc.add_filler(Span(arg_start, arg_end), arg_text)
                    doc.add_argument_mention(pred, filler.aid, role)

        return doc
示例#2
0
    def parse_ace_data(self, corpus, source_file, anno_file):
        with open(source_file) as source_in:
            doc = DEDocument(corpus)

            text = self.get_source_text(source_in)

            doc.set_text(text)

            tree = ET.parse(anno_file)
            root = tree.getroot()

            for xml_doc in root.iter("document"):
                docid = xml_doc.attrib["DOCID"]
                doc.set_id(docid)

                # Parse entity.
                entity2mention = defaultdict(list)

                for entity in xml_doc.iter("entity"):
                    entity_type = entity.attrib["TYPE"]
                    entity_subtype = entity.attrib["SUBTYPE"]
                    full_type = entity_type + "_" + entity_subtype

                    ent = doc.add_entity(full_type, entity.attrib["ID"])

                    for em in entity:
                        for head in em.iter("head"):
                            for charseq in head.iter("charseq"):
                                start = int(charseq.attrib["START"])
                                end = int(charseq.attrib["END"])

                                entity_span = Span(start, end + 1)

                                ent_mention = doc.add_entity_mention(
                                    ent,
                                    entity_span,
                                    charseq.text,
                                    em.attrib["ID"],
                                    entity_type=full_type,
                                    validate=False,
                                )

                                entity2mention[entity.attrib["ID"]].append(
                                    ent_mention)

                # Parse event.
                for event_node in xml_doc.iter("event"):
                    event_type = event_node.attrib["TYPE"]
                    event_subtype = event_node.attrib["SUBTYPE"]

                    hopper = doc.add_hopper(event_node.attrib["ID"])

                    event_mentions = []

                    for evm_node in event_node:
                        for anchor in evm_node.iter("anchor"):
                            for charseq in anchor.iter("charseq"):
                                start = int(charseq.attrib["START"])
                                end = int(charseq.attrib["END"])

                                evm = doc.add_predicate(
                                    hopper,
                                    Span(start, end + 1),
                                    charseq.text,
                                    eid=evm_node.attrib["ID"],
                                    frame_type=event_type + "_" +
                                    event_subtype,
                                    validate=False,
                                )

                                event_mentions.append(evm)

                    for em_arg in event_node.iter("event_argument"):
                        role = em_arg.attrib["ROLE"]
                        arg_id = em_arg.attrib["REFID"]

                        entity_mentions = entity2mention[arg_id]

                        if len(entity_mentions) > 0:
                            closest_ent, closest_evm, _ = find_close_mention(
                                event_mentions, entity_mentions)
                            doc.add_argument_mention(closest_evm,
                                                     closest_ent.aid, role)

                return doc