def parse_conll_data(self, corpus, conll_in): text = "" offset = 0 arg_text = [] sent_predicates = [] sent_args = defaultdict(list) doc = DEDocument(corpus) props = [] for line in conll_in: parts = line.strip().split() if len(parts) < 8: text += "\n" offset += 1 for index, predicate in enumerate(sent_predicates): arg_content = sent_args[index] props.append((predicate, arg_content)) sent_predicates.clear() sent_args.clear() arg_text.clear() continue fname, _, index, token, pos, parse, lemma, sense = parts[:8] pb_annos = parts[8:] if len(arg_text) == 0: arg_text = [None] * len(pb_annos) domain = fname.split("/")[1] start = offset end = start + len(token) text += token + " " offset += len(token) + 1 for index, t in enumerate(arg_text): if t: arg_text[index] += " " + token if not sense == "-": sent_predicates.append((start, end, token)) for index, anno in enumerate(pb_annos): if anno == "(V*)": continue if anno.startswith("("): role = anno.strip("(").strip(")").strip("*") sent_args[index].append([role, start]) arg_text[index] = token if anno.endswith(")"): sent_args[index][-1].append(end) sent_args[index][-1].append(arg_text[index]) arg_text[index] = "" doc.set_text(text) for (p_start, p_end, p_token), args in props: hopper = doc.add_hopper() pred = doc.add_predicate(hopper, Span(p_start, p_end), p_token) if pred is not None: for role, arg_start, arg_end, arg_text in args: filler = doc.add_filler(Span(arg_start, arg_end), arg_text) doc.add_argument_mention(pred, filler.aid, role) return doc
def parse_ace_data(self, corpus, source_file, anno_file): with open(source_file) as source_in: doc = DEDocument(corpus) text = self.get_source_text(source_in) doc.set_text(text) tree = ET.parse(anno_file) root = tree.getroot() for xml_doc in root.iter("document"): docid = xml_doc.attrib["DOCID"] doc.set_id(docid) # Parse entity. entity2mention = defaultdict(list) for entity in xml_doc.iter("entity"): entity_type = entity.attrib["TYPE"] entity_subtype = entity.attrib["SUBTYPE"] full_type = entity_type + "_" + entity_subtype ent = doc.add_entity(full_type, entity.attrib["ID"]) for em in entity: for head in em.iter("head"): for charseq in head.iter("charseq"): start = int(charseq.attrib["START"]) end = int(charseq.attrib["END"]) entity_span = Span(start, end + 1) ent_mention = doc.add_entity_mention( ent, entity_span, charseq.text, em.attrib["ID"], entity_type=full_type, validate=False, ) entity2mention[entity.attrib["ID"]].append( ent_mention) # Parse event. for event_node in xml_doc.iter("event"): event_type = event_node.attrib["TYPE"] event_subtype = event_node.attrib["SUBTYPE"] hopper = doc.add_hopper(event_node.attrib["ID"]) event_mentions = [] for evm_node in event_node: for anchor in evm_node.iter("anchor"): for charseq in anchor.iter("charseq"): start = int(charseq.attrib["START"]) end = int(charseq.attrib["END"]) evm = doc.add_predicate( hopper, Span(start, end + 1), charseq.text, eid=evm_node.attrib["ID"], frame_type=event_type + "_" + event_subtype, validate=False, ) event_mentions.append(evm) for em_arg in event_node.iter("event_argument"): role = em_arg.attrib["ROLE"] arg_id = em_arg.attrib["REFID"] entity_mentions = entity2mention[arg_id] if len(entity_mentions) > 0: closest_ent, closest_evm, _ = find_close_mention( event_mentions, entity_mentions) doc.add_argument_mention(closest_evm, closest_ent.aid, role) return doc