def write_formatted(out, annotations_columns, annotations_structs, columns, structs, structs_count, text): """ The 'formatted' XML part of the 'export' function: export xml with the same whitespace and indentation as in the original. """ txt, anchor2pos, pos2anchor = util.corpus.read_corpus_text(text) structs_order = ["__token__"] + [s[0] for s in structs] anchors = defaultdict(dict) for elem, attrs in structs: for attr in attrs: struct = util.read_annotation(annotations_structs[attr[1]][0]) for edge in struct: if util.edgeStart(edge) == util.edgeEnd(edge): anchors[util.edgeStart(edge)].setdefault("structs", {}).setdefault((elem, anchor2pos[util.edgeEnd(edge)], "close"), []).append((attr[0], struct[edge])) else: anchors[util.edgeStart(edge)].setdefault("structs", {}).setdefault((elem, anchor2pos[util.edgeEnd(edge)]), []).append((attr[0], struct[edge])) anchors[util.edgeEnd(edge)].setdefault("close", set()).add((elem, edge)) for n, annot in enumerate(annotations_columns): n += structs_count for tok, value in util.read_annotation_iteritems(annot): if n > structs_count: # Any column except the first (the word) value = "|" if value == "|/|" else value anchors[util.edgeStart(tok)].setdefault("token", []).append(value.replace("\n", " ")) if n == structs_count: anchors[util.edgeEnd(tok)].setdefault("close", set()).add(("__token__", None)) currpos = 0 with open(out, "w") as OUT: OUT.write("<corpus>") for pos, anchor in sorted(list(pos2anchor.items()), key=lambda x: x[0]): OUT.write(txt[currpos:pos].replace("&", "&").replace("<", "<").replace(">", ">")) if anchor in anchors: if "close" in anchors[anchor]: if ("__token__", None) in anchors[anchor]["close"]: OUT.write("</w>") OUT.write("".join("</%s>" % e[0] for e in sorted(anchors[anchor]["close"], key=lambda x: structs_order.index(x[0])) if not e[0] == "__token__")) if "structs" in anchors[anchor]: for elem, annot in sorted(iter(list(anchors[anchor]["structs"].items())), key=lambda x: (-x[0][1], -structs_order.index(x[0][0]))): if elem not in ("close", "token"): attrstring = "".join(' %s="%s"' % (attr, val.replace("&", "&").replace('"', """).replace("<", "<").replace(">", ">")) for (attr, val) in annot if val and not attr == UNDEF) close = "/" if len(elem) == 3 else "" OUT.write("<%s%s%s>" % (elem[0], attrstring, close)) if "token" in anchors[anchor]: attrstring = "".join(' %s="%s"' % (columns[i + 1], a.replace("&", "&").replace('"', '"').replace("<", "<").replace(">", ">")) for i, a in enumerate(anchors[anchor]["token"][1:]) if a) OUT.write("<w%s>" % attrstring) currpos = pos OUT.write("</corpus>") util.log.info("Exported: %s", out)
def align_texts(sentence1, sentence2, link1, link2, sent_parents1, sent_parents2, out_sentlink1, out_sentlink2): """Make a more fine-grained sentence alignment between the current text (1) and a parallel reference text (2). - sentence1 and sentence2 contain information about which word-IDs there are in each sentence - link1 and link2 are existing annotations for the link IDs in the two texts - linkref2 is the existing annotation for the linkref IDs in text 2 - sent_parents1 and sent_parents2 contain information about which sentences there are in each of the old sentence links - out_sentlink1 and out_sentlink2, are the resulting annotations for the new sentence links """ REVERSED_LINK2 = {v: k for k, v in list(util.read_annotation(link2).items())} SENTPARENTS1 = util.read_annotation(sent_parents1) SENTPARENTS2 = util.read_annotation(sent_parents2) SENT1 = util.read_annotation(sentence1) SENT2 = util.read_annotation(sentence2) OUT_SENTLINK1 = {} OUT_SENTLINK2 = {} linkcounter = 0 # Loop through existing links and split them into smaller units if possible (only if both links have text) for linkkey1, linkid in util.read_annotation_iteritems(link1): linkkey2 = REVERSED_LINK2[linkid] if linkkey1 in SENTPARENTS1 and linkkey2 in SENTPARENTS2: linkedsents1 = [] linkedsents2 = [] for sentid in SENTPARENTS1[linkkey1].split(): linkedsents1.append((sentid, [w for w in SENT1[sentid].split()])) for sentid in SENTPARENTS2[linkkey2].split(): linkedsents2.append((sentid, [w for w in SENT2[sentid].split()])) for s1, s2 in gachalign(linkedsents1, linkedsents2, mean="gacha"): linkcounter += 1 if s1: newlink1 = util.mkEdge('link', [util.edgeStart(s1[0]), util.edgeEnd(s1[-1])]) OUT_SENTLINK1[newlink1] = str(linkcounter) if s2: newlink2 = util.mkEdge('link', [util.edgeStart(s2[0]), util.edgeEnd(s2[-1])]) OUT_SENTLINK2[newlink2] = str(linkcounter) # annotation if a link has text in one language but is empty in the other one elif linkkey1 in SENTPARENTS1 or linkkey2 in SENTPARENTS2: linkcounter += 1 newlink1 = util.mkEdge('link', [util.edgeStart(linkkey1), util.edgeEnd(linkkey1)]) OUT_SENTLINK1[newlink1] = str(linkcounter) newlink2 = util.mkEdge('link', [util.edgeStart(linkkey2), util.edgeEnd(linkkey2)]) OUT_SENTLINK2[newlink2] = str(linkcounter) util.write_annotation(out_sentlink1, OUT_SENTLINK1) util.write_annotation(out_sentlink2, OUT_SENTLINK2)
def order(chunknr, edge, _value): value = anchors[chunknr][util.edgeStart(edge)] # Position in corpus return (chunknr, value)
def run_wsd(wsdjar, sense_model, context_model, out, sentence, word, ref, lemgram, saldo, pos, text, sensefmt=util.SCORESEP + "%.3f", default_prob="-1", encoding=util.UTF8): """ Runs the word sense disambiguation tool (saldowsd.jar) to add probabilities to the saldo annotation. Unanalyzed senses (e.g. multiword expressions) receive the probability value given by default_prob. - wsdjar is the name of the java programme to be used for the wsd - sense_model and context_model are the models to be used with wsdjar - out is the resulting annotation file - sentence is an existing annotation for sentences and their children (words) - word is an existing annotations for wordforms - ref is an existing annotation for word references - lemgram and saldo are existing annotations for inflection tables and meanings - pos is an existing annotations for part-of-speech - text is an existing file with the input text and its anchors. - sensefmt is a format string for how to print the sense and its probability - default_prob is the default value for unanalyzed senses """ WORD = util.read_annotation(word) REF = util.read_annotation(ref) LEMGRAM = util.read_annotation(lemgram) SALDO = util.read_annotation(saldo) POS = util.read_annotation(pos) textpos = util.read_corpus_text(text)[1] # Sort sentences according to their text position because WSD is context dependent. sentences = sorted(util.read_annotation_iteritems(sentence), key=lambda x: textpos[util.edgeStart(x[0])]) sentences = [sent.split() for _, sent in sentences] # Start WSD process process = wsd_start(wsdjar, sense_model, context_model, encoding) # Construct input and send to WSD stdin = build_input(sentences, WORD, REF, LEMGRAM, SALDO, POS) if encoding: stdin = stdin.encode(encoding) stdout, stderr = process.communicate(stdin) # TODO: Solve hack line below! # Problem is that regular messages "Reading sense vectors.." are also piped to stderr. if len(stderr) > 52: util.system.kill_process(process) util.log.error(str(stderr)) return if encoding: stdout = stdout.decode(encoding) process_output(out, stdout, sentences, SALDO, sensefmt, default_prob) # Kill running subprocess util.system.kill_process(process) return
def parse_swener_output(sentences, output, out_ne_ex, out_ne_type, out_ne_subtype, out_ne_name): """Parse the SweNER output and write annotation files.""" out_ex_dict = {} out_type_dict = {} out_subtype_dict = {} out_name_dict = {} # Loop through the NE-tagged sentences and parse each one with ElemenTree for sent, tagged_sent in zip(sentences, output.strip().split(SENT_SEP)): xml_sent = "<sroot>" + tagged_sent + "</sroot>" # Filter out tags on the format <EnamexXxxXxx> since they seem to always overlap with <ENAMEX> elements, # making the XML invalid. xml_sent = re.sub(r'</?Enamex[^>\s]+>', '', xml_sent) try: root = etree.fromstring(xml_sent) except: util.log.warning("Error parsing sentence. Skipping.") continue # Init token counter; needed to get start_id and end_id i = 0 previous_end = 0 children = list(root.iter()) try: for count, child in enumerate(children): start_id = util.edgeStart(sent[i]) start_i = i # If current child has text, increase token counter if child.text: i += len(child.text.strip().split(TOK_SEP)) # Extract NE tags and save them in dictionaries if child.tag != "sroot": if start_i < previous_end: pass # util.log.warning("Overlapping NE elements found; discarding one.") else: end_id = util.edgeEnd(sent[i - 1]) previous_end = i edge = util.mkEdge('ne', [start_id, end_id]) out_ex_dict[edge] = child.tag out_type_dict[edge] = child.get("TYPE") out_subtype_dict[edge] = child.get("SBT") out_name_dict[edge] = child.text # If this child has a tail and it doesn't start with a space, or if it has no tail at all despite not being the last child, # it means this NE ends in the middle of a token. if (child.tail and child.tail.strip() and not child.tail[0] == " ") or ( not child.tail and count < len(children) - 1): i -= 1 # util.log.warning("Split token returned by name tagger.") # If current child has text in the tail, increase token counter if child.tail and child.tail.strip(): i += len(child.tail.strip().split(TOK_SEP)) if (child.tag == "sroot" and child.text and not child.text[-1] == " ") or (child.tail and not child.tail[-1] == " "): # The next NE would start in the middle of a token, so decrease the counter by 1 i -= 1 except IndexError: util.log.warning("Error parsing sentence. Skipping.") continue # Write annotations util.write_annotation(out_ne_ex, out_ex_dict) util.write_annotation(out_ne_type, out_type_dict) util.write_annotation(out_ne_subtype, out_subtype_dict) util.write_annotation(out_ne_name, out_name_dict)
def make_span(edge): return slice(anchor2pos[util.edgeStart(edge)], anchor2pos[util.edgeEnd(edge)])