def read_rogetmap(xml, verbose=True): """Parse Roget map (Roget hierarchy) into a dictionary with Roget head words as keys.""" if verbose: log.info("Reading XML lexicon") lexicon = {} context = etree.iterparse(xml, events=("start", "end")) context = iter(context) for _event, elem in context: if elem.tag == "class": l1 = elem.get("name") elif elem.tag == "section": l2 = elem.get("name") elif elem.tag == "subsection": l3 = elem.get("name") elif elem.tag == "headword": head = elem.get("name") lexicon[head] = (l3, l2, l1) testwords = ["Existence", "Health", "Amusement", "Marriage"] util.test_lexicon(lexicon, testwords) if verbose: log.info("OK, read.") return lexicon
def read_blingbring(tsv, classmap, verbose=True): """Read the tsv version of the Blingbring lexicon (blingbring.xml). Return a lexicon dictionary: {senseid: {roget_head: roget_head, roget_subsection: roget_subsection, roget_section: roget_section, roget_class: roget_class, bring: bring_ID} """ rogetdict = read_rogetmap(xml=classmap, verbose=True) import csv if verbose: log.info("Reading tsv lexicon") lexicon = {} classmapping = {} with open(tsv) as f: for line in csv.reader(f, delimiter="\t"): if line[0].startswith("#"): continue rogetid = line[1].split("/")[-1] if rogetid in rogetdict: roget_l3 = rogetdict[rogetid][0] # subsection roget_l2 = rogetdict[rogetid][1] # section roget_l1 = rogetdict[rogetid][2] # class else: roget_l3 = roget_l2 = roget_l1 = "" senseids = set(line[3].split(":")) for senseid in senseids: lexicon.setdefault(senseid, set()).add( (rogetid, roget_l3, roget_l2, roget_l1)) # Make mapping between Roget and Bring classes if line[0].split("/")[1] == "B": classmapping[rogetid] = line[2] for senseid, rogetids in lexicon.items(): roget_head = set([tup[0] for tup in rogetids]) roget_subsection = set([tup[1] for tup in rogetids if tup[1]]) roget_section = set([tup[2] for tup in rogetids if tup[2]]) roget_class = set([tup[3] for tup in rogetids if tup[3]]) lexicon[senseid] = { "roget_head": roget_head, "roget_subsection": roget_subsection, "roget_section": roget_section, "roget_class": roget_class, "bring": set([classmapping[r] for r in roget_head]) } testwords = ["fågel..1", "behjälplig..1", "köra_ner..1"] util.test_lexicon(lexicon, testwords) if verbose: log.info("OK, read") return lexicon
def read_xml(xml): """Read the XML version of crosslinked lexicon.""" log.info("Reading XML lexicon") lexicon = {} context = etree.iterparse( xml, events=("start", "end")) # "start" needed to save reference to root element context = iter(context) _event, root = next(context) for event, elem in context: if event == "end": if elem.tag == 'LexicalEntry': lemma = elem.find("Lemma") dalin, saldo = [], '' for form in lemma.findall("FormRepresentation"): cat = _findval(form, "category") lem = _findval(form, "lemgram") if cat == "modern": saldo = lem else: match = _findval(form, "match") dalin += [(lem, match)] [ lexicon.update({d: { 'saldo': saldo, 'match': m }}) for (d, m) in dalin ] # Done parsing section. Clear tree to save memory if elem.tag in ['LexicalEntry', 'frame', 'resFrame']: root.clear() testwords = ["tigerhjerta..nn.1", "lågland..nn.1", "gud..nn.1"] util.test_lexicon(lexicon, testwords) log.info("OK, read") return lexicon
def read_swefn(xml, verbose=True): """Read the XML version of the swedish Framenet resource. Return a lexicon dictionary, {saldoID: {swefnID}}. """ if verbose: log.info("Reading XML lexicon") lexicon = {} context = etree.iterparse( xml, events=("start", "end")) # "start" needed to save reference to root element context = iter(context) _event, root = next(context) for event, elem in context: if event == "end": if elem.tag == 'LexicalEntry': sense = elem.find("Sense") sid = sense.get("id").lstrip("swefn--") for lu in sense.findall("feat[@att='LU']"): saldosense = lu.get("val") lexicon.setdefault(saldosense, set()).add(sid) # Done parsing section. Clear tree to save memory if elem.tag in ['LexicalEntry', 'frame', 'resFrame']: root.clear() testwords = [ "slant..1", "befrielse..1", "granne..1", "sisådär..1", "mjölkcentral..1" ] util.test_lexicon(lexicon, testwords) if verbose: log.info("OK, read.") return lexicon
def read_sensaldo(tsv, verbose=True): """Read the TSV version of the sensaldo lexicon (sensaldo-base.txt). Return a lexicon dictionary: {senseid: (class, ranking)} """ if verbose: log.info("Reading TSV lexicon") lexicon = {} f = tsv.read() # with open(tsv) as f: for line in f.split("\n"): if line.lstrip(): if line.startswith("#"): continue saldoid, label = line.split() lexicon[saldoid] = label testwords = ["förskräcklig..1", "ödmjukhet..1", "handla..1"] util.test_lexicon(lexicon, testwords) if verbose: log.info("OK, read") return lexicon
def read_lmf(xml, annotation_elements=("gf", "lem", "saldo"), tagset="SUC", verbose=True): """Read the XML version of SALDO's morphological lexicon (saldom.xml). Return a lexicon dictionary, {wordform: {{annotation-type: annotation}: ( set(possible tags), set(tuples with following words) )}} - annotation_element is the XML element for the annotation value (currently: 'gf' for baseform, 'lem' for lemgram or 'saldo' for SALDO id) - tagset is the tagset for the possible tags (currently: 'SUC', 'Parole', 'Saldo') """ # assert annotation_element in ("gf", "lem", "saldo"), "Invalid annotation element" tagmap = util.tagsets.mappings["saldo_to_" + tagset.lower()] if verbose: log.info("Reading XML lexicon") lexicon = {} context = etree.iterparse( xml, events=("start", "end")) # "start" needed to save reference to root element context = iter(context) event, root = next(context) for event, elem in context: if event == "end": if elem.tag == "LexicalEntry": annotations = HashableDict() for a in annotation_elements: annotations[a] = tuple(x.text for x in elem.findall(a)) pos = elem.findtext("pos") inhs = elem.findtext("inhs") if inhs == "-": inhs = "" inhs = inhs.split() # Check the paradigm for an "x", meaning a multi-word expression with a required gap p = elem.findtext("p") x_find = re.search(r"_x(\d*)_", p) x_insert = x_find.groups()[0] if x_find else None if x_insert == "": x_insert = "1" # Only vbm and certain paradigms allow gaps gap_allowed = (pos == "vbm" or p in (u"abm_x1_var_än", u"knm_x_ju_ju", u"pnm_x1_inte_ett_dugg", u"pnm_x1_vad_än", u"ppm_x1_för_skull")) table = elem.find("table") multiwords = [] for form in list(table): word = form.findtext("wf") param = form.findtext("param") if param in ("frag", "c", "ci", "cm"): # We don't use these wordforms, so skip continue elif param[-1].isdigit() and param[-2:] != "-1": # Handle multi-word expressions multiwords.append(word) multipart, multitotal = param.split(":")[-1].split("-") particle = bool(re.search( r"vbm_.+?p.*?\d+_", p)) # Multi-word with particle # Add a "*" where the gap should be if x_insert and multipart == x_insert: multiwords.append("*") if multipart == multitotal: lexicon.setdefault(multiwords[0], {}).setdefault( annotations, (set(), set(), gap_allowed, particle))[1].add( tuple(multiwords[1:])) multiwords = [] else: # Single word expressions if param[-2:] == "-1": param = param.rsplit(" ", 1)[0] if pos == "vbm": pos = "vb" saldotag = " ".join([pos] + inhs + [param]) tags = tagmap.get(saldotag) if tags: lexicon.setdefault(word, {}).setdefault( annotations, (set(), set(), False, False))[0].update(tags) # Done parsing section. Clear tree to save memory if elem.tag in ["LexicalEntry", "frame", "resFrame"]: root.clear() testwords = [ "äggtoddyarna", "Linköpingsbors", "katabatiska", "väg-", "formar", "in", "datorrelaterade" ] util.test_lexicon(lexicon, testwords) if verbose: log.info("OK, read") return lexicon
def read_lmf(xml, annotation_elements=("writtenForm", "lemgram"), tagset="SUC", verbose=True, skip_multiword=False, translate_tags=True): """Read the XML version of a morphological lexicon in lmf format (dalinm.xml, swedbergm.xml). Return a lexicon dictionary, {wordform: {{annotation-type: annotation}: ( set(possible tags), set(tuples with following words) )}} - annotation_element is the XML element for the annotation value, "writtenForm" for baseform, "lemgram" for lemgram writtenForm is translated to "gf" and lemgram to "lem" (for compatability with Saldo) - skip_multiword is a flag telling whether to make special entries for multiword expressions. Set this to False only if the tool used for text annotation cannot handle this at all """ # assert annotation_element in ("writtenForm lemgram") "Invalid annotation element" if verbose: log.info("Reading XML lexicon") lexicon = {} tagmap = util.tagsets.mappings["saldo_to_" + tagset.lower()] context = etree.iterparse( xml, events=("start", "end")) # "start" needed to save reference to root element context = iter(context) event, root = next(context) for event, elem in context: if event == "end": if elem.tag == "LexicalEntry": annotations = HashableDict() lem = elem.find("Lemma").find("FormRepresentation") for a in annotation_elements: if a == "writtenForm": key = "gf" elif a == "lemgram": key = "lem" annotations[key] = tuple([findval(lem, a)]) pos = findval(lem, "partOfSpeech") inhs = findval(lem, "inherent") if inhs == "-": inhs = "" inhs = inhs.split() # there may be several WordForms for forms in elem.findall("WordForm"): word = findval(forms, "writtenForm") param = findval(forms, "msd") multiwords = [] wordparts = word.split() for i, word in enumerate(wordparts): if (not skip_multiword) and len(wordparts) > 1: # Handle multi-word expressions multiwords.append(word) # We don't use any particles or mwe:s with gaps since that information is not formally # expressed in the historical lexicons particle = False mwe_gap = False # but keep the fields so that the file format matches the normal saldo-pickle format # is it the last word in the multi word expression? if i == len(wordparts) - 1: lexicon.setdefault( multiwords[0], {}).setdefault(annotations, (set(), set(), mwe_gap, particle))[1].add( tuple(multiwords[1:])) multiwords = [] else: # Single word expressions particle = False # we don't use any particles or mwe:s with gaps mwe_gap = False # but keep the fields so that the file format match the normal saldo-pickle format if translate_tags: tags = convert_default(pos, inhs, param, tagmap) if tags: lexicon.setdefault(word, {}).setdefault( annotations, (set(), set(), mwe_gap, particle))[0].update(tags) else: saldotag = " ".join( [pos, param] ) # this tag is rather useless, but at least gives some information tags = tuple([saldotag]) lexicon.setdefault(word, {}).setdefault( annotations, (set(), set(), mwe_gap, particle))[0].update(tags) # Done parsing section. Clear tree to save memory if elem.tag in ["LexicalEntry", "frame", "resFrame"]: root.clear() if verbose: testwords = ["äplebuske", "stöpljus", "katt"] util.test_lexicon(lexicon, testwords) log.info("OK, read") return lexicon