def stanza_resources_file( resources_file: ModelOutput = ModelOutput("stanza/resources.json")): """Download and unzip the Stanza dependency model.""" # Write resources.json file to keep Stanza from complaining res = json.dumps({ "sv": { "lang_name": "Swedish", "tokenize": { "orchid": {}, "best": {} }, "default_processors": { "tokenize": "orchid" }, "default_dependencies": {}, } }) resources_file.write(res)
def hist_morphtable(out: ModelOutput = ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable"), swedberg: Model = Model("hunpos/hist/swedberg-gender.hunpos"), dalin: Model = Model("hunpos/hist/dalinm.hunpos"), saldosuc_morphtable: Model = Model("hunpos/saldo_suc-tags.morphtable")): """Read files and make a morphtable together with the information from SALDO (saldosuc_morphtable). Args: out (str, optional): Resulting morphtable file to be written. Defaults to ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable"). swedberg (str, optional): Wordlist from Swedberg and corresponding SALDO MSD-tags. Defaults to Model("hunpos/hist/swedberg-gender.hunpos"). dalin (str, optional): Wordlist from Dalin and corresponding SALDO MSD-tags. Defaults to Model("hunpos/hist/dalinm.hunpos"). saldosuc_morphtable (str, optional): SALDO Hunpos morphtable. Defaults to Model("hunpos/saldo_suc-tags.morphtable"). """ words = {} _read_saldosuc(words, saldosuc_morphtable.path) for fil in [dalin, swedberg]: for line in open(fil.path, encoding="utf-8").readlines(): if not line.strip(): continue xs = line.split("\t") word, msd = xs[0].strip(), xs[1].strip() if " " in word: if msd.startswith("nn"): # We assume that the head of a noun mwe is the last word word = word.split()[-1] if msd.startswith("vb"): # We assume that the head of a verbal mwe is the first word word = word.split()[0] # If the tag is not present, we try to translate it anyway suc = SALDO_TO_SUC.get(msd, "") if not suc: suc = _force_parse(msd) if suc: words.setdefault(word.lower(), set()).update(suc) words.setdefault(word.title(), set()).update(suc) with open(out.path, encoding="UTF-8", mode="w") as out: for w, ts in list(words.items()): line = ("\t".join([w] + list(ts)) + "\n") out.write(line)
def build_tokenlist( saldo_model: Model = Model("saldo/saldo.pickle"), out: ModelOutput = ModelOutput( "segment/bettertokenizer.sv.saldo-tokens"), segmenter: str = Config("segment.token_wordlist_segmenter"), model: Model = Model("segment/bettertokenizer.sv")): """Build a list of words from a SALDO model, to help BetterWordTokenizer.""" segmenter_args = [] if model: if model.path.suffix in ["pickle", "pkl"]: with open(model, "rb") as m: model_arg = pickle.load(m) else: model_arg = model.path segmenter_args.append(model_arg) assert segmenter in SEGMENTERS, "Available segmenters: %s" % ", ".join( sorted(SEGMENTERS)) segmenter = SEGMENTERS[segmenter] segmenter = segmenter(*segmenter_args) assert hasattr( segmenter, "span_tokenize" ), "Segmenter needs a 'span_tokenize' method: %r" % segmenter wordforms = set() # Skip strings already handled by the tokenizer. # Also skip words ending in comma (used by some multi word expressions in SALDO). with open(saldo_model.path, "rb") as F: lexicon = pickle.load(F) for w in lexicon: w2 = list(map(split_triple, lexicon[w])) mwu_extras = [ contw for w3 in w2 for cont in w3[2] for contw in cont if contw not in lexicon ] for wf in mwu_extras + [w]: spans = list(segmenter.span_tokenize(wf)) if len(spans) > 1 and not wf.endswith(","): wordforms.add(wf) out.write("\n".join(sorted(wordforms)))
def saldo_morphtable(out: ModelOutput = ModelOutput( "hunpos/saldo_suc-tags.morphtable"), saldo_model: Model = Model("saldo/saldo.pickle"), suc: Model = Model("hunpos/suc3_morphtable.words"), morphtable_base: Model = Model("hunpos/suc.morphtable"), morphtable_patterns: Model = Model("hunpos/suc.patterns"), add_capitalized: bool = True, add_lowercase: bool = False): """Create a morphtable file for use with Hunpos. A morphtable contains wordforms from SALDO's morphology (with accompanying tags) which are missing in SUC3. Since the morphtable is case sensitive, both the original form and a capitalized form is saved. Args: out (str, optional): Resulting morphtable file to be written. Defaults to ModelOutput("hunpos/saldo_suc-tags.morphtable"). saldo_model (str, optional): Path to a pickled SALDO model. Defaults to Model("saldo/saldo.pickle"). suc (str, optional): Tab-separated file with wordforms from SUC, containing: frequency, wordform, tag. Defaults to Model("hunpos/suc3_morphtable.words"). morphtable_base (str, optional): Existing morphtable file, whose contents will be included in the new one. Defaults to Model("hunpos/suc.morphtable"). morphtable_patterns (str, optional): Optional file with regular expressions. Defaults to Model("hunpos/suc.patterns"). add_capitalized (bool, optional): Whether or not capitalized word forms should be added. Defaults to True. add_lowercase (bool, optional): Whether or not lower case word forms should be added. Defaults to False. """ lex = saldo.SaldoLexicon(saldo_model.path) tags = defaultdict(set) # Get all wordforms from SALDO for word in list(lex.lexicon.keys()): words = lex.lookup(word) # Filter out multi word expressions words = [x for x in words if len(x[2]) == 0] if words: # Only use MSD not containing "-" for w in words: for msd in w[1]: if "-" not in msd: tags[word].add(msd) if add_capitalized: # Add a capitalized form of the word capitalized = word[0].upper() + word[1:] if not word == capitalized: tags[capitalized].add(msd) if add_lowercase: # Add a lower case form of the word lower = word.lower() if not word == lower: tags[lower].add(msd) # Read SUC words with open(suc.path, encoding="UTF-8") as suctags: for line in suctags: _, word, msd = line.strip("\n").split("\t") # Don't keep SALDO words already in SUC if word in tags: del tags[word] # If the word is not a name, and exists as lowercase in SALDO, remove it elif not msd.startswith("PM") and not word.lower( ) == word and word.lower() in tags: del tags[word.lower()] # Read regular expressions from pattern file pattern_list = [] if morphtable_patterns: with open(morphtable_patterns.path, encoding="UTF-8") as pat: for line in pat: if line.strip() and not line.startswith("#"): pattern_name, _, pattern_tags = line.strip().split("\t", 2) pattern_list.append("[[%s]]\t%s\n" % (pattern_name, pattern_tags)) with open(out.path, encoding="UTF-8", mode="w") as out: if morphtable_base: with open(morphtable_base.path, encoding="UTF-8") as base: for line in base: out.write(line) for pattern in pattern_list: out.write(pattern) for word in sorted(tags): out.write("%s\t%s\n" % (word, "\t".join(tags[word])))