Пример #1
0
def stanza_resources_file(
        resources_file: ModelOutput = ModelOutput("stanza/resources.json")):
    """Download and unzip the Stanza dependency model."""
    # Write resources.json file to keep Stanza from complaining
    res = json.dumps({
        "sv": {
            "lang_name": "Swedish",
            "tokenize": {
                "orchid": {},
                "best": {}
            },
            "default_processors": {
                "tokenize": "orchid"
            },
            "default_dependencies": {},
        }
    })
    resources_file.write(res)
Пример #2
0
def hist_morphtable(out: ModelOutput = ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable"),
                    swedberg: Model = Model("hunpos/hist/swedberg-gender.hunpos"),
                    dalin: Model = Model("hunpos/hist/dalinm.hunpos"),
                    saldosuc_morphtable: Model = Model("hunpos/saldo_suc-tags.morphtable")):
    """Read files and make a morphtable together with the information from SALDO (saldosuc_morphtable).

    Args:
        out (str, optional): Resulting morphtable file to be written.
            Defaults to ModelOutput("hunpos/hist/dalinm-swedberg_saldo_suc-tags.morphtable").
        swedberg (str, optional): Wordlist from Swedberg and corresponding SALDO MSD-tags.
            Defaults to Model("hunpos/hist/swedberg-gender.hunpos").
        dalin (str, optional): Wordlist from Dalin and corresponding SALDO MSD-tags.
            Defaults to Model("hunpos/hist/dalinm.hunpos").
        saldosuc_morphtable (str, optional): SALDO Hunpos morphtable.
            Defaults to Model("hunpos/saldo_suc-tags.morphtable").
    """
    words = {}
    _read_saldosuc(words, saldosuc_morphtable.path)
    for fil in [dalin, swedberg]:
        for line in open(fil.path, encoding="utf-8").readlines():
            if not line.strip():
                continue
            xs = line.split("\t")
            word, msd = xs[0].strip(), xs[1].strip()
            if " " in word:
                if msd.startswith("nn"):  # We assume that the head of a noun mwe is the last word
                    word = word.split()[-1]
                if msd.startswith("vb"):  # We assume that the head of a verbal mwe is the first word
                    word = word.split()[0]

            # If the tag is not present, we try to translate it anyway
            suc = SALDO_TO_SUC.get(msd, "")
            if not suc:
                suc = _force_parse(msd)
            if suc:
                words.setdefault(word.lower(), set()).update(suc)
                words.setdefault(word.title(), set()).update(suc)
    with open(out.path, encoding="UTF-8", mode="w") as out:
        for w, ts in list(words.items()):
            line = ("\t".join([w] + list(ts)) + "\n")
            out.write(line)
Пример #3
0
def build_tokenlist(
        saldo_model: Model = Model("saldo/saldo.pickle"),
        out: ModelOutput = ModelOutput(
            "segment/bettertokenizer.sv.saldo-tokens"),
        segmenter: str = Config("segment.token_wordlist_segmenter"),
        model: Model = Model("segment/bettertokenizer.sv")):
    """Build a list of words from a SALDO model, to help BetterWordTokenizer."""
    segmenter_args = []
    if model:
        if model.path.suffix in ["pickle", "pkl"]:
            with open(model, "rb") as m:
                model_arg = pickle.load(m)
        else:
            model_arg = model.path
        segmenter_args.append(model_arg)
    assert segmenter in SEGMENTERS, "Available segmenters: %s" % ", ".join(
        sorted(SEGMENTERS))
    segmenter = SEGMENTERS[segmenter]
    segmenter = segmenter(*segmenter_args)
    assert hasattr(
        segmenter, "span_tokenize"
    ), "Segmenter needs a 'span_tokenize' method: %r" % segmenter

    wordforms = set()

    # Skip strings already handled by the tokenizer.
    # Also skip words ending in comma (used by some multi word expressions in SALDO).
    with open(saldo_model.path, "rb") as F:
        lexicon = pickle.load(F)
        for w in lexicon:
            w2 = list(map(split_triple, lexicon[w]))
            mwu_extras = [
                contw for w3 in w2 for cont in w3[2] for contw in cont
                if contw not in lexicon
            ]
            for wf in mwu_extras + [w]:
                spans = list(segmenter.span_tokenize(wf))
                if len(spans) > 1 and not wf.endswith(","):
                    wordforms.add(wf)

    out.write("\n".join(sorted(wordforms)))
Пример #4
0
def saldo_morphtable(out: ModelOutput = ModelOutput(
    "hunpos/saldo_suc-tags.morphtable"),
                     saldo_model: Model = Model("saldo/saldo.pickle"),
                     suc: Model = Model("hunpos/suc3_morphtable.words"),
                     morphtable_base: Model = Model("hunpos/suc.morphtable"),
                     morphtable_patterns: Model = Model("hunpos/suc.patterns"),
                     add_capitalized: bool = True,
                     add_lowercase: bool = False):
    """Create a morphtable file for use with Hunpos.

    A morphtable contains wordforms from SALDO's morphology (with accompanying tags) which are missing in SUC3.
    Since the morphtable is case sensitive, both the original form and a capitalized form
    is saved.

    Args:
        out (str, optional): Resulting morphtable file to be written.
            Defaults to ModelOutput("hunpos/saldo_suc-tags.morphtable").
        saldo_model (str, optional): Path to a pickled SALDO model.
            Defaults to Model("saldo/saldo.pickle").
        suc (str, optional): Tab-separated file with wordforms from SUC, containing: frequency, wordform, tag.
            Defaults to Model("hunpos/suc3_morphtable.words").
        morphtable_base (str, optional): Existing morphtable file, whose contents will be included in the new one.
            Defaults to Model("hunpos/suc.morphtable").
        morphtable_patterns (str, optional): Optional file with regular expressions.
            Defaults to Model("hunpos/suc.patterns").
        add_capitalized (bool, optional): Whether or not capitalized word forms should be added. Defaults to True.
        add_lowercase (bool, optional): Whether or not lower case word forms should be added. Defaults to False.
    """
    lex = saldo.SaldoLexicon(saldo_model.path)
    tags = defaultdict(set)

    # Get all wordforms from SALDO
    for word in list(lex.lexicon.keys()):
        words = lex.lookup(word)
        # Filter out multi word expressions
        words = [x for x in words if len(x[2]) == 0]
        if words:
            # Only use MSD not containing "-"
            for w in words:
                for msd in w[1]:
                    if "-" not in msd:
                        tags[word].add(msd)
                        if add_capitalized:
                            # Add a capitalized form of the word
                            capitalized = word[0].upper() + word[1:]
                            if not word == capitalized:
                                tags[capitalized].add(msd)
                        if add_lowercase:
                            # Add a lower case form of the word
                            lower = word.lower()
                            if not word == lower:
                                tags[lower].add(msd)

    # Read SUC words
    with open(suc.path, encoding="UTF-8") as suctags:
        for line in suctags:
            _, word, msd = line.strip("\n").split("\t")

            # Don't keep SALDO words already in SUC
            if word in tags:
                del tags[word]
            # If the word is not a name, and exists as lowercase in SALDO, remove it
            elif not msd.startswith("PM") and not word.lower(
            ) == word and word.lower() in tags:
                del tags[word.lower()]

    # Read regular expressions from pattern file
    pattern_list = []
    if morphtable_patterns:
        with open(morphtable_patterns.path, encoding="UTF-8") as pat:
            for line in pat:
                if line.strip() and not line.startswith("#"):
                    pattern_name, _, pattern_tags = line.strip().split("\t", 2)
                    pattern_list.append("[[%s]]\t%s\n" %
                                        (pattern_name, pattern_tags))

    with open(out.path, encoding="UTF-8", mode="w") as out:
        if morphtable_base:
            with open(morphtable_base.path, encoding="UTF-8") as base:
                for line in base:
                    out.write(line)

        for pattern in pattern_list:
            out.write(pattern)

        for word in sorted(tags):
            out.write("%s\t%s\n" % (word, "\t".join(tags[word])))