def load_wiktionary(): global wiktionary if wiktionary is not None: return try: wiktionary = set([ x.lower() for x in json_load(script_path("wiktionary_lemmas.json")) ]) except: print("run python -m natas.download") wiktionary = []
def _load_transducer(filename, invert): metadata_filename = os.path.join(os.path.dirname(filename), "metadata.json") try: metadata = mikatools.json_load(metadata_filename) except: #No crash if JSON is not found or malformed for some reason metadata = {} if "fst_type" in metadata and metadata["fst_type"] == "foma": return FomaFSTWrapper(filename, invert) else: input_stream = hfst.HfstInputStream(filename) return input_stream.read()
from mikatools import script_path, json_load from onmt.translate.translator import Translator from onmt.decoders.ensemble import load_test_model from onmt.translate import GNMTGlobalScorer from itertools import islice, repeat import configargparse as cfargparse import spacy import os wiktionary = set([x.lower() for x in json_load(script_path("wiktionary_lemmas.json"))]) is_in_data_cache = {"ceec_eng":{}, "ocr_fin":{}} def set_spacy(nlp): models["spacy"] = nlp def _get_spacy(): if "spacy" not in models: try: models["spacy"] = spacy.load('en_core_web_md') except IOError: raise Exception("Spacy model was not loaded! Run: python -m spacy download en_core_web_md") return models["spacy"] def split_corpus(f, shard_size): if shard_size <= 0: yield f else: while True: shard = list(islice(f, shard_size))
def model_info(language): filename = os.path.join(__where_models(language), "metadata.json") d = mikatools.json_load(filename) mikatools.print_json_help(d)
#encoding: utf-8 from __future__ import unicode_literals import re, unicodedata import mikatools isos = mikatools.json_load(mikatools.script_path("lang_codes.json")) pattern = re.compile( r'(\w[\u02F3\u0300\u2013\u032E\u208D\u203F\u0311\u0323\u035E\u031C\u02FC\u030C\u02F9\u0328\u032D:\u02F4\u032F\u0330\u035C\u0302\u0327\u03572\u0308\u0351\u0304\u02F2\u0352\u0355\u00B7\u032C\u030B\u2019\u0339\u00B4\u0301\u02F1\u0303\u0306\u030A7\u0325\u0307\u0354`\u02F0]+|\w|\W)', re.UNICODE | re.IGNORECASE) def char_split(word): word = unicodedata.normalize('NFKC', word) _result = pattern.findall(word) return list(_result) def filter_arabic(text, keep_vowels=True, combine_by=""): if keep_vowels: return combine_by.join(re.findall(r"[ء-ي'ًٌٍَُِّْـ']+", text)) else: return combine_by.join(re.findall(r"[ء-ي]+", text)) def iso_to_name(iso): return isos[iso]