예제 #1
0
def _load_model(name):
	model_path = script_path("models/" + name)
	if not os.path.exists(model_path):
		model_path = name
	opt = opennmt_opts(model_path, **_default_kwargs())
	m = load_test_model(opt)
	models[name] = m
예제 #2
0
def load_wiktionary():
    global wiktionary
    if wiktionary is not None:
        return
    try:
        wiktionary = set([
            x.lower() for x in json_load(script_path("wiktionary_lemmas.json"))
        ])
    except:
        print("run python -m natas.download")
        wiktionary = []
예제 #3
0
from mikatools import script_path, json_load
from onmt.translate.translator import Translator
from onmt.decoders.ensemble import load_test_model
from onmt.translate import GNMTGlobalScorer
from itertools import islice, repeat
import configargparse as cfargparse
import spacy
import os


wiktionary = set([x.lower() for x in json_load(script_path("wiktionary_lemmas.json"))])

is_in_data_cache = {"ceec_eng":{}, "ocr_fin":{}}

def set_spacy(nlp):
	models["spacy"] = nlp

def _get_spacy():
	if "spacy" not in models:
		try:
			models["spacy"] = spacy.load('en_core_web_md')
		except IOError:
			raise Exception("Spacy model was not loaded! Run: python -m spacy download en_core_web_md")
	return models["spacy"]

def split_corpus(f, shard_size):
    if shard_size <= 0:
        yield f
    else:
        while True:
            shard = list(islice(f, shard_size))
예제 #4
0
#encoding: utf-8
from __future__ import unicode_literals
import re, unicodedata
import mikatools

isos = mikatools.json_load(mikatools.script_path("lang_codes.json"))

pattern = re.compile(
    r'(\w[\u02F3\u0300\u2013\u032E\u208D\u203F\u0311\u0323\u035E\u031C\u02FC\u030C\u02F9\u0328\u032D:\u02F4\u032F\u0330\u035C\u0302\u0327\u03572\u0308\u0351\u0304\u02F2\u0352\u0355\u00B7\u032C\u030B\u2019\u0339\u00B4\u0301\u02F1\u0303\u0306\u030A7\u0325\u0307\u0354`\u02F0]+|\w|\W)',
    re.UNICODE | re.IGNORECASE)


def char_split(word):
    word = unicodedata.normalize('NFKC', word)
    _result = pattern.findall(word)
    return list(_result)


def filter_arabic(text, keep_vowels=True, combine_by=""):
    if keep_vowels:
        return combine_by.join(re.findall(r"[ء-ي'ًٌٍَُِّْـ']+", text))
    else:
        return combine_by.join(re.findall(r"[ء-ي]+", text))


def iso_to_name(iso):
    return isos[iso]
예제 #5
0
import tensorflow as tf
from .utils.math import *
from .utils.bdi import *
from .utils.model import *
from . import pickle2 as pickle
from .dan_eval import SentiDAN
from mikatools import script_path

config = tf.ConfigProto()
sess = tf.Session(config=config)
cnn = SentiDAN(sess)
cnn.load(script_path('senti_model.bin'))
infile = script_path('checkpoints/en-es-bimap-1.bin')

MAX_LEN = 64
N = 5
dic = load_model(infile)
W_src = dic['W_source']
W_trg = dic['W_target']
src_lang = dic['source_lang']
trg_lang = dic['target_lang']
model = dic['model']
with open(script_path('pickle/%s.bin' % src_lang), 'rb') as fin:
    src_wv = pickle.load(fin)
with open(script_path('pickle/%s.bin' % trg_lang), 'rb') as fin:
    trg_wv = pickle.load(fin)
src_pad_id = src_wv.add_word('<pad>', np.zeros(src_wv.vec_dim,
                                               dtype=np.float32))
trg_pad_id = trg_wv.add_word('<pad>', np.zeros(trg_wv.vec_dim,
                                               dtype=np.float32))
src_proj_emb = np.empty(src_wv.embedding.shape, dtype=np.float32)