from util_np import np, vpack, sample, partition from util_sp import load_spm, encode_capped, encode_capped_sample_pair from util_tf import tf, pipe config = load_json(A.config) P = Record(config['paths']) C = Record(config['model']) T = Record(config['train']) tf.set_random_seed(A.seed) ############# # load data # ############# vocab = load_spm(P.vocab) valid = np.load(P.valid) def batch(size=T.batch_train, path=P.train, vocab=vocab, seed=A.seed, kudo=A.sample, max_len=T.max_len): pac = lambda arrs: vpack(arrs, (size, max(map(len, arrs))), eos, np.int32) enc = encode_capped_sample_pair if kudo else encode_capped raw = tuple(load_txt(path)) eos = vocab.eos_id() bat = [] for i in sample(len(raw), seed):
rows = csv.reader(load_txt(path)) next(rows) for row in rows: yield row[3] def load_all(): for split in "q_mc_heldout.csv", "q_mc_test.csv", "q_mc_train.csv", "test_set.csv": yield from load_ibm_claim("{}/{}".format(path_csv, split)) # extract all sentences save_txt(path_txt, load_all()) # train a sentence piece model spm(name= path_vocab, path= path_txt) # load the trained sentence piece model vocab = load_spm(path_vocab + ".model") # load and shuffle sentences sents = list(load_txt(path_txt)) np.random.seed(0) np.random.shuffle(sents) # train valid split valid = sents[:valid_size] train = sents[valid_size:] # save train and valid data save_txt(path_train, train) np.save(path_valid, encode(vocab, valid))
path_vocab = "../trial/data/vocab.model" path_data = "../data/stance.npz" from model import tf, vAe from util import partial from util_io import load_txt from util_np import np, partition, vpack import util_sp as sp from util_np import vpack # load test sentences text = np.load(path_data)['text'] # load sentencepiece model vocab = sp.load_spm(path_vocab) # Load the model model = vAe('infer') # Restore the session sess = tf.InteractiveSession() tf.train.Saver().restore(sess, path_ckpt) ################################ # deterministic representation # ################################ # encode text with sentence piece model data = list(map(partial(sp.encode_capped, vocab), text)) data = vpack(data, (len(data), max(map(len, data))), vocab.eos_id(), np.int32)
from model import Model, batch_run from tqdm import tqdm from trial import config as C, paths as P, train as T from util import partial, comp, select from util_io import pform, load_txt, save_txt from util_np import np, partition, batch_sample from util_sp import load_spm, encode, decode from util_tf import tf, pipe tf.set_random_seed(C.seed) C.trial = "m3_" C.ckpt = 3 langs = 'en', 'nl', 'de', 'da', 'sv' vocab = tuple( load_spm(pform(P.data, "vocab_{}.model".format(lang))) for lang in langs) sents = tuple( encode(voc, load_txt(pform(P.data, "eval_{}.txt".format(lang)))) for lang, voc in zip(langs, vocab)) index = tuple(permutations(range(5), 2)) model = Model.new(**select(C, *Model._new)) model = tuple(model.data(i, j).infer() for i, j in index) sess = tf.InteractiveSession() saver = tf.train.Saver() def trans(sents, model, vocab): for preds in batch_run(sess, model, model.pred, sents, batch=C.batch_infer):