示例#1
0
文件: train.py 项目: argsim/argsim
from util_np import np, vpack, sample, partition
from util_sp import load_spm, encode_capped, encode_capped_sample_pair
from util_tf import tf, pipe

config = load_json(A.config)
P = Record(config['paths'])
C = Record(config['model'])
T = Record(config['train'])

tf.set_random_seed(A.seed)

#############
# load data #
#############

vocab = load_spm(P.vocab)
valid = np.load(P.valid)


def batch(size=T.batch_train,
          path=P.train,
          vocab=vocab,
          seed=A.seed,
          kudo=A.sample,
          max_len=T.max_len):
    pac = lambda arrs: vpack(arrs, (size, max(map(len, arrs))), eos, np.int32)
    enc = encode_capped_sample_pair if kudo else encode_capped
    raw = tuple(load_txt(path))
    eos = vocab.eos_id()
    bat = []
    for i in sample(len(raw), seed):
示例#2
0
    rows = csv.reader(load_txt(path))
    next(rows)
    for row in rows:
        yield row[3]

def load_all():
    for split in "q_mc_heldout.csv", "q_mc_test.csv", "q_mc_train.csv", "test_set.csv":
        yield from load_ibm_claim("{}/{}".format(path_csv, split))

# extract all sentences
save_txt(path_txt, load_all())

# train a sentence piece model
spm(name= path_vocab, path= path_txt)

# load the trained sentence piece model
vocab = load_spm(path_vocab + ".model")

# load and shuffle sentences
sents = list(load_txt(path_txt))
np.random.seed(0)
np.random.shuffle(sents)

# train valid split
valid = sents[:valid_size]
train = sents[valid_size:]

# save train and valid data
save_txt(path_train, train)
np.save(path_valid, encode(vocab, valid))
示例#3
0
path_vocab = "../trial/data/vocab.model"
path_data = "../data/stance.npz"

from model import tf, vAe
from util import partial
from util_io import load_txt
from util_np import np, partition, vpack
import util_sp as sp
from util_np import vpack

# load test sentences
text = np.load(path_data)['text']

# load sentencepiece model
vocab = sp.load_spm(path_vocab)

# Load the model
model = vAe('infer')
# Restore the session
sess = tf.InteractiveSession()
tf.train.Saver().restore(sess, path_ckpt)

################################
# deterministic representation #
################################

# encode text with sentence piece model
data = list(map(partial(sp.encode_capped, vocab), text))
data = vpack(data, (len(data), max(map(len, data))), vocab.eos_id(), np.int32)
示例#4
0
文件: eval_all.py 项目: ysmiraak/tau
from model import Model, batch_run
from tqdm import tqdm
from trial import config as C, paths as P, train as T
from util import partial, comp, select
from util_io import pform, load_txt, save_txt
from util_np import np, partition, batch_sample
from util_sp import load_spm, encode, decode
from util_tf import tf, pipe
tf.set_random_seed(C.seed)

C.trial = "m3_"
C.ckpt = 3

langs = 'en', 'nl', 'de', 'da', 'sv'
vocab = tuple(
    load_spm(pform(P.data, "vocab_{}.model".format(lang))) for lang in langs)
sents = tuple(
    encode(voc, load_txt(pform(P.data, "eval_{}.txt".format(lang))))
    for lang, voc in zip(langs, vocab))

index = tuple(permutations(range(5), 2))
model = Model.new(**select(C, *Model._new))
model = tuple(model.data(i, j).infer() for i, j in index)

sess = tf.InteractiveSession()
saver = tf.train.Saver()


def trans(sents, model, vocab):
    for preds in batch_run(sess, model, model.pred, sents,
                           batch=C.batch_infer):