예제 #1
0
from util_np import np, partition
from util_tf import tf
sess = tf.InteractiveSession()

# load model
cws = CharWright.load("../data/cws.pkl")
cwt = CharWright.load("../data/cwt.pkl")
m = model('infer', cws.dwh(), cwt.dwh())
saver = tf.train.Saver()
saver.restore(sess, "../ckpt/{}".format(ckpt))

# the first 4096 instances are used for validation
src = np.array(list(islice(load_txt("../data/src.txt"), 4096)))
tgt = np.array(list(islice(load_txt("../data/tgt.txt"), 4096)))
val = np.array(sorted(range(len(src)), key=lambda i: len(src[i])))
src = src[val]
tgt = tgt[val]


def translate(src, mode):
    for i, j in partition(len(src), 256):
        src_idx, len_src = cws(src[i:j], ret_img=False, ret_idx=True)
        pred, pidx = infer(mode, m, sess, cwt, src_idx, len_src)
        yield from trim_str(pidx, cwt)


save_txt("../tmp/prd", translate(src, mode))
save_txt("../tmp/tgt", tgt)

# sacrebleu -tok intl -b -i ../tmp/prd ../tmp/tgt
예제 #2
0
            if sentence[:7] == "Label##":
                stance, reason = sentence[7:].lower().split("-")
                if "other" == reason: continue  #exclude OTHER class
                label = "{}-{}-{}".format(topic, stance, reason)
                count = 1
                try:
                    while text[idx + count][:6] == "Line##":
                        folds.append(fold)
                        labels.append(label)
                        arguments.append(text[idx + count][6:])
                        count += 1
                except IndexError:
                    continue

# save the data
save_txt("../data/test_data.txt", arguments)
np.save("../data/test_labels.npy", np.asarray(labels))
np.save("../data/test_folds.npy", np.asarray(folds))

#######
# NEW #

import os
import numpy as np
from util_io import load_txt, save_txt, clean
import re

datadir = '../data/reason/reason'

posts, labels, topics = [], [], []
for topic in 'abortion', 'gayRights', 'marijuana', 'obama':
예제 #3
0
classes = {id(cls): cls for cls in sent2class.values()}
del sent2class
aligned = []
for sent_lang in tqdm(classes.values(), ncols=70):
    lang2sents = defaultdict(list)
    for sent, lang in sent_lang:
        lang2sents[lang].append(sent)
    if len(langs) == len(lang2sents) and all(1 == len(sents)
                                             for sents in lang2sents.values()):
        aligned.append(tuple(lang2sents[lang][0] for lang in langs))
aligned.sort()
del classes

# save aligned corpora
for lang, sents in zip(langs, zip(*aligned)):
    save_txt(pform(P.raw, lang), sents)
del aligned

##################
# prep and split #
##################

# train one sentencepiece model for each language
vocab = tuple(
    spm(pform(P.data, "vocab_{}".format(lang)), pform(P.raw, lang), C.dim_voc,
        C.bos, C.eos, C.unk) for lang in langs)

# remove long sentences
short = []
for sents in zip(*(load_txt(pform(P.raw, lang)) for lang in langs)):
    sents = [v.encode_as_ids(s) for v, s in zip(vocab, sents)]
예제 #4
0
#############
# load data #
#############

src_tgt = []
for src, tgt in zip(load_txt(path_src), load_txt(path_tgt)):
    src = src.strip()
    tgt = tgt.strip()
    if 3 <= len(src) <= max_char and 3 <= len(tgt) <= max_char:
        src_tgt.append((src, tgt))

np.random.seed(0)
np.random.shuffle(src_tgt)

src, tgt = zip(*src_tgt)
del src_tgt

#############
# save data #
#############

cws = CharWright.new(chars(src))
cwt = CharWright.new(chars(tgt))

cws.save("../data/cws.pkl")
cwt.save("../data/cwt.pkl")

save_txt("../data/src.txt", src)
save_txt("../data/tgt.txt", tgt)
예제 #5
0
파일: data.py 프로젝트: ysmiraak/eti
####################
# filter and split #
####################

train_src = []
train_tgt = []
valid_src = []
valid_tgt = []
valid_raw = []
for src, tgt in src_tgt:
    s = vocab_src.encode_as_ids(src)
    t = vocab_tgt.encode_as_ids(tgt)
    if 0 < len(s) <= C.cap and 0 < len(t) <= C.cap:
        if len(valid_raw) < C.total_valid:
            valid_src.append(s)
            valid_tgt.append(t)
            valid_raw.append(tgt)
        else:
            train_src.append(src)
            train_tgt.append(tgt)

#############
# save data #
#############

save_txt(pform(P.data, "train_src.txt"), train_src)
save_txt(pform(P.data, "train_tgt.txt"), train_tgt)
save_txt(pform(P.data, "valid_tgt.txt"), valid_raw)
np.save( pform(P.data, "valid_tgt.npy"), vpack(valid_tgt, (C.total_valid, C.cap), C.eos, np.uint16))
np.save( pform(P.data, "valid_src.npy"), vpack(valid_src, (C.total_valid, C.cap), C.eos, np.uint16))
예제 #6
0
from util_io import load_txt, save_txt
from util_np import np, partition, vpack
from util_np import vpack
import pandas as pd
import util_sp as sp

# load data
df = pd.read_csv(path_csv)
emb = np.load(path_emb)
emb_sp = np.load(path_emb_sp)

# load sentencepiece model
vocab = sp.load_spm(path_vocab)

# Load the model
model = vAe('infer')
# Restore the session
sess = tf.InteractiveSession()
tf.train.Saver().restore(sess, path_ckpt)

###########################
# generate from centroids #
###########################

for col in "euc euc_sp cos cos_sp".split():
    cluster = df["cls_{}".format(col)].values
    centroids = np.stack(
        [np.mean(emb[cluster == c], axis=0) for c in range(cluster.max() + 1)])
    y = decode(sess, model, centroids, steps=512)
    save_txt("../trial/centroids_{}".format(col), sp.decode(vocab, y))
예제 #7
0
from util_np import np, vpack
from util_sp import load_spm, spm, encode
import csv

def load_ibm_claim(path):
    rows = csv.reader(load_txt(path))
    next(rows)
    for row in rows:
        yield row[3]

def load_all():
    for split in "q_mc_heldout.csv", "q_mc_test.csv", "q_mc_train.csv", "test_set.csv":
        yield from load_ibm_claim("{}/{}".format(path_csv, split))

# extract all sentences
save_txt(path_txt, load_all())

# train a sentence piece model
spm(name= path_vocab, path= path_txt)

# load the trained sentence piece model
vocab = load_spm(path_vocab + ".model")

# load and shuffle sentences
sents = list(load_txt(path_txt))
np.random.seed(0)
np.random.shuffle(sents)

# train valid split
valid = sents[:valid_size]
train = sents[valid_size:]
예제 #8
0
파일: eval_all.py 프로젝트: ysmiraak/tau
C.trial = "m3_"
C.ckpt = 3

langs = 'en', 'nl', 'de', 'da', 'sv'
vocab = tuple(
    load_spm(pform(P.data, "vocab_{}.model".format(lang))) for lang in langs)
sents = tuple(
    encode(voc, load_txt(pform(P.data, "eval_{}.txt".format(lang))))
    for lang, voc in zip(langs, vocab))

index = tuple(permutations(range(5), 2))
model = Model.new(**select(C, *Model._new))
model = tuple(model.data(i, j).infer() for i, j in index)

sess = tf.InteractiveSession()
saver = tf.train.Saver()


def trans(sents, model, vocab):
    for preds in batch_run(sess, model, model.pred, sents,
                           batch=C.batch_infer):
        yield from decode(vocab, preds)


saver.restore(sess, pform(P.ckpt, C.trial, C.ckpt))
for (i, j), m in zip(index, model):
    print(langs[j], "<-", langs[i])
    save_txt(pform(P.pred, C.trial, langs[j], "_", langs[i]),
             trans(sents[i], m, vocab[j]))
예제 #9
0
import os

posts = tuple(
    clean(post[3])
    # extract the cleaned raw texts
    for filename in sorted(os.listdir(path_raw))
    # each json: posts, annotations, metadata
    for post in load_json(pform(path_raw, filename))[0]
    # each post: id, side(unused), author, raw text, annotations, parent post id, category (unused), timestamp
)

# removes empty posts
posts = tuple(post for post in posts if 0 < len(post))

# saves raw texts
save_txt(path_txt, posts)

# train a sentence piece model
spm(name=path_vocab, path=path_txt)

# load the trained sentence piece model
vocab = load_spm(path_vocab + ".model")

# length control
posts = [encode_capped(vocab, post, cap=512) for post in posts]
save_txt(path_train, map(vocab.decode_ids, posts))

# validation data
posts = tuple(map(clean, load_txt(path_val)))
posts = [encode_capped(vocab, post, cap=512) for post in posts]
np.save(path_valid, vpack(posts, (len(posts), 512), vocab.eos_id(), np.int32))