Exemplo n.º 1
0
    def test_lbl_nrp(self):
        vocab_size = 10000
        k = 10000
        s = 1000

        generator = Generator(k, s)
        ris = [generator.generate() for _ in range(vocab_size)]
        ri_tensor = RandomIndexTensor.from_ri_list(ris, k, s)
        # ri_tensor = to_sparse_tensor_value(ris, k)

        model = LBL_NRP(ctx_size=3,
                        vocab_size=vocab_size,
                        k_dim=k,
                        ri_tensor=ri_tensor,
                        embed_dim=10,
                        embed_share=True,
                        use_gate=True,
                        use_hidden=True,
                        h_dim=4,
                        use_dropout=True,
                        embed_dropout=True)

        runner = tx.ModelRunner(model)
        # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        options = None
        runner.set_session(runtime_stats=True, run_options=options)
        runner.set_log_dir("/tmp/")
        runner.log_graph()
        runner.config_optimizer(
            tf.train.GradientDescentOptimizer(learning_rate=0.05))
        result = runner.run(np.array([[0, 2, 1]]))
        print(np.shape(result))
Exemplo n.º 2
0
    def test_get_sign(self):
        dim = 100
        act = 10
        gen = Generator(dim, act)

        signs = [str(i) for i in range(10)]
        sign_index = TrieSignIndex(gen, vocabulary=signs)

        for s in signs:
            self.assertTrue(sign_index.contains(s))
            id = sign_index.get_id(s)
            self.assertTrue(sign_index.contains_id(id))
            s2 = sign_index.get_sign(id)
            self.assertEqual(s,s2)\



        #get sign for an id that doesn't exist
        id = 86
        s = sign_index.get_sign(id)
        self.assertEqual(s,None)
        self.assertFalse(sign_index.contains_id(id))

        self.assertEqual(len(sign_index.sign_trie),len(signs))

        self.assertTrue(sign_index.contains_id(len(signs)-1))
        self.assertFalse(sign_index.contains_id(len(signs)))
Exemplo n.º 3
0
    def test_contains_id(self):
        dim = 100
        act = 10

        gen = Generator(dim, act)
        sign_index = SignIndex(gen)

        id = 0
        self.assertFalse(sign_index.contains_id(id))
Exemplo n.º 4
0
    def test_get(self):
        dim = 100
        act = 10

        gen = Generator(dim, act)
        sign_index = SignIndex(gen)

        sign_index.add("0")
        ri0 = sign_index.get_ri("0")
        self.assertIsInstance(ri0, RandomIndex)

        self.assertEqual(ri0.dim, dim)
Exemplo n.º 5
0
    def test_contains(self):
        dim = 100
        act = 10

        gen = Generator(dim, act)
        sign_index = SignIndex(generator=gen)

        sign_index.add("0")
        self.assertTrue(sign_index.contains("0"))
        self.assertFalse(sign_index.contains("1"))

        sign_index.remove("0")
        self.assertFalse(sign_index.contains("0"))
Exemplo n.º 6
0
    def load(input_file):
        """
        loads a random index state from a file created using save

        :param filename: name of the file e.g. index.hdf5
        :param dir: directory where this file is located
        :return: a new TrieSignIndex
        """
        h5index = h5py.File(input_file, 'r')

        signs = h5index["signs"]
        indexes = h5index["ri"]
        ri_k = indexes.attrs["k"]
        ri_s = indexes.attrs["s"]

        # set random state
        random_state = pickle.loads(indexes.attrs["state"].tostring())
        random.setstate(random_state)

        generator = Generator(dim=ri_k, num_active=ri_s)
        index = TrieSignIndex(generator,
                              vocabulary=list(signs[:]),
                              pregen_indexes=False)

        random_indexes = {}

        signs = list(signs[:])
        indexes = list(indexes[:])

        # load random indexes into index
        for i in range(len(indexes)):
            w = signs[i]
            id = index.get_id(w)
            ri = ri_from_indexes(ri_k, indexes[i])
            random_indexes[id] = ri

        index.random_indexes = random_indexes

        h5index.close()

        return index
Exemplo n.º 7
0
    def test_load(self):
        """ The ids should be the same when the index is loaded back up

        """
        dim = 100
        act = 10
        gen = Generator(dim, act)

        signs1 = [str(i) for i in range(1000)]
        index1 = TrieSignIndex(gen, vocabulary=signs1)

        filename = "index.hdf5"
        directory = os.path.dirname(os.path.abspath(__file__))
        index_file = directory + "/" + filename

        self.assertFalse(os.path.exists(index_file))
        try:
            index1.save(index_file)
            self.assertTrue(os.path.exists(index_file))

            index2 = TrieSignIndex.load(index_file)
            self.assertEqual(len(index2),len(index1))

            for sign in signs1:
                self.assertTrue(index1.contains(sign))
                self.assertTrue(index2.contains(sign))
                id1 = index1.get_id(sign)
                id2 = index2.get_id(sign)
                self.assertEqual(id1,id2)

                ri1 = index1.get_ri(sign).to_vector()
                ri2 = index2.get_ri(sign).to_vector()

                np.testing.assert_array_equal(ri1,ri2)
        except:
            raise
        finally:
            if os.path.exists(index_file):
                os.remove(index_file)
        self.assertFalse(os.path.exists(index_file))
Exemplo n.º 8
0
    def test_nnlm_nrp(self):
        vocab_size = 100000
        embed_dim = 512
        k = 4000
        s = 4
        ctx_size = 5
        batch_size = 128

        generator = Generator(k, s, symmetric=False)
        ris = [generator.generate() for _ in range(vocab_size)]
        ri_tensor = RandomIndexTensor.from_ri_list(ris, k, s)
        # ri_tensor = to_sparse_tensor_value(ris, k)
        # ri_tensor = tf.convert_to_tensor_or_sparse_tensor(ri_tensor)

        model = NNLM_NRP(ctx_size=ctx_size,
                         vocab_size=vocab_size,
                         k_dim=k,
                         s_active=s,
                         ri_tensor=ri_tensor,
                         embed_dim=embed_dim,
                         embed_share=False,
                         h_dim=128,
                         use_dropout=True,
                         embed_dropout=True)
        runner = tx.ModelRunner(model)
        options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        # options = None
        runner.set_session(runtime_stats=True, run_options=options)
        runner.set_log_dir(self.logdir)
        runner.log_graph()
        runner.config_optimizer(
            tf.train.GradientDescentOptimizer(learning_rate=0.05))

        data = np.random.randint(0, vocab_size, [batch_size, ctx_size])
        labels = np.random.randint(0, vocab_size, [batch_size, 1])

        for _ in tqdm(range(10)):
            runner.train(data, labels)
Exemplo n.º 9
0
    def test_size(self):
        gen = Generator(100, 10)
        sign_index = TrieSignIndex(generator=gen)

        # adding elements should increase size
        self.assertEqual(len(sign_index), 0)

        sign_index.add("0")
        self.assertEqual(len(sign_index), 1)

        # duplicated elements are not added
        sign_index.add("0")
        self.assertEqual(len(sign_index), 1)

        sign_index.add("1")
        self.assertEqual(len(sign_index), 2)

        # removing elements should reduce size
        size_before = len(sign_index)

        sign_index.remove("0")
        size_after = len(sign_index)
        self.assertEqual(size_after, size_before - 1)
Exemplo n.º 10
0
    def test_save(self):
        dim = 100
        act = 10
        gen = Generator(dim, act)

        signs = [str(i) for i in range(10)]
        sign_index = TrieSignIndex(gen, vocabulary=signs)

        filename = "index.hdf5"
        directory = os.path.dirname(os.path.abspath(__file__))
        output_file = directory+"/"+filename

        self.assertFalse(os.path.exists(output_file))
        try:
            sign_index.save(output_file)
            self.assertTrue(os.path.exists(output_file))

            h5file = h5py.File(output_file,'r')

            h5signs = h5file["signs"]
            h5ri = h5file["ri"]

            self.assertEqual(len(h5signs),len(signs))

            print(h5ri[0])
            print(h5ri.attrs["k"])
            print(h5ri.attrs["s"])
            print(h5ri.attrs["state"].tostring())

            h5file.close()
        except:
            raise
        finally:
            if os.path.exists(output_file):
                os.remove(output_file)
        self.assertFalse(os.path.exists(output_file))
Exemplo n.º 11
0
def run(**kwargs):
    arg_dict.from_dict(kwargs)
    args = arg_dict.to_namespace()

    # ======================================================================================
    # Load Params, Prepare results assets
    # ======================================================================================
    # os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
    # print(args.corpus)

    # Experiment parameter summary
    res_param_filename = os.path.join(args.out_dir,
                                      "params_{id}.csv".format(id=args.run_id))
    with open(res_param_filename, "w") as param_file:
        writer = csv.DictWriter(f=param_file, fieldnames=arg_dict.keys())
        writer.writeheader()
        writer.writerow(arg_dict)
        param_file.flush()

    # make dir for model checkpoints
    if args.save_model:
        model_ckpt_dir = os.path.join(args.out_dir,
                                      "model_{id}".format(id=args.run_id))
        os.makedirs(model_ckpt_dir, exist_ok=True)
        model_path = os.path.join(model_ckpt_dir,
                                  "nnlm_{id}.ckpt".format(id=args.run_id))

    # start perplexity file
    ppl_header = ["id", "run", "epoch", "step", "lr", "dataset", "perplexity"]
    ppl_fname = os.path.join(args.out_dir,
                             "perplexity_{id}.csv".format(id=args.run_id))

    ppl_file = open(ppl_fname, "w")
    ppl_writer = csv.DictWriter(f=ppl_file, fieldnames=ppl_header)
    ppl_writer.writeheader()

    # ======================================================================================
    # CORPUS, Vocab and RIs
    # ======================================================================================
    corpus = h5py.File(os.path.join(args.corpus,
                                    "ptb_{}.hdf5".format(args.ngram_size)),
                       mode='r')
    vocab = marisa_trie.Trie(corpus["vocabulary"])

    # generates k-dimensional random indexes with s_active units
    all_positive = args.ri_all_positive
    ri_generator = Generator(dim=args.k_dim,
                             num_active=args.s_active,
                             symmetric=not all_positive)

    # pre-gen indices for vocab
    # it doesn't matter which ri gets assign to which word since we are pre-generating the indexes
    ris = [ri_generator.generate() for i in range(len(vocab))]
    ri_tensor = ris_to_sp_tensor_value(ris, dim=args.k_dim)

    # ri_tensor = RandomIndexTensor.from_ri_list(ris, args.k_dim, args.s_active)

    # ======================================================================================

    def data_pipeline(data,
                      epochs=1,
                      batch_size=args.batch_size,
                      shuffle=False):
        def chunk_fn(x):
            return chunk_it(x, chunk_size=batch_size * 1000)

        if epochs > 1:
            data = repeat_apply(chunk_fn, data, epochs)
        else:
            data = chunk_fn(data)

        if shuffle:
            data = shuffle_it(data, args.shuffle_buffer_size)

        data = batch_it(data, size=batch_size, padding=False)
        return data

    # ======================================================================================
    # MODEL
    # ======================================================================================
    # Activation functions
    if args.h_act == "relu":
        h_act = tx.relu
        h_init = tx.he_normal_init()
    elif args.h_act == "tanh":
        h_act = tx.tanh
        h_init = tx.glorot_uniform()
    elif args.h_act == "elu":
        h_act = tx.elu
        h_init = tx.he_normal_init()

    # Parameter Init
    if args.embed_init == "normal":
        embed_init = tx.random_normal(mean=0., stddev=args.embed_init_val)
    elif args.embed_init == "uniform":
        embed_init = tx.random_uniform(minval=-args.embed_init_val,
                                       maxval=args.embed_init_val)

    if args.logit_init == "normal":
        logit_init = tx.random_normal(mean=0., stddev=args.logit_init_val)
    elif args.logit_init == "uniform":
        logit_init = tx.random_uniform(minval=-args.logit_init_val,
                                       maxval=args.logit_init_val)

    if args.f_init == "normal":
        f_init = tx.random_normal(mean=0., stddev=args.f_init_val)
    elif args.f_init == "uniform":
        f_init = tx.random_uniform(minval=-args.f_init_val,
                                   maxval=args.f_init_val)

    # sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
    #                                        log_device_placement=True))
    # with tf.device('/gpu:{}'.format(args.gpu)):

    model = NNLM_NRP(ctx_size=args.ngram_size - 1,
                     vocab_size=len(vocab),
                     k_dim=args.k_dim,
                     s_active=args.s_active,
                     ri_tensor=ri_tensor,
                     embed_dim=args.embed_dim,
                     embed_init=embed_init,
                     embed_share=args.embed_share,
                     logit_init=logit_init,
                     logit_bias=args.logit_bias,
                     h_dim=args.h_dim,
                     num_h=args.num_h,
                     h_activation=h_act,
                     h_init=h_init,
                     use_dropout=args.dropout,
                     keep_prob=args.keep_prob,
                     embed_dropout=args.embed_dropout,
                     l2_loss=args.l2_loss,
                     l2_loss_coef=args.l2_loss_coef,
                     f_init=f_init)

    model_runner = tx.ModelRunner(model)

    # sess = tf.Session(config=tf.ConfigProto(
    #      allow_soft_placement=True, log_device_placement=True))
    # model_runner.set_session(sess)

    # sess = tf.Session(config=tf.ConfigProto(
    #    allow_soft_placement=True, log_device_placement=True))
    # model_runner.set_session(sess)

    # we use an InputParam because we might want to change it during training
    lr_param = tx.InputParam(value=args.lr)
    if args.optimizer == "sgd":
        optimizer = tf.train.GradientDescentOptimizer(
            learning_rate=lr_param.tensor)
    elif args.optimizer == "adam":
        optimizer = tf.train.AdamOptimizer(learning_rate=lr_param.tensor,
                                           beta1=args.optimizer_beta1,
                                           beta2=args.optimizer_beta2,
                                           epsilon=args.optimizer_epsilon)
    elif args.optimizer == "ams":
        optimizer = tx.AMSGrad(learning_rate=lr_param.tensor,
                               beta1=args.optimizer_beta1,
                               beta2=args.optimizer_beta2,
                               epsilon=args.optimizer_epsilon)

    def clip_grad_global(grads):
        grads, _ = tf.clip_by_global_norm(grads, 12)
        return grads

    def clip_grad_local(grad):
        return tf.clip_by_norm(grad, args.clip_value)

    if args.clip_grads:
        if args.clip_local:
            clip_fn = clip_grad_local
        else:
            clip_fn = clip_grad_global

    if args.clip_grads:
        model_runner.config_optimizer(optimizer,
                                      optimizer_params=lr_param,
                                      gradient_op=clip_fn,
                                      global_gradient_op=not args.clip_local)
    else:
        model_runner.config_optimizer(optimizer, optimizer_params=lr_param)

    # assert(model_runner.session == sess)
    # ======================================================================================
    # EVALUATION
    # ======================================================================================

    def eval_model(runner,
                   dataset_it,
                   len_dataset=None,
                   display_progress=False):
        if display_progress:
            pb = tqdm(total=len_dataset, ncols=60)
        batches_processed = 0
        sum_loss = 0
        for batch in dataset_it:
            batch = np.array(batch, dtype=np.int64)
            ctx = batch[:, :-1]
            target = batch[:, -1:]

            mean_loss = runner.eval(ctx, target)
            sum_loss += mean_loss

            if display_progress:
                pb.update(args.batch_size)
            batches_processed += 1

        if display_progress:
            pb.close()

        return np.exp(sum_loss / batches_processed)

    def evaluation(runner: tx.ModelRunner,
                   pb,
                   cur_epoch,
                   step,
                   display_progress=False):
        pb.write("[Eval Validation]")

        val_data = corpus["validation"]
        ppl_validation = eval_model(
            runner, data_pipeline(val_data, epochs=1, shuffle=False),
            len(val_data), display_progress)
        res_row = {
            "id": args.id,
            "run": args.run,
            "epoch": cur_epoch,
            "step": step,
            "lr": lr_param.value,
            "dataset": "validation",
            "perplexity": ppl_validation
        }
        ppl_writer.writerow(res_row)

        pb.write("Eval Test")
        test_data = corpus["test"]
        ppl_test = eval_model(
            runner, data_pipeline(test_data, epochs=1, shuffle=False),
            len(test_data), display_progress)

        res_row = {
            "id": args.id,
            "run": args.run,
            "epoch": cur_epoch,
            "step": step,
            "lr": lr_param.value,
            "dataset": "test",
            "perplexity": ppl_test
        }
        ppl_writer.writerow(res_row)

        ppl_file.flush()

        pb.write("valid. ppl = {} \n test ppl {}".format(
            ppl_validation, ppl_test))
        return ppl_validation

    # ======================================================================================
    # TRAINING LOOP
    # ======================================================================================
    # preparing evaluation steps
    # I use ceil because I make sure we have padded batches at the end

    epoch_step = 0
    global_step = 0
    current_epoch = 0
    patience = 0

    cfg = tf.ConfigProto()
    cfg.gpu_options.allow_growth = True
    sess = tf.Session(config=cfg)
    model_runner.set_session(sess)
    model_runner.init_vars()

    training_dset = corpus["training"]
    progress = tqdm(total=len(training_dset) * args.epochs)
    training_data = data_pipeline(training_dset,
                                  epochs=args.epochs,
                                  shuffle=True)

    evals = []
    try:
        for ngram_batch in training_data:
            epoch = progress.n // len(training_dset) + 1
            # Start New Epoch
            if epoch != current_epoch:
                current_epoch = epoch
                epoch_step = 0
                progress.write("epoch: {}".format(current_epoch))

            # Eval Time
            if epoch_step == 0:
                current_eval = evaluation(model_runner, progress, epoch,
                                          global_step)
                evals.append(current_eval)

                if global_step > 0:
                    if args.early_stop:
                        if evals[-2] - evals[-1] < args.eval_threshold:
                            if patience >= 3:
                                progress.write("early stop")
                                break
                            patience += 1
                        else:
                            patience = 0

                    # lr decay only at the start of each epoch
                    if args.lr_decay and len(evals) > 0:
                        if evals[-2] - evals[-1] < args.eval_threshold:
                            lr_param.value = max(
                                lr_param.value * args.lr_decay_rate,
                                args.lr_decay_threshold)
                            progress.write("lr changed to {}".format(
                                lr_param.value))

            # ================================================
            # TRAIN MODEL
            # ================================================
            ngram_batch = np.array(ngram_batch, dtype=np.int64)
            ctx_ids = ngram_batch[:, :-1]
            word_ids = ngram_batch[:, -1:]

            model_runner.train(ctx_ids, word_ids)
            progress.update(args.batch_size)

            epoch_step += 1
            global_step += 1

        # if not early stop, evaluate last state of the model
        if not args.early_stop or patience < 3:
            evaluation(model_runner, progress, epoch, epoch_step)
        ppl_file.close()

        if args.save_model:
            model_runner.save_model(model_name=model_path,
                                    step=global_step,
                                    write_state=False)

        model_runner.close_session()
        progress.close()
        tf.reset_default_graph()

    except Exception as e:
        traceback.print_exc()
        os.remove(ppl_file.name)
        os.remove(param_file.name)
        raise e
Exemplo n.º 12
0
 def setUp(self):
     dim = 10
     act = 4
     self.generator = Generator(dim=dim, num_active=act)
     self.sign_index = SignIndex(self.generator)
Exemplo n.º 13
0
param("l2_loss", str2bool, False)
param("l2_loss_coef", float, 1e-5)

args = parser.parse_args()

# ======================================================================================
# CORPUS, Vocab and RIs
# ======================================================================================
corpus = h5py.File(os.path.join(args.corpus,
                                "ptb_{}.hdf5".format(args.ngram_size)),
                   mode='r')
vocab = marisa_trie.Trie(corpus["vocabulary"])

print("generating random indexes")
# generates k-dimensional random indexes with s_active units
ri_generator = Generator(dim=args.k_dim, num_active=args.s_active)

# pre-gen indices for vocab
# it doesn't matter which ri gets assign to which word since we are pre-generating the indexes
ris = [ri_generator.generate() for i in range(len(vocab))]

#ri_tensor = RandomIndexTensor.from_ri_list(ris, args.k_dim, args.s_active)
ri_tensor = ris_to_sp_tensor_value(ris, dim=args.k_dim)

print("done")
# ======================================================================================

# corpus
training_dataset = corpus["training"]
test_dataset = corpus["test"]
validation_dataset = corpus["validation"]
Exemplo n.º 14
0
import tensorflow as tf
import tensorx as tx
from deepsign.models.nrp import RandomIndexTensor
from deepsign.rp.ri import Generator, RandomIndex
import numpy as np

sess = tf.InteractiveSession()

vocab_size = 8
k = 6
s = 2
emebd = 3

generator = Generator(k, s)
ris = [generator.generate() for _ in range(vocab_size)]
ri_tensor = RandomIndexTensor.from_ri_list(ris, k, s)
ri_input = ri_tensor.gather([[0, 1, 0], [1, 2, 0]])

sp = ri_input.to_sparse_tensor()
sp = tx.TensorLayer(sp, k)
print(sp.tensor.eval())

embed = tx.Lookup(sp, seq_size=3, lookup_shape=[k, 3])

tf.global_variables_initializer().run()

print(np.shape(embed.tensor.eval()))
Exemplo n.º 15
0
 def setUp(self):
     dim = 1000
     act = 10
     self.generator = Generator(dim=dim, num_active=act)
Exemplo n.º 16
0
import tensorx as tx
import tensorflow as tf
from deepsign.rp.ri import Generator
import numpy as np
from tqdm import tqdm
from deepsign.data.transform import ris_to_sp_tensor_value

ri_dim = 1000
ri_s = 10

gen = Generator(ri_dim, ri_s)

vocab_size = 10

ris = [gen.generate() for _ in range(vocab_size)]

dummy_logits = tf.constant(np.random.uniform(size=[10, ri_dim * 2]))
#out = tx.sigmoid(dummy_logits)
out = dummy_logits

with tf.Session() as ss:
    samples = tx.sample_sigmoid_from_logits(out, 100)
    #out_pos, out_neg = tf.split(samples, 2, axis=-1)

    # this seems slow
    for i in tqdm(range(1000)):
        #pos, neg = ss.run([out_pos, out_neg])
        s = ss.run(samples)

    #assert (np.shape(pos) == np.shape(neg))
    #print(np.shape(pos))
Exemplo n.º 17
0
corpus_hdf5 = h5py.File(corpus_file, 'r')
corpus_dataset = corpus_hdf5["sentences"]
# iterates over lines but loads them as chunks
#n_rows = 100000
#sentences = chunk_it(corpus_dataset,n_rows=n_rows, chunk_size=20000)
n_rows = len(corpus_dataset)
sentences = chunk_it(corpus_dataset, chunk_size=100000)

pipeline = WaCKyPipe(datagen=sentences)
# ======================================================================================
# Load Vocabulary
# ======================================================================================
vocab_file = data_dir + "wacky_vocab_6M_spacy.hdf5"
vocab_hdf5 = h5py.File(vocab_file, 'r')

ri_gen = Generator(dim=k, num_active=s)
print("Loading Vocabulary...")
sign_index = TrieSignIndex(ri_gen, list(vocab_hdf5["vocabulary"][:]), pregen_indexes=False)

if subsampling:
    freq = TrieSignIndex.map_frequencies(list(vocab_hdf5["vocabulary"][:]),
                                         list(vocab_hdf5["frequencies"][:]),
                                         sign_index)

    total_freq = np.sum(vocab_hdf5["frequencies"])

print("done")

# ======================================================================================
# Neural Random Projections Model
# ======================================================================================
Exemplo n.º 18
0
 def setUp(self):
     dim = 10
     act = 2
     self.generator = Generator(dim=dim, num_active=act)
     self.sign_index = SignIndex(self.generator)
     self.perm_generator = PermutationGenerator(dim=dim)
Exemplo n.º 19
0
from tensorx_old.layers import Input
from tensorx_old.models.nrp2 import NRP
from tensorx_old.init import glorot_init
from deepsign.rp.ri import Generator as RIGen
import numpy as np

import tensorflow as tf

# random index dimension
k = 100
s = 4
h_dim = 500
ri_gen = RIGen(active=s, dim=k)

r = ri_gen.generate()

labels_p = Input(n_units=k, name="ri_pos")
labels_n = Input(n_units=k, name="ri_neg")
labels = Input(n_units=k, name="ri_labels")

model = NRP(k_dim=k, h_dim=h_dim)
loss = model.get_loss(labels_p, labels_n)

optimizer = tf.train.AdagradOptimizer(0.1)
train_step = optimizer.minimize(loss)

# test model training
init = tf.global_variables_initializer()
with tf.Session() as ss:
    ss.run(init)
Exemplo n.º 20
0
    corpus_file = "/data/gold_standards/wacky.hdf5"
    result_path = home + "/data/results/"
    corpus_file = home + corpus_file

    print("Reading hdf5 dataset from: ", corpus_file)
    dataset_name = "sentences_lemmatised"

    # open hdf5 file and get the dataset
    h5f = h5py.File(corpus_file, 'r')
    dataset = h5f[dataset_name]
    return dataset

# do something with the dataset

# Create Sign RI Index
ri_gen = Generator(dim=ri_dim, num_active=ri_num_active)
sign_index = SignIndex(ri_gen)

max_sentences = 200000


def load_spacy():
    t0 = time.time()
    # load tokenizer only
    nlp = English(entity=False, load_vectors=False, parser=True, tagger=True)
    t1 = time.time()
    print("Done: {0:.2f} secs ".format(t1 - t0))
    return nlp

nlp = load_spacy()
Exemplo n.º 21
0
t0 = time.time()
trie = marisa_trie.Trie(list(vocabulary))
t1 = time.time()
print("vocab loaded")
print(t1 - t0)

top10w = list(vocabulary[0:10])
top10f = list(frequencies[0:10])
top10ids = [trie.get(top10w[i]) for i in range(10)]
top10w_trie = [trie.restore_key(i) for i in top10ids]

print(top10w)
print(top10f)
print(top10w_trie)

ri_gen = Generator(dim=1000, num_active=10)

t0 = time.time()
sign_index = TrieSignIndex(ri_gen, list(vocabulary[:]))
t1 = time.time()
print(t1 - t0)

print(top10ids)
top10w_index = [sign_index.get_sign(i) for i in top10ids]
print(top10w_index)

#test load top ten
print("=============================================")
index = TrieSignIndex(generator=ri_gen, vocabulary=top10w)
print(top10w)
top10ids = [index.get_id(w) for w in top10w]
Exemplo n.º 22
0
    def test_nce_nrp(self):
        vocab_size = 1000
        k = 500
        s = 8
        embed_size = 128
        nce_samples = 10
        noise_ratio = 0.1
        use_nce = True

        vocab = [str(i) for i in range(vocab_size)]

        generator = Generator(k, s)
        sign_index = TrieSignIndex(generator,
                                   vocabulary=vocab,
                                   pregen_indexes=True)
        ris = [
            sign_index.get_ri(sign_index.get_sign(i))
            for i in range(len(sign_index))
        ]
        # ris = [generator.generate() for _ in range(vocab_size)]

        ri_tensor = ris_to_sp_tensor_value(ri_seq=ris,
                                           dim=k,
                                           all_positive=False)

        ri_tensor_input = tx.SparseInput(n_units=k, value=ri_tensor)

        if use_nce:
            label_inputs = tx.SparseInput(k, name="target_random_indices")
        else:
            label_inputs = [
                tx.Input(1, dtype=tf.int64, name="ids"),
                tx.InputParam(dtype=tf.int32,
                              value=vocab_size,
                              name="vocab_size")
            ]

        eval_label_inputs = [
            tx.Input(1, dtype=tf.int64, name="ids_eval"),
            tx.InputParam(dtype=tf.int32, value=vocab_size, name="vocab_size")
        ]

        model = NRP(
            run_inputs=tx.SparseInput(n_units=k, name="random_index_inputs"),
            label_inputs=label_inputs,
            eval_label_input=eval_label_inputs,
            ctx_size=2,
            # vocab_size=vocab_size,
            k_dim=k,
            ri_tensor_input=ri_tensor_input,  # current dictionary state
            embed_dim=embed_size,
            h_dim=128,
            num_h=1,
            h_activation=tx.relu,
            use_dropout=True,
            embed_dropout=True,
            keep_prob=0.70,
            use_nce=use_nce,
            nce_samples=nce_samples,
            nce_noise_amount=noise_ratio,
            noise_input=tx.SparseInput(k, name="noise"))

        tf.summary.histogram("embeddings", model.embeddings.weights)
        for h in model.h_layers:
            tf.summary.histogram("h", h.linear.weights)

        # model.eval_tensors.append(model.train_loss_tensors[0])
        runner = tx.ModelRunner(model)
        runner.set_log_dir("/tmp")
        runner.log_graph()

        options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        # options = None
        runner.set_session(runtime_stats=True, run_options=options)

        # options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

        # runner.config_optimizer(tf.train.GradientDescentOptimizer(learning_rate=0.005))#,
        # SGD with 0.025

        # lr = tx.InputParam(init_value=0.0002)
        lr = tx.InputParam(value=0.025)
        # runner.config_optimizer(tf.train.AdamOptimizer(learning_rate=lr.tensor, beta1=0.9), params=lr,
        runner.config_optimizer(
            tf.train.GradientDescentOptimizer(learning_rate=lr.tensor),
            optimizer_params=lr,
            global_gradient_op=False,
            # gradient_op=lambda grad: tf.clip_by_global_norm(grad, 10.0)[0])
            gradient_op=lambda grad: tf.clip_by_norm(grad, 1.0))

        data = np.array([[0, 2], [5, 7], [9, 8], [3, 4], [1, 9], [12, 8]])
        labels = np.array([[32], [56], [12], [2], [5], [23]])

        ppl_curve = []
        n = 256
        batch_size = 128

        dataset = np.column_stack((data, labels))
        # print(dataset)
        dataset = views.repeat_it([dataset], n)
        dataset = views.flatten_it(dataset)
        # shuffle 5 at a time
        dataset = views.shuffle_it(dataset, 6)
        dataset = views.batch_it(dataset, batch_size)

        # print(np.array(list(dataset)))
        # d = list(views.take_it(1, views.shuffle_it(d, 4)))[0]

        data_stream = dataset

        for data_stream in tqdm(data_stream, total=n * 5 / batch_size):
            sample = np.array(data_stream)

            ctx = sample[:, :-1]
            ctx.flatten()
            ctx = ctx.flatten()
            ctx_ris = [sign_index.get_ri(sign_index.get_sign(i)) for i in ctx]
            ctx_ris = ris_to_sp_tensor_value(
                ctx_ris,
                dim=sign_index.feature_dim(),
                all_positive=not sign_index.generator.symmetric)
            lbl_ids = sample[:, -1:]
            lbl = lbl_ids.flatten()

            if use_nce:
                lbl_ris = [
                    sign_index.get_ri(sign_index.get_sign(i)) for i in lbl
                ]
                lbl_ris = ris_to_sp_tensor_value(
                    lbl_ris,
                    dim=sign_index.feature_dim(),
                    all_positive=not sign_index.generator.symmetric)

                noise = generate_noise(k_dim=k,
                                       batch_size=lbl_ris.dense_shape[0] *
                                       nce_samples,
                                       ratio=noise_ratio)
                runner.train(ctx_ris, [lbl_ris, noise],
                             output_loss=True,
                             write_summaries=True)
            else:
                runner.train(model_input_data=ctx_ris,
                             loss_input_data=lbl_ids,
                             output_loss=True,
                             write_summaries=True)

        runner.close_session()
Exemplo n.º 23
0
ppl_writer = csv.DictWriter(f=ppl_file, fieldnames=ppl_header)
ppl_writer.writeheader()

# ======================================================================================
# CORPUS, Vocab and RIs
# ======================================================================================
corpus = h5py.File(os.path.join(args.corpus,
                                "ptb_{}.hdf5".format(args.ngram_size)),
                   mode='r')
vocab = marisa_trie.Trie(corpus["vocabulary"])

print("generating random indexes")
# generates k-dimensional random indexes with s_active units
all_positive = args.ri_all_positive
ri_generator = Generator(dim=args.k_dim,
                         num_active=args.s_active,
                         symmetric=not all_positive)

# pre-gen indices for vocab
# it doesn't matter which ri gets assign to which word since we are pre-generating the indexes
ris = [ri_generator.generate() for i in range(len(vocab))]
ri_tensor = ris_to_sp_tensor_value(ris, dim=args.k_dim)
# ri_tensor = RandomIndexTensor.from_ri_list(ris, args.k_dim, args.s_active)

print("done")

# ======================================================================================


def data_pipeline(data, epochs=1, batch_size=args.batch_size, shuffle=False):
    def chunk_fn(x):