def train_embedding(preprocess, datatype):
    if preprocess:
        embedding_path = PRE_ALL_PATH
    else:
        embedding_path = NOPRE_ALL_PATH
    encode_time = True

    if datatype is not DataType.ALL:
        word_sequences = reader.load_data(preprocess, datatype, encode_time)
        sequences_for_training = []
        for idx, words in word_sequences.items():
            sentences = [s for s in ' '.join(words).replace(NULL_TOKEN, '').split('  ') if s != '']
            for s in sentences:
                sequences_for_training.append(s)
    else:
        sequences_for_training = []
        word_sequences = reader.load_data(preprocess, DataType.VISITOR, encode_time)
        for idx, words in word_sequences.items():
            sentences = [s for s in ' '.join(words).replace(NULL_TOKEN, '').split('  ') if s != '']
            for s in sentences:
                sequences_for_training.append(s)
        word_sequences = reader.load_data(preprocess, DataType.AGENT, encode_time)
        for idx, words in word_sequences.items():
            sentences = [s for s in ' '.join(words).replace(NULL_TOKEN, '').split('  ') if s != '']
            for s in sentences:
                sequences_for_training.append(s)

    print('data load completed. start training.')
    print(sequences_for_training[0:2])

    cores = multiprocessing.cpu_count()
    model = gensim.models.Word2Vec(sentences=SequenceIterator(sequences_for_training), size=50,
                                   sg=1, min_count=5, window=5, workers=cores)
    model.save(embedding_path)
def main(_):
    raw_data = reader.load_data(FLAGS.parse_data_path)
    train_sents, train_trees, dev_sents, dev_trees, vocab_dict, pos_dict, label_dict = raw_data  # items in ids
    config = get_config(FLAGS.parse_lang)

    with tf.Session() as session:
        with tf.variable_scope(FLAGS.parse_scope_name):
            m = NNParser(config=config)

    # CheckPoint State
    if not os.path.exists(FLAGS.parse_train_dir):
        os.makedirs(FLAGS.parse_train_dir)

    ckpt = tf.train.get_checkpoint_state(FLAGS.parse_train_dir)
    if ckpt:
        print("Loading model parameters from %s" % ckpt.model_checkpoint_path)
        m.saver.restore(session,
                        tf.train.latest_checkpoint(FLAGS.parse_train_dir))
    else:
        print("Created model with fresh parameters.")
        session.run(tf.global_variables_initializer())

    # train dataset should be generated only once and called by run_epoch function
    for i in range(config.max_max_epoch):
        lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0)
        m.assign_lr(session, config.learning_rate * lr_decay)
        print("Epoch: %d Learning rate: %.4f" % (i + 1, session.run(m.lr)))

        # new iterator
        train_dataset = transition_system.generate_examples(
            train_sents, train_trees, m.batch_size, label_dict)
        train_perplexity = run_epoch(session, m, m.train_op, train_dataset)
        print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
Пример #3
0
    def sub_proc(sub_filenames, q, idx):
        for _, a_file in enumerate(sub_filenames):
            df = reader.load_data(a_file)

            vid = np.asarray(df.vid.values, dtype=np.int64)
            cid = np.asarray(df.cid.values, dtype=np.int64)
            title_length = np.asarray(df.title_length.values, dtype=np.int64)
            class_id = np.asarray(df.class_id.values, dtype=np.int64)
            second_class = np.asarray(df.second_class.values, dtype=np.int64)
            is_intact = np.asarray(df.is_intact.values, dtype=np.int64)
            stars = df.stars.values

            sample_member = [
                vid, cid, title_length, class_id, second_class, is_intact,
                stars
            ]

            sub_item_map = dict()
            collector = dict()

            for i, k in enumerate(vid):
                sample = [
                    vid[i], cid[i], title_length[i], class_id[i],
                    second_class[i], is_intact[i]
                ]
                sample = np.asarray(sample, dtype=np.int64)
                sample = np.concatenate([sample, stars[i]])
                #print(sample, type(sample), sample.dtype)
                sub_item_map[k] = sample
                collector.update(sub_item_map)

            q.put(collector, block=True, timeout=False)
def calibrate(reck_heaters, calib, heater_name, heater_index, input, outputs):
	
	datafilename = take_fringe(reck_heaters, heater_index)
	
	data_for_fitting = load_data(datafilename,outputs)
	
	calib.datafilename = data_for_fitting
	calib.fit(heatername)
Пример #5
0
 def functor(idx, q, sub_filenames):
     collector = dict()
     for i, a_file in enumerate(filenames):
         df = reader.load_data(a_file)
         uids = df.did.values
         watches = df.watch.values
         uid_vid_map = get_user_watch_map(uids, watches)
         full_uid_vid_map.update(uid_vid_map)
         collector.update(full_uid_vid_map)
     status, output = getstatusoutput('free -g')
     print('done with sub_files: {}, mem:\n{}'.format(
         sub_filenames, output))
     q.put(collector, block=True)
     status, output = getstatusoutput('free -g')
     print('put into queue, mem: {}'.format(output))
Пример #6
0
def get_full_user_map(path, num_parallel_reads=None):
    assert isinstance(num_parallel_reads,
                      int), "invalid type of num_parallel_reads."

    if num_parallel_reads > 1:
        full_uid_vid_map = _get_full_user_map_parallel(path,
                                                       num_parallel_reads)

    else:
        filenames = utils.path_to_list(path, key_word='user')
        full_uid_vid_map = dict()
        for i, a_file in enumerate(filenames):
            df = reader.load_data(a_file)
            uids = df.did.values
            watches = df.watch.values
            uid_vid_map = get_user_watch_map(uids, watches)
            full_uid_vid_map.update(uid_vid_map)

    return full_uid_vid_map
Пример #7
0
def main():

    parser = argparse.ArgumentParser(
        description='Generates some cards for bunker')
    parser.add_argument('--xlsx', help='path to excel file')
    args = parser.parse_args()

    fname = args.xlsx

    data = load_data(fname)
    gen = SimpleGenerator(data)

    while True:
        print('Your card is:')
        card = gen.generate()
        print_card(card)
        print('\n\n')
        inp = input("Press Enter to continue...\n")
        if inp == 'Q':
            break
Пример #8
0
    def __init__(self, config, data_path=None, vocabulary=None, name=None):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        raw_context, raw_questions, raw_choices, raw_labels, self.choices_map = \
                read.load_data(data_path)
        all_choices = read.build_choices(raw_choices)
        self.epoch_size = ((len(raw_context) // batch_size) - 1) // num_steps
        # build vocab for train data
        if not vocabulary:
            self.vocabulary = read.get_vocab(raw_questions,\
                    raw_context,min_frequency=500)
        else:
            self.vocabulary = vocabulary

        raw_choices = [" ".join(x) for x in raw_choices]
        self.all_choices = read.vocab_transform(all_choices, self.vocabulary)
        self.questions = read.vocab_transform(raw_questions, self.vocabulary)
        self.context = read.vocab_transform(raw_context, self.vocabulary)
        self.labels = read.vocab_transform(raw_labels, self.vocabulary)
        self.choices = read.vocab_transform([" ".join(x) for x in raw_choices],
                                            self.vocabulary)
        filename=datetime.now().strftime('mylogfile_%H_%M_%d_%m_%Y.log'),
        filemode='w')


if __name__ == "__main__":

    #    tf.reset_default_graph()
    options = get_params()
    root = "/Users/liuhongbing/Documents/tensorflow/data/snli_1.0/"
    train = [l.strip().split('\t') for l in open(root + 'snli_1.0_train.txt')]
    dev = [l.strip().split('\t') for l in open(root + 'snli_1.0_dev.txt')]
    test = [l.strip().split('\t') for l in open(root + 'snli_1.0_test.txt')]
    vocab = get_vocab(train)
    print("vocab (incr. maxfeatures accordingly):", len(vocab))

    X_train, Y_train, Z_train = load_data(train, vocab)
    X_dev, Y_dev, Z_dev = load_data(dev, vocab)
    X_test, Y_test, Z_test = load_data(test, vocab)
    print('Build model...')

    model = build_model(options)

    config_str = getConfig(options)
    MODEL_ARCH = root + "/Attention_neural/arch_att" + config_str + ".yaml"
    MODEL_WGHT = root + "/Attention_neural/weights_att" + config_str + ".weights"

    MAXLEN = options.xmaxlen
    X_train = pad_sequences(X_train,
                            maxlen=MAXLEN,
                            value=vocab["unk"],
                            padding='pre')
Пример #10
0
import json
import os

from keras.models import load_model
from numpy import argmax

import CONFIG
from reader import KerasBatchGenerator, load_data

_, _, _total_words, reversed_dictionary, dictionary = load_data()

_model = load_model(os.path.join(os.getcwd(), 'model', 'model.h5'))

while True:
    input_string = input('\n\nEnter 3 words: \n')
    input_string = input_string.split()
    input_string = input_string[3:]
    idx = []
    for i in input_string:
        if i == '.':
            i = '<eos>'
        try:
            idx.append(dictionary[i])
        except:
            print('Word ', i, ' donot exist')
            i = '<unk>'
            idx.append(dictionary[i])
            pass

    string = ''
def preprocess():
  # Load training and eval data
  samples,labels,categories = reader.load_data("../data_set/train2014/", "../data_set/annotations/instances_train2014.json")
Пример #12
0
def main(_):

    data_path = FLAGS.data_path
    if ((FLAGS.clear_save) & (os.path.exists(FLAGS.save_path))):
        shutil.rmtree(FLAGS.save_path)
    if FLAGS.testing:
        train_path = os.path.join(data_path, 'test')
    else:
        train_path = os.path.join(data_path, 'train')
    val_path = os.path.join(data_path, 'val')
    test_path = os.path.join(data_path, 'test')

    if not os.path.exists(FLAGS.save_path):
        os.makedirs(FLAGS.save_path)

    log_fi = os.path.join(FLAGS.save_path,'output.log')
    lg.basicConfig(filename=log_fi,level=lg.DEBUG,\
    format='%(asctime)s %(message)s')

    # print("Loading train data from %s" % train_path)
    train = RawInput(rn.load_data(train_path, return_entities=True))

    print("Loading val data from %s"%val_path)
    val = RawInput(rn.load_data(val_path, return_entities=True),
                   vocabulary=train.vocab)

    print("Loading test data from %s" % test_path)
    test = RawInput(rn.load_data(test_path, return_entities=True),
                    vocabulary=train.vocab)

    if FLAGS.use_glove:
        embedding = rn.glove_embedding(FLAGS.glove_path,train.vocab)
    else:
        embedding = None

    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-FLAGS.init_scale,
                                                    FLAGS.init_scale)
        print("Loading model..")
        with tf.name_scope("Train"):
            with tf.variable_scope("Model", reuse=None, initializer=initializer):
                if FLAGS.use_glove:
                    m = Model(vocab_size=train.vocab_size,
                              choices_idx=train.transformed_labels_idx,
                              pre_embedding=embedding)
                else:
                    m = Model(vocab_size=train.vocab_size,
                              choices_idx=train.transformed_labels_idx)

        with tf.Session() as session:
            saver = tf.train.Saver(tf.all_variables())
            ckpt = tf.train.get_checkpoint_state(FLAGS.save_path)
            if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
                print("Loading parameters from %s" % ckpt.model_checkpoint_path)
                lg.info("Loading parameters from %s" % ckpt.model_checkpoint_path)
                saver.restore(session, ckpt.model_checkpoint_path)
            else:
                print("New session.")
                lg.info("New session.")
                session.run(tf.initialize_all_variables())
            all_st = time.time()
            for i in range(FLAGS.max_epoch):
                train_iter = rn.batch_iter(
                    train.contexts, train.questions,
                    train.choices, train.labels, train.choices_map, train.context_lens,
                    train.qs_lens, batch_size=FLAGS.batch_size, entity_inds=train.entities)
                train_cost, train_acc = run_epoch(
                    session, m, train_iter, train_op=m.train_op, verbose=False,
                    vocab=train.vocab)
                print("Train cost: after " + str(i) + " epoch is " + str(train_cost))
                print("Train acc: after " + str(i) +  " epoch is " + str(train_acc))
                lg.info("Train cost: after " + str(i) + " epoch is " + str(train_cost))
                lg.info("Train acc: after " + str(i) +  "epoch is " + str(train_acc))

                if i % FLAGS.ckpt_steps == 0:
                    checkpoint_path = os.path.join(FLAGS.save_path, "wdw.ckpt")
                    saver.save(session, checkpoint_path, global_step=i)

                val_iter = rn.batch_iter(
                    val.contexts, val.questions,
                    val.choices, val.labels, val.choices_map, val.context_lens,
                    val.qs_lens, batch_size=FLAGS.batch_size, entity_inds=val.entities)
                val_cost, val_acc = run_epoch(
                    session, m, val_iter, train_op=None, verbose=False,
                    vocab=train.vocab, is_testing=True)
                lg.info("Val cost: after " + str(i) + " epoch is " + str(val_cost))
                lg.info("Val acc: after " + str(i) + " epoch is " + str(val_acc))
                print("Val cost: after " + str(i) + " epoch is " + str(val_cost))
                print("Val acc: after " + str(i) + " epoch is " + str(val_acc))

                test_iter = rn.batch_iter(
                    test.contexts, test.questions,
                    test.choices, test.labels, test.choices_map, test.context_lens,
                    test.qs_lens, batch_size=FLAGS.batch_size, entity_inds=test.entities)
                print("Checking on test set.")
                test_cost, test_acc = run_epoch(session, m, test_iter, train_op=None,
                                                verbose=False, vocab=train.vocab,is_testing=True)

                test_str = ("Test Accuracy: %s\n" % test_acc)
                print(test_str)
                lg.info(test_str)
Пример #13
0
def main(_):
    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)
    logger = logging.getLogger()

    # Do what you need to load datasets from FLAGS.data_dir
    dataset = None

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)

    # load data sets
    Q_train, P_train, A_start_train, A_end_train, A_len_train, P_raw_train, A_raw_train, Q_len_train, P_len_train = load_data(
        FLAGS.data_dir, "train")
    Q_dev, P_dev, A_start_dev, A_end_dev, A_len_dev, P_raw_dev, A_raw_dev, Q_len_dev, P_len_dev = load_data(
        FLAGS.data_dir, "val")
    #Q_test, P_test, A_start_test, A_end_test = load_data(FLAGS.data_dir, "test")

    # see some data
    logger.info("Training samples read... %s" % (len(Q_train)))
    logger.info("Dev samples read... %s" % (len(Q_dev)))
    # logger.info("Before Padding: \n Q_train[0]: %s \n P_train[0]: %s \n A_start_train[0]: %s \n A_end_train[0]: %s" % (Q_train[0], P_train[0], A_start_train[0], A_end_train[0]))

    # pad the data at load-time. So, we don't need to do any masking later!!!
    # ref: https://keras.io/preprocessing/sequence/
    # if len < maxlen, pad with specified val
    # elif len > maxlen, truncate
    QMAXLEN = FLAGS.QMAXLEN
    PMAXLEN = FLAGS.PMAXLEN
    Q_train = pad_sequences(Q_train,
                            maxlen=QMAXLEN,
                            value=PAD_ID,
                            padding='post')
    P_train = pad_sequences(P_train,
                            maxlen=PMAXLEN,
                            value=PAD_ID,
                            padding='post')
    A_start_train = pad_sequences(A_start_train,
                                  maxlen=PMAXLEN,
                                  value=0,
                                  padding='post')
    A_end_train = pad_sequences(A_end_train,
                                maxlen=PMAXLEN,
                                value=0,
                                padding='post')
    train_data = zip(P_train, Q_train, P_len_train, Q_len_train, A_start_train,
                     A_end_train, A_len_train, P_raw_train, A_raw_train)

    # see the effect of padding
    # logger.info("After Padding: \n Q_train[0]: %s \n P_train[0]: %s \n A_start_train[0]: %s \n A_end_train[0]: %s" % (Q_train[0], P_train[0], A_start_train[0], A_end_train[0]))
    # repeat on dev and test set
    Q_dev = pad_sequences(Q_dev, maxlen=QMAXLEN, value=PAD_ID, padding='post')
    P_dev = pad_sequences(P_dev, maxlen=PMAXLEN, value=PAD_ID, padding='post')
    A_start_dev = pad_sequences(A_start_dev,
                                maxlen=PMAXLEN,
                                value=0,
                                padding='post')
    A_end_dev = pad_sequences(A_end_dev,
                              maxlen=PMAXLEN,
                              value=0,
                              padding='post')
    dev_data = zip(P_dev, Q_dev, P_len_dev, Q_len_dev, A_start_dev, A_end_dev,
                   A_len_dev, P_raw_dev, A_raw_dev)

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    with tf.Graph().as_default():
        with tf.Session() as sess:
            logger.info("Loading embeddings")
            embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' +
                                 str(FLAGS.embedding_size) + '.npz')
            pretrained_embeddings = embeddings['glove']
            logger.info("Embeddings loaded with shape: %s %s" %
                        (pretrained_embeddings.shape))

            qa = QASystem(FLAGS,
                          pretrained_embeddings,
                          vocab_dim=len(vocab.keys()))

            initialize_model(sess, qa, train_dir)

            # a reasonable model should perhaps give decent results (f1 in double digits) even with training on smaller set of train_data
            if FLAGS.tiny_sample:
                sample_pct = FLAGS.tiny_sample_pct  # sample sample_pct % from train and test for local dev
                sam_train = np.random.choice(
                    range(len(train_data)),
                    int(sample_pct / 100 * len(train_data)))
                # no need to sample dev
                sam_dev = range(
                    len(dev_data)
                )  #np.random.choice(range(len(dev_data)), int(FLAGS.dev_tiny_sample_pct/100*len(dev_data)))
                # small sample
                train_data = [train_data[i] for i in sam_train]
                dev_data = [dev_data[i] for i in sam_dev]

            qa.train(sess, train_data, dev_data)
Пример #14
0
from reader import load_data
from reader import get_vocab
from reader import vocab_transform
from reader import batch_iter


contexts, questions, choices, labels, choices_map, context_lens, qs_lens, entities =\
    load_data(data_path="wdw/test", return_entities=True)

# # # 2. Fit vocabulary with questions and context.
# vocab = get_vocab(contexts, questions)
#
# # # 3. Transform context and questions
# contexts = vocab_transform(contexts, vocab)
# questions = vocab_transform(questions, vocab)
#
# # 4. Give to batch_iter
# readers = batch_iter(contexts, questions, choices, labels, choices_map,
#            context_lens, qs_lens)
#
# # for q, c, ch, lab, ch_map, c_lens, q_lens in readers:
# #     print(c.shape)
# #     break
Пример #15
0
def main(_):

    train_path = os.path.join(FLAGS.data_wdw, 'train')
    val_path = os.path.join(FLAGS.data_wdw, 'val')
    test_path = os.path.join(FLAGS.data_wdw, 'test')

    print("Loading train data from %s" % train_path)
    train = RawInput(rn.load_data(train_path))

    print("Loading val data from %s" % val_path)
    val = RawInput(rn.load_data(val_path), vocabulary=train.vocab)
    if len(train.labels_idx) < len(val.labels_idx):
        print("More validation choices than train")

    print("Loading test data from %s" % test_path)
    test = RawInput(rn.load_data(test_path), vocabulary=train.vocab)
    if len(train.labels_idx) < len(test.labels_idx):
        print("More test choices than train")

    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-FLAGS.init_scale,
                                                    FLAGS.init_scale)
        print("Loading model..")
        with tf.name_scope("Train"):
            with tf.variable_scope("Model",
                                   reuse=None,
                                   initializer=initializer):
                m = Model(is_training=True,
                          vocab_size=train.vocab_size,
                          labels_idx=train.labels_idx)

        sv = tf.train.Supervisor(logdir=FLAGS.save_path)
        with sv.managed_session() as session:
            for i in range(FLAGS.max_epoch):
                train_iter = rn.batch_iter(
                    train.contexts,
                    train.questions,
                    train.choices,
                    train.labels,
                    train.choices_map,
                    train.context_lens,
                    train.qs_lens,
                    batch_size=FLAGS.batch_size,
                    context_num_steps=FLAGS.context_steps,
                    question_num_steps=FLAGS.question_steps)

                #             lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
                #             m.assign_lr(session, config.learning_rate * lr_decay)

                val_iter = rn.batch_iter(
                    val.contexts,
                    val.questions,
                    val.choices,
                    val.labels,
                    val.choices_map,
                    val.context_lens,
                    val.qs_lens,
                    batch_size=FLAGS.batch_size,
                    context_num_steps=FLAGS.context_steps,
                    question_num_steps=FLAGS.question_steps)

                print("Epoch: %d" % (i + 1))
                run_epoch(session,
                          m,
                          train_iter,
                          eval_op=m.train_op,
                          verbose=True)
                print("Checking on validation set.")
                ave_cost, ave_acc = run_epoch(session,
                                              m,
                                              val_iter,
                                              eval_op=None,
                                              verbose=False)
                print("Avg. Val Accuracy: %s" % ave_acc)
                print("Avg. Vac Cost: %s" % ave_cost)

            test_iter = rn.batch_iter(test.contexts,
                                      test.questions,
                                      test.choices,
                                      test.labels,
                                      test.choices_map,
                                      test.context_lens,
                                      test.qs_lens,
                                      batch_size=FLAGS.batch_size,
                                      context_num_steps=c_steps,
                                      question_num_steps=q_steps)
            print("\nChecking on test set.")
            test_acc = run_epoch(session,
                                 m,
                                 test_iter,
                                 eval_op=None,
                                 verbose=False)
            print("\nAvg. Test Accuracy: %s\n" % test_acc)
            if FLAGS.save_path:
                print("Saving model to %s." % FLAGS.save_path)
                sv.saver.save(session,
                              FLAGS.save_path,
                              global_step=sv.global_step)
Пример #16
0
from reader import load_data
from reader import get_vocab
from reader import vocab_transform
from reader import batch_iter


contexts, questions, choices, labels, choices_map, context_lens, qs_lens =\
    load_data(data_path="wdw/test")

# # 2. Fit vocabulary with questions and context.
vocab = get_vocab(contexts, questions)

# # 3. Transform context and questions
contexts = vocab_transform(contexts, vocab)
questions = vocab_transform(questions, vocab)

# 4. Give to batch_iter
readers = batch_iter(contexts, questions, choices, labels, choices_map,
                     context_lens, qs_lens)

# for q, c, ch, lab, ch_map, c_lens, q_lens in readers:
#     print(c.shape)
#     break
Пример #17
0
Author: Gabriela Tavares, [email protected]

Adapted from jmetzen.github.io/2015-11-27/vae.html
"""

import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt

np.random.seed(0)
tf.set_random_seed(0)

# Load data.
import reader
dataset = reader.load_data('juri_train.csv', 'juri_test.csv')
n_samples = dataset.train.num_examples


def xavier_init(fan_in, fan_out, constant=1):
    """ Xavier initialization of network weights"""
    # https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow
    low = -constant * np.sqrt(6.0 / (fan_in + fan_out))
    high = constant * np.sqrt(6.0 / (fan_in + fan_out))
    return tf.random_uniform((fan_in, fan_out),
                             minval=low,
                             maxval=high,
                             dtype=tf.float32)


class VariationalAutoencoder(object):
Пример #18
0
def main():
    raw_data = rdr.load_data("input/")

    train_data, valid_data, test_data, vocabulary, reversed_dictionary = raw_data

    test(MODEL_PATH, test_data, reversed_dictionary, vocabulary)
def test():
    data_path = "./data/zh"
    print("Data Path: " + data_path)
    train_sents, train_trees, dev_sents, dev_trees, vocab_dict, pos_dict, label_dict = reader.load_data(
        data_path)

    print("Vocab Dict Size %d" % len(vocab_dict))
    print("POS Dict Size %d" % len(pos_dict))
    print("Label Dict Size %d" %
          len(label_dict))  # unique labels size, Nl, not arc label num

    train_dataset = generate_examples(train_sents, train_trees, 1,
                                      label_dict)  # Unknown feature index
    for step, (x, y) in enumerate(train_dataset):
        if (step <= 10):
            print("Step id: %d" % step)
            print(x)
            print(y)
        else:
            break
Пример #20
0
def main(_):

    vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path)

    embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))

    global_train_dir = '/tmp/cs224n-squad-train'
    # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the
    # file paths saved in the checkpoint. This allows the model to be reloaded even
    # if the location of the checkpoint files has moved, allowing usage with CodaLab.
    # This must be done on both train.py and qa_answer.py in order to work.
    if not os.path.exists(FLAGS.train_dir):
        os.makedirs(FLAGS.train_dir)
    if os.path.exists(global_train_dir):
        os.unlink(global_train_dir)
    os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir)
    train_dir = global_train_dir

    if not os.path.exists(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)
    file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt"))
    logging.getLogger().addHandler(file_handler)

    print(vars(FLAGS))
    with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)


    # ========= Download Dataset json =========
    # You can change this code to load dataset in your own way

    dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path))
    dev_filename = os.path.basename(FLAGS.dev_path)
    _, _, _ = prepare_dev(dev_dirname, dev_filename, vocab)

    # ========= Process input json =========
    prefix = os.path.join("data", "squad")

    # writes dev.answer, dev.context, dev.question, dev.span
    dev_path = FLAGS.dev_path
    dev_filename = FLAGS.dev_path.split("/")[-1]
    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', prefix)
    print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers))

    # writes dev.ids.context, dev.ids.question
    vocab_path = pjoin(os.path.join("data", "squad"), "vocab.dat")
    dev_deposit_path = pjoin(os.path.join("data", "squad"), "dev")
    x_dis_path = dev_deposit_path + ".ids.context"
    y_ids_path = dev_deposit_path + ".ids.question"
    data_to_token_ids(dev_deposit_path + ".context", x_dis_path, vocab_path)
    data_to_token_ids(dev_deposit_path + ".question", y_ids_path, vocab_path)

    # load data sets
    Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data(os.path.join("data", "squad"), "dev") # for our purposes this is as test set.
    question_uuid_data = []
    with open(os.path.join("data", "squad") + "/dev.quid") as f:
        for line in f:
            question_uuid_data.append((line))

    # pad the data at load-time. So, we don't need to do any masking later!!!
    # ref: https://keras.io/preprocessing/sequence/
    # if len < maxlen, pad with specified val
    # elif len > maxlen, truncate
    QMAXLEN = FLAGS.QMAXLEN
    PMAXLEN = FLAGS.PMAXLEN
    Q_test = pad_sequences(Q_test, maxlen=QMAXLEN, value=PAD_ID, padding='post')
    P_test = pad_sequences(P_test, maxlen=PMAXLEN, value=PAD_ID, padding='post')
    A_start_test = pad_sequences(A_start_test, maxlen=PMAXLEN, value=0, padding='post')
    A_end_test = pad_sequences(A_end_test, maxlen=PMAXLEN, value=0, padding='post')
    test_data = zip(P_test, Q_test, P_len_test, Q_len_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, question_uuid_data)

    # ========= Model-specific =========
    # You must change the following code to adjust to your model

    with tf.Graph().as_default():
        with tf.Session() as sess:
            embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz')
            pretrained_embeddings = embeddings['glove']

            qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys()))

            initialize_model(sess, qa, train_dir)

            # get predicted start-end indices
            a_s = [] # store all start index preds
            a_e = [] # store all end index preds
            a_s_l = []
            a_e_l = []

            f1 = exact_match = total = 0; answers = {}
            prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size))
            for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)):
                batch_test =  batch[:4]
                (ys, ye) = qa.predict_on_batch(sess, *batch_test)
                a_s = (np.argmax(ys, axis=1))
                a_e = (np.argmax(ye, axis=1))
                a_s_l = a_s_l + list(a_s)
                a_e_l = a_e_l + list(a_e)

                for j in range(len(a_s)):
                    p_raw = batch[7][j]
                    a_raw = batch[8][j]
                    s = a_s[j]
                    e = a_e[j]
                    pred_raw = ' '.join(p_raw.split()[s:e + 1])
                    f1 += f1_score(pred_raw, a_raw)
                    exact_match += exact_match_score(pred_raw, a_raw)
                    total += 1
                    answers[batch[9][j].strip("\n")] = pred_raw.strip("\n")
                prog.update(i + 1, [("processed", i + 1)])
            exact_match = 100.0 * exact_match / total
            f1 = 100.0 * f1 / total
            print(("First Answer Entity level F1/EM: %.2f/%.2f", f1, exact_match))

        #answers = generate_answers(question_uuid_data, a_s_l, a_e_l, context_data, rev_vocab)

        # write to json file to root dir
        with io.open('dev-prediction.json', 'w', encoding='utf-8') as f:
            f.write(unicode(json.dumps(answers, ensure_ascii=False)))
Пример #21
0
import os

from keras.callbacks import ModelCheckpoint, TensorBoard

import CONFIG
from keras_model import model
from reader import KerasBatchGenerator, load_data, save_json

train_data, valid_data, _total_words, reversed_dictionary, dictionary = load_data(
)

train_data_generator = KerasBatchGenerator(train_data,
                                           CONFIG._num_steps,
                                           CONFIG._batch_size,
                                           _total_words,
                                           skip_step=CONFIG._num_steps)
valid_data_generator = KerasBatchGenerator(valid_data,
                                           CONFIG._num_steps,
                                           CONFIG._batch_size,
                                           _total_words,
                                           skip_step=CONFIG._num_steps)

_model = model(total_words=_total_words,
               hidden_size=CONFIG._hidden_size,
               num_steps=CONFIG._num_steps,
               optimizer='adam')

print(_model.summary())

checkpointer = ModelCheckpoint(filepath=os.path.join(os.getcwd(), 'model',
                                                     'checkpoint',
Пример #22
0
def train(config, evaluator, restore=False):

    data, num_emb = reader.load_data(config)

    train_set, dev_set, test_set = data['train'], data['valid'], data['test']

    if not os.path.exists(config.model_dir):
        os.mkdir(config.model_dir)
    if not os.path.exists(config.log_dir):
        os.mkdir(config.log_dir)
    if not os.path.exists(config.log_train_dir):
        os.mkdir(config.log_train_dir)

    if restore == False:
        train_files = glob.glob(config.log_train_dir + '/*')
        for train_file in train_files:
            os.remove(train_file)

    if len(config.gpu_chosen) > 0:
        gpu_options = tf.GPUOptions(
            visible_device_list=",".join(map(str, config.gpu_chosen)),
            per_process_gpu_memory_fraction=config.gup_per_fraction)
    else:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=config.gup_per_fraction)

    with tf.Graph().as_default(), \
     tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:

        # with tf.variable_scope("model", reuse=None):
        model = config.model_func(config)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        sess.run(init)

        if not config.DEBUG:
            word_embedding = np.loadtxt(config.word_vec_path, dtype=np.float32)
            # with tf.variable_scope("model", reuse=True):
            with tf.variable_scope("Embed", reuse=True):
                embedding = tf.get_variable(
                    "embedding", [config.vocab_size, config.wordvec_size])
                ea = embedding.assign(word_embedding)
                sess.run(ea)

        best_valid_score = 0.0
        best_valid_epoch = 0

        if restore:
            saver.restore(sess, config.model_path)

        with open(config.log_train_acc_path, "w") as train_acc_fp, \
         open(config.log_valid_acc_path, "w") as valid_acc_fp:
            for epoch in range(config.num_epoch):

                start_time = time.time()

                if epoch > config.decay_epoch:
                    learning_rate = sess.run(model.learning_rate)
                    lr_decay = config.lr_decay
                    #learning_rate = config.learning_rate
                    #lr_decay = config.lr_decay**max(epoch-config.decay_epoch, 0.0)
                    sess.run(
                        tf.assign(model.learning_rate,
                                  learning_rate * lr_decay))

                print('=' * 40)
                print(("Epoch %d, Learning rate: %.4f") %
                      (epoch + 1, sess.run(model.learning_rate)))
                avg_loss = evaluator.train(train_set, model, sess)
                print(('\ntrain loss: %.4f') % avg_loss)

                if (epoch + 1) % 5 == 0:
                    train_score = evaluator.evaluate(train_set, model, sess)[0]
                    print(('train top1 acc: %.4f') % train_score)
                    train_acc_fp.write("%d: %.4f\n" % (epoch + 1, train_score))

                valid_score = evaluator.evaluate(dev_set, model, sess)[0]
                print(('valid top1 acc: %.4f') % valid_score)
                valid_acc_fp.write("%d: %.4f\n" % (epoch + 1, valid_score))

                if valid_score > best_valid_score:
                    best_valid_score = valid_score
                    best_valid_epoch = epoch
                    if config.model_save_by_best_valid:
                        saver.save(sess, config.model_path)

                if not config.model_save_by_best_valid and (
                        epoch + 1) % config.model_save_period == 0:
                    saver.save(sess, config.model_path)

                if config.model_save_by_best_valid and epoch - best_valid_epoch > config.early_stop_epoch:
                    break

                print("time per epoch is %.2f min" %
                      ((time.time() - start_time) / 60.0))

        if not config.model_save_by_best_valid:
            saver.save(sess, config.model_path)

        print(("\nbest valid top1 acc: %.4f") % best_valid_score)
        test_score = evaluator.evaluate(test_set, model, sess)[0]
        print(('*' * 10 + 'test top1 acc: %.4f') % test_score)
Пример #23
0
        self.class_ = key.split('/')[-2]

    def __str__(self):
        self.normalize_beans()
        out_str = ','.join([str(i) for i in self.norm_beans])
        out_str += "," + self.class_
        return out_str

    def normalize_beans(self):
        s = sum(self.beans)
        self.norm_beans = [i*1e0/s for i in self.beans]


if __name__ == "__main__":

    files = reader.load_data('../../../1/data')

    descriptors = []
    k = 0
    for filename in files:
        img = cv2.imread(filename)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        sift = cv2.SIFT()
        kp, des = sift.detectAndCompute(gray, None)
        for index in xrange(len(kp)):
            descriptors.append(SiftDescriptor(kp[index], des[index], filename))
        k += 1
        print k

    kmeans_input_des = np.vstack([x.des for x in descriptors])
Пример #24
0
    test_batch_size = 80
    period = [1, 4, 16, 64]
    epoch = 200
    parameter_configs = {
        "learning_rate": 0.001,
        "lstm_hidden_size": 192,
        "temperature": 1,
        "period": [1, 4, 16, 64]
    }

    print('Loading data...')
    #----------------------------data------------------------------------------
    train_filepath = './dataset/train_data.p'
    val_filepath = './dataset/val_data.p'
    test_filepath = './dataset/test_data.p'
    train_data, train_labels = reader.load_data(train_filepath, time_length, 1)
    val_data, val_labels = reader.load_data(val_filepath, time_length, 1)
    test_data, test_labels = reader.load_data(test_filepath, time_length, 1)

    print train_data.shape
    print val_data.shape
    print test_data.shape

    test_nums = test_data.shape[0]
    test_batch_nums = int(math.ceil(test_nums / (test_batch_size + 0.0)))

    #shuffle
    train_data_tmp = np.zeros(train_data.shape)
    train_labels_tmp = np.zeros(train_labels.shape)
    count = 0
    samples_train_nums = train_data.shape[0]
Пример #25
0
    config = Config()
    model = LSTM_RBM(config)

    # load data
    if config.new_data:
        pitches = reader.data2index('./pitches.pkl')
        config.n_visible = pitches[3]
        inputs_data = pitches[0]
        index_to_data = pitches[1]
        data_to_index = pitches[2]
        reader.save_data('pitches_i2d.pkl', pitches[1])
        reader.save_data('pitches_d2i.pkl', pitches[2])
        reader.save_data('pitches_len.pkl', pitches[3])
        print ('information of new data has been saved.')
    else:
        data_to_index = reader.load_data('./pitches_d2i.pkl')
        index_to_data = reader.load_data('./pitches_i2d.pkl')
        raw_data = reader.load_data('./pitches.pkl')
        inputs_data = reader.convert_to_index(raw_data, data_to_index)
        len_pitches = reader.load_data('./pitches_len.pkl')
        config.n_visible = len_pitches
        print ('information of needed data has been loaded.')

    outputs = []
    with tf.Session() as sess, tf.device('/cpu:0'):
        if config.new_data:
            sess.run(tf.initialize_all_variables())
            print ('check point: initialize variables')
        else:
            model.load_params(sess)
            print ('check point: load_params')
Пример #26
0
def read_train_eval(testid, preprocess, maxseq, modelType, encodeTime, dropout,
                    earlyStop, seedNum, batchSize, maxEpoch, topn):
    '''

    :param testid: identifier
    :param preprocess: whether sequences are stemmed or not
    :param maxseq: the maximum sequence length
    :param modelType: one of SIMPLE_RNN | LSTM_RNN | GRU_RNN
    :param encodeTime:
    :param dropout
    :param earlyStop: whether training stops when errors are saturated
    :param seedNum: random seed
    :param batchSize
    :param maxEpoch
    :param topn: how frequently used word tokens are considered
    :return:
    '''
    N = 1000
    TRAIN_INSTANCE_DIR = os.path.join(
        'log',
        '{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(testid, preprocess, maxseq,
                                            modelType, dropout, earlyStop,
                                            seedNum, batchSize, maxEpoch))

    if not os.path.isdir(TRAIN_INSTANCE_DIR):
        os.mkdir(TRAIN_INSTANCE_DIR)
    log_csvfile = os.path.join(TRAIN_INSTANCE_DIR, 'log.csv')
    result_file = os.path.join(TRAIN_INSTANCE_DIR, 'results.txt')

    print('Load data')
    session_data = load_data(preprocess=preprocess,
                             maxseq=maxseq,
                             encodeTime=encodeTime)
    label_data = load_label()
    topN_words = load_topn_words(session_data, N)

    sequences, labels = filter_labeled_data(session_data, label_data)

    print('Load embedding')
    if preprocess:
        w2v_model = load_embedding(embeddingType=EmbeddingType.PRE_ALL)
    else:
        w2v_model = load_embedding(embeddingType=EmbeddingType.NOPRE_ALL)

    print('Pre-processing sequences')
    print(' - Get word vectors')
    vocab_size, embedding_dim, word_indices, embedding_matrix = \
        get_wordvectors_from_keyedvectors(w2v_model, seed=seedNum)

    print(' - Transform sequences')
    if topn is False:
        transformed_seq = transform_sequence(sequences,
                                             word_indices=word_indices)
    else:
        transformed_seq = transform_sequence_using_topn(
            sequences, word_indices, w2v_model, topN_words)

    print(' - Transform labels')
    transformed_labels = transform_label(label_data)
    print(' - Transform seq data to list')
    X, y = transform_labeled_data_listform(transformed_seq, transformed_labels)

    sss = StratifiedShuffleSplit(n_splits=1,
                                 test_size=0.2,
                                 random_state=seedNum)
    for train_index, test_index in sss.split(X, y):
        pass

    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

    X_train, y_train = random_oversampling(X_train, y_train, seed=seedNum)
    X_test, y_test = random_oversampling(X_test, y_test, seed=seedNum)

    X_train = sequence.pad_sequences(X_train, maxlen=maxseq)
    X_test = sequence.pad_sequences(X_test, maxlen=maxseq)

    list_callbacks = [CSVLogger(log_csvfile, separator=',', append=False)]
    if earlyStop:
        earlyStopping = EarlyStopping(monitor='val_loss',
                                      patience=10,
                                      verbose=1,
                                      mode='auto')
        list_callbacks.append(earlyStopping)

    if modelType is ModelType.GRU_RNN:
        model = GRU_RNN(vocab_size=vocab_size,
                        maxlen=maxseq,
                        dropout=dropout,
                        embedding=embedding_matrix,
                        embedding_dim=embedding_dim)()
        model.fit({'text': X_train},
                  y_train,
                  validation_data=({
                      'text': X_test
                  }, y_test),
                  batch_size=batchSize,
                  epochs=maxEpoch,
                  verbose=1,
                  callbacks=list_callbacks)
        y_pred = model.predict({'text': X_test},
                               batch_size=batchSize,
                               verbose=1)
    elif modelType is ModelType.LSTM_RNN:
        model = LSTM_RNN(vocab_size=vocab_size,
                         maxlen=maxseq,
                         dropout=dropout,
                         embedding=embedding_matrix,
                         embedding_dim=embedding_dim)()
        model.fit({'text': X_train},
                  y_train,
                  validation_data=({
                      'text': X_test
                  }, y_test),
                  batch_size=batchSize,
                  epochs=maxEpoch,
                  verbose=1,
                  callbacks=list_callbacks)
        y_pred = model.predict({'text': X_test},
                               batch_size=batchSize,
                               verbose=1)
    elif modelType is ModelType.SIMPLE_RNN:
        model = SIMPLE_RNN(vocab_size=vocab_size,
                           maxlen=maxseq,
                           dropout=dropout,
                           embedding=embedding_matrix,
                           embedding_dim=embedding_dim)()
        model.fit({'text': X_train},
                  y_train,
                  validation_data=({
                      'text': X_test
                  }, y_test),
                  batch_size=batchSize,
                  epochs=maxEpoch,
                  verbose=1,
                  callbacks=list_callbacks)
        y_pred = model.predict({'text': X_test},
                               batch_size=batchSize,
                               verbose=1)
    else:
        print('This function should be set for XXX_single modeltype.')
        exit()

    print('Evaluation..')
    with open(result_file, 'wt') as f:
        writer.eval(y_pred, y_test, file=f)
Пример #27
0
import collections
import reader
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import GRU, RepeatVector, TimeDistributed, Dense
from keras.models import Model, Sequential
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

eng_sen = reader.load_data('data/small_vocab_en')
fre_sen = reader.load_data('data/small_vocab_fr')


def tokenize(x):
    x_tk = Tokenizer(char_level=False)
    x_tk.fit_on_texts(x)
    return x_tk.texts_to_sequences(x), x_tk


def pad(x, length=None):
    if (length is None):
        length = max([len(sentence for sentence in x)])
    return pad_sequences(x, maxlen=length, padding="post")


def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)
    preprocess_x = pad(preprocess_x)
Пример #28
0
# def main(_):
# if (os.path.exists(FLAGS.save_path)):
#     shutil.rmtree(FLAGS.save_path)
# os.makedirs(FLAGS.save_path)
# t_log = open(os.path.join(FLAGS.save_path, 'train.txt'),'w')
# v_log = open(os.path.join(FLAGS.save_path, 'val.txt'),'w')
# te_log = open(os.path.join(FLAGS.save_path, 'test.txt'),'w')

data_path = "/home/manoj/oogie-boogie/wdw"
train_path = os.path.join(data_path, 'test')
val_path = os.path.join(data_path, 'test')
test_path = os.path.join(data_path, 'test')

config = Config()
print("Loading train data from %s" % train_path)
train = RawInput(rn.load_data(train_path))

# print("Loading val data from %s"%val_path)
# val = RawInput(rn.load_data(val_path),vocabulary=train.vocab,c_len=train.c_len,\
#         q_len=train.q_len)
# if len(train.labels_idx) < len(val.labels_idx):
#     print("More validation choices than train")
#
# print("Loading test data from %s"%test_path)
# test = RawInput(rn.load_data(test_path),vocabulary=train.vocab,c_len=train.c_len,\
#         q_len=train.q_len)
# if len(train.labels_idx) < len(test.labels_idx):
#     print("More test choices than train")

with tf.Graph().as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale,