Пример #1
0
def do_eval(test_data_path, shuffle=False):
    if FLAGS.load_model is None:
        raise ValueError("You need to specify the model location by --load_model=[location]")

    # Load Testing Data
    question_1, question_2, labels = get_input_from_csv(test_data_path)

    if shuffle:
        question_1, question_2, labels = shuffle_data(question_1, question_2, labels)

    # Load Pre-trained Model
    if FLAGS.best_glove:
        import en_core_web_md
        nlp = en_core_web_md.load()  # load best-matching version for Glove
    else:
        nlp = spacy.load('en')
    embedding_matrix = load_glove_embeddings(nlp.vocab, n_unknown=FLAGS.num_unknown)  # shape=(1071074, 300)

    tf.logging.info('Build model ...')
    esim = ESIM(embedding_matrix, FLAGS.max_length, FLAGS.num_hidden, FLAGS.num_classes, FLAGS.keep_prob, FLAGS.learning_rate)

    if FLAGS.load_model:
        model = esim.build_model(FLAGS.load_model)
    else:
        raise ValueError("You need to specify the model location by --load_model=[location]")

    # Convert the "raw data" to word-ids format && convert "labels" to one-hot vectors
    q1_test, q2_test = convert_questions_to_word_ids(question_1, question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate)
    labels = to_categorical(np.asarray(labels, dtype='int32'))

    scores = model.evaluate([q1_test, q2_test], labels, batch_size=FLAGS.batch_size, verbose=1)

    print("=================== RESULTS =====================")
    print("[*] LOSS OF TEST DATA: %.4f" % scores[0])
    print("[*] ACCURACY OF TEST DATA: %.4f" % scores[1])
Пример #2
0
def decode():
    embed_path = FLAGS.embed_path or pjoin(
        "data", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = utils.load_glove_embeddings(embed_path)

    with tf.Session() as sess:
        # Create model and load parameters.
        # model = create_model(sess, embeddings, True)
        # model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
        en_vocab, rev_fr_vocab = preprocess_data.initialize_vocabulary(
            vocab_path)
        FLAGS.vocab_size = len(en_vocab)
        print("embeddings.shape[0]: " + str(embeddings.shape[0]))
        print("len(en_vocab):" + str(len(en_vocab)))
        assert embeddings.shape[0] == len(en_vocab)
        model = create_model(sess, embeddings, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = preprocess_data.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), en_vocab)
            # Which bucket does it belong to?
            bucket_id = len(_buckets) - 1
            for i, bucket in enumerate(_buckets):
                if bucket[0] >= len(token_ids):
                    bucket_id = i
                    break
            else:
                logging.warning("Sentence truncated: %s", sentence)

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if preprocess_data.EOS_ID in outputs:
                outputs = outputs[:outputs.index(preprocess_data.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([
                tf.compat.as_str(rev_fr_vocab[output]) for output in outputs
            ]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Пример #3
0
def train(train_data, val_data, batch_size, n_epochs, save_dir=None):
    # Stage 1: Read training data (csv) && Preprocessing them
    tf.logging.info('Loading training and validataion data ...')
    train_question_1, train_question_2, train_labels = get_input_from_csv(train_data)
    # val_question_1, val_question_2, val_labels = get_input_from_csv(val_data)

    # Stage 2: Load Pre-trained embedding matrix (Using GLOVE here)
    tf.logging.info('Loading pre-trained embedding matrix ...')
    if FLAGS.best_glove:
        import en_core_web_md
        nlp = en_core_web_md.load()  # load best-matching version for Glove
    else:
        nlp = spacy.load('en')
    embedding_matrix = load_glove_embeddings(nlp.vocab, n_unknown=FLAGS.num_unknown)  # shape=(1071074, 300)

    # Stage 3: Build Model
    tf.logging.info('Build model ...')
    esim = ESIM(embedding_matrix, FLAGS.max_length, FLAGS.num_hidden, FLAGS.num_classes, FLAGS.keep_prob, FLAGS.learning_rate)

    if FLAGS.load_model:
        model = esim.build_model(FLAGS.load_model)
    else:
        model = esim.build_model()

    # Stage 4: Convert the "raw data" to word-ids format && convert "labels" to one-hot vectors
    tf.logging.info('Converting questions into ids ...')
    q1_train, q2_train = convert_questions_to_word_ids(train_question_1, train_question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate)
    train_labels = to_categorical(np.asarray(train_labels, dtype='int32'))

    # q1_val, q2_val = convert_questions_to_word_ids(val_question_1, val_question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate)
    # val_labels = to_categorical(np.asarray(val_labels, dtype='int32'))

    # Stage 5: Training
    tf.logging.info('Start training ...')

    callbacks = []
    save_dir = save_dir if save_dir is not None else 'checkpoints'
    filepath = os.path.join(save_dir, "weights-{epoch:02d}-{val_acc:.2f}.hdf5")
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks.append(checkpoint)

    if FLAGS.tensorboard:
        graph_dir = os.path.join('.', 'GRAPHs')
        if not os.path.exists(graph_dir):
            os.makedirs(graph_dir)
        tb = TensorBoard(log_dir=graph_dir, histogram_freq=0, write_graph=True, write_images=True)
        callbacks.append(tb)

    model.fit(
        x=[q1_train, q2_train],
        y=train_labels,
        batch_size=batch_size,
        epochs=n_epochs,
        # validation_data=([q1_val, q2_val], val_labels),
        validation_split=0.2,
        callbacks=callbacks,
        shuffle=True,
        verbose=FLAGS.verbose
    )
Пример #4
0
    def forward(self, char_encoded, C_lengths, raw_sentences):
        """
        Pass the input sentences through the GRU layers.
        :param X: batch of sentences
        :return:
        """

        batch_size = len(raw_sentences)
        elmo_embeddings = load_elmo_embeddings(raw_sentences).to(self.device)
        glove_embeddings = load_glove_embeddings(raw_sentences).to(self.device)
        char_embeddings = self.charRNN(char_encoded, C_lengths).to(self.device)
        one_hot_embeddings = load_onehot_embeddings(raw_sentences).to(self.device)
        num_words, char_dim = char_embeddings.size()
        char_embeddings = char_embeddings.view(batch_size, num_words // batch_size, char_dim)
        final_embeddings = torch.cat([elmo_embeddings, glove_embeddings, char_embeddings, one_hot_embeddings], dim=2)

        # Dropout pre BiRNN
        final_embeddings = self.dropout(final_embeddings)

        # Get the shared layer representations.
        shared_output, _ = self.wordRNN(final_embeddings)
        return shared_output
Пример #5
0
def do_pred(test_data_path):
    if FLAGS.load_model is None:
        raise ValueError("You need to specify the model location by --load_model=[location]")

    # Load Testing Data
    question_1, question_2 = get_test_from_csv(test_data_path)

    # Load Pre-trained Model
    if FLAGS.best_glove:
        import en_core_web_md
        nlp = en_core_web_md.load()  # load best-matching version for Glove
    else:
        nlp = spacy.load('en')
    embedding_matrix = load_glove_embeddings(nlp.vocab, n_unknown=FLAGS.num_unknown)  # shape=(1071074, 300)

    tf.logging.info('Build model ...')
    esim = ESIM(embedding_matrix, FLAGS.max_length, FLAGS.num_hidden, FLAGS.num_classes, FLAGS.keep_prob, FLAGS.learning_rate)

    if FLAGS.load_model:
        model = esim.build_model(FLAGS.load_model)
    else:
        raise ValueError("You need to specify the model location by --load_model=[location]")

    # Convert the "raw data" to word-ids format && convert "labels" to one-hot vectors
    q1_test, q2_test = convert_questions_to_word_ids(question_1, question_2, nlp, max_length=FLAGS.max_length, tree_truncate=FLAGS.tree_truncate)

    predictions = model.predict([q1_test, q2_test])
    print("[*] Predictions Results: \n", predictions[0])

    for i in range(len(q1_test)):
        print("=============== %d Prediction ===============" % i)
        print("Q1: %s" % question_1[i])
        print("Q2: %s" % question_2[i])

        if np.argmax(predictions[i]) == 1:
            print("IS_DUPLICATE: YES  score: %.6f" % predictions[i][1])
        else:
            print("IS_DUPLICATE: NO   score: %.6f" % predictions[i][0])
Пример #6
0
    def loadwordmodel(self, wordembfile, destfile, wordembsize, log, device):
        if not os.path.exists(destfile):
            log.info('loading pre-trained word embeddings from ' +
                     wordembfile + '... (takes several minutes)')
            if os.path.exists(wordembfile) and 'fasttext' in wordembfile:
                from gensim.models import FastText
                wordvectors = FastText.load_fasttext_format(wordembfile)
            elif os.path.exists(wordembfile) and 'glove' in wordembfile:
                wordvectors = utils.load_glove_embeddings(wordembfile)
            else:
                log.error('word embedding model ' + wordembfile +
                          ' cannot be found!')
                sys.exit()

            word_embs = []
            c = 0
            for i, idx in enumerate(self.idx2word):
                try:
                    word_embs.append(
                        torch.from_numpy(wordvectors[idx]).float())
                except KeyError:
                    c += 1
                    word_embs.append(torch.zeros(wordembsize))

            log.info('number of words without a pretrained word embedding: ' +
                     str(c) + '/' + str(len(self.idx2word)))

            self.word_embs = torch.stack(word_embs)
            self.word_embs[0].fill_(0)  # fill embedding for <PAD> with 0s
            torch.save(self.word_embs, destfile)
        else:
            log.info('loading pre-trained word embeddings from ' +
                     wordembfile + '...')
            self.word_embs = torch.load(destfile)
            log.info('loaded pre-trained word vectors successfully!')

        if device >= 0:
            self.word_embs = self.word_embs.to('cuda:' + str(device))
Пример #7
0
word_index = imdb.get_word_index(os.path.join(project_path, 'data/imdb_word_index.json'))
word_inverted_index = {v: k for k, v in word_index.items()}
# The first indexes in the map are reserved to represet things other than tokens
index_offset = 3
word_inverted_index[-1 - index_offset] = '_' # Padding at the end
word_inverted_index[ 1 - index_offset] = '>' # Start of the sentence
word_inverted_index[ 2 - index_offset] = '?' # OOV
word_inverted_index[ 3 - index_offset] = ''  # Un-used





x_len_train = np.array([min(len(x), sentence_size) for x in x_train_variable])
x_len_test = np.array([min(len(x), sentence_size) for x in x_test_variable])
embedding_matrix = load_glove_embeddings('data/glove.6B.50d.txt', word_index, vocab_size, embedding_size)
params = {'embedding_initializer': embedding_matrix}
lstm_classifier = tf.estimator.Estimator(model_fn=lstm_model_fn,
                                                   model_dir=os.path.join(model_dir, 'cnn_pretrained'),
                                                   params=params)
# Save a reference to the classifier to run predictions later
lstm_classifier.train(input_fn=train_input_fn(x_train, x_len_train, y_train, x_train_variable), steps=500)
eval_results = lstm_classifier.evaluate(input_fn=eval_input_fn(x_test, x_len_test, y_test))

predictions = np.array([p['logistic'][0] for p in lstm_classifier.predict(input_fn=eval_input_fn(x_test, x_len_test, y_test))])

tf.reset_default_graph()
pr = summary_lib.pr_curve('precision_recall', predictions=predictions, labels=y_test.astype(bool),
                          num_thresholds=21)
with tf.Session() as sess:
    writer = tf.summary.FileWriter(os.path.join(lstm_classifier.model_dir, 'eval'), sess.graph)
Пример #8
0
def train():
    """Train a en->fr translation model using WMT data."""
    data_config = DataConfig(FLAGS.data_dir)
    logFile = open('data/log.txt', 'w')
    embed_path = FLAGS.embed_path or pjoin(
        "data", "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = utils.load_glove_embeddings(embed_path)

    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = preprocess_data.initialize_vocabulary(vocab_path)
    FLAGS.vocab_size = len(vocab)
    print(embeddings.shape[0], len(vocab))
    assert embeddings.shape[0] == len(vocab)

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=True)) as sess:
        # Create model.
        with tf.device('/gpu:1'):
            print("Creating %d layers of %d units." %
                  (FLAGS.num_layers, FLAGS.size))
            model = create_model(sess, embeddings, False)

            tic = time.time()
            params = tf.trainable_variables()
            num_params = sum(
                map(lambda t: np.prod(tf.shape(t.value()).eval()), params))
            toc = time.time()
            logging.info("Number of params: %d (retreival took %f secs)" %
                         (num_params, toc - tic))

            # Read data into buckets and compute their sizes.
            print("Reading development and training data (limit: %d)." %
                  FLAGS.max_train_data_size)
            dev_set = read_data(data_config.val_from, data_config.val_to)
            train_set = read_data(data_config.train_from, data_config.train_to,
                                  FLAGS.max_train_data_size)
            train_bucket_sizes = [
                len(train_set[b]) for b in xrange(len(_buckets))
            ]
            train_total_size = float(sum(train_bucket_sizes))

            # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
            # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
            # the size if i-th training bucket, as used later.
            train_buckets_scale = [
                sum(train_bucket_sizes[:i + 1]) / train_total_size
                for i in xrange(len(train_bucket_sizes))
            ]

            # This is the training loop.
            step_time, loss = 0.0, 0.0
            current_step = 0
            previous_losses = []
            breakCount = 0

            while True:
                # Choose a bucket according to data distribution. We pick a random number
                # in [0, 1] and use the corresponding interval in train_buckets_scale.
                random_number_01 = np.random.random_sample()
                bucket_id = min([
                    i for i in xrange(len(train_buckets_scale))
                    if train_buckets_scale[i] > random_number_01
                ])

                # Get a batch and make a step.
                start_time = time.time()
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    train_set, bucket_id)
                _, step_loss, _ = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, False)
                step_time += (time.time() -
                              start_time) / FLAGS.steps_per_checkpoint
                loss += step_loss / FLAGS.steps_per_checkpoint
                current_step += 1

                if current_step % FLAGS.steps_per_print == 0:
                    perplexity = math.exp(
                        float(loss)) if loss < 300 else float("inf")
                    print(
                        "global step %d learning rate %.4f step_loss %.2f perplexity "
                        "%.2f" %
                        (model.global_step.eval(), model.learning_rate.eval(),
                         step_loss, perplexity))

                # Once in a while, we save checkpoint, print statistics, and run evals.
                if current_step % FLAGS.steps_per_checkpoint == 0:
                    # Print statistics for the previous epoch.
                    perplexity = math.exp(
                        float(loss)) if loss < 300 else float("inf")
                    print("checkpoint here")
                    print(
                        "====== global step %d learning rate %.4f step-time %.2f perplexity "
                        "%.2f" %
                        (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
                    logFile.write(
                        "====== global step %d learning rate %.4f step-time %.2f perplexity "
                        "%.2f" %
                        (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
                    logFile.write("\n")
                    # Decrease learning rate if no improvement was seen over last 3 times.
                    if len(previous_losses) > 2 and loss > max(
                            previous_losses[-3:]):
                        sess.run(model.learning_rate_decay_op)
                    previous_losses.append(loss)
                    # Save checkpoint and zero timer and loss.
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   "translate.ckpt")
                    model.saver.save(sess,
                                     checkpoint_path,
                                     global_step=model.global_step)
                    if perplexity < 2:
                        breakCount += 1
                        print("breakCount ", breakCount)
                    step_time, loss = 0.0, 0.0
                    # Run evals on development set and print their perplexity.
                    for bucket_id in xrange(len(_buckets)):
                        if len(dev_set[bucket_id]) == 0:
                            print("  eval: empty bucket %d" % (bucket_id))
                            continue
                        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                            dev_set, bucket_id)
                        _, eval_loss, _ = model.step(sess, encoder_inputs,
                                                     decoder_inputs,
                                                     target_weights, bucket_id,
                                                     True)
                        eval_ppx = math.exp(float(
                            eval_loss)) if eval_loss < 300 else float("inf")
                        print("  eval: bucket %d perplexity %.2f" %
                              (bucket_id, eval_ppx))
                    sys.stdout.flush()
                    if breakCount > 20:
                        print("successfully breakdown")
                        logFile.close()
                        break