Пример #1
0
    def run_epoch(self, epoch, frameTrain, frameVal):
        """
        Run 1 epoch. Train on training examples, evaluate on validation set.
        """
        path = self.options['path']
        train_losses = []
        numTrain = frameTrain.shape[0]
        prog = Progbar(target=1 + int(numTrain / self.options["batch_size"]))
        for i, frameBatch in enumerate(
                get_minibatches(frameTrain, self.options["batch_size"])):
            batch = loadData(frameBatch, **(self.options))
            loss, lr, gs = self.optimize(*batch)
            train_losses.append(loss)
            if (self.global_step % self.options["print_every"]) == 0:
                logging.info("Iteration {0}: with minibatch training l2_loss = {1:.3g} and mse of {2:.2g}"\
                      .format(self.global_step, loss, loss/options["batch_size"]))
            prog.update(i + 1, [("train loss", loss)], [("learning rate", lr),
                                                        ("global step", gs)])
        total_train_mse = np.sum(train_losses) / numTrain

        val_losses = []
        if epoch >= 11:
            numVal = frameVal.shape[0]
            prog = Progbar(target=1 + int(numVal / self.options["batch_size"]))
            for i, frameBatch in enumerate(
                    get_minibatches(frameVal, self.options["batch_size"])):
                batch = loadData(frameBatch, **(self.options))
                loss = self.validate(*batch)
                val_losses.append(loss)
                prog.update(i + 1, [("validation loss", loss)])
            total_val_mse = np.sum(val_losses) / numVal
        else:
            total_val_mse = -1

        return total_train_mse, train_losses, total_val_mse, val_losses
    def validate(
        self, sess, valid_dataset_address
    ):  # only used for unseen examples, ie when you wanna check your model
        """
        Iterate through the validation dataset and determine what
        the validation cost is.

        This method calls self.test() which explicitly calculates validation cost.

        How you implement this function is dependent on how you design
        your data iteration function

        :return:
        """
        valid_cost = 0.
        if self.FLAGS.debug:
            dataset, _ = get_sample(valid_dataset_address,
                                    self.FLAGS.context_paragraph_max_length,
                                    self.FLAGS.batch_size)
            dataset = [
                dataset
            ]  # expecting a list of minibatches, but get sample returns a single minibatch
        else:
            dataset = get_minibatches(valid_dataset_address,
                                      self.FLAGS.context_paragraph_max_length,
                                      self.FLAGS.batch_size)
        for question_batch, context_batch, answer_start_batch, answer_end_batch in dataset:
            valid_cost += self.test(sess, question_batch, context_batch,
                                    answer_start_batch, answer_end_batch)

        return valid_cost
Пример #3
0
    def run_epoch(self, sess, train_writer, inputs, labels, epochNum):
        """Runs an epoch of training.

        Args:
            sess: tf.Session() object
            train_writer: writer object to write to TensorBoard logs
            inputs: np.ndarray of shape (n_samples, n_features)
            labels: np.ndarray of shape (n_samples,)
            epochNum: the number (0-indexed) of the epoch we should run
        Returns:
            average_loss: scalar. Average minibatch loss of model on epoch.
        """

        # Get all of the Tensorboard summary nodes
        merged = tf.summary.merge_all()

        n_batches = int(inputs.shape[0] / self.config.batch_size) + (
            1 if inputs.shape[0] % self.config.batch_size > 0 else 0)

        total_loss = 0
        for i, (input_batch, labels_batch) in enumerate(
                get_minibatches([inputs, labels],
                                self.config.batch_size,
                                shuffle=False)):
            loss, output_summary = self.train_on_batch(sess, merged,
                                                       input_batch,
                                                       labels_batch)
            total_loss += loss

            # Add the summaries to our TensorBoard output
            train_writer.add_summary(output_summary, epochNum * n_batches + i)

        return 1.0 * total_loss / n_batches
Пример #4
0
def test(data_matrix, data_labels, saved_model_path, batch_size=1000):
    tf.reset_default_graph()
    pred, input_placeholder, labels_placeholder, _, loss_op = build_model(
        data_matrix, data_labels)
    saver = tf.train.Saver()
    loss_list = []
    label_list = []
    pred_list = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, saved_model_path)
        print("Model restored.")

        minibatches = util.get_minibatches(data_matrix, data_labels,
                                           batch_size)
        for tup in minibatches:
            pred_temp, loss, labels_temp = sess.run(
                [pred, loss_op, labels_placeholder],
                feed_dict={
                    input_placeholder: tup[0],
                    labels_placeholder: tup[1]
                })
            for i, row in enumerate(pred_temp):
                pred_list.append(np.where(row == max(row))[0][0])
            for i, row in enumerate(labels_temp):
                label_list.append(np.where(row == max(row))[0][0])

            loss_list.append(loss)
        print "Loss: " + str(np.mean(loss_list)) + "\n"

    util.outputConfusionMatrix(pred_list, label_list,
                               "confusion_matrix_baseline_test")
    util.get_accuracy(pred_list, label_list)
Пример #5
0
    def run_epoch(self, session, inputs):
        """Runs an epoch of training.
        Args:
            sess: tf.Session() object
            inputs: datasets represented as a dictionary
            labels: np.ndarray of shape (n_samples, n_classes)
        Returns:
            average_loss: scalar. Average minibatch loss of model on epoch.
        """
        losses = []
        n_minibatches, total_loss = 0, 0

        for [question_batch, context_batch, labels_batch, q_mask_batch, p_mask_batch] in \
                get_minibatches([inputs['Questions'], inputs['Paragraphs'], inputs['Labels'], inputs['Questions_masks'], inputs['Paragraphs_masks']], self.config.batch_size):
            start_index_loss, end_index_loss = self.optimize(
                session, question_batch, context_batch, labels_batch, q_mask_batch, p_mask_batch)
            n_minibatches += 1

            losses.append([start_index_loss, start_index_loss])

        mean = np.mean(losses, axis=0)
        logging.info(
            "Logged mean epoch losses: train : %f dev : %f ", mean[0], mean[1])

        return losses
Пример #6
0
    def evaluate(self, session, dataset):
        input_feed = {self.train_phase: False}
        output_feed = [self.loss, self.psnr, self.out]

        test, loader = dataset

        total_loss = 0.
        metrics = []

        prog = Progbar(target=(len(test) - 1) / self.flags.batch_size + 1)
        for i, batch in enumerate(
                get_minibatches(test, self.flags.batch_size, shuffle=False)):
            input_feed[self.im_placeholder] = [loader(b[0]) for b in batch]
            input_feed[self.gt_placeholder] = [loader(b[1]) for b in batch]

            loss, psnr, out = session.run(output_feed, input_feed)
            total_loss += loss * len(batch)
            all_ssim = [
                ssim(im - resid, gt, multichannel=True)
                for resid, im, gt in zip(out, input_feed[self.im_placeholder],
                                         input_feed[self.gt_placeholder])
            ]
            metrics.extend(zip([b[0] for b in batch], psnr, all_ssim))
            prog.update(i + 1, exact=[("total loss", total_loss)])

        return total_loss, metrics
Пример #7
0
 def get_mini(self, dataset_trainOrVal, batch_size, shuffle, span=True):
     feed = [dataset_trainOrVal['.ids.context'],
             dataset_trainOrVal['.ids.question'],
             dataset_trainOrVal['.ids.context.mask'],
             dataset_trainOrVal['.ids.question.mask'],
             dataset_trainOrVal['.ids.context.lens'],
             dataset_trainOrVal['.ids.question.lens']]
     if span:
         feed.append(dataset_trainOrVal['.span'])
     return util.get_minibatches(feed, batch_size, shuffle)
Пример #8
0
    def run_epoch(self, sess, train, label):
        """
        """

        prog = Progbar(target=1 + int(len(train) / self.config.batch_size))
        losses, grad_norms = [], []
        for inputs_minibatch, labels_minibatch in get_minibatches(
            [train, label], self.config.batch_size):
            #for i, batch in enumerate(minibatches(train, label, self.config.batch_size)):
            loss, grad_norm = self.train_on_batch(sess, inputs_minibatch,
                                                  labels_minibatch)
            losses.append(loss)
            grad_norms.append(grad_norm)
            prog.update(i + 1, [("train loss", loss)])
Пример #9
0
    def evaluate_answer(self,
                        session,
                        dataset,
                        sample=100,
                        log=False,
                        ds_type="dev"):
        """
        Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM)
        with the set of true answer labels

        This step actually takes quite some time. So we can only sample 100 examples
        from either training or testing set.

        :param session: session should always be centrally managed in train.py
        :param dataset: a representation of our data, in some implementations, you can
                        pass in multiple components (arguments) of one dataset to this function
        :param sample: how many examples in dataset we look at
        :param log: whether we print to std out stream
        """

        f1 = 0.
        em = 0.

        sampled_dataset = next(get_minibatches(dataset, sample, shuffle=True))
        questions, contexts, spans = sampled_dataset

        tic = time.time()
        preds = self.decode(session, [questions, contexts])
        pred_spans = self._preds_to_spans(preds)
        toc = time.time()
        logger.info("evaluate_answer pred_spans: took %f secs" % (toc - tic))

        logger.info("Predicted/actual spans: ")
        for p, a in zip(pred_spans, spans):
            logger.info(str(p) + " || " + str(a))

        for context, pred_span, actual_span in zip(contexts, pred_spans,
                                                   spans):
            em += self.exact_match_score(pred_span, actual_span)

            tokens_pred = context[pred_span[0]:pred_span[1] + 1]
            tokens_actual = context[actual_span[0]:actual_span[1] + 1]
            f1 += self.f1_score(tokens_pred, tokens_actual)

        if log:
            logger.info("{}: F1: {}, EM: {}, for {} samples".format(
                ds_type, f1, em, sample))

        return f1, em
Пример #10
0
    def fit(self, sess, saver, train_examples_raw, dev_set_raw):
        best_score = 0.

        train_examples = self.preprocess_sequence_data(train_examples_raw)
        dev_set = self.preprocess_sequence_data(dev_set_raw)

        for epoch in range(self.config.n_epochs):
            logger.info("Epoch %d out of %d", epoch + 1, self.config.n_epochs)
            # You may use the progress bar to monitor the training progress
            # Addition of progress bar will not be graded, but may help when debugging
            prog = Progbar(target=1 +
                           int(len(train_examples) / self.config.batch_size))

            # The general idea is to loop over minibatches from train_examples, and run train_on_batch inside the loop
            # Hint: train_examples could be a list containing the feature data and label data
            # Read the doc for utils.get_minibatches to find out how to use it.
            # Note that get_minibatches could either return a list, or a list of list
            # [features, labels]. This makes expanding tuples into arguments (* operator) handy

            ### YOUR CODE HERE (2-3 lines)

            for inputs_minibatch in get_minibatches(
                    train_examples, minibatch_size=self.config.batch_size):
                # a,b,c=inputs_minibatch
                self.train_on_batch(sess, *zip(*inputs_minibatch))
                # self.train_on_batch(sess, inputs, np.eye(self.config.n_classes)[labels])

            ### END YOUR CODE

            logger.info("Evaluating on development data")
            token_cm, entity_scores = self.evaluate(sess, dev_set, dev_set_raw)
            logger.debug("Token-level confusion matrix:\n" +
                         token_cm.as_table())
            logger.debug("Token-level scores:\n" + token_cm.summary())
            logger.info("Entity level P/R/F1: %.2f/%.2f/%.2f", *entity_scores)

            score = entity_scores[-1]

            if score > best_score:
                best_score = score
                if saver:
                    logger.info("New best score! Saving model in %s",
                                self.config.model_output)
                    saver.save(sess, self.config.model_output)
            print("")
            if self.report:
                self.report.log_epoch()
                self.report.save()
        return best_score
Пример #11
0
 def run_epoch(self, sess, inputs):
     """Runs an epoch of training.
     Args:
         sess: tf.Session() object
         inputs: datasets represented as a dictionary
         labels: np.ndarray of shape (n_samples, n_classes)
     Returns:
         average_loss: scalar. Average minibatch loss of model on epoch.
     """
     n_minibatches, total_loss = 0, 0
     for batch in get_minibatches([inputs['Questions'], inputs['Paragraphs'], inputs['Labels']],
                                  self.config.batch_size):
         n_minibatches += 1
         total_loss += self.train_on_batch(sess, *batch)
     return total_loss / n_minibatches
Пример #12
0
def train(data_matrix,
          data_labels,
          save_path,
          title,
          hidden_size=256,
          lr=0.005,
          saved_model_path=None,
          RESUME=False,
          batch_size=256,
          n_epochs=30):
    tf.reset_default_graph()
    _, input_placeholder, labels_placeholder, train_op, loss_op = build_model(
        data_matrix, data_labels, hidden_size=hidden_size, lr=lr)
    saver = tf.train.Saver()
    avg_loss_list = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        if RESUME:
            sess.run(tf.global_variables_initializer())
            saver.restore(sess, saved_model_path)
            print("Model restored.")

        minibatches = util.get_minibatches(data_matrix, data_labels,
                                           batch_size)
        for i in range(n_epochs):
            batch_loss_list = []
            print "Epoch " + str(i + 1) + ": "
            for tup in minibatches:
                _, loss = sess.run([train_op, loss_op],
                                   feed_dict={
                                       input_placeholder: tup[0],
                                       labels_placeholder: tup[1]
                                   })
                batch_loss_list.append(loss)
            avg_loss_list.append(np.mean(batch_loss_list))
            print "=====>loss: " + str(avg_loss_list[i]) + " "
            if (i > 0) and (avg_loss_list[i] < avg_loss_list[i - 1]):
                tmp_path = save_path + "--smallest loss"
                saver.save(sess, tmp_path)
                print "New min loss at epoch %s! Model saved in path %s" % (
                    str(i + 1), tmp_path)
        saver.save(sess, save_path)
        print("Final model saved in path: %s" % save_path)

    util.dumpVar("losses/ " + title + " " + today + ".pkl", avg_loss_list)
    generatePlots(range(len(avg_loss_list)), avg_loss_list, "Number of Epochs",
                  "Cross-Entropy Loss", title)
Пример #13
0
def test(data_matrix, data_labels, saved_model_path, title, batch_size=256):
    tf.reset_default_graph()
    pred, input_placeholder, labels_placeholder, _, loss_op = build_model(
        data_matrix, data_labels)
    saver = tf.train.Saver()
    loss_list = []
    label_list = []
    pred_list = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, saved_model_path)
        print("Model restored.")

        minibatches = util.get_minibatches(data_matrix, data_labels,
                                           batch_size)
        for tup in minibatches:
            pred_temp, loss, labels_temp = sess.run(
                [pred, loss_op, labels_placeholder],
                feed_dict={
                    input_placeholder: tup[0],
                    labels_placeholder: tup[1]
                })
            for i, row in enumerate(pred_temp):
                pred_list.append(np.where(row == max(row))[0][0])
            for i, row in enumerate(tup[1]):
                label_list.append(np.where(row == max(row))[0][0])

            loss_list.append(loss)

            count = 0
            for i in range(len(pred_list)):
                if pred_list[i] != label_list[i]:
                    count += 1
                    print "sentence: ", reconstruct_sentence(
                        (tup[0][i:i + 1, :]).tolist())
                    print "predicted label: ", pred_list[i]
                    print "correct label: ", label_list[i]
                if count > 4:
                    break

        print "Loss: " + str(np.mean(loss_list)) + "\n"

    util.outputConfusionMatrix(
        pred_list, label_list,
        "confusion matrices/confusion_matrix " + title + " " + today)
    util.get_accuracy(pred_list, label_list)
Пример #14
0
def generate_answers(sess, model, dataset, rev_vocab, batch_size):
    """
    Loop over the dev or test dataset and generate answer.

    Note: output format must be answers[uuid] = "real answer"
    You must provide a string of words instead of just a list, or start and end index

    In main() function we are dumping onto a JSON file

    evaluate.py will take the output JSON along with the original JSON file
    and output a F1 and EM

    You must implement this function in order to submit to Leaderboard.

    :param sess: active TF session
    :param model: a built QASystem model
    :param rev_vocab: this is a list of vocabulary that maps index to actual words
    :return:
    """

    answers = {}

    all_contexts, all_questions, all_question_uuids = dataset

    prog = Progbar(target=1 + int(len(dataset[0]) / batch_size))
    for i, batch in enumerate(
            get_minibatches([all_contexts, all_questions, all_question_uuids],
                            batch_size,
                            shuffle=False)):
        contexts, questions, question_uuids = batch
        context_lengths = (contexts != 0).sum(1)

        pred_spans = model.answer(sess, [questions, contexts])
        pred_spans[:, 0] = np.minimum(pred_spans[:, 0], pred_spans[:, 1])
        pred_spans[:, 1] = np.minimum(pred_spans[:, 1], context_lengths)

        for (s, e), uuid, context in zip(pred_spans, question_uuids, contexts):
            answers[uuid] = " ".join(
                [rev_vocab[context[c]] for c in range(s, e + 1)])

        prog.update(i + 1, [])

    return answers
Пример #15
0
    def optimize(self, session, dataset, epoch):
        input_feed = {self.train_phase: True}
        output_feed = [self.train_op, self.loss, self.norm]

        train, loader = dataset

        total_loss = 0.

        prog = Progbar(target=(len(train) - 1) / self.flags.batch_size + 1)
        for i, batch in enumerate(get_minibatches(train,
                                                  self.flags.batch_size)):
            input_feed[self.im_placeholder] = [loader(b[0]) for b in batch]
            input_feed[self.gt_placeholder] = [loader(b[1]) for b in batch]

            _, loss, norm = session.run(output_feed, input_feed)
            prog.update(i + 1, [("train loss", loss), ("norm", norm)])
            total_loss += loss

        return total_loss
    def optimize(self, session, dataset_address):
        """
        Takes in actual data to optimize your model
        This method is equivalent to a step() function
        :return:
        """
        counter = -1
        #loss = 0.0
        if self.FLAGS.debug:
            dataset, _ = get_sample(dataset_address,
                                    self.FLAGS.context_paragraph_max_length,
                                    self.FLAGS.batch_size)
            dataset = [
                dataset
            ]  # put in a list, becuase get_sample returns one minibatch and we want a list of minibatches
        else:
            dataset = get_minibatches(dataset_address,
                                      self.FLAGS.context_paragraph_max_length,
                                      self.FLAGS.batch_size)

        #with tf.variable_scope("qa"):
        #tf.get_variable_scope().reuse_variables()
        epoch_loss = 0.0
        for question_batch, context_batch, answer_start_batch, answer_end_batch in dataset:
            answer_start_batch = unlistify(
                answer_start_batch
            )  # batch returns dim=[batch_size,1] need dim=[batch_size,]
            answer_end_batch = unlistify(
                answer_end_batch
            )  # batch returns dim=[batch_size,1] need dim=[batch_size,]
            input_feed = self.create_feed_dict(question_batch, context_batch,
                                               answer_start_batch,
                                               answer_end_batch,
                                               self.FLAGS.dropout)
            output_feed = [self.updates, self.loss, self.global_grad_norm]
            outputs = session.run(output_feed, feed_dict=input_feed)
            epoch_loss += np.sum(outputs[1])
            global_grad_norm = outputs[2]
            counter = (counter + 1) % self.FLAGS.print_every
            if counter == 0:
                logging.info(
                    "Global grad norm for update: {}".format(global_grad_norm))
        return epoch_loss
Пример #17
0
    def run_epoch(self, session, train_dataset, val_dataset):
        prog = Progbar(target=1 +
                       int(len(train_dataset[0]) / self.config.batch_size),
                       logger=logger)
        for i, train_batch in enumerate(
                get_minibatches(train_dataset,
                                self.config.batch_size,
                                shuffle=True)):
            _, loss, grad_norm = self.optimize(session, train_batch)
            prog.update(i + 1, [("train loss", loss),
                                ("grad norm", grad_norm)])

        self.evaluate_answer(session,
                             train_dataset,
                             log=True,
                             ds_type="Train dataset")
        f1, em = self.evaluate_answer(session,
                                      val_dataset,
                                      log=True,
                                      ds_type="Dev dataset")
        return f1
Пример #18
0
def train(data_matrix, data_labels, save_path, batch_size=100, n_epochs=30):
    _, input_placeholder, labels_placeholder, train_op, loss_op = build_model(
        data_matrix, data_labels)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        minibatches = util.get_minibatches(data_matrix, data_labels,
                                           batch_size)
        for i in range(n_epochs):
            print "Epoch " + str(i + 1) + ": "
            loss_list = []
            label_list = []
            pred_list = []
            for tup in minibatches:
                _, loss = sess.run([train_op, loss_op],
                                   feed_dict={
                                       input_placeholder: tup[0],
                                       labels_placeholder: tup[1]
                                   })
                loss_list.append(loss)
            print "=====>loss: " + str(np.mean(loss_list)) + " "
        save_path = saver.save(sess, save_path)
        print("Final baseline model saved in path: %s" % save_path)
Пример #19
0
 def minibatches(self, data, shuffle=True):
     # batchs = [[sentences sets] [labels sets]]
     batches = [np.array(col) for col in zip(*data)]
     return get_minibatches(batches, self.config.batch_size, shuffle)
Пример #20
0
 def email_minibatches(email_examples, batch_size):
     body_ids, labels = email_records_to_word_ids(email_examples,
                                                  token_mapping, n_features)
     return get_minibatches([body_ids, labels], batch_size)