示例#1
0
    def computeAccuracy(self, dataPremiseMat, dataHypothesisMat, dataTarget,
                        predictFunc):
        """
        Computes the accuracy for the given network on a certain dataset.
        """
        numExamples = len(dataTarget)
        correctPredictions = 0.

        # Arbitrary batch size set
        minibatches = getMinibatchesIdx(len(dataTarget), 1)
        pad = "right"

        for _, minibatch in minibatches:
            batchPremiseTensor, batchHypothesisTensor, batchLabels = \
                    convertDataToTrainingBatch(dataPremiseMat, self.numTimestepsPremise, dataHypothesisMat,
                                               self.numTimestepsHypothesis, pad, self.embeddingTable,
                                               dataTarget, minibatch)
            prediction = predictFunc(batchPremiseTensor, batchHypothesisTensor)
            batchGoldIdx = [ex.argmax(axis=0) for ex in batchLabels]

            correctPredictions += (
                np.array(prediction) == np.array(batchGoldIdx)).sum()

        return correctPredictions / numExamples
示例#2
0
    def computeAccuracy(self, dataPremiseMat, dataHypothesisMat, dataTarget,
                        predictFunc):
        """
        Computes the accuracy for the given network on a certain dataset.
        """
        numExamples = len(dataTarget)
        correctPredictions = 0.

        # Arbitrary batch size set
        minibatches = getMinibatchesIdx(len(dataTarget), 1)
        pad = "right"

        for _, minibatch in minibatches:
            batchPremiseTensor, batchHypothesisTensor, batchLabels = \
                    convertDataToTrainingBatch(dataPremiseMat, self.numTimestepsPremise, dataHypothesisMat,
                                               self.numTimestepsHypothesis, pad, self.embeddingTable,
                                               dataTarget, minibatch)
            prediction = predictFunc(batchPremiseTensor, batchHypothesisTensor)
            batchGoldIdx = [ex.argmax(axis=0) for ex in batchLabels]

            correctPredictions += (np.array(prediction) ==
                                   np.array(batchGoldIdx)).sum()

        return correctPredictions/numExamples
示例#3
0
def main(exp_name, embed_data, train_data, train_data_stats, val_data, val_data_stats,
         test_data, test_data_stats, log_path, batch_size, num_epochs,
         unroll_steps, learn_rate, num_dense, dense_dim, penalty, reg_coeff):
    """
    Main run function for training model.
    :param exp_name:
    :param embed_data:
    :param train_data:
    :param train_data_stats:
    :param val_data:
    :param val_data_stats:
    :param test_data:
    :param test_data_stats:
    :param log_path:
    :param batch_size:
    :param num_epochs:
    :param unroll_steps:
    :param learn_rate:
    :param num_dense: Number of dense fully connected layers to add after concatenation layer
    :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1
    :param penalty: Penalty to use for regularization
    :param reg_weight: Regularization coeff to use for each layer of network; may
                       want to support different coefficient for different layers
    :return:
    """
    # Set random seed for deterministic results
    np.random.seed(0)
    num_ex_to_train = 30

    # Load embedding table
    table = EmbeddingTable(embed_data)
    vocab_size = table.sizeVocab
    dim_embeddings = table.dimEmbeddings
    embeddings_mat = table.embeddings


    train_prem, train_hyp = generate_data(train_data, train_data_stats, "left", "right", table, seq_len=unroll_steps)
    val_prem, val_hyp = generate_data(val_data, val_data_stats, "left", "right", table, seq_len=unroll_steps)
    train_labels = convertLabelsToMat(train_data)
    val_labels = convertLabelsToMat(val_data)

    # To test for overfitting capabilities of model
    if num_ex_to_train > 0:
        val_prem = val_prem[0:num_ex_to_train]
        val_hyp = val_hyp[0:num_ex_to_train]
        val_labels = val_labels[0:num_ex_to_train]

    # Theano expressions for premise/hypothesis inputs to network
    x_p = T.imatrix()
    x_h = T.imatrix()
    target_values = T.fmatrix(name="target_output")


    # Embedding layer for premise
    l_in_prem = InputLayer((batch_size, unroll_steps))
    l_embed_prem = EmbeddingLayer(l_in_prem, input_size=vocab_size,
                        output_size=dim_embeddings, W=embeddings_mat)

    # Embedding layer for hypothesis
    l_in_hyp = InputLayer((batch_size, unroll_steps))
    l_embed_hyp = EmbeddingLayer(l_in_hyp, input_size=vocab_size,
                        output_size=dim_embeddings, W=embeddings_mat)


    # Ensure embedding matrix parameters are not trainable
    l_embed_hyp.params[l_embed_hyp.W].remove('trainable')
    l_embed_prem.params[l_embed_prem.W].remove('trainable')

    l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp)
    l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem)

    # Concatenate sentence embeddings for premise and hypothesis
    l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum])

    l_in = l_concat
    l_output = l_concat
    # Add 'num_dense' dense layers with tanh
    # top layer is softmax
    if num_dense > 1:
        for n in range(num_dense):
            if n == num_dense-1:
                l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax)
            else:
                l_in = DenseLayer(l_in, num_units=dense_dim, nonlinearity=lasagne.nonlinearities.tanh)
    else:
        l_output = DenseLayer(l_in, num_units=NUM_DENSE_UNITS, nonlinearity=lasagne.nonlinearities.softmax)

    network_output = get_output(l_output, {l_in_prem: x_p, l_in_hyp: x_h}) # Will have shape (batch_size, 3)
    f_dense_output = theano.function([x_p, x_h], network_output, on_unused_input='warn')

    # Compute cost
    if penalty == "l2":
        p_metric = l2
    elif penalty == "l1":
        p_metric = l1

    layers = lasagne.layers.get_all_layers(l_output)
    layer_dict = {l: reg_coeff for l in layers}
    reg_cost = reg_coeff * regularize_layer_params_weighted(layer_dict, p_metric)
    cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values).mean()) + reg_cost
    compute_cost = theano.function([x_p, x_h, target_values], cost)

    # Compute accuracy
    accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1), T.argmax(target_values, axis=-1)),
                      dtype=theano.config.floatX)
    compute_accuracy = theano.function([x_p, x_h, target_values], accuracy)

    label_output = T.argmax(network_output, axis=-1)
    predict = theano.function([x_p, x_h], label_output)

    # Define update/train functions
    all_params = lasagne.layers.get_all_params(l_output, trainable=True)
    updates = lasagne.updates.rmsprop(cost, all_params, learn_rate)
    train = theano.function([x_p, x_h, target_values], cost, updates=updates)

    # TODO: Augment embedding layer to allow for masking inputs

    stats = Stats(exp_name)
    acc_num = 10

    #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size)
    minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size)
    print("Training ...")
    try:
        total_num_ex = 0
        for epoch in xrange(num_epochs):
            for _, minibatch in minibatches:
                total_num_ex += len(minibatch)
                stats.log("Processed {0} total examples in epoch {1}".format(str(total_num_ex),
                                                                          str(epoch)))

                #prem_batch = val_prem[minibatch]
                #hyp_batch = val_hyp[minibatch]
                #labels_batch = val_labels[minibatch]

                prem_batch = train_prem[minibatch]
                hyp_batch = train_hyp[minibatch]
                labels_batch = train_labels[minibatch]

                train(prem_batch, hyp_batch, labels_batch)
                cost_val = compute_cost(prem_batch, hyp_batch, labels_batch)

                stats.recordCost(total_num_ex, cost_val)
                # Periodically compute and log train/dev accuracy
                if total_num_ex%(acc_num*batch_size) == 0:
                    train_acc = compute_accuracy(train_prem, train_hyp, train_labels)
                    dev_acc = compute_accuracy(val_prem, val_hyp, val_labels)
                    stats.recordAcc(total_num_ex, train_acc, dataset="train")
                    stats.recordAcc(total_num_ex, dev_acc, dataset="dev")

    except KeyboardInterrupt:
        pass
示例#4
0
    def train(self, numEpochs=1, batchSize=5, learnRateVal=0.1, numExamplesToTrain=-1, gradMax=3.,
                L2regularization=0.0, dropoutRate=0.0, sentenceAttention=False,
                wordwiseAttention=False):
        """
        Takes care of training model, including propagation of errors and updating of
        parameters.
        """
        expName = "Epochs_{0}_LRate_{1}_L2Reg_{2}_dropout_{3}_sentAttn_{4}_" \
                       "wordAttn_{5}".format(str(numEpochs), str(learnRateVal),
                                             str(L2regularization), str(dropoutRate),
                                             str(sentenceAttention), str(wordwiseAttention))
        self.configs.update(locals())
        trainPremiseIdxMat, trainHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices(
                                  self.trainData, self.trainDataStats)
        trainGoldLabel = convertLabelsToMat(self.trainData)

        valPremiseIdxMat, valHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices(
                                self.valData, self.valDataStats)
        valGoldLabel = convertLabelsToMat(self.valData)

        # If you want to train on less than full dataset
        if numExamplesToTrain > 0:
            valPremiseIdxMat = valPremiseIdxMat[:, range(numExamplesToTrain), :]
            valHypothesisIdxMat = valHypothesisIdxMat[:, range(numExamplesToTrain), :]
            valGoldLabel = valGoldLabel[range(numExamplesToTrain)]


        #Whether zero-padded on left or right
        pad = "right"

        # Get full premise/hypothesis tensors
        # batchPremiseTensor, batchHypothesisTensor, batchLabels = \
        #             convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat,
        #                                        self.numTimestepsHypothesis, "right", self.embeddingTable,
        #                                        valGoldLabel, range(len(valGoldLabel)))
        #sharedValPremise = theano.shared(batchPremiseTensor)
        #sharedValHypothesis = theano.shared(batchHypothesisTensor)
        #sharedValLabels = theano.shared(batchLabels)


        inputPremise = T.ftensor3(name="inputPremise")
        inputHypothesis = T.ftensor3(name="inputHypothesis")
        yTarget = T.fmatrix(name="yTarget")
        learnRate = T.scalar(name="learnRate", dtype='float32')


        fGradSharedHypothesis, fGradSharedPremise, fUpdatePremise, \
            fUpdateHypothesis, costFn, _, _ = self.trainFunc(inputPremise,
                                            inputHypothesis, yTarget, learnRate, gradMax,
                                            L2regularization, dropoutRate, sentenceAttention,
                                            wordwiseAttention, batchSize)

        totalExamples = 0
        stats = Stats(self.logger, expName)

        # Training
        self.logger.Log("Model configs: {0}".format(self.configs))
        self.logger.Log("Starting training with {0} epochs, {1} batchSize,"
                " {2} learning rate, {3} L2regularization coefficient, and {4} dropout rate".format(
            numEpochs, batchSize, learnRateVal, L2regularization, dropoutRate))


        predictFunc = self.predictFunc(inputPremise, inputHypothesis, dropoutRate)

        for epoch in xrange(numEpochs):
            self.logger.Log("Epoch number: %d" %(epoch))

            if numExamplesToTrain > 0:
                minibatches = getMinibatchesIdx(numExamplesToTrain, batchSize)
            else:
                minibatches = getMinibatchesIdx(len(trainGoldLabel), batchSize)

            numExamples = 0
            for _, minibatch in minibatches:
                self.dropoutMode.set_value(1.0)
                numExamples += len(minibatch)
                totalExamples += len(minibatch)

                self.logger.Log("Processed {0} examples in current epoch".
                                format(str(numExamples)))

                batchPremiseTensor, batchHypothesisTensor, batchLabels = \
                    convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat,
                                               self.numTimestepsHypothesis, pad, self.embeddingTable,
                                               valGoldLabel, minibatch)

                gradHypothesisOut = fGradSharedHypothesis(batchPremiseTensor,
                                       batchHypothesisTensor, batchLabels)
                gradPremiseOut = fGradSharedPremise(batchPremiseTensor,
                                       batchHypothesisTensor, batchLabels)
                fUpdatePremise(learnRateVal)
                fUpdateHypothesis(learnRateVal)

                predictLabels = self.predict(batchPremiseTensor, batchHypothesisTensor, predictFunc)
                #self.logger.Log("Labels in epoch {0}: {1}".format(epoch, str(predictLabels)))


                cost = costFn(batchPremiseTensor, batchHypothesisTensor, batchLabels)
                stats.recordCost(totalExamples, cost)

                # Note: Big time sink happens here
                if totalExamples%(100) == 0:
                    # TODO: Don't compute accuracy of dev set
                    self.dropoutMode.set_value(0.0)
                    devAccuracy = self.computeAccuracy(valPremiseIdxMat,
                                                       valHypothesisIdxMat, valGoldLabel, predictFunc)
                    stats.recordAcc(totalExamples, devAccuracy, "dev")


        stats.recordFinalTrainingTime(totalExamples)

        # Save model to disk
        self.logger.Log("Saving model...")
        self.extractParams()
        configString = "batch={0},epoch={1},learnRate={2},dimHidden={3},dimInput={4}".format(str(batchSize),
                                            str(numEpochs), str(learnRateVal),
                                            str(self.dimHidden), str(self.dimInput))
        self.saveModel(currDir + "/savedmodels/basicLSTM_"+configString+".npz")
        self.logger.Log("Model saved!")

        # Set dropout to 0. again for testing
        self.dropoutMode.set_value(0.0)

        #Train Accuracy
        # trainAccuracy = self.computeAccuracy(trainPremiseIdxMat,
        #                              trainHypothesisIdxMat, trainGoldLabel, predictFunc)
        # self.logger.Log("Final training accuracy: {0}".format(trainAccuracy))

        # Val Accuracy
        valAccuracy = self.computeAccuracy(valPremiseIdxMat,
                                    valHypothesisIdxMat, valGoldLabel, predictFunc)
        # TODO: change -1 for training acc to actual value when I enable train computation
        stats.recordFinalStats(totalExamples, -1, valAccuracy)
示例#5
0
def main(exp_name, embed_data, train_data, train_data_stats, val_data,
         val_data_stats, test_data, test_data_stats, log_path, batch_size,
         num_epochs, unroll_steps, learn_rate, num_dense, dense_dim, penalty,
         reg_coeff):
    """
    Main run function for training model.
    :param exp_name:
    :param embed_data:
    :param train_data:
    :param train_data_stats:
    :param val_data:
    :param val_data_stats:
    :param test_data:
    :param test_data_stats:
    :param log_path:
    :param batch_size:
    :param num_epochs:
    :param unroll_steps:
    :param learn_rate:
    :param num_dense: Number of dense fully connected layers to add after concatenation layer
    :param dense_dim: Dimension of dense FC layers -- note this only applies if num_dense > 1
    :param penalty: Penalty to use for regularization
    :param reg_weight: Regularization coeff to use for each layer of network; may
                       want to support different coefficient for different layers
    :return:
    """
    # Set random seed for deterministic results
    np.random.seed(0)
    num_ex_to_train = 30

    # Load embedding table
    table = EmbeddingTable(embed_data)
    vocab_size = table.sizeVocab
    dim_embeddings = table.dimEmbeddings
    embeddings_mat = table.embeddings

    train_prem, train_hyp = generate_data(train_data,
                                          train_data_stats,
                                          "left",
                                          "right",
                                          table,
                                          seq_len=unroll_steps)
    val_prem, val_hyp = generate_data(val_data,
                                      val_data_stats,
                                      "left",
                                      "right",
                                      table,
                                      seq_len=unroll_steps)
    train_labels = convertLabelsToMat(train_data)
    val_labels = convertLabelsToMat(val_data)

    # To test for overfitting capabilities of model
    if num_ex_to_train > 0:
        val_prem = val_prem[0:num_ex_to_train]
        val_hyp = val_hyp[0:num_ex_to_train]
        val_labels = val_labels[0:num_ex_to_train]

    # Theano expressions for premise/hypothesis inputs to network
    x_p = T.imatrix()
    x_h = T.imatrix()
    target_values = T.fmatrix(name="target_output")

    # Embedding layer for premise
    l_in_prem = InputLayer((batch_size, unroll_steps))
    l_embed_prem = EmbeddingLayer(l_in_prem,
                                  input_size=vocab_size,
                                  output_size=dim_embeddings,
                                  W=embeddings_mat)

    # Embedding layer for hypothesis
    l_in_hyp = InputLayer((batch_size, unroll_steps))
    l_embed_hyp = EmbeddingLayer(l_in_hyp,
                                 input_size=vocab_size,
                                 output_size=dim_embeddings,
                                 W=embeddings_mat)

    # Ensure embedding matrix parameters are not trainable
    l_embed_hyp.params[l_embed_hyp.W].remove('trainable')
    l_embed_prem.params[l_embed_prem.W].remove('trainable')

    l_embed_hyp_sum = SumEmbeddingLayer(l_embed_hyp)
    l_embed_prem_sum = SumEmbeddingLayer(l_embed_prem)

    # Concatenate sentence embeddings for premise and hypothesis
    l_concat = ConcatLayer([l_embed_hyp_sum, l_embed_prem_sum])

    l_in = l_concat
    l_output = l_concat
    # Add 'num_dense' dense layers with tanh
    # top layer is softmax
    if num_dense > 1:
        for n in range(num_dense):
            if n == num_dense - 1:
                l_output = DenseLayer(
                    l_in,
                    num_units=NUM_DENSE_UNITS,
                    nonlinearity=lasagne.nonlinearities.softmax)
            else:
                l_in = DenseLayer(l_in,
                                  num_units=dense_dim,
                                  nonlinearity=lasagne.nonlinearities.tanh)
    else:
        l_output = DenseLayer(l_in,
                              num_units=NUM_DENSE_UNITS,
                              nonlinearity=lasagne.nonlinearities.softmax)

    network_output = get_output(l_output, {
        l_in_prem: x_p,
        l_in_hyp: x_h
    })  # Will have shape (batch_size, 3)
    f_dense_output = theano.function([x_p, x_h],
                                     network_output,
                                     on_unused_input='warn')

    # Compute cost
    if penalty == "l2":
        p_metric = l2
    elif penalty == "l1":
        p_metric = l1

    layers = lasagne.layers.get_all_layers(l_output)
    layer_dict = {l: reg_coeff for l in layers}
    reg_cost = reg_coeff * regularize_layer_params_weighted(
        layer_dict, p_metric)
    cost = T.mean(
        T.nnet.categorical_crossentropy(network_output,
                                        target_values).mean()) + reg_cost
    compute_cost = theano.function([x_p, x_h, target_values], cost)

    # Compute accuracy
    accuracy = T.mean(T.eq(T.argmax(network_output, axis=-1),
                           T.argmax(target_values, axis=-1)),
                      dtype=theano.config.floatX)
    compute_accuracy = theano.function([x_p, x_h, target_values], accuracy)

    label_output = T.argmax(network_output, axis=-1)
    predict = theano.function([x_p, x_h], label_output)

    # Define update/train functions
    all_params = lasagne.layers.get_all_params(l_output, trainable=True)
    updates = lasagne.updates.rmsprop(cost, all_params, learn_rate)
    train = theano.function([x_p, x_h, target_values], cost, updates=updates)

    # TODO: Augment embedding layer to allow for masking inputs

    stats = Stats(exp_name)
    acc_num = 10

    #minibatches = getMinibatchesIdx(val_prem.shape[0], batch_size)
    minibatches = getMinibatchesIdx(train_prem.shape[0], batch_size)
    print("Training ...")
    try:
        total_num_ex = 0
        for epoch in xrange(num_epochs):
            for _, minibatch in minibatches:
                total_num_ex += len(minibatch)
                stats.log("Processed {0} total examples in epoch {1}".format(
                    str(total_num_ex), str(epoch)))

                #prem_batch = val_prem[minibatch]
                #hyp_batch = val_hyp[minibatch]
                #labels_batch = val_labels[minibatch]

                prem_batch = train_prem[minibatch]
                hyp_batch = train_hyp[minibatch]
                labels_batch = train_labels[minibatch]

                train(prem_batch, hyp_batch, labels_batch)
                cost_val = compute_cost(prem_batch, hyp_batch, labels_batch)

                stats.recordCost(total_num_ex, cost_val)
                # Periodically compute and log train/dev accuracy
                if total_num_ex % (acc_num * batch_size) == 0:
                    train_acc = compute_accuracy(train_prem, train_hyp,
                                                 train_labels)
                    dev_acc = compute_accuracy(val_prem, val_hyp, val_labels)
                    stats.recordAcc(total_num_ex, train_acc, dataset="train")
                    stats.recordAcc(total_num_ex, dev_acc, dataset="dev")

    except KeyboardInterrupt:
        pass
示例#6
0
    def train(self,
              numEpochs=1,
              batchSize=5,
              learnRateVal=0.1,
              numExamplesToTrain=-1,
              gradMax=3.,
              L2regularization=0.0,
              dropoutRate=0.0,
              sentenceAttention=False,
              wordwiseAttention=False):
        """
        Takes care of training model, including propagation of errors and updating of
        parameters.
        """
        expName = "Epochs_{0}_LRate_{1}_L2Reg_{2}_dropout_{3}_sentAttn_{4}_" \
                       "wordAttn_{5}".format(str(numEpochs), str(learnRateVal),
                                             str(L2regularization), str(dropoutRate),
                                             str(sentenceAttention), str(wordwiseAttention))
        self.configs.update(locals())
        trainPremiseIdxMat, trainHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices(
            self.trainData, self.trainDataStats)
        trainGoldLabel = convertLabelsToMat(self.trainData)

        valPremiseIdxMat, valHypothesisIdxMat = self.embeddingTable.convertDataToIdxMatrices(
            self.valData, self.valDataStats)
        valGoldLabel = convertLabelsToMat(self.valData)

        # If you want to train on less than full dataset
        if numExamplesToTrain > 0:
            valPremiseIdxMat = valPremiseIdxMat[:,
                                                range(numExamplesToTrain), :]
            valHypothesisIdxMat = valHypothesisIdxMat[:,
                                                      range(numExamplesToTrain
                                                            ), :]
            valGoldLabel = valGoldLabel[range(numExamplesToTrain)]

        #Whether zero-padded on left or right
        pad = "right"

        # Get full premise/hypothesis tensors
        # batchPremiseTensor, batchHypothesisTensor, batchLabels = \
        #             convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat,
        #                                        self.numTimestepsHypothesis, "right", self.embeddingTable,
        #                                        valGoldLabel, range(len(valGoldLabel)))
        #sharedValPremise = theano.shared(batchPremiseTensor)
        #sharedValHypothesis = theano.shared(batchHypothesisTensor)
        #sharedValLabels = theano.shared(batchLabels)

        inputPremise = T.ftensor3(name="inputPremise")
        inputHypothesis = T.ftensor3(name="inputHypothesis")
        yTarget = T.fmatrix(name="yTarget")
        learnRate = T.scalar(name="learnRate", dtype='float32')


        fGradSharedHypothesis, fGradSharedPremise, fUpdatePremise, \
            fUpdateHypothesis, costFn, _, _ = self.trainFunc(inputPremise,
                                            inputHypothesis, yTarget, learnRate, gradMax,
                                            L2regularization, dropoutRate, sentenceAttention,
                                            wordwiseAttention, batchSize)

        totalExamples = 0
        stats = Stats(self.logger, expName)

        # Training
        self.logger.Log("Model configs: {0}".format(self.configs))
        self.logger.Log(
            "Starting training with {0} epochs, {1} batchSize,"
            " {2} learning rate, {3} L2regularization coefficient, and {4} dropout rate"
            .format(numEpochs, batchSize, learnRateVal, L2regularization,
                    dropoutRate))

        predictFunc = self.predictFunc(inputPremise, inputHypothesis,
                                       dropoutRate)

        for epoch in xrange(numEpochs):
            self.logger.Log("Epoch number: %d" % (epoch))

            if numExamplesToTrain > 0:
                minibatches = getMinibatchesIdx(numExamplesToTrain, batchSize)
            else:
                minibatches = getMinibatchesIdx(len(trainGoldLabel), batchSize)

            numExamples = 0
            for _, minibatch in minibatches:
                self.dropoutMode.set_value(1.0)
                numExamples += len(minibatch)
                totalExamples += len(minibatch)

                self.logger.Log(
                    "Processed {0} examples in current epoch".format(
                        str(numExamples)))

                batchPremiseTensor, batchHypothesisTensor, batchLabels = \
                    convertDataToTrainingBatch(valPremiseIdxMat, self.numTimestepsPremise, valHypothesisIdxMat,
                                               self.numTimestepsHypothesis, pad, self.embeddingTable,
                                               valGoldLabel, minibatch)

                gradHypothesisOut = fGradSharedHypothesis(
                    batchPremiseTensor, batchHypothesisTensor, batchLabels)
                gradPremiseOut = fGradSharedPremise(batchPremiseTensor,
                                                    batchHypothesisTensor,
                                                    batchLabels)
                fUpdatePremise(learnRateVal)
                fUpdateHypothesis(learnRateVal)

                predictLabels = self.predict(batchPremiseTensor,
                                             batchHypothesisTensor,
                                             predictFunc)
                #self.logger.Log("Labels in epoch {0}: {1}".format(epoch, str(predictLabels)))

                cost = costFn(batchPremiseTensor, batchHypothesisTensor,
                              batchLabels)
                stats.recordCost(totalExamples, cost)

                # Note: Big time sink happens here
                if totalExamples % (100) == 0:
                    # TODO: Don't compute accuracy of dev set
                    self.dropoutMode.set_value(0.0)
                    devAccuracy = self.computeAccuracy(valPremiseIdxMat,
                                                       valHypothesisIdxMat,
                                                       valGoldLabel,
                                                       predictFunc)
                    stats.recordAcc(totalExamples, devAccuracy, "dev")

        stats.recordFinalTrainingTime(totalExamples)

        # Save model to disk
        self.logger.Log("Saving model...")
        self.extractParams()
        configString = "batch={0},epoch={1},learnRate={2},dimHidden={3},dimInput={4}".format(
            str(batchSize), str(numEpochs), str(learnRateVal),
            str(self.dimHidden), str(self.dimInput))
        self.saveModel(currDir + "/savedmodels/basicLSTM_" + configString +
                       ".npz")
        self.logger.Log("Model saved!")

        # Set dropout to 0. again for testing
        self.dropoutMode.set_value(0.0)

        #Train Accuracy
        # trainAccuracy = self.computeAccuracy(trainPremiseIdxMat,
        #                              trainHypothesisIdxMat, trainGoldLabel, predictFunc)
        # self.logger.Log("Final training accuracy: {0}".format(trainAccuracy))

        # Val Accuracy
        valAccuracy = self.computeAccuracy(valPremiseIdxMat,
                                           valHypothesisIdxMat, valGoldLabel,
                                           predictFunc)
        # TODO: change -1 for training acc to actual value when I enable train computation
        stats.recordFinalStats(totalExamples, -1, valAccuracy)