Python plot_costs примеры использования

Язык программирования: Python

Пространство имен/Пакет: whim_common.utils.plotting

Метод/Функция: plot_costs

Примеров на hotexamples.com: 9

Python plot_costs - 9 примеров найдено. Это лучшие примеры Python кода для whim_common.utils.plotting.plot_costs, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

        def callback(new_theta):
            # Update the parameters of the model
            self.theta.set_value(new_theta, borrow=True)

            # Only evaluate on val set every validation_frequencyth iteration
            if validation_xs is not None and (iteration_counter[0] +
                                              1) % validation_frequency == 0:
                # Compute accuracy on validation set
                validation_error = self.error(validation_xs, validation_ys)
                # Compute accuracy on training set
                training_error = self.error(xs, ys)
                # Compute how much we've improved on the previous best validation error
                if validation_error < best_validation_error[0]:
                    validation_improvement = 0.0
                else:
                    validation_improvement = (
                        validation_error - best_validation_error[0]
                    ) / best_validation_error[0] * 100.0
                    best_validation_error[0] = validation_error

                # Plot some graphs
                if plot_errors and validation_error is not None:
                    validation_errors.append(validation_error)
                    training_errors.append(training_error)
                    plot_costs(plot_errors,
                               (training_errors, "training set error"),
                               (validation_errors, "val set error"))
            else:
                validation_error = training_error = validation_improvement = None

            if iteration_callback is not None:
                # TODO Compute training cost?
                iteration_callback(iteration_counter[0], 0.0, training_error,
                                   validation_error, validation_improvement)

            iteration_counter[0] += 1

Пример #2

Показать файл

Файл: single_hidden_layer.py Проект: basilwang/TransBERT-ijcai2019

    def train(self,
              xs,
              ys,
              iterations=10000,
              iteration_callback=None,
              learning_rate=None,
              regularization=None,
              batch_size=20,
              batch_callback=None,
              validation_set=None,
              stopping_iterations=10,
              log=None,
              class_weights=None,
              cost_plot_filename=None,
              training_cost_prop_change_threshold=None,
              undersample=None,
              print_predictions=False):
        """
        Train on data stored in Theano tensors. Uses minibatch training.

        E.g.
        xs = rng.randn(N, num_features)
        ys = rng.randint(size=N, low=0, high=2)

        iteration_callback is called after each iteration with args (iteration, error array).

        If a validation set (xs, ys) is given, it is used to compute an error after each iteration
        and to enforce a stopping criterion. The algorithm will terminate if it goes stopping_iterations
        iterations without an improvement in validation error.

        Updates for each target class can be weighted by giving a vector class_weights. Alternatively,
        give the string 'freq' to weight them by inverse class frequency, or leave as None to apply
        no weighting.

        If compute_error_frequency > 1 (default=5), this number of iterations are performed between each time
        the error is computed on the training set.

        The algorithm will assume it has converged and stop early if the proportional change between successive
        training costs drops below training_cost_prop_change_threshold for five iterations in a row.
        If threshold is given as None, this stopping condition will not be used.

        If undersample is given it should be a float. The training data will be randomly undersampled to produce
        a set in which the expected number of instances of each class is undersample*min_freq, where min_freq
        is the number of instances of the least common observed class. A value of 1.0 will produce a roughly
        balanced set. Every class that is observed at all will be included at least once. The sampling is
        performed once at the beginning of training.

        """
        if log is None:
            log = get_console_logger("MLP train")

        if cost_plot_filename is not None:
            _fname, __, _ext = cost_plot_filename.rpartition(".")
            balanced_cost_plot_filename = "%s_balanced.%s" % (_fname, _ext)
            log.info("Outputting balanced costs to: %s" %
                     balanced_cost_plot_filename)
        else:
            balanced_cost_plot_filename = None

        kwargs = {}
        cost_kwargs = {
            "reg_coef": 0.,  # Always compute the cost without regularization
        }
        if learning_rate is not None:
            kwargs["learning_rate"] = learning_rate
        if regularization is not None:
            kwargs["reg_coef"] = regularization
        log.info("Training params: learning rate=%s, reg coef=%s" %
                 (learning_rate, regularization))
        log.info("Training with %s, batch size=%d" %
                 (self.optimization, batch_size))
        if undersample is not None and undersample > 0.0:
            log.info("Undersampling the dataset with a ratio of %s" %
                     undersample)

        # Work out how many batches to do
        if batch_size is None or batch_size == 0:
            num_batches = 1
        else:
            num_batches = xs.shape[0] / batch_size
            if xs.shape[0] % batch_size != 0:
                num_batches += 1

        if undersample is not None and undersample > 0.0:
            # Undersample the training data to produce a (more) balanced set
            balanced_indices = balanced_array_sample(ys,
                                                     balance_ratio=undersample,
                                                     min_inclusion=1)
            # Copy the data so we're not dealing with a view
            xs = numpy.copy(xs[balanced_indices])
            ys = numpy.copy(ys[balanced_indices])
            # Also sample the validation set similarly
            balanced_validation_indices = balanced_array_sample(
                validation_set[1], balance_ratio=undersample, min_inclusion=1)
            validation_set = (
                numpy.copy(validation_set[0][balanced_validation_indices]),
                numpy.copy(validation_set[1][balanced_validation_indices]))
            log.info("Sampled %d training and %d validation instances" %
                     (xs.shape[0], validation_set[0].shape[0]))

        # Work out class weighting
        # Do this after undersampling: if both are used, we only want the weights to account for any imbalance
        #  left after undersampling
        if class_weights is not None:
            if class_weights == 'freq':
                # Use inverse frequency to weight class updates
                # This procedure is modelled directly on what liblinear does
                class_counts = self.get_class_counts(ys).astype(numpy.float64)
                # Replace zero-counts with 1s
                class_counts = numpy.maximum(class_counts, 1.0)
                class_weights = 1.0 / class_counts
                class_weights *= self.network.num_classes / class_weights.sum()
                log.info("Inverse-frequency class weighting")
            elif class_weights == 'log':
                # Use a different scheme, inversely proportional to the log of the class frequencies
                class_counts = self.get_class_counts(ys).astype(numpy.float64)
                class_counts = numpy.maximum(class_counts, 1.0)
                class_weights = 1.0 / (1.0 + numpy.log(class_counts))
                class_weights *= self.network.num_classes / class_weights.sum()
                log.info("Log-inverse-frequency class weighting")
            else:
                log.info("Custom vector class weighting")
            kwargs["class_weights"] = class_weights
            cost_kwargs["class_weights"] = class_weights
        else:
            log.info("No class weighting")

        # Keep a record of costs, so we can plot them
        val_costs = []
        training_costs = []
        # The costs using the balanced metric
        bal_val_costs = []
        bal_training_costs = []

        # Compute costs using the initialized network
        training_cost = self.compute_cost(xs, ys, **cost_kwargs)
        training_costs.append(training_cost)
        if validation_set is not None:
            val_cost = self.compute_cost(validation_set[0], validation_set[1],
                                         **cost_kwargs)
            val_costs.append(val_cost)
        else:
            val_cost = None

        log.info("Computing initial validation set metrics:")
        class_accuracies = self.network.per_class_accuracy(
            validation_set[0], validation_set[1])
        class_accuracies = class_accuracies[numpy.where(
            numpy.logical_not(numpy.isnan(class_accuracies)))]
        mean_class_accuracy = class_accuracies.mean()
        log.info("Per-class accuracy: %.4f%% (mean over %d classes)" %
                 (mean_class_accuracy, class_accuracies.shape[0]))
        # Also compute mean log prob of targets over val set
        mean_log_prob = self.network.mean_log_prob(validation_set[0],
                                                   validation_set[1])
        log.info("Mean target log prob: %.4f" % mean_log_prob)
        mean_per_class_log_prob = self.network.mean_per_class_target_log_prob(
            validation_set[0], validation_set[1])
        log.info("Mean per-class mean target log prob: %.4f" %
                 mean_per_class_log_prob)

        # Keep a copy of the best weights so far
        best_weights = best_iter = best_val_cost = None
        if validation_set is not None:
            best_weights = self.network.get_weights()
            best_iter = -1
            best_val_cost = val_cost

        below_threshold_its = 0

        # Count the instances we're learning from to give an idea of how hard a time the model's got
        training_class_counts = numpy.bincount(ys)
        training_class_counts = training_class_counts[
            training_class_counts.nonzero()]
        log.info(
            "Training instances per class: min=%d, max=%d (%d unseen classes)"
            % (int(
                training_class_counts.min()), int(training_class_counts.max()),
               self.network.num_classes - training_class_counts.shape[0]))

        for i in range(iterations):
            # Shuffle the training data between iterations, as one should with SGD
            shuffle = numpy.random.permutation(xs.shape[0])
            xs[:] = xs[shuffle]
            ys[:] = ys[shuffle]

            err = 0.0
            if num_batches > 1:
                for batch in range(num_batches):
                    # Update the model with this batch's data
                    batch_err = self._train_fn(
                        xs[batch * batch_size:(batch + 1) * batch_size],
                        ys[batch * batch_size:(batch + 1) * batch_size],
                        **kwargs)
                    err += batch_err

                    if batch_callback is not None:
                        batch_callback(batch, num_batches, batch_err)
            else:
                # Batch training: no need to loop
                err = self._train_fn(xs, ys, **kwargs)

            # Go back and compute training cost
            training_cost = self.compute_cost(xs, ys, **cost_kwargs)
            training_costs.append(training_cost)
            # Training set error
            train_error = self.network.error(xs, ys)
            bal_training_costs.append(
                -self.network.mean_per_class_target_log_prob(xs, ys))

            if validation_set is not None:
                if print_predictions:
                    # Perform some predictions on a random sample of the val set
                    for randind in numpy.random.randint(
                            validation_set[0].shape[0], size=5):
                        # Get the network's predictions
                        predictions = self.network.predict(
                            validation_set[0][None, randind, :])
                        predictions = predictions[0, None]
                        log.info("Input: %s. Predictions: %s" %
                                 (list(
                                     numpy.where(
                                         validation_set[0][randind] > 0)[0]),
                                  list(predictions)))
                # Compute the cost function on the validation set
                val_cost = self.compute_cost(validation_set[0],
                                             validation_set[1], **cost_kwargs)
                val_costs.append(val_cost)
                if val_cost <= best_val_cost:
                    # We assume that, if the validation error remains the same, it's better to use the new set of
                    # weights (with, presumably, a better training error)
                    # Update our best estimate
                    best_weights = self.network.get_weights()
                    best_iter = i
                    best_val_cost = val_cost

                if i - best_iter >= stopping_iterations:
                    # We've gone on long enough without improving validation error
                    # Time to call a halt and use the best validation error we got
                    log.info(
                        "Stopping after %d iterations without improving validation cost"
                        % stopping_iterations)
                    break

                # Compute various metrics
                # Per-class accuracy on val set
                class_accuracies = self.network.per_class_accuracy(
                    validation_set[0], validation_set[1])
                class_accuracies = class_accuracies[numpy.where(
                    numpy.logical_not(numpy.isnan(class_accuracies)))]
                mean_class_accuracy = class_accuracies.mean()
                # Mean log prob of targets over val set
                mean_log_prob = self.network.mean_log_prob(
                    validation_set[0], validation_set[1])
                mean_per_class_log_prob = self.network.mean_per_class_target_log_prob(
                    validation_set[0], validation_set[1])
                log.info(
                    "Completed iteration %d, training cost=%.5f, val cost=%.5f, training error=%.2f%%, "
                    "per-class accuracy: %.4f%%, mean tgt logprob: %.4f, per-class tgt logprob: %.4f"
                    % (i, training_cost, val_cost, train_error * 100.0,
                       mean_class_accuracy, mean_log_prob,
                       mean_per_class_log_prob))
                bal_val_costs.append(-mean_per_class_log_prob)

                if best_iter < i:
                    log.info("No improvement in validation cost")
            else:
                log.info(
                    "Completed iteration %d, training cost=%.5f, training error=%.2f%%"
                    % (i, training_cost, train_error * 100.0))

            if cost_plot_filename:
                # Plot the cost function as we train
                columns = [(training_costs, "Train cost")]
                if validation_set is not None:
                    columns.append((val_costs, "Val cost"))
                ax = plot_costs(None, *columns)
                # Add a line at the most recent best val cost
                ax.axvline(float(best_iter + 1), color="b")
                ax.text(float(best_iter + 1) + 0.1,
                        best_val_cost * 1.1,
                        "Best val cost",
                        color="b")
                plt.savefig(cost_plot_filename)

                bal_columns = [(bal_training_costs, "Train cost (balanced)")]
                if validation_set is not None:
                    bal_columns.append((bal_val_costs, "Val cost (balanced)"))
                plot_costs(balanced_cost_plot_filename, *bal_columns)

            if iteration_callback is not None:
                iteration_callback(i, training_cost, val_cost, train_error,
                                   best_iter)

            # Check the proportional change between this iteration's training cost and the last
            if len(training_costs
                   ) > 2 and training_cost_prop_change_threshold is not None:
                training_cost_prop_change = abs(
                    (training_costs[-2] - training_costs[-1]) /
                    training_costs[-2])
                if training_cost_prop_change < training_cost_prop_change_threshold:
                    # Very small change in training cost - maybe we've converged
                    below_threshold_its += 1
                    if below_threshold_its >= 5:
                        # We've had enough iterations with very small changes: we've converged
                        log.info(
                            "Proportional change in training cost (%g) below %g for five successive iterations: "
                            "converged" %
                            (training_cost_prop_change,
                             training_cost_prop_change_threshold))
                        break
                    else:
                        log.info(
                            "Proportional change in training cost (%g) below %g for %d successive iterations: "
                            "waiting until it's been low for five iterations" %
                            (training_cost_prop_change,
                             training_cost_prop_change_threshold,
                             below_threshold_its))
                else:
                    # Reset the below threshold counter
                    below_threshold_its = 0

        if best_weights is not None:
            # Use the weights that gave us the best error on the validation set
            self.network.set_weights(best_weights)

Пример #3

Показать файл

    def train(self,
              batch_iterator,
              total_samples,
              iterations=10000,
              validation_set=None,
              stopping_iterations=10,
              cost_plot_filename=None,
              iteration_callback=None,
              log=None,
              training_cost_prop_change_threshold=0.0005,
              batch_callback=None,
              first_it_last_layer=False):
        if log is None:
            log = get_console_logger("Autoencoder tune")

        log.info(
            "Tuning params: learning rate=%s (->%s), regularization=%s" %
            (self.learning_rate, self.min_learning_rate, self.regularization))
        if self.update_empty_vectors:
            log.info("Training empty vectors")
        if self.update_input_vectors:
            log.info("Updating basic word representations")

        ######## Compile functions
        network = self.model.pair_projection_model
        # Prepare cost/update functions for training
        cost, updates = self.get_triple_cost_updates()
        cost_without_reg, __ = self.get_triple_cost_updates(regularization=0.)
        # Prepare training functions
        cost_fn = theano.function(
            inputs=network.triple_inputs,
            outputs=cost_without_reg,
        )
        train_fn = theano.function(
            inputs=network.triple_inputs + [
                # Allow the learning rate to be set per update
                theano.Param(self.learning_rate_var,
                             default=self.learning_rate),
            ],
            outputs=cost,
            updates=updates,
        )
        # Doesn't do anything now: used to do something different
        first_pass_train_fn = train_fn
        ###########

        # Keep a record of costs, so we can plot them
        val_costs = []
        training_costs = []

        # Keep a copy of the best weights so far
        val_cost = 0.
        best_weights = best_iter = best_val_cost = None
        if validation_set is not None:
            best_weights = self.network.get_weights()
            best_iter = -1
            best_val_cost = cost_fn(validation_set)

        below_threshold_its = 0

        for i in range(iterations):
            err = 0.0
            batch_num = 0
            learning_rate = self.learning_rate
            seen_samples = 0
            tenth_progress = -1

            if i == 0 and first_it_last_layer:
                # On the first iteration, use the training function that only updates the final layer
                log.info(
                    "First pass: only updating final layer (logistic regression)"
                )
                train = first_pass_train_fn
            else:
                train = train_fn

            for batch_num, batch_inputs in enumerate(batch_iterator):
                # Shuffle the training data between iterations, as one should with SGD
                # Just shuffle within batches
                shuffle = numpy.random.permutation(batch_inputs[0].shape[0])
                for batch_data in batch_inputs:
                    batch_data[:] = batch_data[shuffle]

                # Update the model with this batch's data
                err += train(*batch_inputs, learning_rate=learning_rate)

                seen_samples += batch_inputs[0].shape[0]
                # Update the learning rate, so it falls away as we go through
                # Do this only on the first iteration. After that, LR should just stay at the min
                if i == 0:
                    learning_rate = max(
                        self.min_learning_rate,
                        self.learning_rate *
                        (1. - float(seen_samples) / total_samples))

                current_tenth_progress = int(
                    math.floor(10. * float(seen_samples) / total_samples))
                if current_tenth_progress > tenth_progress:
                    tenth_progress = current_tenth_progress
                    mean_cost_so_far = err / (batch_num + 1)
                    log.info("%d%% of iteration: training cost so far = %.5g" %
                             (current_tenth_progress * 10, mean_cost_so_far))
                    if i == 0:
                        log.info("Learning rate updated to %g" % learning_rate)

                if batch_callback is not None:
                    batch_callback(i, batch_num)

            if batch_num == 0:
                raise ModelTrainingError(
                    "zero batches returned by training data iterator")
            training_costs.append(err / (batch_num + 1))

            if validation_set is not None:
                # Compute the cost function on the validation set
                val_cost = cost_fn(validation_set) / validation_set.shape[0]
                val_costs.append(val_cost)
                if val_cost <= best_val_cost:
                    # We assume that, if the validation error remains the same, it's better to use the new set of
                    # weights (with, presumably, a better training error)
                    if val_cost == best_val_cost:
                        log.info(
                            "Same validation cost: %.4f, using new weights" %
                            val_cost)
                    else:
                        log.info("New best validation cost: %.4f" % val_cost)
                    # Update our best estimate
                    best_weights = self.network.get_weights()
                    best_iter = i
                    best_val_cost = val_cost
                if val_cost >= best_val_cost and i - best_iter >= stopping_iterations:
                    # We've gone on long enough without improving validation error
                    # Time to call a halt and use the best validation error we got
                    log.info(
                        "Stopping after %d iterations of increasing validation cost"
                        % stopping_iterations)
                    break

            log.info(
                "COMPLETED ITERATION %d: training cost=%.5g, val cost=%.5g" %
                (i, training_costs[-1], val_cost))

            if cost_plot_filename:
                # Plot the cost function as we train
                # Skip the first costs, as they're usually so much higher than others that the rest is indistinguishable
                columns = [(training_costs[1:], "Train cost")]
                if validation_set is not None:
                    columns.append((val_costs[1:], "Val cost"))
                ax = plot_costs(None, *columns)
                # Add a line at the most recent best val cost
                ax.axvline(float(best_iter), color="b")
                ax.text(float(best_iter + 1) + 0.1,
                        best_val_cost * 1.1,
                        "Best val cost",
                        color="b")
                plt.savefig(cost_plot_filename)

            if iteration_callback is not None:
                # Not computing training error at the moment
                iteration_callback(i, training_costs[-1], val_cost, 0.0,
                                   best_iter)

            # Check the proportional change between this iteration's training cost and the last
            if len(training_costs) > 2:
                training_cost_prop_change = abs(
                    (training_costs[-2] - training_costs[-1]) /
                    training_costs[-2])
                if training_cost_prop_change < training_cost_prop_change_threshold:
                    # Very small change in training cost - maybe we've converged
                    below_threshold_its += 1
                    if below_threshold_its >= 5:
                        # We've had enough iterations with very small changes: we've converged
                        log.info(
                            "Proportional change in training cost (%g) below %g for five successive iterations: "
                            "converged" %
                            (training_cost_prop_change,
                             training_cost_prop_change_threshold))
                        break
                    else:
                        log.info(
                            "Proportional change in training cost (%g) below %g for %d successive iterations: "
                            "waiting until it's been low for five iterations" %
                            (training_cost_prop_change,
                             training_cost_prop_change_threshold,
                             below_threshold_its))
                else:
                    # Reset the below threshold counter
                    below_threshold_its = 0

        if best_weights is not None:
            # Use the weights that gave us the best error on the validation set
            self.network.set_weights(best_weights)

Пример #4

Показать файл

Файл: logistic_regression2.py Проект: basilwang/TransBERT-ijcai2019

    def train(self,
              xs,
              ys,
              iterations=10000,
              iteration_callback=None,
              validation_xs=None,
              validation_ys=None,
              learning_rate=0.1,
              regularization=0.01,
              plot_errors=None,
              plot_cost=None,
              minibatch=1,
              early_stopping_iterations=5,
              batch_getter=None,
              validation_batch_size=0):
        """
        Train on data stored in Theano tensors.

        E.g.
        xs = rng.randn(N, num_features)
        ys = rng.randint(size=N, low=0, high=2)

        iteration_callback is called after each iteration with args (iteration, error array).

        Returns True if we stop early because of early stopping criterion

        """
        if plot_errors:
            warnings.warn(
                "Error plotting is no longer implemented for logistic regression"
            )

        if batch_getter is None:
            # Default getter produces an iterator over fixed minibatch sizes and assumes the xs and ys are arrays
            def batch_getter(x_source, y_source, batch_size):
                if batch_size == 0:
                    # Interpret minibatch == 0 as batch
                    batch_size = x_source.shape[0]
                for batch in range(
                        int(math.ceil(float(x_source.shape[0]) / batch_size))):
                    yield x_source[minibatch * batch:minibatch *
                                   (batch + 1)], y_source[minibatch *
                                                          batch:minibatch *
                                                          (batch + 1)]

            return batch_getter

        # Build the cost function
        # The cost to minimize, including L2 regularization
        cost = self.xent + regularization * (self.w**2).mean()
        _summed_cost_fn = theano.function(
            inputs=[self.x, self.y],
            outputs=self.xent_sum,
        )
        updates = [(param, param - learning_rate * T.grad(cost, param))
                   for param in self.params]
        # Build the training function
        _train_fn = theano.function(
            inputs=[self.x, self.y],
            outputs=cost,
            updates=updates,
        )

        validation_costs = []
        costs = []

        best_val_cost = numpy.inf
        best_weights = None
        last_best_iter = 0
        early_stop = False

        for i in range(iterations):
            new_best = False

            # Do an update for each minibatch
            num_batches = 0
            training_cost = 0.
            training_points = 0

            for batch_xs, batch_ys in batch_getter(xs, ys, minibatch):
                num_batches += 1
                # Randomize the order within the batch
                permutation = numpy.random.permutation(batch_xs.shape[0])
                batch_xs = batch_xs[permutation].copy()
                batch_ys = batch_ys[permutation].copy()

                # Update on the batch
                _train_fn(batch_xs, batch_ys)
                # Compute the training cost as we go
                training_cost += _summed_cost_fn(batch_xs, batch_ys)
                training_points += batch_xs.shape[0]

            training_cost /= training_points
            # Compute training set cost
            costs.append(training_cost)

            if validation_xs is not None:
                # Compute cost in val set (without regularization, of course)
                validation_cost = 0.
                validation_points = 0
                for val_batch_xs, val_batch_ys in batch_getter(
                        validation_xs, validation_ys, validation_batch_size):
                    validation_cost += _summed_cost_fn(val_batch_xs,
                                                       val_batch_ys)
                    validation_points += val_batch_xs.shape[0]
                validation_cost /= validation_points
                validation_costs.append(validation_cost)

                # Check whether we've got a new set of best weights, according to validation cost
                if i == 0 or validation_costs[-1] < best_val_cost:
                    best_val_cost = validation_costs[-1]
                    best_weights = self.get_weights()
                    last_best_iter = i
                    new_best = True

                # Test for the early stopping condition
                if i - last_best_iter >= early_stopping_iterations:
                    # We've gone for enough iterations without an improvement in validation cost
                    # Give up and use best weights so far
                    self.set_weights(best_weights)
                    early_stop = True

            if iteration_callback is not None:
                # The empty lists are where val and training errors used to be, left for backwards compat
                iteration_callback(i, costs, validation_costs, [], [],
                                   new_best)

            # Plot some graphs
            if plot_cost:
                plot_costs(plot_cost, (costs, "training cost"),
                           (validation_costs, "val cost"))

            if early_stop:
                # We've decided to give up here on the basis of validation cost
                return True

Пример #5

Показать файл

    def train(self,
              batch_iterator,
              iterations=10000,
              iteration_callback=None,
              validation_set=None,
              stopping_iterations=10,
              log=None,
              cost_plot_filename=None,
              training_cost_prop_change_threshold=0.0005,
              learning_rate=0.1,
              regularization=0.,
              class_weights_vector=None,
              corruption_level=0.,
              continuous_corruption=False,
              loss="xent"):
        """
        Train on data stored in Theano tensors. Uses minibatch training.

        batch_iterator should be a repeatable iterator producing batches.

        iteration_callback is called after each iteration with args (iteration, error array).

        If a validation set (matrix) is given, it is used to compute an error after each iteration
        and to enforce a stopping criterion. The algorithm will terminate if it goes stopping_iterations
        iterations without an improvement in validation error.

        If compute_error_frequency > 1 (default=5), this number of iterations are performed between each time
        the error is computed on the training set.

        The algorithm will assume it has converged and stop early if the proportional change between successive
        training costs drops below training_cost_prop_change_threshold for five iterations in a row.

        Uses L2 regularization.

        """
        if log is None:
            log = get_console_logger("Autoencoder train")

        log.info(
            "Training params: learning rate=%s, noise ratio=%.1f%% (%s), regularization=%s"
            % (learning_rate, corruption_level * 100.0, "continuous corruption"
               if continuous_corruption else "zeroing corruption",
               regularization))
        log.info("Training with SGD")

        ######## Compile functions
        # Prepare cost/update functions for training
        cost, updates = self.network.get_cost_updates(
            self.learning_rate,
            self.regularization,
            class_cost_weights=class_weights_vector,
            corruption_level=corruption_level,
            continuous_corruption=continuous_corruption,
            loss=loss)
        # Prepare training functions
        cost_fn = theano.function(
            inputs=[self.network.x,
                    Param(self.regularization, default=0.0)],
            outputs=cost,
        )
        train_fn = theano.function(
            inputs=[
                self.network.x,
                Param(self.learning_rate, default=0.1),
                Param(self.regularization, default=0.0)
            ],
            outputs=cost,
            updates=updates,
        )
        # Prepare a function to test how close to the identity function the learned mapping is
        # A lower value indicates that it's generalizing more (though not necessarily better)
        identity_ratio = T.mean(
            T.sum(self.network.get_prediction_dist() * (self.network.x > 0),
                  axis=1))
        identity_ratio_fn = theano.function(inputs=[self.network.x],
                                            outputs=identity_ratio)
        ###########

        # Keep a record of costs, so we can plot them
        val_costs = []
        training_costs = []

        # Keep a copy of the best weights so far
        val_cost = 0.
        best_weights = best_iter = best_val_cost = None
        if validation_set is not None:
            best_weights = self.network.get_weights()
            best_iter = -1
            best_val_cost = cost_fn(validation_set)

            log.info("Computing initial validation scores")
            f_score, precision, recall, f_score_classes = self.compute_f_scores(
                validation_set)
            log.info(
                "F-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%" %
                (f_score * 100.0, f_score_classes, precision * 100.0,
                 recall * 100.0))
            identity_ratio = identity_ratio_fn(validation_set)
            log.info("Identity ratio = %.4g" % identity_ratio)

        below_threshold_its = 0

        for i in range(iterations):
            err = 0.0
            batch_num = 0
            for batch_num, batch in enumerate(batch_iterator):
                # Shuffle the training data between iterations, as one should with SGD
                # Just shuffle within batches
                shuffle = numpy.random.permutation(batch.shape[0])
                batch[:] = batch[shuffle]

                # Update the model with this batch's data
                err += train_fn(batch,
                                learning_rate=learning_rate,
                                regularization=regularization)

            training_costs.append(err / batch_num)

            if validation_set is not None:
                # Compute the cost function on the validation set
                val_cost = cost_fn(validation_set) / validation_set.shape[0]
                val_costs.append(val_cost)
                if val_cost <= best_val_cost:
                    # We assume that, if the validation error remains the same, it's better to use the new set of
                    # weights (with, presumably, a better training error)
                    if val_cost == best_val_cost:
                        log.info(
                            "Same validation cost: %.4f, using new weights" %
                            val_cost)
                    else:
                        log.info("New best validation cost: %.4f" % val_cost)
                    # Update our best estimate
                    best_weights = self.network.get_weights()
                    best_iter = i
                    best_val_cost = val_cost
                if val_cost >= best_val_cost and i - best_iter >= stopping_iterations:
                    # We've gone on long enough without improving validation error
                    # Time to call a halt and use the best validation error we got
                    log.info(
                        "Stopping after %d iterations of increasing validation cost"
                        % stopping_iterations)
                    break

            log.info(
                "COMPLETED ITERATION %d: training cost=%.5g, val cost=%.5g" %
                (i, training_costs[-1], val_cost))

            if cost_plot_filename:
                # Plot the cost function as we train
                # Skip the first costs, as they're usually so much higher than others that the rest is indistinguishable
                columns = [(training_costs[1:], "Train cost")]
                if validation_set is not None:
                    columns.append((val_costs[1:], "Val cost"))
                ax = plot_costs(None, *columns)
                # Add a line at the most recent best val cost
                ax.axvline(float(best_iter), color="b")
                ax.text(float(best_iter + 1) + 0.1,
                        best_val_cost * 1.1,
                        "Best val cost",
                        color="b")
                from matplotlib import pyplot as plt
                plt.savefig(cost_plot_filename)

            if validation_set is not None:
                f_score, precision, recall, f_score_classes = self.compute_f_scores(
                    validation_set)
                log.info(
                    "Validation f-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%"
                    % (f_score * 100.0, f_score_classes, precision * 100.0,
                       recall * 100.0))
                identity_ratio = identity_ratio_fn(validation_set)
                log.info("Validation identity ratio = %.4g" % identity_ratio)

            if iteration_callback is not None:
                # Not computing training error at the moment
                iteration_callback(i, training_costs[-1], val_cost, 0.0,
                                   best_iter)

            # Check the proportional change between this iteration's training cost and the last
            if len(training_costs) > 2:
                training_cost_prop_change = abs(
                    (training_costs[-2] - training_costs[-1]) /
                    training_costs[-2])
                if training_cost_prop_change < training_cost_prop_change_threshold:
                    # Very small change in training cost - maybe we've converged
                    below_threshold_its += 1
                    if below_threshold_its >= 5:
                        # We've had enough iterations with very small changes: we've converged
                        log.info(
                            "Proportional change in training cost (%g) below %g for five successive iterations: "
                            "converged" %
                            (training_cost_prop_change,
                             training_cost_prop_change_threshold))
                        break
                    else:
                        log.info(
                            "Proportional change in training cost (%g) below %g for %d successive iterations: "
                            "waiting until it's been low for five iterations" %
                            (training_cost_prop_change,
                             training_cost_prop_change_threshold,
                             below_threshold_its))
                else:
                    # Reset the below threshold counter
                    below_threshold_its = 0

        if best_weights is not None:
            # Use the weights that gave us the best error on the validation set
            self.network.set_weights(best_weights)

Пример #6

Показать файл

    def train(self,
              batch_iterator,
              iterations=10000,
              iteration_callback=None,
              validation_set=None,
              stopping_iterations=10,
              log=None,
              cost_plot_filename=None,
              training_cost_prop_change_threshold=0.0005,
              learning_rate=0.1,
              regularization=0.,
              class_weights_vector=None,
              corruption_level=0.,
              continuous_corruption=False,
              loss="xent"):
        """
        See autoencoder trainer: uses the same training for each layer in turn, then rolls out and
        trains the whole thing together.

        """
        if log is None:
            log = get_console_logger("Autoencoder train")

        # Because the layers are all already properly stacked, when we get the cost/updates for a layer,
        # it's already a function of the original input, but only updates the layer itself
        for layer_num, layer in enumerate(self.network.layers):
            log.info("TRAINING LAYER %d" % layer_num)
            ## Compile functions
            # Prepare cost/update functions for training
            cost, updates = layer.get_cost_updates(
                self.learning_rate,
                self.regularization,
                class_cost_weights=class_weights_vector,
                corruption_level=corruption_level,
                continuous_corruption=continuous_corruption,
                loss=loss)
            # Prepare training functions
            # Note that these use the initial input, not the layer input
            cost_fn = theano.function(
                inputs=[self.input,
                        Param(self.regularization, default=0.0)],
                outputs=cost,
            )
            train_fn = theano.function(
                inputs=[
                    self.input,
                    Param(self.learning_rate, default=0.1),
                    Param(self.regularization, default=0.0)
                ],
                outputs=cost,
                updates=updates,
            )
            # Prepare a function to test how close to the identity function the learned mapping is
            # A lower value indicates that it's generalizing more (though not necessarily better)
            identity_ratio = T.mean(
                T.sum(layer.get_prediction_dist() * (layer.x > 0), axis=1))
            identity_ratio_fn = theano.function(inputs=[self.input],
                                                outputs=identity_ratio)

            # Keep a record of costs, so we can plot them
            val_costs = []
            training_costs = []

            # Keep a copy of the best weights so far
            val_cost = 0.
            best_weights = best_iter = best_val_cost = None
            if validation_set is not None:
                best_weights = layer.get_weights()
                best_iter = -1
                best_val_cost = cost_fn(validation_set)

                log.info("Computing initial validation scores")
                identity_ratio = identity_ratio_fn(validation_set)
                log.info("Identity ratio = %.4g" % identity_ratio)

            log.info("Computing initial training cost")
            batch_costs = [cost_fn(batch) for batch in batch_iterator]
            initial_cost = sum(batch_costs) / len(batch_costs)
            log.info("Cost = %g (%d batches)" %
                     (initial_cost, len(batch_costs)))

            below_threshold_its = 0

            for i in range(iterations):
                err = 0.0
                batch_num = 0
                for batch_num, batch in enumerate(batch_iterator):
                    # Shuffle the training data between iterations, as one should with SGD
                    # Just shuffle within batches
                    shuffle = numpy.random.permutation(batch.shape[0])
                    batch[:] = batch[shuffle]

                    # Update the model with this batch's data
                    err += train_fn(batch,
                                    learning_rate=learning_rate,
                                    regularization=regularization)

                training_costs.append(err / batch_num)

                if validation_set is not None:
                    # Compute the cost function on the validation set
                    val_cost = cost_fn(
                        validation_set) / validation_set.shape[0]
                    val_costs.append(val_cost)
                    if val_cost <= best_val_cost:
                        # We assume that, if the validation error remains the same, it's better to use the new set of
                        # weights (with, presumably, a better training error)
                        if val_cost == best_val_cost:
                            log.info(
                                "Same validation cost: %.4f, using new weights"
                                % val_cost)
                        else:
                            log.info("New best validation cost: %.4f" %
                                     val_cost)
                        # Update our best estimate
                        best_weights = layer.get_weights()
                        best_iter = i
                        best_val_cost = val_cost
                    if val_cost >= best_val_cost and i - best_iter >= stopping_iterations:
                        # We've gone on long enough without improving validation error
                        # Time to call a halt and use the best validation error we got
                        log.info(
                            "Stopping after %d iterations of increasing validation cost"
                            % stopping_iterations)
                        break

                    log.info(
                        "COMPLETED ITERATION %d: training cost=%.5g, val cost=%.5g"
                        % (i, training_costs[-1], val_cost))
                else:
                    log.info("COMPLETED ITERATION %d: training cost=%.5g" %
                             (i, training_costs[-1]))

                if cost_plot_filename:
                    # Plot the cost function as we train
                    # Skip the first costs, as they're usually so much higher that the rest is indistinguishable
                    columns = [(training_costs[1:], "Train cost")]
                    if validation_set is not None:
                        columns.append((val_costs[1:], "Val cost"))
                    ax = plot_costs(None, *columns)
                    # Add a line at the most recent best val cost
                    ax.axvline(float(best_iter), color="b")
                    ax.text(float(best_iter + 1) + 0.1,
                            best_val_cost * 1.1,
                            "Best val cost",
                            color="b")
                    from matplotlib import pyplot as plt
                    plt.savefig(cost_plot_filename)

                if validation_set is not None:
                    identity_ratio = identity_ratio_fn(validation_set)
                    log.info("Validation identity ratio = %.4g" %
                             identity_ratio)

                if iteration_callback is not None:
                    # Not computing training error at the moment
                    iteration_callback(i, training_costs[-1], val_cost, 0.0,
                                       best_iter)

                # Check the proportional change between this iteration's training cost and the last
                if len(training_costs) > 2:
                    training_cost_prop_change = abs(
                        (training_costs[-2] - training_costs[-1]) /
                        training_costs[-2])
                    if training_cost_prop_change < training_cost_prop_change_threshold:
                        # Very small change in training cost - maybe we've converged
                        below_threshold_its += 1
                        if below_threshold_its >= 5:
                            # We've had enough iterations with very small changes: we've converged
                            log.info(
                                "Proportional change in training cost (%g) below %g for five successive iterations: "
                                "converged" %
                                (training_cost_prop_change,
                                 training_cost_prop_change_threshold))
                            break
                        else:
                            log.info(
                                "Proportional change in training cost (%g) below %g for %d successive iterations: "
                                "waiting until it's been low for five iterations"
                                % (training_cost_prop_change,
                                   training_cost_prop_change_threshold,
                                   below_threshold_its))
                    else:
                        # Reset the below threshold counter
                        below_threshold_its = 0

            if best_weights is not None:
                # Use the weights that gave us the best error on the validation set
                layer.set_weights(best_weights)

Пример #7

Показать файл

    def train(self,
              xs,
              ys,
              iterations=10000,
              iteration_callback=None,
              validation_xs=None,
              validation_ys=None,
              validation_frequency=1,
              learning_rate=0.1,
              regularization=0.01,
              plot_errors=None,
              plot_cost=None):
        """
        Train on data stored in Theano tensors.

        E.g.
        xs = rng.randn(N, num_features)
        ys = rng.randint(size=N, low=0, high=2)

        iteration_callback is called after each iteration with args (iteration, error array).

        """
        learning_rate_var = T.scalar("alpha")
        # Compute the training function
        _train_fn = theano.function(
            inputs=[
                self.x, self.y,
                Param(learning_rate_var, default=0.1),
                Param(self.reg_coef, default=0.01)
            ],
            outputs=self._cost_without_reg,
            updates=[(self.theta, self.theta - learning_rate_var * self.gtheta)
                     ],
        )

        best_validation_error = numpy.inf

        validation_errors = []
        training_errors = []
        costs = []

        for i in range(iterations):
            training_cost = _train_fn(xs,
                                      ys,
                                      alpha=learning_rate,
                                      reg=regularization)

            # Only evaluate on val set every validation_frequencyth iteration
            if validation_xs is not None and (i +
                                              1) % validation_frequency == 0:
                # Compute accuracy on validation set
                validation_error = self.error(validation_xs, validation_ys)
                # Compute accuracy on training set
                training_error = self.error(xs, ys)
                # Compute how much we've improved on the previous best validation error
                if validation_error < best_validation_error:
                    validation_improvement = 0.0
                else:
                    validation_improvement = (
                        validation_error -
                        best_validation_error) / best_validation_error * 100.0
                    best_validation_error = validation_error
            else:
                validation_error = None
                validation_improvement = None
                training_error = None

            if iteration_callback is not None:
                iteration_callback(i, training_cost, training_error,
                                   validation_error, validation_improvement)

            # Plot some graphs
            if plot_cost:
                costs.append(training_cost)
                plot_costs(plot_cost, (costs, "training cost"))
            if plot_errors and validation_error is not None:
                validation_errors.append(validation_error)
                training_errors.append(training_error)
                plot_costs(plot_errors,
                           (training_errors, "training set error"),
                           (validation_errors, "val set error"))

Пример #8

Показать файл

    def train(self, batch_iterator, iterations=10000, iteration_callback=None, learning_rate=None, regularization=None,
              batch_callback=None, validation_set=None, stopping_iterations=10, log=None,
              cost_plot_filename=None, training_cost_prop_change_threshold=None):
        """
        Train on data stored in Theano tensors. Uses minibatch training.

        The input is given as an iterator over batches that should produce (x, y) pairs.

        E.g.
        xs = rng.randn(N, num_features)
        ys = rng.randint(size=N, low=0, high=2)

        iteration_callback is called after each iteration with args (iteration, error array).

        If a validation set (xs, ys) is given, it is used to compute an error after each iteration
        and to enforce a stopping criterion. The algorithm will terminate if it goes stopping_iterations
        iterations without an improvement in validation error.

        Updates for each target class can be weighted by giving a vector class_weights. Alternatively,
        give the string 'freq' to weight them by inverse class frequency, or leave as None to apply
        no weighting.

        If compute_error_frequency > 1 (default=5), this number of iterations are performed between each time
        the error is computed on the training set.

        The algorithm will assume it has converged and stop early if the proportional change between successive
        training costs drops below training_cost_prop_change_threshold for five iterations in a row.
        If threshold is given as None, this stopping condition will not be used.

        """
        if log is None:
            log = get_console_logger("MLP train")

        if plot_costs is None and cost_plot_filename is not None:
            warnings.warn("disabling plotting, since matplotlib couldn't be loaded")
            cost_plot_filename = None
        elif cost_plot_filename is not None:
            log.info("Plotting costs to %s" % cost_plot_filename)

        kwargs = {}
        if learning_rate is not None:
            kwargs["learning_rate"] = learning_rate
        if regularization is not None:
            kwargs["reg_coef"] = regularization
        log.info("Training params: learning rate=%s, reg coef=%s, algorithm=%s" %
                 (learning_rate, regularization, self.optimization))

        # Keep a record of costs, so we can plot them
        val_costs = []
        training_costs = []

        # Compute costs using the initialized network
        initial_batch_costs = [self.compute_cost(xs, ys) for (xs, ys) in batch_iterator]
        training_cost = sum(initial_batch_costs) / len(initial_batch_costs)
        log.info("Initial training cost: %g" % training_cost)
        training_costs.append(training_cost)
        if validation_set is not None:
            val_cost = self.compute_cost(validation_set[0], validation_set[1])
            val_costs.append(val_cost)
        else:
            val_cost = None
        log.info("Training on %d batches" % len(initial_batch_costs))

        # Keep a copy of the best weights so far
        best_weights = best_iter = best_val_cost = None
        if validation_set is not None:
            best_weights = self.network.get_weights()
            best_iter = -1
            best_val_cost = val_cost

        below_threshold_its = 0

        for i in range(iterations):
            err = 0.0
            batch_num = 0
            for batch_num, (xs, ys) in enumerate(batch_iterator):
                # Shuffle the training data between iterations, as one should with SGD
                # We only do it within batches
                shuffle = numpy.random.permutation(xs.shape[0])
                xs[:] = xs[shuffle]
                ys[:] = ys[shuffle]
                # Update the model with this batch's data
                batch_err = self._train_fn(xs, ys, **kwargs)
                err += batch_err

                if batch_callback is not None:
                    batch_callback(batch_num, batch_err)

            # Go back and compute training cost
            training_cost = err / batch_num
            training_costs.append(training_cost)

            if validation_set is not None:
                # Compute the cost function on the validation set
                val_cost = self.compute_cost(validation_set[0], validation_set[1])
                val_costs.append(val_cost)
                if val_cost <= best_val_cost:
                    # We assume that, if the validation error remains the same, it's better to use the new set of
                    # weights (with, presumably, a better training error)
                    # Update our best estimate
                    best_weights = self.network.get_weights()
                    best_iter = i
                    best_val_cost = val_cost

                if i - best_iter >= stopping_iterations:
                    # We've gone on long enough without improving validation error
                    # Time to call a halt and use the best validation error we got
                    log.info("Stopping after %d iterations without improving validation cost" %
                             stopping_iterations)
                    break

                log.info("Completed iteration %d, training cost=%.5f, val cost=%.5f" % (i, training_cost, val_cost))

                if best_iter < i:
                    log.info("No improvement in validation cost")
            else:
                log.info("Completed iteration %d, training cost=%.5f" % (i, training_cost))

            if cost_plot_filename:
                # Plot the cost function as we train
                # Training cost is usually so high on the first iteration that it makes it impossible to see others
                columns = [(training_costs[1:], "Train cost")]
                if validation_set is not None:
                    columns.append((val_costs[1:], "Val cost"))
                ax, fig = plot_costs(None, *columns, return_figure=True)
                if best_iter is not None:
                    # Add a line at the most recent best val cost
                    ax.axvline(float(best_iter+1), color="b")
                    ax.text(float(best_iter+1)+0.1, best_val_cost*1.1, "Best val cost", color="b")
                # Write out to a file
                from matplotlib import pyplot as plt
                plt.savefig(cost_plot_filename)
                plt.close(fig)

            if iteration_callback is not None:
                iteration_callback(i, training_cost, val_cost, best_iter)

            # Check the proportional change between this iteration's training cost and the last
            if len(training_costs) > 2 and training_cost_prop_change_threshold is not None:
                training_cost_prop_change = abs((training_costs[-2] - training_costs[-1]) / training_costs[-2])
                if training_cost_prop_change < training_cost_prop_change_threshold:
                    # Very small change in training cost - maybe we've converged
                    below_threshold_its += 1
                    if below_threshold_its >= 5:
                        # We've had enough iterations with very small changes: we've converged
                        log.info("Proportional change in training cost (%g) below %g for five successive iterations: "
                                 "converged" % (training_cost_prop_change, training_cost_prop_change_threshold))
                        break
                    else:
                        log.info("Proportional change in training cost (%g) below %g for %d successive iterations: "
                                 "waiting until it's been low for five iterations" %
                                 (training_cost_prop_change, training_cost_prop_change_threshold, below_threshold_its))
                else:
                    # Reset the below threshold counter
                    below_threshold_its = 0

        if best_weights is not None:
            # Use the weights that gave us the best error on the validation set
            # If val set wasn't given, the network just has the latest weights
            self.network.set_weights(best_weights)

Пример #9

Показать файл

Файл: autoencoder.py Проект: basilwang/TransBERT-ijcai2019

    def train(self,
              xs,
              iterations=10000,
              iteration_callback=None,
              batch_size=20,
              batch_callback=None,
              validation_set=None,
              stopping_iterations=10,
              log=None,
              cost_plot_filename=None,
              training_cost_prop_change_threshold=0.0005,
              learning_rate=0.1,
              regularization=None,
              class_weights=None,
              corruption_level=0.,
              continuous_corruption=False,
              loss="xent"):
        """
        Train on data stored in Theano tensors. Uses minibatch training.

        xs are the vectors to train on. Targets needn't be given, since the input and output are the
        same in an autoencoder.

        iteration_callback is called after each iteration with args (iteration, error array).

        If a validation set (matrix) is given, it is used to compute an error after each iteration
        and to enforce a stopping criterion. The algorithm will terminate if it goes stopping_iterations
        iterations without an improvement in validation error.

        If compute_error_frequency > 1 (default=5), this number of iterations are performed between each time
        the error is computed on the training set.

        The algorithm will assume it has converged and stop early if the proportional change between successive
        training costs drops below training_cost_prop_change_threshold for five iterations in a row.

        Uses L2 regularization.

        Several params are included just to implement the same interface as single_hidden_layer.
        Might want to change this later to be a bit neater.

        """
        if log is None:
            log = get_console_logger("Autoencoder train")

        log.info(
            "Training params: learning rate=%s, noise ratio=%.1f%% (%s), regularization=%.2f"
            % (learning_rate, self.network.corruption_level * 100.0,
               "continuous corruption" if self.network.continuous_corruption
               else "zeroing corruption", regularization))
        log.info("Training with SGD, batch size=%d" % batch_size)

        if class_weights is None:
            # Don't apply any weighting
            class_weights_vector = None
        elif class_weights == "freq":
            # Apply inverse frequency weighting
            class_counts = numpy.maximum(xs.sum(axis=0), 1.0)
            class_weights_vector = 1. / class_counts
            class_weights_vector *= xs.shape[1] / class_weights_vector.sum()
            log.info(
                "Using inverse frequency class weighting in cost function")
        elif class_weights == "log":
            class_counts = numpy.maximum(xs.sum(axis=0), 1.0)
            class_weights_vector = 1. / (numpy.log(class_counts) + 1.)
            class_weights_vector *= xs.shape[1] / class_weights_vector.sum()
            log.info(
                "Using inverse log frequency class weighting in cost function")
        else:
            raise ValueError("invalid class weighting '%s'" % class_weights)

        ######## Compile functions
        # Prepare cost/update functions for training
        cost, updates = self.network.get_cost_updates(
            self.learning_rate,
            self.regularization,
            class_cost_weights=class_weights_vector,
            corruption_level=corruption_level,
            continuous_corruption=continuous_corruption,
            loss=loss)
        # Prepare training functions
        cost_fn = theano.function(
            inputs=[self.network.x,
                    Param(self.regularization, default=0.0)],
            outputs=cost,
        )
        train_fn = theano.function(
            inputs=[
                self.network.x,
                Param(self.learning_rate, default=0.1),
                Param(self.regularization, default=0.0)
            ],
            outputs=cost,
            updates=updates,
        )
        # Prepare a function to test how close to the identity function the learned mapping is
        # A lower value indicates that it's generalizing more (though not necessarily better)
        identity_ratio = T.mean(
            T.sum(self.network.get_prediction_dist() * (self.network.x > 0),
                  axis=1))
        identity_ratio_fn = theano.function(inputs=[self.network.x],
                                            outputs=identity_ratio)
        ###########

        # Throw away ys in validation set
        validation_set = validation_set[0]

        # Prepare a prediction validation set by holding one event out of every chain in the val set
        prediction_targets = numpy.array([
            random.choice(numpy.where(x_row > 0)[0])
            for x_row in validation_set
        ],
                                         dtype=numpy.int16)
        prediction_contexts = validation_set.copy()
        prediction_contexts[range(prediction_contexts.shape[0]),
                            prediction_targets] = 0.
        prediction_balanced_sample = balanced_array_sample(prediction_targets,
                                                           balance_ratio=4.,
                                                           min_inclusion=1)
        prediction_targets = prediction_targets[prediction_balanced_sample]
        prediction_contexts = prediction_contexts[prediction_balanced_sample]
        log.info(
            "Prepared roughly balanced prediction set from validation set with %d examples"
            % prediction_contexts.shape[0])

        # Work out how many batches to do
        if batch_size is None or batch_size == 0:
            num_batches = 1
        else:
            num_batches = xs.shape[0] / batch_size
            if xs.shape[0] % batch_size != 0:
                num_batches += 1

        # Keep a record of costs, so we can plot them
        val_costs = []
        training_costs = []

        # Compute costs using the initialized network
        training_cost = cost_fn(xs)
        training_costs.append(training_cost)
        if validation_set is not None:
            val_cost = cost_fn(validation_set)
            val_costs.append(val_cost)
        else:
            val_cost = None

        log.info("Computing initial validation scores")
        f_score, precision, recall, f_score_classes = self.compute_f_scores(
            validation_set)
        log.info("F-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%" %
                 (f_score * 100.0, f_score_classes, precision * 100.0,
                  recall * 100.0))
        log_prob = self.network.prediction_log_prob(prediction_contexts,
                                                    prediction_targets)
        log.info("Logprob = %.4g" % log_prob)
        gen_log_prob = self.network.generalization_log_prob(
            prediction_contexts, prediction_targets)
        log.info("Generalization logprob = %.4g" % gen_log_prob)
        identity_ratio = identity_ratio_fn(validation_set)
        log.info("Identity ratio = %.4g" % identity_ratio)

        # Keep a copy of the best weights so far
        best_weights = best_iter = best_val_cost = None
        if validation_set is not None:
            best_weights = self.network.get_weights()
            best_iter = -1
            best_val_cost = val_cost

        below_threshold_its = 0

        for i in range(iterations):
            # Shuffle the training data between iterations, as one should with SGD
            shuffle = numpy.random.permutation(xs.shape[0])
            xs[:] = xs[shuffle]

            err = 0.0
            if num_batches > 1:
                for batch in range(num_batches):
                    # Update the model with this batch's data
                    batch_err = train_fn(xs[batch * batch_size:(batch + 1) *
                                            batch_size],
                                         learning_rate=learning_rate,
                                         regularization=regularization)
                    err += batch_err

                    if batch_callback is not None:
                        batch_callback(batch, num_batches, batch_err)
            else:
                # Batch training: no need to loop
                ### Always perform one batch iteration to start with to get us into a good part of the space
                train_fn(xs,
                         learning_rate=learning_rate,
                         regularization=regularization)

            # Go back and compute training cost
            training_cost = cost_fn(xs)
            training_costs.append(training_cost)

            if validation_set is not None:
                # Compute the cost function on the validation set
                val_cost = cost_fn(validation_set)
                val_costs.append(val_cost)
                if val_cost <= best_val_cost:
                    # We assume that, if the validation error remains the same, it's better to use the new set of
                    # weights (with, presumably, a better training error)
                    if val_cost == best_val_cost:
                        log.info(
                            "Same validation cost: %.4f, using new weights" %
                            val_cost)
                    else:
                        log.info("New best validation cost: %.4f" % val_cost)
                    # Update our best estimate
                    best_weights = self.network.get_weights()
                    best_iter = i
                    best_val_cost = val_cost
                if val_cost >= best_val_cost and i - best_iter >= stopping_iterations:
                    # We've gone on long enough without improving validation error
                    # Time to call a halt and use the best validation error we got
                    log.info(
                        "Stopping after %d iterations of increasing validation cost"
                        % stopping_iterations)
                    break

            log.info(
                "COMPLETED ITERATION %d: training cost=%.5f, val cost=%.5f" %
                (i, training_cost, val_cost))

            if cost_plot_filename:
                # Plot the cost function as we train
                # Skip the first costs, as they're usually so much higher than others that the rest is indistinguishable
                columns = [(training_costs[1:], "Train cost")]
                if validation_set is not None:
                    columns.append((val_costs[1:], "Val cost"))
                ax = plot_costs(None, *columns)
                # Add a line at the most recent best val cost
                ax.axvline(float(best_iter), color="b")
                ax.text(float(best_iter + 1) + 0.1,
                        best_val_cost * 1.1,
                        "Best val cost",
                        color="b")
                plt.savefig(cost_plot_filename)

            f_score, precision, recall, f_score_classes = self.compute_f_scores(
                validation_set)
            log.info(
                "Validation f-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%"
                % (f_score * 100.0, f_score_classes, precision * 100.0,
                   recall * 100.0))
            #log_prob = self.network.prediction_log_prob(prediction_contexts, prediction_targets)
            #log.info("Prediction logprob = %.4g" % log_prob)
            gen_log_prob = self.network.generalization_log_prob(
                prediction_contexts, prediction_targets)
            log.info("Generalization logprob = %.4g" % gen_log_prob)
            identity_ratio = identity_ratio_fn(validation_set)
            log.info("Validation identity ratio = %.4g" % identity_ratio)

            if iteration_callback is not None:
                # Not computing training error at the moment
                iteration_callback(i, training_cost, val_cost, 0.0, best_iter)

            # Check the proportional change between this iteration's training cost and the last
            if len(training_costs) > 2:
                training_cost_prop_change = abs(
                    (training_costs[-2] - training_costs[-1]) /
                    training_costs[-2])
                if training_cost_prop_change < training_cost_prop_change_threshold:
                    # Very small change in training cost - maybe we've converged
                    below_threshold_its += 1
                    if below_threshold_its >= 5:
                        # We've had enough iterations with very small changes: we've converged
                        log.info(
                            "Proportional change in training cost (%g) below %g for five successive iterations: "
                            "converged" %
                            (training_cost_prop_change,
                             training_cost_prop_change_threshold))
                        break
                    else:
                        log.info(
                            "Proportional change in training cost (%g) below %g for %d successive iterations: "
                            "waiting until it's been low for five iterations" %
                            (training_cost_prop_change,
                             training_cost_prop_change_threshold,
                             below_threshold_its))
                else:
                    # Reset the below threshold counter
                    below_threshold_its = 0

        if best_weights is not None:
            # Use the weights that gave us the best error on the validation set
            self.network.set_weights(best_weights)