def callback(new_theta): # Update the parameters of the model self.theta.set_value(new_theta, borrow=True) # Only evaluate on val set every validation_frequencyth iteration if validation_xs is not None and (iteration_counter[0] + 1) % validation_frequency == 0: # Compute accuracy on validation set validation_error = self.error(validation_xs, validation_ys) # Compute accuracy on training set training_error = self.error(xs, ys) # Compute how much we've improved on the previous best validation error if validation_error < best_validation_error[0]: validation_improvement = 0.0 else: validation_improvement = ( validation_error - best_validation_error[0] ) / best_validation_error[0] * 100.0 best_validation_error[0] = validation_error # Plot some graphs if plot_errors and validation_error is not None: validation_errors.append(validation_error) training_errors.append(training_error) plot_costs(plot_errors, (training_errors, "training set error"), (validation_errors, "val set error")) else: validation_error = training_error = validation_improvement = None if iteration_callback is not None: # TODO Compute training cost? iteration_callback(iteration_counter[0], 0.0, training_error, validation_error, validation_improvement) iteration_counter[0] += 1
def train(self, xs, ys, iterations=10000, iteration_callback=None, learning_rate=None, regularization=None, batch_size=20, batch_callback=None, validation_set=None, stopping_iterations=10, log=None, class_weights=None, cost_plot_filename=None, training_cost_prop_change_threshold=None, undersample=None, print_predictions=False): """ Train on data stored in Theano tensors. Uses minibatch training. E.g. xs = rng.randn(N, num_features) ys = rng.randint(size=N, low=0, high=2) iteration_callback is called after each iteration with args (iteration, error array). If a validation set (xs, ys) is given, it is used to compute an error after each iteration and to enforce a stopping criterion. The algorithm will terminate if it goes stopping_iterations iterations without an improvement in validation error. Updates for each target class can be weighted by giving a vector class_weights. Alternatively, give the string 'freq' to weight them by inverse class frequency, or leave as None to apply no weighting. If compute_error_frequency > 1 (default=5), this number of iterations are performed between each time the error is computed on the training set. The algorithm will assume it has converged and stop early if the proportional change between successive training costs drops below training_cost_prop_change_threshold for five iterations in a row. If threshold is given as None, this stopping condition will not be used. If undersample is given it should be a float. The training data will be randomly undersampled to produce a set in which the expected number of instances of each class is undersample*min_freq, where min_freq is the number of instances of the least common observed class. A value of 1.0 will produce a roughly balanced set. Every class that is observed at all will be included at least once. The sampling is performed once at the beginning of training. """ if log is None: log = get_console_logger("MLP train") if cost_plot_filename is not None: _fname, __, _ext = cost_plot_filename.rpartition(".") balanced_cost_plot_filename = "%s_balanced.%s" % (_fname, _ext) log.info("Outputting balanced costs to: %s" % balanced_cost_plot_filename) else: balanced_cost_plot_filename = None kwargs = {} cost_kwargs = { "reg_coef": 0., # Always compute the cost without regularization } if learning_rate is not None: kwargs["learning_rate"] = learning_rate if regularization is not None: kwargs["reg_coef"] = regularization log.info("Training params: learning rate=%s, reg coef=%s" % (learning_rate, regularization)) log.info("Training with %s, batch size=%d" % (self.optimization, batch_size)) if undersample is not None and undersample > 0.0: log.info("Undersampling the dataset with a ratio of %s" % undersample) # Work out how many batches to do if batch_size is None or batch_size == 0: num_batches = 1 else: num_batches = xs.shape[0] / batch_size if xs.shape[0] % batch_size != 0: num_batches += 1 if undersample is not None and undersample > 0.0: # Undersample the training data to produce a (more) balanced set balanced_indices = balanced_array_sample(ys, balance_ratio=undersample, min_inclusion=1) # Copy the data so we're not dealing with a view xs = numpy.copy(xs[balanced_indices]) ys = numpy.copy(ys[balanced_indices]) # Also sample the validation set similarly balanced_validation_indices = balanced_array_sample( validation_set[1], balance_ratio=undersample, min_inclusion=1) validation_set = ( numpy.copy(validation_set[0][balanced_validation_indices]), numpy.copy(validation_set[1][balanced_validation_indices])) log.info("Sampled %d training and %d validation instances" % (xs.shape[0], validation_set[0].shape[0])) # Work out class weighting # Do this after undersampling: if both are used, we only want the weights to account for any imbalance # left after undersampling if class_weights is not None: if class_weights == 'freq': # Use inverse frequency to weight class updates # This procedure is modelled directly on what liblinear does class_counts = self.get_class_counts(ys).astype(numpy.float64) # Replace zero-counts with 1s class_counts = numpy.maximum(class_counts, 1.0) class_weights = 1.0 / class_counts class_weights *= self.network.num_classes / class_weights.sum() log.info("Inverse-frequency class weighting") elif class_weights == 'log': # Use a different scheme, inversely proportional to the log of the class frequencies class_counts = self.get_class_counts(ys).astype(numpy.float64) class_counts = numpy.maximum(class_counts, 1.0) class_weights = 1.0 / (1.0 + numpy.log(class_counts)) class_weights *= self.network.num_classes / class_weights.sum() log.info("Log-inverse-frequency class weighting") else: log.info("Custom vector class weighting") kwargs["class_weights"] = class_weights cost_kwargs["class_weights"] = class_weights else: log.info("No class weighting") # Keep a record of costs, so we can plot them val_costs = [] training_costs = [] # The costs using the balanced metric bal_val_costs = [] bal_training_costs = [] # Compute costs using the initialized network training_cost = self.compute_cost(xs, ys, **cost_kwargs) training_costs.append(training_cost) if validation_set is not None: val_cost = self.compute_cost(validation_set[0], validation_set[1], **cost_kwargs) val_costs.append(val_cost) else: val_cost = None log.info("Computing initial validation set metrics:") class_accuracies = self.network.per_class_accuracy( validation_set[0], validation_set[1]) class_accuracies = class_accuracies[numpy.where( numpy.logical_not(numpy.isnan(class_accuracies)))] mean_class_accuracy = class_accuracies.mean() log.info("Per-class accuracy: %.4f%% (mean over %d classes)" % (mean_class_accuracy, class_accuracies.shape[0])) # Also compute mean log prob of targets over val set mean_log_prob = self.network.mean_log_prob(validation_set[0], validation_set[1]) log.info("Mean target log prob: %.4f" % mean_log_prob) mean_per_class_log_prob = self.network.mean_per_class_target_log_prob( validation_set[0], validation_set[1]) log.info("Mean per-class mean target log prob: %.4f" % mean_per_class_log_prob) # Keep a copy of the best weights so far best_weights = best_iter = best_val_cost = None if validation_set is not None: best_weights = self.network.get_weights() best_iter = -1 best_val_cost = val_cost below_threshold_its = 0 # Count the instances we're learning from to give an idea of how hard a time the model's got training_class_counts = numpy.bincount(ys) training_class_counts = training_class_counts[ training_class_counts.nonzero()] log.info( "Training instances per class: min=%d, max=%d (%d unseen classes)" % (int( training_class_counts.min()), int(training_class_counts.max()), self.network.num_classes - training_class_counts.shape[0])) for i in range(iterations): # Shuffle the training data between iterations, as one should with SGD shuffle = numpy.random.permutation(xs.shape[0]) xs[:] = xs[shuffle] ys[:] = ys[shuffle] err = 0.0 if num_batches > 1: for batch in range(num_batches): # Update the model with this batch's data batch_err = self._train_fn( xs[batch * batch_size:(batch + 1) * batch_size], ys[batch * batch_size:(batch + 1) * batch_size], **kwargs) err += batch_err if batch_callback is not None: batch_callback(batch, num_batches, batch_err) else: # Batch training: no need to loop err = self._train_fn(xs, ys, **kwargs) # Go back and compute training cost training_cost = self.compute_cost(xs, ys, **cost_kwargs) training_costs.append(training_cost) # Training set error train_error = self.network.error(xs, ys) bal_training_costs.append( -self.network.mean_per_class_target_log_prob(xs, ys)) if validation_set is not None: if print_predictions: # Perform some predictions on a random sample of the val set for randind in numpy.random.randint( validation_set[0].shape[0], size=5): # Get the network's predictions predictions = self.network.predict( validation_set[0][None, randind, :]) predictions = predictions[0, None] log.info("Input: %s. Predictions: %s" % (list( numpy.where( validation_set[0][randind] > 0)[0]), list(predictions))) # Compute the cost function on the validation set val_cost = self.compute_cost(validation_set[0], validation_set[1], **cost_kwargs) val_costs.append(val_cost) if val_cost <= best_val_cost: # We assume that, if the validation error remains the same, it's better to use the new set of # weights (with, presumably, a better training error) # Update our best estimate best_weights = self.network.get_weights() best_iter = i best_val_cost = val_cost if i - best_iter >= stopping_iterations: # We've gone on long enough without improving validation error # Time to call a halt and use the best validation error we got log.info( "Stopping after %d iterations without improving validation cost" % stopping_iterations) break # Compute various metrics # Per-class accuracy on val set class_accuracies = self.network.per_class_accuracy( validation_set[0], validation_set[1]) class_accuracies = class_accuracies[numpy.where( numpy.logical_not(numpy.isnan(class_accuracies)))] mean_class_accuracy = class_accuracies.mean() # Mean log prob of targets over val set mean_log_prob = self.network.mean_log_prob( validation_set[0], validation_set[1]) mean_per_class_log_prob = self.network.mean_per_class_target_log_prob( validation_set[0], validation_set[1]) log.info( "Completed iteration %d, training cost=%.5f, val cost=%.5f, training error=%.2f%%, " "per-class accuracy: %.4f%%, mean tgt logprob: %.4f, per-class tgt logprob: %.4f" % (i, training_cost, val_cost, train_error * 100.0, mean_class_accuracy, mean_log_prob, mean_per_class_log_prob)) bal_val_costs.append(-mean_per_class_log_prob) if best_iter < i: log.info("No improvement in validation cost") else: log.info( "Completed iteration %d, training cost=%.5f, training error=%.2f%%" % (i, training_cost, train_error * 100.0)) if cost_plot_filename: # Plot the cost function as we train columns = [(training_costs, "Train cost")] if validation_set is not None: columns.append((val_costs, "Val cost")) ax = plot_costs(None, *columns) # Add a line at the most recent best val cost ax.axvline(float(best_iter + 1), color="b") ax.text(float(best_iter + 1) + 0.1, best_val_cost * 1.1, "Best val cost", color="b") plt.savefig(cost_plot_filename) bal_columns = [(bal_training_costs, "Train cost (balanced)")] if validation_set is not None: bal_columns.append((bal_val_costs, "Val cost (balanced)")) plot_costs(balanced_cost_plot_filename, *bal_columns) if iteration_callback is not None: iteration_callback(i, training_cost, val_cost, train_error, best_iter) # Check the proportional change between this iteration's training cost and the last if len(training_costs ) > 2 and training_cost_prop_change_threshold is not None: training_cost_prop_change = abs( (training_costs[-2] - training_costs[-1]) / training_costs[-2]) if training_cost_prop_change < training_cost_prop_change_threshold: # Very small change in training cost - maybe we've converged below_threshold_its += 1 if below_threshold_its >= 5: # We've had enough iterations with very small changes: we've converged log.info( "Proportional change in training cost (%g) below %g for five successive iterations: " "converged" % (training_cost_prop_change, training_cost_prop_change_threshold)) break else: log.info( "Proportional change in training cost (%g) below %g for %d successive iterations: " "waiting until it's been low for five iterations" % (training_cost_prop_change, training_cost_prop_change_threshold, below_threshold_its)) else: # Reset the below threshold counter below_threshold_its = 0 if best_weights is not None: # Use the weights that gave us the best error on the validation set self.network.set_weights(best_weights)
def train(self, batch_iterator, total_samples, iterations=10000, validation_set=None, stopping_iterations=10, cost_plot_filename=None, iteration_callback=None, log=None, training_cost_prop_change_threshold=0.0005, batch_callback=None, first_it_last_layer=False): if log is None: log = get_console_logger("Autoencoder tune") log.info( "Tuning params: learning rate=%s (->%s), regularization=%s" % (self.learning_rate, self.min_learning_rate, self.regularization)) if self.update_empty_vectors: log.info("Training empty vectors") if self.update_input_vectors: log.info("Updating basic word representations") ######## Compile functions network = self.model.pair_projection_model # Prepare cost/update functions for training cost, updates = self.get_triple_cost_updates() cost_without_reg, __ = self.get_triple_cost_updates(regularization=0.) # Prepare training functions cost_fn = theano.function( inputs=network.triple_inputs, outputs=cost_without_reg, ) train_fn = theano.function( inputs=network.triple_inputs + [ # Allow the learning rate to be set per update theano.Param(self.learning_rate_var, default=self.learning_rate), ], outputs=cost, updates=updates, ) # Doesn't do anything now: used to do something different first_pass_train_fn = train_fn ########### # Keep a record of costs, so we can plot them val_costs = [] training_costs = [] # Keep a copy of the best weights so far val_cost = 0. best_weights = best_iter = best_val_cost = None if validation_set is not None: best_weights = self.network.get_weights() best_iter = -1 best_val_cost = cost_fn(validation_set) below_threshold_its = 0 for i in range(iterations): err = 0.0 batch_num = 0 learning_rate = self.learning_rate seen_samples = 0 tenth_progress = -1 if i == 0 and first_it_last_layer: # On the first iteration, use the training function that only updates the final layer log.info( "First pass: only updating final layer (logistic regression)" ) train = first_pass_train_fn else: train = train_fn for batch_num, batch_inputs in enumerate(batch_iterator): # Shuffle the training data between iterations, as one should with SGD # Just shuffle within batches shuffle = numpy.random.permutation(batch_inputs[0].shape[0]) for batch_data in batch_inputs: batch_data[:] = batch_data[shuffle] # Update the model with this batch's data err += train(*batch_inputs, learning_rate=learning_rate) seen_samples += batch_inputs[0].shape[0] # Update the learning rate, so it falls away as we go through # Do this only on the first iteration. After that, LR should just stay at the min if i == 0: learning_rate = max( self.min_learning_rate, self.learning_rate * (1. - float(seen_samples) / total_samples)) current_tenth_progress = int( math.floor(10. * float(seen_samples) / total_samples)) if current_tenth_progress > tenth_progress: tenth_progress = current_tenth_progress mean_cost_so_far = err / (batch_num + 1) log.info("%d%% of iteration: training cost so far = %.5g" % (current_tenth_progress * 10, mean_cost_so_far)) if i == 0: log.info("Learning rate updated to %g" % learning_rate) if batch_callback is not None: batch_callback(i, batch_num) if batch_num == 0: raise ModelTrainingError( "zero batches returned by training data iterator") training_costs.append(err / (batch_num + 1)) if validation_set is not None: # Compute the cost function on the validation set val_cost = cost_fn(validation_set) / validation_set.shape[0] val_costs.append(val_cost) if val_cost <= best_val_cost: # We assume that, if the validation error remains the same, it's better to use the new set of # weights (with, presumably, a better training error) if val_cost == best_val_cost: log.info( "Same validation cost: %.4f, using new weights" % val_cost) else: log.info("New best validation cost: %.4f" % val_cost) # Update our best estimate best_weights = self.network.get_weights() best_iter = i best_val_cost = val_cost if val_cost >= best_val_cost and i - best_iter >= stopping_iterations: # We've gone on long enough without improving validation error # Time to call a halt and use the best validation error we got log.info( "Stopping after %d iterations of increasing validation cost" % stopping_iterations) break log.info( "COMPLETED ITERATION %d: training cost=%.5g, val cost=%.5g" % (i, training_costs[-1], val_cost)) if cost_plot_filename: # Plot the cost function as we train # Skip the first costs, as they're usually so much higher than others that the rest is indistinguishable columns = [(training_costs[1:], "Train cost")] if validation_set is not None: columns.append((val_costs[1:], "Val cost")) ax = plot_costs(None, *columns) # Add a line at the most recent best val cost ax.axvline(float(best_iter), color="b") ax.text(float(best_iter + 1) + 0.1, best_val_cost * 1.1, "Best val cost", color="b") plt.savefig(cost_plot_filename) if iteration_callback is not None: # Not computing training error at the moment iteration_callback(i, training_costs[-1], val_cost, 0.0, best_iter) # Check the proportional change between this iteration's training cost and the last if len(training_costs) > 2: training_cost_prop_change = abs( (training_costs[-2] - training_costs[-1]) / training_costs[-2]) if training_cost_prop_change < training_cost_prop_change_threshold: # Very small change in training cost - maybe we've converged below_threshold_its += 1 if below_threshold_its >= 5: # We've had enough iterations with very small changes: we've converged log.info( "Proportional change in training cost (%g) below %g for five successive iterations: " "converged" % (training_cost_prop_change, training_cost_prop_change_threshold)) break else: log.info( "Proportional change in training cost (%g) below %g for %d successive iterations: " "waiting until it's been low for five iterations" % (training_cost_prop_change, training_cost_prop_change_threshold, below_threshold_its)) else: # Reset the below threshold counter below_threshold_its = 0 if best_weights is not None: # Use the weights that gave us the best error on the validation set self.network.set_weights(best_weights)
def train(self, xs, ys, iterations=10000, iteration_callback=None, validation_xs=None, validation_ys=None, learning_rate=0.1, regularization=0.01, plot_errors=None, plot_cost=None, minibatch=1, early_stopping_iterations=5, batch_getter=None, validation_batch_size=0): """ Train on data stored in Theano tensors. E.g. xs = rng.randn(N, num_features) ys = rng.randint(size=N, low=0, high=2) iteration_callback is called after each iteration with args (iteration, error array). Returns True if we stop early because of early stopping criterion """ if plot_errors: warnings.warn( "Error plotting is no longer implemented for logistic regression" ) if batch_getter is None: # Default getter produces an iterator over fixed minibatch sizes and assumes the xs and ys are arrays def batch_getter(x_source, y_source, batch_size): if batch_size == 0: # Interpret minibatch == 0 as batch batch_size = x_source.shape[0] for batch in range( int(math.ceil(float(x_source.shape[0]) / batch_size))): yield x_source[minibatch * batch:minibatch * (batch + 1)], y_source[minibatch * batch:minibatch * (batch + 1)] return batch_getter # Build the cost function # The cost to minimize, including L2 regularization cost = self.xent + regularization * (self.w**2).mean() _summed_cost_fn = theano.function( inputs=[self.x, self.y], outputs=self.xent_sum, ) updates = [(param, param - learning_rate * T.grad(cost, param)) for param in self.params] # Build the training function _train_fn = theano.function( inputs=[self.x, self.y], outputs=cost, updates=updates, ) validation_costs = [] costs = [] best_val_cost = numpy.inf best_weights = None last_best_iter = 0 early_stop = False for i in range(iterations): new_best = False # Do an update for each minibatch num_batches = 0 training_cost = 0. training_points = 0 for batch_xs, batch_ys in batch_getter(xs, ys, minibatch): num_batches += 1 # Randomize the order within the batch permutation = numpy.random.permutation(batch_xs.shape[0]) batch_xs = batch_xs[permutation].copy() batch_ys = batch_ys[permutation].copy() # Update on the batch _train_fn(batch_xs, batch_ys) # Compute the training cost as we go training_cost += _summed_cost_fn(batch_xs, batch_ys) training_points += batch_xs.shape[0] training_cost /= training_points # Compute training set cost costs.append(training_cost) if validation_xs is not None: # Compute cost in val set (without regularization, of course) validation_cost = 0. validation_points = 0 for val_batch_xs, val_batch_ys in batch_getter( validation_xs, validation_ys, validation_batch_size): validation_cost += _summed_cost_fn(val_batch_xs, val_batch_ys) validation_points += val_batch_xs.shape[0] validation_cost /= validation_points validation_costs.append(validation_cost) # Check whether we've got a new set of best weights, according to validation cost if i == 0 or validation_costs[-1] < best_val_cost: best_val_cost = validation_costs[-1] best_weights = self.get_weights() last_best_iter = i new_best = True # Test for the early stopping condition if i - last_best_iter >= early_stopping_iterations: # We've gone for enough iterations without an improvement in validation cost # Give up and use best weights so far self.set_weights(best_weights) early_stop = True if iteration_callback is not None: # The empty lists are where val and training errors used to be, left for backwards compat iteration_callback(i, costs, validation_costs, [], [], new_best) # Plot some graphs if plot_cost: plot_costs(plot_cost, (costs, "training cost"), (validation_costs, "val cost")) if early_stop: # We've decided to give up here on the basis of validation cost return True
def train(self, batch_iterator, iterations=10000, iteration_callback=None, validation_set=None, stopping_iterations=10, log=None, cost_plot_filename=None, training_cost_prop_change_threshold=0.0005, learning_rate=0.1, regularization=0., class_weights_vector=None, corruption_level=0., continuous_corruption=False, loss="xent"): """ Train on data stored in Theano tensors. Uses minibatch training. batch_iterator should be a repeatable iterator producing batches. iteration_callback is called after each iteration with args (iteration, error array). If a validation set (matrix) is given, it is used to compute an error after each iteration and to enforce a stopping criterion. The algorithm will terminate if it goes stopping_iterations iterations without an improvement in validation error. If compute_error_frequency > 1 (default=5), this number of iterations are performed between each time the error is computed on the training set. The algorithm will assume it has converged and stop early if the proportional change between successive training costs drops below training_cost_prop_change_threshold for five iterations in a row. Uses L2 regularization. """ if log is None: log = get_console_logger("Autoencoder train") log.info( "Training params: learning rate=%s, noise ratio=%.1f%% (%s), regularization=%s" % (learning_rate, corruption_level * 100.0, "continuous corruption" if continuous_corruption else "zeroing corruption", regularization)) log.info("Training with SGD") ######## Compile functions # Prepare cost/update functions for training cost, updates = self.network.get_cost_updates( self.learning_rate, self.regularization, class_cost_weights=class_weights_vector, corruption_level=corruption_level, continuous_corruption=continuous_corruption, loss=loss) # Prepare training functions cost_fn = theano.function( inputs=[self.network.x, Param(self.regularization, default=0.0)], outputs=cost, ) train_fn = theano.function( inputs=[ self.network.x, Param(self.learning_rate, default=0.1), Param(self.regularization, default=0.0) ], outputs=cost, updates=updates, ) # Prepare a function to test how close to the identity function the learned mapping is # A lower value indicates that it's generalizing more (though not necessarily better) identity_ratio = T.mean( T.sum(self.network.get_prediction_dist() * (self.network.x > 0), axis=1)) identity_ratio_fn = theano.function(inputs=[self.network.x], outputs=identity_ratio) ########### # Keep a record of costs, so we can plot them val_costs = [] training_costs = [] # Keep a copy of the best weights so far val_cost = 0. best_weights = best_iter = best_val_cost = None if validation_set is not None: best_weights = self.network.get_weights() best_iter = -1 best_val_cost = cost_fn(validation_set) log.info("Computing initial validation scores") f_score, precision, recall, f_score_classes = self.compute_f_scores( validation_set) log.info( "F-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%" % (f_score * 100.0, f_score_classes, precision * 100.0, recall * 100.0)) identity_ratio = identity_ratio_fn(validation_set) log.info("Identity ratio = %.4g" % identity_ratio) below_threshold_its = 0 for i in range(iterations): err = 0.0 batch_num = 0 for batch_num, batch in enumerate(batch_iterator): # Shuffle the training data between iterations, as one should with SGD # Just shuffle within batches shuffle = numpy.random.permutation(batch.shape[0]) batch[:] = batch[shuffle] # Update the model with this batch's data err += train_fn(batch, learning_rate=learning_rate, regularization=regularization) training_costs.append(err / batch_num) if validation_set is not None: # Compute the cost function on the validation set val_cost = cost_fn(validation_set) / validation_set.shape[0] val_costs.append(val_cost) if val_cost <= best_val_cost: # We assume that, if the validation error remains the same, it's better to use the new set of # weights (with, presumably, a better training error) if val_cost == best_val_cost: log.info( "Same validation cost: %.4f, using new weights" % val_cost) else: log.info("New best validation cost: %.4f" % val_cost) # Update our best estimate best_weights = self.network.get_weights() best_iter = i best_val_cost = val_cost if val_cost >= best_val_cost and i - best_iter >= stopping_iterations: # We've gone on long enough without improving validation error # Time to call a halt and use the best validation error we got log.info( "Stopping after %d iterations of increasing validation cost" % stopping_iterations) break log.info( "COMPLETED ITERATION %d: training cost=%.5g, val cost=%.5g" % (i, training_costs[-1], val_cost)) if cost_plot_filename: # Plot the cost function as we train # Skip the first costs, as they're usually so much higher than others that the rest is indistinguishable columns = [(training_costs[1:], "Train cost")] if validation_set is not None: columns.append((val_costs[1:], "Val cost")) ax = plot_costs(None, *columns) # Add a line at the most recent best val cost ax.axvline(float(best_iter), color="b") ax.text(float(best_iter + 1) + 0.1, best_val_cost * 1.1, "Best val cost", color="b") from matplotlib import pyplot as plt plt.savefig(cost_plot_filename) if validation_set is not None: f_score, precision, recall, f_score_classes = self.compute_f_scores( validation_set) log.info( "Validation f-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%" % (f_score * 100.0, f_score_classes, precision * 100.0, recall * 100.0)) identity_ratio = identity_ratio_fn(validation_set) log.info("Validation identity ratio = %.4g" % identity_ratio) if iteration_callback is not None: # Not computing training error at the moment iteration_callback(i, training_costs[-1], val_cost, 0.0, best_iter) # Check the proportional change between this iteration's training cost and the last if len(training_costs) > 2: training_cost_prop_change = abs( (training_costs[-2] - training_costs[-1]) / training_costs[-2]) if training_cost_prop_change < training_cost_prop_change_threshold: # Very small change in training cost - maybe we've converged below_threshold_its += 1 if below_threshold_its >= 5: # We've had enough iterations with very small changes: we've converged log.info( "Proportional change in training cost (%g) below %g for five successive iterations: " "converged" % (training_cost_prop_change, training_cost_prop_change_threshold)) break else: log.info( "Proportional change in training cost (%g) below %g for %d successive iterations: " "waiting until it's been low for five iterations" % (training_cost_prop_change, training_cost_prop_change_threshold, below_threshold_its)) else: # Reset the below threshold counter below_threshold_its = 0 if best_weights is not None: # Use the weights that gave us the best error on the validation set self.network.set_weights(best_weights)
def train(self, batch_iterator, iterations=10000, iteration_callback=None, validation_set=None, stopping_iterations=10, log=None, cost_plot_filename=None, training_cost_prop_change_threshold=0.0005, learning_rate=0.1, regularization=0., class_weights_vector=None, corruption_level=0., continuous_corruption=False, loss="xent"): """ See autoencoder trainer: uses the same training for each layer in turn, then rolls out and trains the whole thing together. """ if log is None: log = get_console_logger("Autoencoder train") # Because the layers are all already properly stacked, when we get the cost/updates for a layer, # it's already a function of the original input, but only updates the layer itself for layer_num, layer in enumerate(self.network.layers): log.info("TRAINING LAYER %d" % layer_num) ## Compile functions # Prepare cost/update functions for training cost, updates = layer.get_cost_updates( self.learning_rate, self.regularization, class_cost_weights=class_weights_vector, corruption_level=corruption_level, continuous_corruption=continuous_corruption, loss=loss) # Prepare training functions # Note that these use the initial input, not the layer input cost_fn = theano.function( inputs=[self.input, Param(self.regularization, default=0.0)], outputs=cost, ) train_fn = theano.function( inputs=[ self.input, Param(self.learning_rate, default=0.1), Param(self.regularization, default=0.0) ], outputs=cost, updates=updates, ) # Prepare a function to test how close to the identity function the learned mapping is # A lower value indicates that it's generalizing more (though not necessarily better) identity_ratio = T.mean( T.sum(layer.get_prediction_dist() * (layer.x > 0), axis=1)) identity_ratio_fn = theano.function(inputs=[self.input], outputs=identity_ratio) # Keep a record of costs, so we can plot them val_costs = [] training_costs = [] # Keep a copy of the best weights so far val_cost = 0. best_weights = best_iter = best_val_cost = None if validation_set is not None: best_weights = layer.get_weights() best_iter = -1 best_val_cost = cost_fn(validation_set) log.info("Computing initial validation scores") identity_ratio = identity_ratio_fn(validation_set) log.info("Identity ratio = %.4g" % identity_ratio) log.info("Computing initial training cost") batch_costs = [cost_fn(batch) for batch in batch_iterator] initial_cost = sum(batch_costs) / len(batch_costs) log.info("Cost = %g (%d batches)" % (initial_cost, len(batch_costs))) below_threshold_its = 0 for i in range(iterations): err = 0.0 batch_num = 0 for batch_num, batch in enumerate(batch_iterator): # Shuffle the training data between iterations, as one should with SGD # Just shuffle within batches shuffle = numpy.random.permutation(batch.shape[0]) batch[:] = batch[shuffle] # Update the model with this batch's data err += train_fn(batch, learning_rate=learning_rate, regularization=regularization) training_costs.append(err / batch_num) if validation_set is not None: # Compute the cost function on the validation set val_cost = cost_fn( validation_set) / validation_set.shape[0] val_costs.append(val_cost) if val_cost <= best_val_cost: # We assume that, if the validation error remains the same, it's better to use the new set of # weights (with, presumably, a better training error) if val_cost == best_val_cost: log.info( "Same validation cost: %.4f, using new weights" % val_cost) else: log.info("New best validation cost: %.4f" % val_cost) # Update our best estimate best_weights = layer.get_weights() best_iter = i best_val_cost = val_cost if val_cost >= best_val_cost and i - best_iter >= stopping_iterations: # We've gone on long enough without improving validation error # Time to call a halt and use the best validation error we got log.info( "Stopping after %d iterations of increasing validation cost" % stopping_iterations) break log.info( "COMPLETED ITERATION %d: training cost=%.5g, val cost=%.5g" % (i, training_costs[-1], val_cost)) else: log.info("COMPLETED ITERATION %d: training cost=%.5g" % (i, training_costs[-1])) if cost_plot_filename: # Plot the cost function as we train # Skip the first costs, as they're usually so much higher that the rest is indistinguishable columns = [(training_costs[1:], "Train cost")] if validation_set is not None: columns.append((val_costs[1:], "Val cost")) ax = plot_costs(None, *columns) # Add a line at the most recent best val cost ax.axvline(float(best_iter), color="b") ax.text(float(best_iter + 1) + 0.1, best_val_cost * 1.1, "Best val cost", color="b") from matplotlib import pyplot as plt plt.savefig(cost_plot_filename) if validation_set is not None: identity_ratio = identity_ratio_fn(validation_set) log.info("Validation identity ratio = %.4g" % identity_ratio) if iteration_callback is not None: # Not computing training error at the moment iteration_callback(i, training_costs[-1], val_cost, 0.0, best_iter) # Check the proportional change between this iteration's training cost and the last if len(training_costs) > 2: training_cost_prop_change = abs( (training_costs[-2] - training_costs[-1]) / training_costs[-2]) if training_cost_prop_change < training_cost_prop_change_threshold: # Very small change in training cost - maybe we've converged below_threshold_its += 1 if below_threshold_its >= 5: # We've had enough iterations with very small changes: we've converged log.info( "Proportional change in training cost (%g) below %g for five successive iterations: " "converged" % (training_cost_prop_change, training_cost_prop_change_threshold)) break else: log.info( "Proportional change in training cost (%g) below %g for %d successive iterations: " "waiting until it's been low for five iterations" % (training_cost_prop_change, training_cost_prop_change_threshold, below_threshold_its)) else: # Reset the below threshold counter below_threshold_its = 0 if best_weights is not None: # Use the weights that gave us the best error on the validation set layer.set_weights(best_weights)
def train(self, xs, ys, iterations=10000, iteration_callback=None, validation_xs=None, validation_ys=None, validation_frequency=1, learning_rate=0.1, regularization=0.01, plot_errors=None, plot_cost=None): """ Train on data stored in Theano tensors. E.g. xs = rng.randn(N, num_features) ys = rng.randint(size=N, low=0, high=2) iteration_callback is called after each iteration with args (iteration, error array). """ learning_rate_var = T.scalar("alpha") # Compute the training function _train_fn = theano.function( inputs=[ self.x, self.y, Param(learning_rate_var, default=0.1), Param(self.reg_coef, default=0.01) ], outputs=self._cost_without_reg, updates=[(self.theta, self.theta - learning_rate_var * self.gtheta) ], ) best_validation_error = numpy.inf validation_errors = [] training_errors = [] costs = [] for i in range(iterations): training_cost = _train_fn(xs, ys, alpha=learning_rate, reg=regularization) # Only evaluate on val set every validation_frequencyth iteration if validation_xs is not None and (i + 1) % validation_frequency == 0: # Compute accuracy on validation set validation_error = self.error(validation_xs, validation_ys) # Compute accuracy on training set training_error = self.error(xs, ys) # Compute how much we've improved on the previous best validation error if validation_error < best_validation_error: validation_improvement = 0.0 else: validation_improvement = ( validation_error - best_validation_error) / best_validation_error * 100.0 best_validation_error = validation_error else: validation_error = None validation_improvement = None training_error = None if iteration_callback is not None: iteration_callback(i, training_cost, training_error, validation_error, validation_improvement) # Plot some graphs if plot_cost: costs.append(training_cost) plot_costs(plot_cost, (costs, "training cost")) if plot_errors and validation_error is not None: validation_errors.append(validation_error) training_errors.append(training_error) plot_costs(plot_errors, (training_errors, "training set error"), (validation_errors, "val set error"))
def train(self, batch_iterator, iterations=10000, iteration_callback=None, learning_rate=None, regularization=None, batch_callback=None, validation_set=None, stopping_iterations=10, log=None, cost_plot_filename=None, training_cost_prop_change_threshold=None): """ Train on data stored in Theano tensors. Uses minibatch training. The input is given as an iterator over batches that should produce (x, y) pairs. E.g. xs = rng.randn(N, num_features) ys = rng.randint(size=N, low=0, high=2) iteration_callback is called after each iteration with args (iteration, error array). If a validation set (xs, ys) is given, it is used to compute an error after each iteration and to enforce a stopping criterion. The algorithm will terminate if it goes stopping_iterations iterations without an improvement in validation error. Updates for each target class can be weighted by giving a vector class_weights. Alternatively, give the string 'freq' to weight them by inverse class frequency, or leave as None to apply no weighting. If compute_error_frequency > 1 (default=5), this number of iterations are performed between each time the error is computed on the training set. The algorithm will assume it has converged and stop early if the proportional change between successive training costs drops below training_cost_prop_change_threshold for five iterations in a row. If threshold is given as None, this stopping condition will not be used. """ if log is None: log = get_console_logger("MLP train") if plot_costs is None and cost_plot_filename is not None: warnings.warn("disabling plotting, since matplotlib couldn't be loaded") cost_plot_filename = None elif cost_plot_filename is not None: log.info("Plotting costs to %s" % cost_plot_filename) kwargs = {} if learning_rate is not None: kwargs["learning_rate"] = learning_rate if regularization is not None: kwargs["reg_coef"] = regularization log.info("Training params: learning rate=%s, reg coef=%s, algorithm=%s" % (learning_rate, regularization, self.optimization)) # Keep a record of costs, so we can plot them val_costs = [] training_costs = [] # Compute costs using the initialized network initial_batch_costs = [self.compute_cost(xs, ys) for (xs, ys) in batch_iterator] training_cost = sum(initial_batch_costs) / len(initial_batch_costs) log.info("Initial training cost: %g" % training_cost) training_costs.append(training_cost) if validation_set is not None: val_cost = self.compute_cost(validation_set[0], validation_set[1]) val_costs.append(val_cost) else: val_cost = None log.info("Training on %d batches" % len(initial_batch_costs)) # Keep a copy of the best weights so far best_weights = best_iter = best_val_cost = None if validation_set is not None: best_weights = self.network.get_weights() best_iter = -1 best_val_cost = val_cost below_threshold_its = 0 for i in range(iterations): err = 0.0 batch_num = 0 for batch_num, (xs, ys) in enumerate(batch_iterator): # Shuffle the training data between iterations, as one should with SGD # We only do it within batches shuffle = numpy.random.permutation(xs.shape[0]) xs[:] = xs[shuffle] ys[:] = ys[shuffle] # Update the model with this batch's data batch_err = self._train_fn(xs, ys, **kwargs) err += batch_err if batch_callback is not None: batch_callback(batch_num, batch_err) # Go back and compute training cost training_cost = err / batch_num training_costs.append(training_cost) if validation_set is not None: # Compute the cost function on the validation set val_cost = self.compute_cost(validation_set[0], validation_set[1]) val_costs.append(val_cost) if val_cost <= best_val_cost: # We assume that, if the validation error remains the same, it's better to use the new set of # weights (with, presumably, a better training error) # Update our best estimate best_weights = self.network.get_weights() best_iter = i best_val_cost = val_cost if i - best_iter >= stopping_iterations: # We've gone on long enough without improving validation error # Time to call a halt and use the best validation error we got log.info("Stopping after %d iterations without improving validation cost" % stopping_iterations) break log.info("Completed iteration %d, training cost=%.5f, val cost=%.5f" % (i, training_cost, val_cost)) if best_iter < i: log.info("No improvement in validation cost") else: log.info("Completed iteration %d, training cost=%.5f" % (i, training_cost)) if cost_plot_filename: # Plot the cost function as we train # Training cost is usually so high on the first iteration that it makes it impossible to see others columns = [(training_costs[1:], "Train cost")] if validation_set is not None: columns.append((val_costs[1:], "Val cost")) ax, fig = plot_costs(None, *columns, return_figure=True) if best_iter is not None: # Add a line at the most recent best val cost ax.axvline(float(best_iter+1), color="b") ax.text(float(best_iter+1)+0.1, best_val_cost*1.1, "Best val cost", color="b") # Write out to a file from matplotlib import pyplot as plt plt.savefig(cost_plot_filename) plt.close(fig) if iteration_callback is not None: iteration_callback(i, training_cost, val_cost, best_iter) # Check the proportional change between this iteration's training cost and the last if len(training_costs) > 2 and training_cost_prop_change_threshold is not None: training_cost_prop_change = abs((training_costs[-2] - training_costs[-1]) / training_costs[-2]) if training_cost_prop_change < training_cost_prop_change_threshold: # Very small change in training cost - maybe we've converged below_threshold_its += 1 if below_threshold_its >= 5: # We've had enough iterations with very small changes: we've converged log.info("Proportional change in training cost (%g) below %g for five successive iterations: " "converged" % (training_cost_prop_change, training_cost_prop_change_threshold)) break else: log.info("Proportional change in training cost (%g) below %g for %d successive iterations: " "waiting until it's been low for five iterations" % (training_cost_prop_change, training_cost_prop_change_threshold, below_threshold_its)) else: # Reset the below threshold counter below_threshold_its = 0 if best_weights is not None: # Use the weights that gave us the best error on the validation set # If val set wasn't given, the network just has the latest weights self.network.set_weights(best_weights)
def train(self, xs, iterations=10000, iteration_callback=None, batch_size=20, batch_callback=None, validation_set=None, stopping_iterations=10, log=None, cost_plot_filename=None, training_cost_prop_change_threshold=0.0005, learning_rate=0.1, regularization=None, class_weights=None, corruption_level=0., continuous_corruption=False, loss="xent"): """ Train on data stored in Theano tensors. Uses minibatch training. xs are the vectors to train on. Targets needn't be given, since the input and output are the same in an autoencoder. iteration_callback is called after each iteration with args (iteration, error array). If a validation set (matrix) is given, it is used to compute an error after each iteration and to enforce a stopping criterion. The algorithm will terminate if it goes stopping_iterations iterations without an improvement in validation error. If compute_error_frequency > 1 (default=5), this number of iterations are performed between each time the error is computed on the training set. The algorithm will assume it has converged and stop early if the proportional change between successive training costs drops below training_cost_prop_change_threshold for five iterations in a row. Uses L2 regularization. Several params are included just to implement the same interface as single_hidden_layer. Might want to change this later to be a bit neater. """ if log is None: log = get_console_logger("Autoencoder train") log.info( "Training params: learning rate=%s, noise ratio=%.1f%% (%s), regularization=%.2f" % (learning_rate, self.network.corruption_level * 100.0, "continuous corruption" if self.network.continuous_corruption else "zeroing corruption", regularization)) log.info("Training with SGD, batch size=%d" % batch_size) if class_weights is None: # Don't apply any weighting class_weights_vector = None elif class_weights == "freq": # Apply inverse frequency weighting class_counts = numpy.maximum(xs.sum(axis=0), 1.0) class_weights_vector = 1. / class_counts class_weights_vector *= xs.shape[1] / class_weights_vector.sum() log.info( "Using inverse frequency class weighting in cost function") elif class_weights == "log": class_counts = numpy.maximum(xs.sum(axis=0), 1.0) class_weights_vector = 1. / (numpy.log(class_counts) + 1.) class_weights_vector *= xs.shape[1] / class_weights_vector.sum() log.info( "Using inverse log frequency class weighting in cost function") else: raise ValueError("invalid class weighting '%s'" % class_weights) ######## Compile functions # Prepare cost/update functions for training cost, updates = self.network.get_cost_updates( self.learning_rate, self.regularization, class_cost_weights=class_weights_vector, corruption_level=corruption_level, continuous_corruption=continuous_corruption, loss=loss) # Prepare training functions cost_fn = theano.function( inputs=[self.network.x, Param(self.regularization, default=0.0)], outputs=cost, ) train_fn = theano.function( inputs=[ self.network.x, Param(self.learning_rate, default=0.1), Param(self.regularization, default=0.0) ], outputs=cost, updates=updates, ) # Prepare a function to test how close to the identity function the learned mapping is # A lower value indicates that it's generalizing more (though not necessarily better) identity_ratio = T.mean( T.sum(self.network.get_prediction_dist() * (self.network.x > 0), axis=1)) identity_ratio_fn = theano.function(inputs=[self.network.x], outputs=identity_ratio) ########### # Throw away ys in validation set validation_set = validation_set[0] # Prepare a prediction validation set by holding one event out of every chain in the val set prediction_targets = numpy.array([ random.choice(numpy.where(x_row > 0)[0]) for x_row in validation_set ], dtype=numpy.int16) prediction_contexts = validation_set.copy() prediction_contexts[range(prediction_contexts.shape[0]), prediction_targets] = 0. prediction_balanced_sample = balanced_array_sample(prediction_targets, balance_ratio=4., min_inclusion=1) prediction_targets = prediction_targets[prediction_balanced_sample] prediction_contexts = prediction_contexts[prediction_balanced_sample] log.info( "Prepared roughly balanced prediction set from validation set with %d examples" % prediction_contexts.shape[0]) # Work out how many batches to do if batch_size is None or batch_size == 0: num_batches = 1 else: num_batches = xs.shape[0] / batch_size if xs.shape[0] % batch_size != 0: num_batches += 1 # Keep a record of costs, so we can plot them val_costs = [] training_costs = [] # Compute costs using the initialized network training_cost = cost_fn(xs) training_costs.append(training_cost) if validation_set is not None: val_cost = cost_fn(validation_set) val_costs.append(val_cost) else: val_cost = None log.info("Computing initial validation scores") f_score, precision, recall, f_score_classes = self.compute_f_scores( validation_set) log.info("F-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%" % (f_score * 100.0, f_score_classes, precision * 100.0, recall * 100.0)) log_prob = self.network.prediction_log_prob(prediction_contexts, prediction_targets) log.info("Logprob = %.4g" % log_prob) gen_log_prob = self.network.generalization_log_prob( prediction_contexts, prediction_targets) log.info("Generalization logprob = %.4g" % gen_log_prob) identity_ratio = identity_ratio_fn(validation_set) log.info("Identity ratio = %.4g" % identity_ratio) # Keep a copy of the best weights so far best_weights = best_iter = best_val_cost = None if validation_set is not None: best_weights = self.network.get_weights() best_iter = -1 best_val_cost = val_cost below_threshold_its = 0 for i in range(iterations): # Shuffle the training data between iterations, as one should with SGD shuffle = numpy.random.permutation(xs.shape[0]) xs[:] = xs[shuffle] err = 0.0 if num_batches > 1: for batch in range(num_batches): # Update the model with this batch's data batch_err = train_fn(xs[batch * batch_size:(batch + 1) * batch_size], learning_rate=learning_rate, regularization=regularization) err += batch_err if batch_callback is not None: batch_callback(batch, num_batches, batch_err) else: # Batch training: no need to loop ### Always perform one batch iteration to start with to get us into a good part of the space train_fn(xs, learning_rate=learning_rate, regularization=regularization) # Go back and compute training cost training_cost = cost_fn(xs) training_costs.append(training_cost) if validation_set is not None: # Compute the cost function on the validation set val_cost = cost_fn(validation_set) val_costs.append(val_cost) if val_cost <= best_val_cost: # We assume that, if the validation error remains the same, it's better to use the new set of # weights (with, presumably, a better training error) if val_cost == best_val_cost: log.info( "Same validation cost: %.4f, using new weights" % val_cost) else: log.info("New best validation cost: %.4f" % val_cost) # Update our best estimate best_weights = self.network.get_weights() best_iter = i best_val_cost = val_cost if val_cost >= best_val_cost and i - best_iter >= stopping_iterations: # We've gone on long enough without improving validation error # Time to call a halt and use the best validation error we got log.info( "Stopping after %d iterations of increasing validation cost" % stopping_iterations) break log.info( "COMPLETED ITERATION %d: training cost=%.5f, val cost=%.5f" % (i, training_cost, val_cost)) if cost_plot_filename: # Plot the cost function as we train # Skip the first costs, as they're usually so much higher than others that the rest is indistinguishable columns = [(training_costs[1:], "Train cost")] if validation_set is not None: columns.append((val_costs[1:], "Val cost")) ax = plot_costs(None, *columns) # Add a line at the most recent best val cost ax.axvline(float(best_iter), color="b") ax.text(float(best_iter + 1) + 0.1, best_val_cost * 1.1, "Best val cost", color="b") plt.savefig(cost_plot_filename) f_score, precision, recall, f_score_classes = self.compute_f_scores( validation_set) log.info( "Validation f-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%" % (f_score * 100.0, f_score_classes, precision * 100.0, recall * 100.0)) #log_prob = self.network.prediction_log_prob(prediction_contexts, prediction_targets) #log.info("Prediction logprob = %.4g" % log_prob) gen_log_prob = self.network.generalization_log_prob( prediction_contexts, prediction_targets) log.info("Generalization logprob = %.4g" % gen_log_prob) identity_ratio = identity_ratio_fn(validation_set) log.info("Validation identity ratio = %.4g" % identity_ratio) if iteration_callback is not None: # Not computing training error at the moment iteration_callback(i, training_cost, val_cost, 0.0, best_iter) # Check the proportional change between this iteration's training cost and the last if len(training_costs) > 2: training_cost_prop_change = abs( (training_costs[-2] - training_costs[-1]) / training_costs[-2]) if training_cost_prop_change < training_cost_prop_change_threshold: # Very small change in training cost - maybe we've converged below_threshold_its += 1 if below_threshold_its >= 5: # We've had enough iterations with very small changes: we've converged log.info( "Proportional change in training cost (%g) below %g for five successive iterations: " "converged" % (training_cost_prop_change, training_cost_prop_change_threshold)) break else: log.info( "Proportional change in training cost (%g) below %g for %d successive iterations: " "waiting until it's been low for five iterations" % (training_cost_prop_change, training_cost_prop_change_threshold, below_threshold_its)) else: # Reset the below threshold counter below_threshold_its = 0 if best_weights is not None: # Use the weights that gave us the best error on the validation set self.network.set_weights(best_weights)