class BasicConfig: ''' Basic Config ''' # Running mode: debug or run mode = "debug" #region raw dataset control parameters cur_path = os.path.abspath(__file__) project_dir = cur_path[0:cur_path.index('Hashtag')+len('Hashtag')] # GPU: "int32"; CPU: "int64" int_type = "int32" batch_size = 32 sort_batch_count = 20 # Step rule step_rule = AdaDelta() # Measured by batches, e.g, valid every 1000 batches print_freq = 100 save_freq = 1000 # Measured by epoch valid_freq = 0.2
def test_adadelta(): a = shared_floatx([3, 4]) cost = (a**2).sum() steps, updates = AdaDelta(decay_rate=0.5, epsilon=1e-7).compute_steps( OrderedDict([(a, tensor.grad(cost, a))])) f = theano.function([], [steps[a]], updates=updates) assert_allclose(f()[0], [0.00044721, 0.00044721], rtol=1e-5) assert_allclose(f()[0], [0.0005164, 0.0005164], rtol=1e-5) assert_allclose(f()[0], [0.00056904, 0.00056904], rtol=1e-5)
def main(save_to, num_epochs, regularization=0.0003, subset=None, num_batches=None, histogram=None, resume=False): batch_size = 500 output_size = 10 convnet = create_lenet_5() layers = convnet.layers x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) components = (ComponentwiseCrossEntropy().apply(y.flatten(), probs) .copy(name='components')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs) .copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) cg = ComputationGraph([cost, error_rate, components]) # Apply regularization to the cost weights = VariableFilter(roles=[WEIGHT])(cg.variables) l2_norm = sum([(W ** 2).sum() for W in weights]) l2_norm.name = 'l2_norm' cost = cost + regularization * l2_norm cost.name = 'cost_with_regularization' if subset: start = 30000 - subset // 2 mnist_train = MNIST(("train",), subset=slice(start, start+subset)) else: mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=AdaDelta(decay_rate=0.99)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [cost, error_rate, confusion], mnist_test_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, l2_norm, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(cost) main_loop = MainLoop( algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def create_main_loop(save_to, num_epochs, unit_order=None, batch_size=500, num_batches=None): image_size = (28, 28) output_size = 10 convnet = create_lenet_5() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) case_costs = CasewiseCrossEntropy().apply(y.flatten(), probs) cost = case_costs.mean().copy(name='cost') # cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) # .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) # Apply regularization to the cost weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + sum([0.0003 * (W**2).sum() for W in weights]) cost.name = 'cost_with_regularization' mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=AdaDelta()) # Find layer outputs to probe outs = OrderedDict( reversed( list((get_brick(out).name, out) for out in VariableFilter(roles=[OUTPUT], bricks=[Convolutional, Linear])( cg.variables)))) actpic_extension = ActpicExtension(actpic_variables=outs, case_labels=y, pics=x, label_count=output_size, rectify=-1, data_stream=mnist_test_stream, after_batch=True) synpic_extension = SynpicExtension(synpic_parameters=biases, case_costs=case_costs, case_labels=y, pics=x, batch_size=batch_size, pic_size=image_size, label_count=output_size, after_batch=True) # Impose an orderint for the SaveImages extension if unit_order is not None: with open(unit_order, 'rb') as handle: histograms = pickle.load(handle) unit_order = compute_unit_order(histograms) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), actpic_extension, synpic_extension, SaveImages(picsources=[synpic_extension, actpic_extension], title="LeNet-5: batch {i}, " + "cost {cost_with_regularization:.2f}, " + "trainerr {error_rate:.3f}", data=[cost, error_rate], graph='error_rate', graph_len=500, unit_order=unit_order, after_batch=True), DataStreamMonitoring([cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] model = Model(cost) main_loop = MainLoop(algorithm, mnist_train_stream, model=model, extensions=extensions) return main_loop
cg = apply_noise(cg, config.noise_inputs(cg), config.noise) cost = cg.outputs[0] cg = Model(cost) logger.info('# Parameter shapes:') parameters_size = 0 for value in cg.parameters: logger.info(' %20s %s' % (value.get_value().shape, value.name)) parameters_size += reduce(operator.mul, value.get_value().shape, 1) logger.info('Total number of parameters: %d in %d matrices' % (parameters_size, len(cg.parameters))) if hasattr(config, 'step_rule'): step_rule = config.step_rule else: step_rule = AdaDelta() logger.info("Fuel seed: %d" % fuel.config.default_seed) logger.info("Blocks seed: %d" % blocks.config.default_seed) params = cg.parameters algorithm = GradientDescent(cost=cost, step_rule=CompositeRule( [RemoveNotFinite(), step_rule]), parameters=params) plot_vars = [['valid_' + x.name for x in valid_monitored] + ['train_' + x.name for x in valid_monitored]] logger.info('Plotted variables: %s' % str(plot_vars)) dump_path = os.path.join('model_data', model_name) + '.pkl'
# How often (number of batches) to print / plot monitor_freq = 20 batch_size = 200 # regularization : noise on the weights weight_noise = 0.01 dropout = 0.2 # number of classes, a constant of the dataset num_output_classes = 5 # the step rule (uncomment your favorite choice) step_rule = CompositeRule([AdaDelta(), RemoveNotFinite()]) #step_rule = CompositeRule([Momentum(learning_rate=0.00001, momentum=0.99), RemoveNotFinite()]) #step_rule = CompositeRule([Momentum(learning_rate=0.1, momentum=0.9), RemoveNotFinite()]) #step_rule = CompositeRule([AdaDelta(), Scale(0.01), RemoveNotFinite()]) #step_rule = CompositeRule([RMSProp(learning_rate=0.1, decay_rate=0.95), # RemoveNotFinite()]) #step_rule = CompositeRule([RMSProp(learning_rate=0.0001, decay_rate=0.95), # BasicMomentum(momentum=0.9), # RemoveNotFinite()]) # How the weights are initialized weights_init = IsotropicGaussian(0.01) biases_init = Constant(0.001) # ==========================================================================================
step_rules = [ Adam(learning_rate=learning_rate), StepClipping(step_clipping) ] # , VariableClipping(threshold=max_norm_threshold) elif training_optimizer == 'RMSProp': step_rules = [ RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping) ] elif training_optimizer == 'Adagrad': step_rules = [ AdaGrad(learning_rate=learning_rate), StepClipping(step_clipping) ] elif training_optimizer == 'Adadelta': step_rules = [AdaDelta(decay_rate=decay_rate), StepClipping(step_clipping)] parameters_to_update = cg.parameters algorithm = GradientDescent(cost=cg.outputs[0], parameters=parameters_to_update, step_rule=CompositeRule(step_rules)) algorithm.add_updates(extra_updates) # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, step_rules[0].learning_rate, gradient_norm, step_norm] test_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True,
def train(self, cost, y_hat, train_stream, accuracy=None, prediction_cost=None, regularization_cost=None, params_to_optimize=None, valid_stream=None, extra_extensions=None, model=None, vars_to_monitor_on_train=None, vars_to_monitor_on_valid=None, step_rule=None, additional_streams=None, save_on_best=None, use_own_validation=False, objects_to_dump=None): """ Generic method for training models. It extends functionality already provided by Blocks. :param cost: Theano var with cost function :param y_hat: Theano var with predictions from the model :param train_stream: Fuel stream with training data :param accuracy: Theano var with accuracy :param prediction_cost: :param regularization_cost: :param params_to_optimize: :param valid_stream: Fuel stream with validation data :param extra_extensions: :param model: :param vars_to_monitor_on_train: :param vars_to_monitor_on_valid: :param step_rule: :param additional_streams: :param save_on_best: :param use_own_validation: :param objects_to_dump: :return: """ if not vars_to_monitor_on_valid: vars_to_monitor_on_valid = [(cost, min)] if accuracy: vars_to_monitor_on_valid.append((accuracy, max)) if not save_on_best: # use default metrics for saving the best model save_on_best = [(cost, min)] if accuracy: save_on_best.append((accuracy, max)) # setup the training algorithm ####################################### # step_rule = Scale(learning_rate=0.01) # step_rule = Adam() model_save_suffix = "" if self.args.append_metaparams: model_save_suffix = "." + get_current_metaparams_str( self.parser, self.args) # get a list of variables that will be monitored during training vars_to_monitor = [cost] if accuracy: vars_to_monitor.append(accuracy) if prediction_cost: vars_to_monitor.append(prediction_cost) if regularization_cost: vars_to_monitor.append(regularization_cost) theano_vars_to_monitor = [ var for var, comparator in vars_to_monitor_on_valid ] if not params_to_optimize: # use all parameters of the model for optimization cg = ComputationGraph(cost) params_to_optimize = cg.parameters self.print_parameters_info(params_to_optimize) if not model: if accuracy: model = MultiOutputModel([cost, accuracy, y_hat] + theano_vars_to_monitor) else: model = MultiOutputModel([cost, y_hat] + theano_vars_to_monitor) if not step_rule: step_rule = AdaDelta() # learning_rate=0.02, momentum=0.9) step_rules = [ StepClipping(self.args.gradient_clip), step_rule, RemoveNotFinite() ] # optionally add gradient noise if self.args.gradient_noise: step_rules = [ GradientNoise(self.args.gradient_noise, self.args.gn_decay) ] + step_rules algorithm = GradientDescent(cost=cost, parameters=params_to_optimize, step_rule=CompositeRule(step_rules), on_unused_sources="warn") # this variable aggregates all extensions executed periodically during training extensions = [] if self.args.epochs_max: # finis training after fixed number of epochs extensions.append(FinishAfter(after_n_epochs=self.args.epochs_max)) # training data monitoring def create_training_data_monitoring(): if "every_n_epochs" in self.args.evaluate_every_n: return TrainingDataMonitoring(vars_to_monitor, prefix='train', after_epoch=True) else: return TrainingDataMonitoring(vars_to_monitor, prefix='train', after_epoch=True, **self.args.evaluate_every_n) # add extensions that monitors progress of training on train set extensions.extend([create_training_data_monitoring()]) if not self.args.disable_progress_bar: extensions.append(ProgressBar()) def add_data_stream_monitor(data_stream, prefix): if not use_own_validation: extensions.append( DataStreamMonitoring(variables=theano_vars_to_monitor, data_stream=data_stream, prefix=prefix, before_epoch=False, **self.args.evaluate_every_n)) # additional streams that should be monitored if additional_streams: for stream_name, stream in additional_streams: add_data_stream_monitor(stream, stream_name) # extra extensions need to be called before Printing extension if extra_extensions: extensions.extend(extra_extensions) if valid_stream: # add validation set monitoring add_data_stream_monitor(valid_stream, 'valid') # add best val monitoring for var, comparator in vars_to_monitor_on_valid: extensions.append( TrackTheBest("valid_" + var.name, choose_best=comparator, **self.args.evaluate_every_n)) if self.args.patience_metric == 'cost': patience_metric_name = cost.name elif self.args.patience_metric == 'accuracy': patience_metric_name = accuracy.name else: print "WARNING: Falling back to COST function for patience." patience_metric_name = cost.name extensions.append( # "valid_cost_best_so_far" message will be entered to the main loop log by TrackTheBest extension FinishIfNoImprovementAfter( "valid_" + patience_metric_name + "_best_so_far", epochs=self.args.epochs_patience_valid)) if not self.args.do_not_save: # use user provided metrics for saving valid_save_extensions = map( lambda metric_comparator: SaveTheBest( "valid_" + metric_comparator[0].name, self.args.save_path + ".best." + metric_comparator[ 0].name + model_save_suffix, choose_best=metric_comparator[1], **self.args.evaluate_every_n), save_on_best) extensions.extend(valid_save_extensions) extensions.extend([ Timing(**self.args.evaluate_every_n), Printing(after_epoch=False, **self.args.evaluate_every_n), ]) if not self.args.do_not_save or self.args.save_only_best: extensions.append( Checkpoint(self.args.save_path + model_save_suffix, **self.args.save_every_n)) extensions.append(FlushStreams(**self.args.evaluate_every_n)) # main loop ########################################################## main_loop = MainLoop(data_stream=train_stream, model=model, algorithm=algorithm, extensions=extensions) sys.setrecursionlimit(1000000) main_loop.run()
from blocks.filter import VariableFilter from blocks.roles import WEIGHT from blocks.graph import ComputationGraph, apply_noise, apply_dropout from datastream import RandomTransposeIt import ber as balanced_error_rate step_rule_name = 'adadelta' learning_rate = 0.1 momentum = 0. decay_rate = 0.9 if step_rule_name == 'adadelta': step_rule = AdaDelta(decay_rate=decay_rate) step_rule_name = 'adadelta%s' % repr(decay_rate) elif step_rule_name == 'rmsprop': step_rule = RMSProp() elif step_rule_name == 'momentum': step_rule_name = "mom%s,%s" % (repr(learning_rate), repr(momentum)) step_rule = Momentum(learning_rate=learning_rate, momentum=momentum) else: raise ValueError("No such step rule: " + step_rule_name) ibatchsize = None iter_scheme = RandomTransposeIt(ibatchsize, False, None, False) valid_iter_scheme = RandomTransposeIt(ibatchsize, False, None, False) r_noise_std = 0.01 w_noise_std = 0.00
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(rename(gain_matrix.min(), 'min_gain')) primary_observables.append(rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) from blocks.roles import AUXILIARY l2_cost, = VariableFilter(roles=[AUXILIARY], theano_name='l2_cost_aux')(cost_cg) cost_forward, = VariableFilter(roles=[AUXILIARY], theano_name='costs_forward_aux')(cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables + [l2_cost, cost_forward]), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def initialaze_algorithm(config, save_path, bokeh_name, params, bokeh_server, bokeh, use_load_ext, load_log, fast_start, recognizer, data, model, cg, regularized_cg, cost, train_cost, parameters, max_norm_rules, observables, batch_size, batch_cost, weights_entropy, labels_mask, labels, gradients=None): primary_observables = observables secondary_observables = [] validation_observables = [] root_path, extension = os.path.splitext(save_path) train_conf = config['training'] # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) if 'adam' in rule_names: assert len(rule_names) == 1 logger.info("Using Adam for training") core_rules.append( Adam(learning_rate=train_conf.get('scale', 0.002), beta1=train_conf.get('beta1', 0.1), beta2=train_conf.get('beta2', 0.001), epsilon=train_conf.get('epsilon', 1e-8), decay_factor=train_conf.get('decay_rate', (1 - 1e-8)))) burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') #theano_func_kwargs={'mode':NanGuardMode(nan_is_error=True)}) logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances')] + weights_entropy def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name.startswith('weights_entropy'): chld_id = recognizer.child_id_from_postfix(var.name) result.append(rename(aggregation.mean(var, labels_mask[chld_id].sum()), 'weights_entropy_per_label'+ recognizer.children[chld_id].names_postfix)) elif var.name.endswith('_nll'): chld_id = recognizer.child_id_from_postfix(var.name) result.append(rename(aggregation.mean(var.sum(), labels_mask[chld_id].sum()), var.name+'_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False, **data_params_valid), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) additional_patience_notifiers = [] uas = DependencyErrorRate(recognizer.children[0], data, **config['monitoring']['search']) las = AuxiliaryErrorRates(uas, name='LAS') lab = AuxiliaryErrorRates(uas, name='LAB') per_monitoring = DataStreamMonitoring( [uas, las, lab], data.get_one_stream("valid", data.langs[0], batches=False, shuffle=False, **data_params_valid)[0], prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_uas = TrackTheBest( per_monitoring.record_name(uas)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_las = TrackTheBest( per_monitoring.record_name(las)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_lab = TrackTheBest( per_monitoring.record_name(lab)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_uas, track_the_best_las, track_the_best_lab, ] per = uas track_the_best_per = track_the_best_uas additional_patience_notifiers = [track_the_best_lab, track_the_best_las] track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500, num_stds=train_conf.get('clip_stds', 1.0))) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']), # .add_condition(["after_batch"], _gradient_norm_is_none), ] main_postfix = recognizer.children[0].names_postfix channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'+main_postfix), validation._record_name('weights_entropy_per_label'+main_postfix)], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'+main_postfix), validation._record_name('weights_penalty_per_recording'+main_postfix)]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] + additional_patience_notifiers extensions.append(Patience(**patience_conf)) if train_conf.get('min_performance_stops'): extensions.append(EarlyTermination( param_name=track_the_best_per.best_name, min_performance_by_epoch=train_conf['min_performance_stops'])) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
n_epochs = 15 if "n_epochs" in config: n_epochs = int(config["n_epochs"]) params = cg.parameters model = Model([cost]) print "model parameters:" print model.get_parameter_dict() if "adagrad" in config: print "using adagrad" thisRule=AdaGrad(learning_rate=learning_rate) elif "adadelta" in config: print "using adadelta" thisRule=AdaDelta() elif "momentum" in config: print "using momentum" mWeight = float(config["momentum"]) thisRule=Momentum(learning_rate=learning_rate, momentum=mWeight) else: print "using traditional SGD" thisRule=Scale(learning_rate=learning_rate) if "gradientClipping" in config: threshold = float(config["gradientClipping"]) print "using gradient clipping with threshold ", threshold thisRule=CompositeRule([StepClipping(threshold), thisRule]) #step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])()])
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout, depth, max_grad, step_method, epsilon, sample): #---------------------------------------------------------------------- datasource = name def shnum(x): """ Convert a positive float into a short tag-usable string E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2 """ return '0' if x <= 0 else '%s%d' % ( ("%e" % x)[0], -np.floor(np.log10(x))) jobname = "%s-%dX%dm%dd%dr%sb%de%s" % ( datasource, depth, dim, mix_dim, int( dropout * 10), shnum(learning_rate), batch_size, shnum(epsilon)) if max_length != 600: jobname += '-L%d' % max_length if GRU: jobname += 'g' if max_grad != 5.: jobname += 'G%g' % max_grad if step_method != 'adam': jobname += step_method if sample: print("Sampling") else: print("\nRunning experiment %s" % jobname) #---------------------------------------------------------------------- if depth > 1: transition = LSTMstack(dim=dim, depth=depth, name="transition", lstm_name="transition") assert not GRU elif GRU: transition = GatedRecurrent(dim=dim, name="transition") else: transition = LSTM(dim=dim, name="transition") emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter") readout = Readout(readout_dim=emitter.get_dim('inputs'), source_names=['states'], emitter=emitter, name="readout") normal_inputs = [ name for name in transition.apply.sequences if 'mask' not in name ] fork = Fork(normal_inputs, prototype=Linear(use_bias=True)) generator = SequenceGenerator(readout=readout, transition=transition, fork=fork) # Initialization settings generator.weights_init = OrthogonalGlorot() generator.biases_init = Constant(0) # Build the cost computation graph [steps,batch_size, 3] x = T.tensor3('features', dtype=floatX)[:max_length, :, :] x.tag.test_value = np.ones((max_length, batch_size, 3)).astype(np.float32) cost = generator.cost(x) cost.name = "sequence_log_likelihood" # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) model_size = 0 for v in params.itervalues(): s = v.get_value().shape model_size += s[0] * (s[1] if len(s) > 1 else 1) logger.info("Total number of parameters %d" % model_size) #------------------------------------------------------------ extensions = [] if old_model_name == 'continue': extensions.append(LoadFromDump(jobname)) elif old_model_name: # or you can just load the weights without state using: old_params = LoadFromDump(old_model_name).manager.load_parameters() model.set_param_values(old_params) else: # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() if sample: assert old_model_name and old_model_name != 'continue' Sample(generator, steps=max_length, path='.').do(None) exit(0) #------------------------------------------------------------ # Define the training algorithm. cg = ComputationGraph(cost) if dropout > 0.: from blocks.roles import INPUT, OUTPUT dropout_target = VariableFilter(roles=[OUTPUT], bricks=[transition], name_regex='states')(cg.variables) cg = apply_dropout(cg, dropout_target, dropout) cost = cg.outputs[0] if step_method == 'adam': step_rule = Adam(learning_rate) elif step_method == 'rmsprop': step_rule = RMSProp(learning_rate, decay_rate=0.95) elif step_method == 'adagrad': step_rule = AdaGrad(learning_rate) elif step_method == 'adadelta': step_rule = AdaDelta() elif step_method == 'scale': step_rule = Scale(learning_rate=0.1) else: raise Exception('Unknown sttep method %s' % step_method) step_rule = CompositeRule([StepClipping(max_grad), step_rule]) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) #------------------------------------------------------------ observables = [cost] # Fetch variables useful for debugging (energies, ) = VariableFilter(applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations, ) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables += [min_energy, max_energy, mean_activation] observables += [algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) #------------------------------------------------------------ datasource_fname = os.path.join(fuel.config.data_path, datasource, datasource + '.hdf5') train_ds = H5PYDataset( datasource_fname, #max_length=max_length, which_set='train', sources=('features', ), load_in_memory=True) train_stream = DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size)) test_ds = H5PYDataset( datasource_fname, #max_length=max_length, which_set='test', sources=('features', ), load_in_memory=True) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) train_stream = Mapping(train_stream, _transpose) test_stream = Mapping(test_stream, _transpose) def stream_stats(ds, label): itr = ds.get_epoch_iterator(as_dict=True) batch_count = 0 examples_count = 0 for batch in itr: batch_count += 1 examples_count += batch['features'].shape[1] print('%s #batch %d #examples %d' % (label, batch_count, examples_count)) stream_stats(train_stream, 'train') stream_stats(test_stream, 'test') extensions += [ Timing(every_n_batches=10), TrainingDataMonitoring(observables, prefix="train", every_n_batches=10), DataStreamMonitoring( [cost], test_stream, prefix="test", on_resumption=True, after_epoch=False, # by default this is True every_n_batches=100), # all monitored data is ready so print it... # (next steps may take more time and we want to see the # results as soon as possible so print as soon as you can) Printing(every_n_batches=10), # perform multiple dumps at different intervals # so if one of them breaks (has nan) we can hopefully # find a model from few batches ago in the other Dump(jobname, every_n_batches=11), Dump(jobname + '.test', every_n_batches=100), Sample(generator, steps=max_length, path=jobname + '.test', every_n_batches=100), ProgressBar(), FinishAfter(after_n_epochs=epochs) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), ] if bokeh: extensions.append(Plot('sketch', channels=[ ['cost'], ])) # Construct the main loop and start training! main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def training(self, fea2obj, batch_size, learning_rate=0.005, steprule='adagrad', wait_epochs=5, kl_weight_init=None, klw_ep=50, klw_inc_rate=0, num_epochs=None): networkfile = self._config['net'] n_epochs = num_epochs or int(self._config['nepochs']) reg_weight = float(self._config['loss_weight']) reg_type = self._config['loss_reg'] numtrain = int( self._config['num_train']) if 'num_train' in self._config else None train_stream, num_samples_train = get_comb_stream( fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain) dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size=None, shuffle=False) logger.info('sources: %s -- number of train/dev samples: %d/%d', train_stream.sources, num_samples_train, num_samples_dev) t2idx = fea2obj['targets'].t2idx klw_init = kl_weight_init or float( self._config['kld_weight']) if 'kld_weight' in self._config else 1 logger.info('kl_weight_init: %d', klw_init) kl_weight = shared_floatx(klw_init, 'kl_weight') entropy_weight = shared_floatx(1., 'entropy_weight') cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate = build_model_new( fea2obj, len(t2idx), self._config, kl_weight, entropy_weight) cg = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT])(cg.parameters) logger.info('Model weights are: %s', weights) if 'L2' in reg_type: cost += reg_weight * l2_norm(weights) logger.info('applying %s with weight: %f ', reg_type, reg_weight) dropout = -0.1 if dropout > 0: cg = apply_dropout(cg, weights, dropout) cost = cg.outputs[0] cost.name = 'cost' logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule, learning_rate) if 'adagrad' in steprule: cnf_step_rule = AdaGrad(learning_rate) elif 'adadelta' in steprule: cnf_step_rule = AdaDelta(decay_rate=0.95) elif 'decay' in steprule: cnf_step_rule = RMSProp(learning_rate=learning_rate, decay_rate=0.90) cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)]) elif 'momentum' in steprule: cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9) elif 'adam' in steprule: cnf_step_rule = Adam(learning_rate=learning_rate) else: logger.info('The steprule param is wrong! which is: %s', steprule) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=cnf_step_rule, on_unused_sources='warn') #algorithm.add_updates(updates) gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [ cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight, pat1_recog ] train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') dev_monitor = DataStreamMonitoring(variables=[ cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate ], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") extensions = [ dev_monitor, train_monitor, Timing(), TrackTheBest('dev_cost'), FinishIfNoImprovementAfter('dev_cost_best_so_far', epochs=wait_epochs), Printing(after_batch=False), #, ProgressBar() FinishAfter(after_n_epochs=n_epochs), saveload.Load(networkfile + '.toload.pkl'), ] + track_best('dev_cost', networkfile + '.best.pkl') #extensions.append(SharedVariableModifier(kl_weight, # lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) # extensions.append(SharedVariableModifier(entropy_weight, # lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) logger.info('number of parameters in the model: %d', tensor.sum([p.size for p in cg.parameters]).eval()) logger.info('Lookup table sizes: %s', [p.size.eval() for p in cg.parameters if 'lt' in p.name]) main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
######### training ################## n_epochs = 15 if "n_epochs" in config: n_epochs = int(config["n_epochs"]) model = Model([cost]) print model.get_parameter_dict() curStepRule = Scale(learning_rate=learning_rate) if "adagrad" in config: print "using adagrad" curStepRule = AdaGrad(learning_rate=learning_rate) elif "adadelta" in config: print "using adadelta" curStepRule = AdaDelta() elif "momentum" in config: print "using momentum" mWeight = float(config["momentum"]) curStepRule = Momentum(learning_rate=learning_rate, momentum=mWeight) else: print "using traditional SGD" algorithm = GradientDescent(cost=cost, parameters=params, step_rule=curStepRule, on_unused_sources='warn') extensions = [] extensions.append(CheckpointAfterEpoch(path=networkfile, every_n_epochs=1)) extensions.append(FinishAfter(after_n_epochs=n_epochs)) extensions.append(
decoder.initialize() cg = ComputationGraph(cost) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] print('Parameter shapes') for shape, count in Counter(shapes).most_common(): print(' {:15}: {}'.format(shape, count)) # Set up training algorithm algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule( [StepClipping(10), AdaDelta()])) # Train! main_loop = MainLoop(model=Model(cost), algorithm=algorithm, data_stream=masked_stream, extensions=[ TrainingDataMonitoring([cost], after_every_batch=True), Plot('En-Fr', channels=[['decoder_cost_cost']], after_every_batch=True), Printing(after_every_batch=True), Checkpoint('model.pkl', every_n_batches=2048) ]) main_loop.run()