def main(config, tr_stream, dev_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.decimator.bidir_w.prototype.recurrent.weights_init = Orthogonal() for layer_n in range(config['src_dgru_depth']): encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['bidir_encoder_depth']): encoder.children[1 + layer_n].prototype.recurrent.weights_init = Orthogonal() if config['trg_igru_depth'] == 1: decoder.interpolator.igru.weights_init = Orthogonal() else: for layer_n in range(config['trg_igru_depth']): decoder.interpolator.igru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['trg_dgru_depth']): decoder.interpolator.feedback_brick.dgru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['transition_depth']): decoder.transition.transitions[layer_n].weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(str(shape), count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(str(value.get_value().shape), name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set up training algorithm logger.info("Initializing training algorithm") # You could use 1e-4 in Adam, however manually decay will be faster. # We decay it to 5e-4 when trained for about 30K # then decay it to 2e-4 when trained for about 90K # finally set it to 1e-4 when trained for about 180K algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), Adam(learning_rate=1e-3)])) # Set extensions logger.info("Initializing extensions") # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [ train_monitor, Timing(), Printing(every_n_batches=config['print_freq']), FinishAfter(after_n_batches=config['finish_after']), CheckpointNMT(saveto=config['saveto'], dump_freq=config['dump_freq'], every_n_batches=config['save_freq'], )] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[config['transition_depth']])) # generated[transition_depth] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], transition_depth=config['transition_depth'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
step_compute]), parameters=cg.parameters, cost=cost) extension_list = [] extension_list.append( SharedVariableModifier(step_compute.learning_rate, extensions.decay_learning_rate, after_batch=False, every_n_batches=batches_per_epoch, )) extension_list.append(FinishAfter(after_n_epochs=100001)) ## logging of test set performance extension_list.append(extensions.LogLikelihood(dpm, test_stream, scl, every_n_batches=args.ext_every_n*batches_per_epoch, before_training=False)) ## set up logging extension_list.extend([Timing(), Printing()]) model_dir = util.create_log_dir(args, dpm.name + '_' + args.dataset) model_save_name = os.path.join(model_dir, 'model.pkl') extension_list.append( Checkpoint(model_save_name, every_n_batches=args.ext_every_n*batches_per_epoch, save_separately=['log'])) # generate plots extension_list.append(extensions.PlotMonitors(model_dir, every_n_batches=args.ext_every_n*batches_per_epoch, before_training=args.plot_before_training)) test_batch = next(test_stream.get_epoch_iterator())[0] extension_list.append(extensions.PlotSamples(dpm, algorithm, test_batch, model_dir, every_n_batches=args.ext_every_n*batches_per_epoch, before_training=args.plot_before_training)) internal_state = dpm.internal_state(features) train_batch = next(train_stream.get_epoch_iterator())[0] # extension_list.append( # extensions.PlotInternalState(dpm, blocks_model, internal_state, features, train_batch, model_dir, # every_n_batches=args.ext_every_n*batches_per_epoch, before_training=args.plot_before_training))
predict1 = NDimensionalSoftmax(name='Soft1').apply(out_soft1) cost = CategoricalCrossEntropy(name='Cross1').apply(y.flatten(), predict1).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict1) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) ########### GET THE DATA ##################### stream_train = ServerDataStream(('image_features','targets'), False, port=5512, hwm=40) stream_valid = ServerDataStream(('image_features','targets'), False, port=5513, hwm=40) ########### DEFINE THE ALGORITHM ############# algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum(learning_rate=0.01, momentum=0.9)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), TrainingDataMonitoring([cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid", after_epoch=True), Checkpoint("catsVsDogs_fire2_new.pkl", after_epoch=True, save_separately=['log']), ProgressBar(), Printing()] #Adding a live plot with the bokeh server extensions.append(Plot( 'CatsVsDogs_Fire2', channels=[['train_error_rate', 'valid_error_rate'], ['train_cost', 'valid_cost'],
from blocks.extensions import Printing, Timing, FinishAfter from blocks.extensions.training import TrackTheBest from blocks.extensions.monitoring import TrainingDataMonitoring, DataStreamMonitoring from blocks.extensions.stopping import FinishIfNoImprovementAfter from blocks.extensions.saveload import Checkpoint from blocks_extras.extensions.plot import Plot import socket import datetime import time host_plot = 'http://tfjgeorge.com:5006' cost.name = 'cost' extensions = [ Timing(), TrainingDataMonitoring([cost], after_epoch=True, prefix='train'), DataStreamMonitoring(variables=[cost], data_stream=valid_stream, prefix="valid"), Plot('%s %s' % (socket.gethostname(), datetime.datetime.now(), ),channels=[['train_cost', 'valid_cost']], after_epoch=True, server_url=host_plot), TrackTheBest('valid_cost'), Checkpoint('train', save_separately=["model","log"]), FinishIfNoImprovementAfter('valid_cost_best_so_far', epochs=5), #FinishAfter(after_n_epochs=100), Printing() ] # In[7]: from blocks.main_loop import MainLoop
valid_stream, every_n_batches=15 * n_batches, prefix="valid") def _is_nan(log): try: result = math.isnan(log.current_row['train_total_gradient_norm']) return result except: return False extensions = extensions = [ train_monitor, valid_monitor, Timing(every_n_batches=n_batches), Printing(every_n_batches=n_batches), WriteConditional(emit, every_n_batches=n_batches, save_name=save_dir + "samples/" + exp_name + ".png"), FinishAfter().add_condition(["after_batch"], _is_nan), ProgressBar(), Checkpoint(save_dir + "pkl/" + exp_name + ".pkl", after_epoch=True), SaveComputationGraph(emit), Plot(save_dir + exp_name + ".png", [['train_nll', 'valid_nll']], every_n_batches=5 * n_batches, email=False) ] main_loop = MainLoop(model=model, data_stream=data_stream,
def train_rnnrbm(train, rnnrbm, epochs=1000, test=None, bokeh=True, load_path=None): cdk = theano.shared(10) lr = theano.shared(float32(0.004)) cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk) error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask) mistake_rate.name = "single error within note" cost.name = 'rbm_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule([ RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite() ]) # Scale(0.01) gradients = dict( equizip(cg.parameters, T.grad(cost, cg.parameters, consider_constant=[v_sample]))) algorithm = GradientDescent(step_rule=step_rule, gradients=gradients, cost=cost, params=cg.parameters) algorithm.add_updates(cg.updates) extensions = [ SharedVariableModifier(parameter=cdk, function=lambda n, v: rnnrbm_cdk[n] if rnnrbm_cdk.get(n) else v), SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.78 * v) if n % (200 * 5) == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [ cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar() ] if test is not None: extensions.append( DataStreamMonitoring([cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append( Plot( 'Training RNN-RBM', channels=[ [ 'train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note' ], ['train_final_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions) return main_loop
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(rename(gain_matrix.min(), 'min_gain')) primary_observables.append(rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def main(dataset_name='sklearn', num_epochs=100): x = tensor.matrix('features') y = tensor.lmatrix('targets') logistic_regressor = LogisticRegressor(input_dim=2) probs = logistic_regressor.get_probs(features=x) params = logistic_regressor.get_params() cost = logistic_regressor.get_cost(probs=probs, targets=y).mean() cost.name = 'cost' misclassification = logistic_regressor.get_misclassification( probs=probs, targets=y).mean() misclassification.name = 'misclassification' train_dataset, test_dataset = build_2d_datasets(dataset_name=dataset_name) algorithm = GradientDescent(cost=cost, params=params, step_rule=Scale(learning_rate=0.1)) train_data_stream = ForceFloatX( data_stream=DataStream(dataset=train_dataset, iteration_scheme=SequentialScheme( examples=train_dataset.num_examples, batch_size=40, ))) test_data_stream = ForceFloatX( data_stream=DataStream(dataset=test_dataset, iteration_scheme=SequentialScheme( examples=test_dataset.num_examples, batch_size=400, ))) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append( DataStreamMonitoring([cost, misclassification], test_data_stream, prefix='test')) extensions.append( TrainingDataMonitoring([cost, misclassification], prefix='train', after_epoch=True)) scoring_function = theano.function([x], probs) plotters = [] plotters.append( Plotter(channels=[[ 'test_cost', 'test_misclassification', 'train_cost', 'train_misclassification' ]], titles=['Costs'])) score_train_stream = Mapping(data_stream=copy.deepcopy(train_data_stream), mapping=TupleMapping(scoring_function), add_sources=('scores', )) score_test_stream = Mapping(data_stream=copy.deepcopy(test_data_stream), mapping=TupleMapping(scoring_function), add_sources=('scores', )) plotters.append( Display2DData(data_streams=[ score_train_stream, copy.deepcopy(train_data_stream), score_test_stream, copy.deepcopy(test_data_stream) ], radius=0.01)) extensions.append( PlotManager('2D examples', plotters=plotters, after_epoch=False, every_n_epochs=50, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def train(self): """ Setup and train the model """ to_train = ComputationGraph([self.tagger.total_cost]).parameters logger.info('Found the following parameters: %s' % str(to_train)) step_rule = Adam(learning_rate=self.p.lr) training_algorithm = GradientDescent( cost=self.tagger.total_cost, parameters=to_train, step_rule=step_rule, on_unused_sources='warn', theano_func_kwargs={'on_unused_input': 'warn'} ) # TRACKED GRAPH NODES train_params = { 'Train_Denoising_Cost': self.tagger.corr.denoising_cost, } if self.p.class_cost_x > 0: train_params['Train_Classification_Cost'] = self.tagger.corr.class_cost train_params['Train_Classification_Error'] = self.tagger.clean.class_error valid_params = { 'Validation_Denoising_Cost': self.tagger.corr.denoising_cost, } if self.p.class_cost_x > 0: valid_params['Validation_Classification_Cost'] = self.tagger.corr.class_cost valid_params['Validation_Classification_Error'] = self.tagger.clean.class_error test_params = { 'Test_AMI_Score': self.tagger.clean.ami_score, 'Test_Denoising_Cost': self.tagger.corr.denoising_cost, } if self.p.class_cost_x > 0: test_params['Test_Classification_Cost'] = self.tagger.corr.class_cost test_params['Test_Classification_Error'] = self.tagger.clean.class_error short_prints = { "train": train_params, "valid": valid_params, "test": test_params, } main_loop = MainLoop( training_algorithm, # Datastream used for training self.streams['train'], model=Model(self.tagger.total_cost), extensions=[ FinishAfter(after_n_epochs=self.p.num_epochs), SaveParams(self.p.get('save_freq', 0), self.tagger, self.save_dir, before_epoch=True), DataStreamMonitoring( valid_params.values(), self.streams['valid'], prefix="valid" ), FinalTestMonitoring( test_params.values(), self.streams['train'], {'valid': self.streams['valid'], 'test': self.streams['test']}, after_training=True ), TrainingDataMonitoring( train_params.values(), prefix="train", after_epoch=True ), SaveExpParams(self.p, self.save_dir, before_training=True), Timing(after_epoch=True), ShortPrinting(short_prints, after_epoch=True), ]) logger.info('Running the main loop') main_loop.run()
def main(port_data): mlp_hiddens = [500] filter_sizes = [(3, 3), (3, 3)] feature_maps = [20, 20] pooling_sizes = [(3, 3), (2, 2)] save_to = "DvC.pkl" image_size = (128, 128) output_size = 2 learningRate = 0.1 num_epochs = 300 num_batches = None if socket.gethostname() == 'tim-X550JX': host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel, live_plotting): image_size, channels, data_train, data_valid, data_test = datasets.get_data( dataset) train_stream = Flatten( DataStream.default_stream(data_train, iteration_scheme=SequentialScheme( data_train.num_examples, batch_size))) valid_stream = Flatten( DataStream.default_stream(data_valid, iteration_scheme=SequentialScheme( data_valid.num_examples, batch_size))) test_stream = Flatten( DataStream.default_stream(data_test, iteration_scheme=SequentialScheme( data_test.num_examples, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = channels * img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # Configure attention mechanism if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * channels * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, channels=channels, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, channels=channels, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- if name is None: name = dataset # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = name + "-" + time.strftime("%Y%m%d-%H%M%S") longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % ( dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') x_recons, kl_terms = draw.reconstruct(x) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) plotting_extensions = [] if live_plotting: plotting_extensions = [Plot(name, channels=plot_channels)] main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), #Checkpoint(name, before_training=False, after_epoch=True, save_separately=['log', 'model']), Checkpoint("{}/{}".format(subdir, name), save_main_loop=False, before_training=True, after_epoch=True, save_separately=['log', 'model']), SampleCheckpoint(image_size=image_size[0], channels=channels, save_subdir=subdir, before_training=True, after_epoch=True), ProgressBar(), Printing() ] + plotting_extensions) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_parameter_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def run(batch_size, classifier, oldmodel, monitor_every, checkpoint_every, final_epoch, dataset, color_convert, image_size, net_depth, allowed, stretch): streams = create_custom_streams(filename=dataset, training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=True, color_convert=color_convert, allowed=allowed, stretch=stretch) main_loop_stream = streams[0] train_monitor_stream = streams[1] valid_monitor_stream = streams[2] cg, bn_dropout_cg = create_training_computation_graphs(image_size, net_depth) model = Model(bn_dropout_cg.outputs[0]) # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. pop_updates = get_batch_normalization_updates(bn_dropout_cg) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0], parameters=bn_dropout_cg.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring cost = bn_dropout_cg.outputs[0] cost.name = 'cost' train_monitoring = DataStreamMonitoring( [cost], train_monitor_stream, prefix="train", before_first_epoch=False, after_epoch=False, after_training=True, updates=extra_updates) cost, accuracy = cg.outputs cost.name = 'cost' accuracy.name = 'accuracy' monitored_quantities = [cost, accuracy] valid_monitoring = DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid", before_first_epoch=True, after_epoch=False, every_n_epochs=monitor_every) # Prepare checkpoint checkpoint = Checkpoint(classifier, every_n_epochs=checkpoint_every, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=final_epoch), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(model=model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) if oldmodel is not None: print("Initializing parameters with old model {}".format(oldmodel)) with open(oldmodel, 'rb') as src: saved_model = load(src) main_loop.model.set_parameter_values( saved_model.model.get_parameter_values()) del saved_model main_loop.run()
def build_and_run(label, config): ############## CREATE THE NETWORK ############### #Define the parameters num_epochs, num_batches, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation = config[ 'num_epochs'], config['num_batches'], config['num_channels'], config[ 'image_shape'], config['filter_size'], config[ 'num_filter'], config['pooling_sizes'], config[ 'mlp_hiddens'], config['output_size'], config[ 'batch_size'], config['activation'], config[ 'mlp_activation'] # print(num_epochs, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation) lambda_l1 = 0.000025 lambda_l2 = 0.000025 print("Building model") #Create the symbolics variable x = T.tensor4('image_features') y = T.lmatrix('targets') #Get the parameters conv_parameters = zip(filter_size, num_filter) #Create the convolutions layers conv_layers = list( interleave([(Convolutional(filter_size=filter_size, num_filters=num_filter, name='conv_{}'.format(i)) for i, (filter_size, num_filter) in enumerate(conv_parameters)), (activation), (MaxPooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) # (AveragePooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) #Create the sequence conv_sequence = ConvolutionalSequence(conv_layers, num_channels, image_size=image_shape, weights_init=Uniform(width=0.2), biases_init=Constant(0.)) #Initialize the convnet conv_sequence.initialize() #Add the MLP top_mlp_dims = [np.prod(conv_sequence.get_dim('output')) ] + mlp_hiddens + [output_size] out = Flattener().apply(conv_sequence.apply(x)) mlp = MLP(mlp_activation, top_mlp_dims, weights_init=Uniform(0, 0.2), biases_init=Constant(0.)) #Initialisze the MLP mlp.initialize() #Get the output predict = mlp.apply(out) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') ########### REGULARIZATION ################## cg = ComputationGraph([cost]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) biases = VariableFilter(roles=[BIAS])(cg.variables) # # l2_penalty_weights = T.sum([i*lambda_l2/len(weights) * (W ** 2).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer l2_penalty = T.sum([ lambda_l2 * (W**2).sum() for i, W in enumerate(weights + biases) ]) # Gradually increase penalty for layer # # #l2_penalty_bias = T.sum([lambda_l2*(B **2).sum() for B in biases]) # # #l2_penalty = l2_penalty_weights + l2_penalty_bias l2_penalty.name = 'l2_penalty' l1_penalty = T.sum([lambda_l1 * T.abs_(z).sum() for z in weights + biases]) # l1_penalty_weights = T.sum([i*lambda_l1/len(weights) * T.abs_(W).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer # l1_penalty_biases = T.sum([lambda_l1 * T.abs_(B).sum() for B in biases]) # l1_penalty = l1_penalty_biases + l1_penalty_weights l1_penalty.name = 'l1_penalty' costreg = cost + l2_penalty + l1_penalty costreg.name = 'costreg' ########### DEFINE THE ALGORITHM ############# # algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum()) algorithm = GradientDescent(cost=costreg, parameters=cg.parameters, step_rule=Adam()) ########### GET THE DATA ##################### istest = 'test' in config.keys() train_stream, valid_stream, test_stream = get_stream(batch_size, image_shape, test=istest) ########### INITIALIZING EXTENSIONS ########## checkpoint = Checkpoint('models/best_' + label + '.tar') checkpoint.add_condition( ['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Adding a live plot with the bokeh server plot = Plot( label, channels=[ ['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], # ['train_costreg','train_grad_norm']], # [ 'train_costreg', 'train_total_gradient_norm', 'train_l2_penalty', 'train_l1_penalty' ] ], server_url="http://hades.calculquebec.ca:5042") grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = 'grad_norm' extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring([cost, error_rate, error_rate2], valid_stream, prefix="valid"), TrainingDataMonitoring([ costreg, error_rate, error_rate2, grad_norm, l2_penalty, l1_penalty ], prefix="train", after_epoch=True), plot, ProgressBar(), Printing(), TrackTheBest('valid_error_rate', min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=4) ] # Early-stopping model = Model(cost) main_loop = MainLoop(algorithm, data_stream=train_stream, model=model, extensions=extensions) main_loop.run()
def construct_main_loop(name, task_name, patch_shape, batch_size, n_spatial_dims, n_patches, n_epochs, learning_rate, hyperparameters, **kwargs): name = "%s_%s" % (name, task_name) hyperparameters["name"] = name task = get_task(**hyperparameters) hyperparameters["n_channels"] = task.n_channels theano.config.compute_test_value = "warn" x, x_shape, y = task.get_variables() ram = construct_model(task=task, **hyperparameters) ram.initialize() states = [] states.append(ram.compute_initial_state(x, x_shape, as_dict=True)) n_steps = n_patches - 1 for i in xrange(n_steps): states.append(ram.apply(x, x_shape, as_dict=True, **states[-1])) emitter = task.get_emitter(input_dim=ram.get_dim("states"), **hyperparameters) emitter.initialize() cost = emitter.cost(states[-1]["states"], y, n_patches) cost.name = "cost" print "setting up main loop..." graph = ComputationGraph(cost) uselessflunky = Model(cost) algorithm = GradientDescent(cost=cost, parameters=graph.parameters, step_rule=CompositeRule([ StepClipping(1.), Adam(learning_rate=learning_rate) ])) monitors = construct_monitors(x=x, x_shape=x_shape, y=y, cost=cost, algorithm=algorithm, task=task, model=uselessflunky, ram=ram, graph=graph, **hyperparameters) main_loop = MainLoop( data_stream=task.get_stream("train"), algorithm=algorithm, extensions=( monitors + [ FinishAfter(after_n_epochs=n_epochs), DumpMinimum(name + '_best', channel_name='valid_error_rate'), Dump(name + '_dump', every_n_epochs=10), #Checkpoint(name+'_checkpoint.pkl', every_n_epochs=10, on_interrupt=False), ProgressBar(), Timing(), Printing(), PrintingTo(name + "_log") ]), model=uselessflunky) return main_loop
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None): if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost') error_rate = MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate') cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring([cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] model = Model(cost) main_loop = MainLoop(algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run()
def main(config, tr_stream, dev_stream, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') initial_context = tensor.matrix('initial_context') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = InitialContextDecoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['context_dim']) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, initial_context) cost.name = 'decoder_cost' logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_parameters().values() enc_params += Selector(encoder.fwd_fork).get_parameters().values() enc_params += Selector(encoder.back_fork).get_parameters().values() dec_params = Selector( decoder.sequence_generator.readout).get_parameters().values() dec_params += Selector( decoder.sequence_generator.fork).get_parameters().values() dec_params += Selector(decoder.transition.initial_transformer).get_parameters().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff']) # TODO: weight noise for recurrent params isn't currently implemented -- see config['weight_noise_rec'] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions # TODO: add checking for existing model and loading logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Create the theano variables that we need for the sampling graph sampling_input = tensor.lmatrix('input') sampling_context = tensor.matrix('context_input') # WORKING: change this part to account for the new initial context for decoder # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation, sampling_context) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling # TODO: currently commented because we need to modify the sampler to use the contexts if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab=source_vocab, trg_vocab=target_vocab, src_vocab_size=config['src_vocab_size'], )) # TODO: add sampling_context to BleuValidator and Sampler # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[['decoder_cost', 'validation_set_bleu_score']], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent( cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) else: algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # enrich the logged information extensions.append( Timing(every_n_batches=100) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def main(num_epochs=100): x = tensor.tensor4('features') y = tensor.lmatrix('targets') num_filters = 64 layers = [ConvolutionalLayer(filter_size=(8, 8), num_filters=num_filters, num_channels=3, step=(1, 1), border_mode='valid'), MaxPooling(pooling_size=(2, 2)), ConvolutionalLayer(filter_size=(5, 5), num_filters=num_filters, num_channels=num_filters, step=(1, 1), border_mode='valid')] convnet = ConvolutionalNetwork(layers=layers) output_shape = convnet.get_output_shape((32, 32)) convnet.set_input_shape((32, 32)) fc_net = NeuralSoftmax(input_dim=numpy.prod(output_shape) * num_filters, n_classes=10, n_hidden=[500]) convnet_features = convnet.apply(x) probs = fc_net.get_probs(features=convnet_features.flatten(2)) params = convnet.get_params() + fc_net.get_params() weights = convnet.get_weights() cost = fc_net.get_cost(probs=probs, targets=y).mean() cost.name = 'cost' misclassification = fc_net.get_misclassification( probs=probs, targets=y ).mean() misclassification.name = 'misclassification' train_dataset = CIFAR10('train', flatten=False) test_dataset = CIFAR10('test', flatten=False) algorithm = GradientDescent( cost=cost, params=params, step_rule=Momentum(learning_rate=.01, momentum=0.1)) train_data_stream = Mapping( data_stream=ForceFloatX( data_stream=DataStream( dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=range(50000), batch_size=100, ) ) ), mapping=Rescale(scale=1/127.5, shift=-127.5) ) valid_data_stream = Mapping( data_stream=ForceFloatX( data_stream=DataStream( dataset=train_dataset, iteration_scheme=SequentialScheme( examples=range(40000, 50000), batch_size=1000, ) ) ), mapping=Rescale(scale=1/127.5, shift=-127.5) ) test_data_stream = Mapping( data_stream=ForceFloatX( data_stream=DataStream( dataset=test_dataset, iteration_scheme=SequentialScheme( examples=test_dataset.num_examples, batch_size=100, ) ) ), mapping=Rescale(scale=1/127.5, shift=-127.5) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(DataStreamMonitoring( [cost, misclassification], test_data_stream, prefix='test')) extensions.append(DataStreamMonitoring( [cost, misclassification], valid_data_stream, prefix='valid')) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) plotters = [] plotters.append(Plotter( channels=[['test_cost', 'test_misclassification', 'train_cost', 'train_misclassification']], titles=['Costs'])) display_train = ImageDataStreamDisplay( data_stream=copy.deepcopy(train_data_stream), image_shape=(3, 32, 32), axes=('c', 0, 1), shift=0, rescale=1., ) weight_display = WeightDisplay( weights=weights, transpose=(0, 1, 2, 3), image_shape=(3, 8, 8), axes=('c', 0, 1), shift=-0.5, rescale=2., ) # Feature maps one_example_train_data_stream = Mapping( data_stream=ForceFloatX( data_stream=DataStream( dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=train_dataset.num_examples, batch_size=1, ) ) ), mapping=Rescale(scale=1/127.5, shift=-127.5) ) displayable_convnet_features = convnet_features.dimshuffle((1, 0, 2, 3)) convnet_features_normalizer = abs( displayable_convnet_features ).max(axis=(1, 2, 3)) displayable_convnet_features = displayable_convnet_features \ / convnet_features_normalizer.dimshuffle((0, 'x', 'x', 'x')) get_displayable_convnet_features = theano.function( [x], displayable_convnet_features ) display_feature_maps_data_stream = Mapping( data_stream=one_example_train_data_stream, mapping=TupleMapping(get_displayable_convnet_features, same_len_out=True) ) display_feature_maps = ImageDataStreamDisplay( data_stream=display_feature_maps_data_stream, image_shape=(1, ) + output_shape, axes=('c', 0, 1), shift=0, rescale=1., ) # Saliency map displayable_saliency_map = tensor.grad(cost, x) saliency_map_normalizer = abs( displayable_saliency_map ).max(axis=(1, 2, 3)) displayable_saliency_map = displayable_saliency_map \ / saliency_map_normalizer.dimshuffle((0, 'x', 'x', 'x')) get_displayable_saliency_map = theano.function( [x, y], displayable_saliency_map ) display_saliency_map_data_stream = Mapping( data_stream=copy.deepcopy(train_data_stream), mapping=TupleMapping(get_displayable_saliency_map, same_len_out=True, same_len_in=True) ) display_saliency_map = ImageDataStreamDisplay( data_stream=display_saliency_map_data_stream, image_shape=(3, 32, 32), axes=('c', 0, 1), shift=0, rescale=1., ) # Deconvolution x_repeated = x.repeat(num_filters, axis=0) convnet_features_repeated = convnet.apply(x_repeated) convnet_features_selected = convnet_features_repeated \ * tensor.eye(num_filters).repeat( x.shape[0], axis=0 ).dimshuffle((0, 1, 'x', 'x')) displayable_deconvolution = tensor.grad(misclassification, x_repeated, known_grads={ convnet_features_selected: convnet_features_selected }) deconvolution_normalizer = abs( displayable_deconvolution ).max(axis=(1, 2, 3)) displayable_deconvolution = displayable_deconvolution \ / deconvolution_normalizer.dimshuffle((0, 'x', 'x', 'x')) get_displayable_deconvolution = theano.function( [x], displayable_deconvolution ) display_deconvolution_data_stream = Mapping( data_stream=one_example_train_data_stream, mapping=TupleMapping(get_displayable_deconvolution, same_len_out=True) ) display_deconvolution = ImageDataStreamDisplay( data_stream=display_deconvolution_data_stream, image_shape=(3, 32, 32), axes=('c', 0, 1), shift=0, rescale=1., ) images_displayer = DisplayImage( image_getters=[display_train, weight_display, display_feature_maps, display_saliency_map, display_deconvolution], titles=['Training examples', 'Convolutional weights', 'Feature maps', 'Sensitivity', 'Deconvolution'] ) plotters.append(images_displayer) extensions.append(PlotManager('CIFAR-10 convnet examples', plotters=plotters, after_epoch=False, every_n_epochs=10, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): x_dim = 28 * 28 img_height, img_width = (28, 28) rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) name = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (name, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_valid = BinarizedMNIST("valid", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) train_stream = DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size)) valid_stream = DataStream(mnist_valid, iteration_scheme=SequentialScheme( mnist_valid.num_examples, batch_size)) test_stream = DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size)) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), Checkpoint(name + ".pkl", save_separately=['log', 'model']), #Dump(name), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def training(self, fea2obj, batch_size, learning_rate=0.005, steprule='adagrad', wait_epochs=5, kl_weight_init=None, klw_ep=50, klw_inc_rate=0, num_epochs=None): networkfile = self._config['net'] n_epochs = num_epochs or int(self._config['nepochs']) reg_weight = float(self._config['loss_weight']) reg_type = self._config['loss_reg'] numtrain = int( self._config['num_train']) if 'num_train' in self._config else None train_stream, num_samples_train = get_comb_stream( fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain) dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size=None, shuffle=False) logger.info('sources: %s -- number of train/dev samples: %d/%d', train_stream.sources, num_samples_train, num_samples_dev) t2idx = fea2obj['targets'].t2idx klw_init = kl_weight_init or float( self._config['kld_weight']) if 'kld_weight' in self._config else 1 logger.info('kl_weight_init: %d', klw_init) kl_weight = shared_floatx(klw_init, 'kl_weight') entropy_weight = shared_floatx(1., 'entropy_weight') cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate = build_model_new( fea2obj, len(t2idx), self._config, kl_weight, entropy_weight) cg = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT])(cg.parameters) logger.info('Model weights are: %s', weights) if 'L2' in reg_type: cost += reg_weight * l2_norm(weights) logger.info('applying %s with weight: %f ', reg_type, reg_weight) dropout = -0.1 if dropout > 0: cg = apply_dropout(cg, weights, dropout) cost = cg.outputs[0] cost.name = 'cost' logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule, learning_rate) if 'adagrad' in steprule: cnf_step_rule = AdaGrad(learning_rate) elif 'adadelta' in steprule: cnf_step_rule = AdaDelta(decay_rate=0.95) elif 'decay' in steprule: cnf_step_rule = RMSProp(learning_rate=learning_rate, decay_rate=0.90) cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)]) elif 'momentum' in steprule: cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9) elif 'adam' in steprule: cnf_step_rule = Adam(learning_rate=learning_rate) else: logger.info('The steprule param is wrong! which is: %s', steprule) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=cnf_step_rule, on_unused_sources='warn') #algorithm.add_updates(updates) gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [ cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight, pat1_recog ] train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') dev_monitor = DataStreamMonitoring(variables=[ cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate ], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") extensions = [ dev_monitor, train_monitor, Timing(), TrackTheBest('dev_cost'), FinishIfNoImprovementAfter('dev_cost_best_so_far', epochs=wait_epochs), Printing(after_batch=False), #, ProgressBar() FinishAfter(after_n_epochs=n_epochs), saveload.Load(networkfile + '.toload.pkl'), ] + track_best('dev_cost', networkfile + '.best.pkl') #extensions.append(SharedVariableModifier(kl_weight, # lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) # extensions.append(SharedVariableModifier(entropy_weight, # lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) logger.info('number of parameters in the model: %d', tensor.sum([p.size for p in cg.parameters]).eval()) logger.info('Lookup table sizes: %s', [p.size.eval() for p in cg.parameters if 'lt' in p.name]) main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" o = x.sum(axis=1) + m.mean() * 0 score_layer = Linear( input_dim = 300, output_dim = 1, weights_init = IsotropicGaussian(std=0.02), biases_init = Constant(0.), name="linear2") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cg.outputs[0], params=params, step_rule = CompositeRule([ StepClipping(threshold=4), AdaM(), ]) ) # ======== print "setting up data" ports = { 'gpu0_train' : 5557, 'gpu0_test' : 5558, 'gpu1_train' : 5559, 'gpu1_test' : 5560, } #batch_size = 16 batch_size = 16 def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set, sorted=True) n_train = dataset.num_examples #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size) scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size) stream = DataStream( dataset=dataset, iteration_scheme=scheme) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + '_train'] train_p = Process(target=start_server, args=(train_port, 'train')) train_p.start() test_port = ports[theano.config.device + '_test'] test_p = Process(target=start_server, args=(test_port, 'test')) test_p.start() train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port) test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port) print "setting up model" n_examples = 25000 #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [ cost, misclassification, ], prefix='train', after_epoch=True )) #extensions.append(DataStreamMonitoring( #[cost, misclassification], #data_stream=test_stream, #prefix='test', #after_epoch=True #)) extensions.append(Timing()) extensions.append(Printing()) extensions.append(Plot( theano.config.device+"_result", channels=[['train_cost']], after_epoch=True )) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def run_pretrain(model, hyper_params, cost, train_data, valid_data=None, extra_costs=None): """ generic training method for neural networks; works with any network structure :return: """ from fuel.streams import DataStream from fuel.schemes import SequentialScheme, ShuffledScheme from blocks.filter import VariableFilter from blocks.graph import ComputationGraph from blocks.roles import WEIGHT from blocks.algorithms import GradientDescent, Adam, RMSProp, Scale from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.predicates import OnLogRecord from blocks.monitoring import aggregation from blocks.main_loop import MainLoop from blocks.extensions.training import TrackTheBest from deepthought.extensions.parameters import BestParams if extra_costs is None: extra_costs = [] cg = ComputationGraph([cost]) # TODO: more hyper-params for regularization # L1 regularization if hyper_params['l1wdecay'] > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + hyper_params['l1wdecay'] * sum([abs(W).sum() for W in weights]) cost.name = 'cost' # set up step_rule if hyper_params['step_rule'] == 'Adam': step_rule = Adam(learning_rate=hyper_params['learning_rate']) elif hyper_params['step_rule'] == 'RMSProp': step_rule = RMSProp(learning_rate=hyper_params['learning_rate']) #, decay_rate=0.9, max_scaling=1e5) else: step_rule = Scale(learning_rate=hyper_params['learning_rate']) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) if 'blocks_print_variable_names' in hyper_params and hyper_params['blocks_print_variable_names']: print 'cg.variables:', cg.variables train_monitoring_vars = [cost] + extra_costs + [aggregation.mean(algorithm.total_gradient_norm)] for var_name in hyper_params['blocks_extensions_train_monitoring_channels']: for v in cg.variables: if v.name == var_name: print 'Monitoring variable:', v train_monitoring_vars.append(v) # default extensions extensions = [Timing(), FinishAfter(after_n_epochs=hyper_params['max_epochs']), TrainingDataMonitoring( train_monitoring_vars, suffix="train", after_epoch=True) ] # additional stuff if validation set is used if valid_data is not None: valid_monitoring_vars = [cost] + extra_costs for var_name in hyper_params['blocks_extensions_valid_monitoring_channels']: for v in cg.variables: if v.name == var_name: print 'Monitoring variable:', v valid_monitoring_vars.append(v) extensions.append( DataStreamMonitoring( valid_monitoring_vars, DataStream.default_stream( valid_data, iteration_scheme=SequentialScheme( valid_data.num_examples, hyper_params['batch_size'])), suffix="valid")) best_channel = 'cost_valid' print '#train:', train_data.num_examples, '#valid:', valid_data.num_examples else: best_channel = 'cost_train' print '#train:', train_data.num_examples # tracking of the best best_params = BestParams() best_params.add_condition(['after_epoch'], predicate=OnLogRecord(best_channel + '_best_so_far')) extensions.append(TrackTheBest(best_channel)) extensions.append(best_params) # after TrackTheBest! # printing and plotting if hyper_params['blocks_extensions_printing'] is True: extensions.append(Printing()) # optional if hyper_params['blocks_extensions_progressbar'] is True: extensions.append(ProgressBar()) if hyper_params['blocks_extensions_bokeh'] is True: try: from blocks_extras.extensions.plot import Plot bokeh_available = True except: bokeh_available = False print 'bokeh available: ', bokeh_available if bokeh_available: extensions.append(Plot( hyper_params['blocks_extensions_bokeh_plot_title'], channels=hyper_params['blocks_extensions_bokeh_channels'], )) main_loop = MainLoop( algorithm, DataStream.default_stream( train_data, iteration_scheme=ShuffledScheme( train_data.num_examples, hyper_params['batch_size'])), model=model, extensions=extensions) main_loop.run() return best_params.values, main_loop.status['best_' + best_channel]
def pretrain_rnn(train, rnnrbm, test=None, epochs=1000, bokeh=True): lr = theano.shared(float32(0.1)) probs, _, _, _ = rnnrbm.rnn_pretrain_pred(x, x_mask) cost = NegativeLogLikelihood().apply(y, probs, y_mask) error_rate = MismulitclassificationRate().apply(y, probs, y_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(y, probs, y_mask) mistake_rate.name = "single error within note" cost.name = 'final_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule([ RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite() ]) algorithm = GradientDescent(step_rule=step_rule, cost=cost, params=cg.parameters) extensions = [ SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.7 * v) if n % 700 == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [ cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar() ] if test is not None: extensions.append( DataStreamMonitoring([cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append( Plot( 'Pretrain RNN', channels=[ [ 'train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note' ], ['train_rbm_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions) return main_loop
def add_timing(self, **kwargs): self.extensions.append(Timing(**kwargs))
from blocks_extras.extensions.synchronization import ( Synchronize, SynchronizeWorker) from platoon.param_sync import ASGD sync_rule = ASGD() worker = SynchronizeWorker( sync_rule, control_port=args.platoon_port, socket_timeout=2000) extensions = [] if args.load_experiment and (not worker or worker.is_main_worker): extensions += [Load(os.path.join( save_dir, "pkl", "best_" + args.load_experiment + ".tar"))] extensions += [ Timing(every_n_batches=args.save_every), train_monitor] if not worker or worker.is_main_worker: extensions += [ valid_monitor, TrackTheBest( 'valid_nll', every_n_batches=args.save_every, before_first_epoch=True), Plot( os.path.join(save_dir, "progress", exp_name + ".png"), [['train_nll', 'valid_nll']], every_n_batches=args.save_every, email=False), Checkpoint(
def main(name, epochs, batch_size, learning_rate): if name is None: name = "att-rw" print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print() #------------------------------------------------------------------------ img_height, img_width = 28, 28 read_N = 12 write_N = 14 inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } x_dim = img_height * img_width reader = ZoomableAttentionWindow(img_height, img_width, read_N) writer = ZoomableAttentionWindow(img_height, img_width, write_N) # Parameterize the attention reader and writer mlpr = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="RMLP", **inits) mlpw = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="WMLP", **inits) # MLP between the reader and writer mlp = MLP(activations=[Tanh(), Identity()], dims=[read_N**2, 300, write_N**2], name="MLP", **inits) for brick in [mlpr, mlpw, mlp]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') hr = mlpr.apply(x) hw = mlpw.apply(x) center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) r = reader.read(x, center_y, center_x, delta, sigma) h = mlp.apply(r) center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) c = writer.write(h, center_y, center_x, delta, sigma) / gamma x_recons = T.nnet.sigmoid(c) cost = BinaryCrossEntropy().apply(x, x_recons) cost.name = "cost" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ RemoveNotFinite(), Adam(learning_rate), StepClipping(3.), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: # v_mean = v.mean() # v_mean.name = v.name # monitors += [v_mean] # monitors += [aggregation.mean(v)] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["cost"], ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), #Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # add the tags from this function to the IMT datastream # prediction function signature # [target_suffix, source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask] prediction_function = get_prediction_function(exp_config=config) tr_stream = Mapping( tr_stream, CallPredictionFunctionOnStream(prediction_function, [1, 0, 5, 4, 7, 6]), #tr_stream = Mapping(tr_stream, CallFunctionOnStream(prediction_function, [6, 1, 0, 5, 4, 7]), add_sources=('predictions', 'orig_readouts', 'prediction_tags')) # now datastream has 11 things import ipdb ipdb.set_trace() # WORKING: call prediction function twice to get new readouts on predictions instead of reference suffs # the only difference is the index of the suffix tr_stream = Mapping(tr_stream, CallPredictionFunctionOnStream(prediction_function, [1, 0, 5, 4, 7, 8]), add_sources=('dummy_predictions', 'readouts', 'dummy_prediction_tags')) import ipdb ipdb.set_trace() # Create the prediction confidence model # the first draft of this model uses the readout output (before the post-merge step) as the per-timestep state vector # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference target_sentence = tensor.lmatrix('target_suffix') target_sentence_mask = tensor.matrix('target_suffix_mask') target_prefix = tensor.lmatrix('target_prefix') target_prefix_mask = tensor.matrix('target_prefix_mask') # symbolic variable which tags each timestep as GOOD/BAD # Note: later this might be tags for a hypothesis i.e. from TER(p), right now the timesteps are actually determined by the reference # By zipping the confidence model output with the reference, we get the model's confidence that this reference word # will be predicted correctly prediction_tags = tensor.matrix('prediction_tags') readouts = tensor.tensor3('readouts') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = NMTPrefixDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, loss_function='cross_entropy') # rename to match baseline NMT systems decoder.name = 'decoder' cost = decoder.confidence_cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, target_prefix, target_prefix_mask, readouts, prediction_tags) # WORKING: add l2 regularization logger.info('Creating computational graph') # working: implement cost for confidence model cg = ComputationGraph(cost) # INITIALIZATION logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() #cost_cg = ComputationGraph(cost) if config['l2_reg']: l2_reg_alpha = config['l2_reg_alpha'] model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # do we need to name the cost variable again? cost.name = 'cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name in set([ 'confidence_model1_apply_output', 'confidence_model2_apply_output', 'confidence_model3_apply_output' ]) ] # if x.name == 'maxout_apply_output'] # if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # WORKING: implement confidence -- remove all params except output model cost_model = Model(cost) model_params = cost_model.get_parameter_dict() trainable_params = cg.parameters import ipdb ipdb.set_trace() print('trainable params') #params_to_remove = [model_params[k] for k in model_params.keys() if 'confidence' not in k] #for p in params_to_remove: # trainable_params.remove(p) # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W'] # trainable_params.remove(source_embeddings) # trainable_params.remove(target_embeddings) # END WORKING: implement confidence -- remove all params except output model # TODO: fixed dropout mask for recurrent params? # Print shapes # shapes = [param.get_value().shape for param in cg.parameters] # logger.info("Parameter shapes: ") # for shape, count in Counter(shapes).most_common(): # logger.info(' {:15}: {}'.format(shape, count)) # logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names # enc_dec_param_dict = merge(Selector(encoder).get_parameters(), # Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}" # .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), # Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # WORKING: confidence prediction #monitor everything that could possibly be relevant # Set up the sampling graph for validation during training # Theano variables for the sampling graph # Note this also loads the model parameters sampling_vars = load_params_and_get_beam_search(config, encoder=encoder, decoder=decoder) beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab=source_vocab, # trg_vocab=target_vocab, # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu #if config['bleu_script'] is not None: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # src_vocab=source_vocab, # trg_vocab=target_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # TODO: add first-word accuracy validation # TODO: add IMT meteor early stopping #if config.get('imt_f1_validation', None) is not None: # logger.info("Building imt F1 validator") # extensions.append( # IMT_F1_Validator(sampling_input, sampling_prefix, # samples=samples, # config=config, # model=search_model, data_stream=dev_stream, # src_vocab=source_vocab, # trg_vocab=target_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # TODO: hacking here: get the predictions of the confidence model using the `readouts` source of the data_stream # Note that the parameters of this model must be pretrained, otherwise this doesn't make sense # confidence_predictions = decoder.get_confidence(readouts) # confidence_prediction_model = Model(confidence_predictions) # # confidence_param_values = LoadNMT.load_parameter_values(config['confidence_saved_parameters'], brick_delimiter=None) # LoadNMT.set_model_parameters(confidence_prediction_model, confidence_param_values) # # confidence_prediction_func = confidence_prediction_model.get_theano_function() # import ipdb; ipdb.set_trace() # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( # Plot(config['model_save_directory'], channels=[['decoder_confidence_cost_cost']], Plot(config['model_save_directory'], channels=[['cost']], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # WORKING: implement confidence model # if there is dropout or random noise, we need to use the output of the modified graph algorithm = GradientDescent( cost=cg.outputs[0], parameters=trainable_params, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), # eval(config['step_rule'])(), RemoveNotFinite()]), # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), on_unused_sources='warn') #if config['dropout'] < 1.0: # algorithm = GradientDescent( # cost=cg.outputs[0], parameters=trainable_params, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])(), RemoveNotFinite()]), # # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), # on_unused_sources='warn' # ) #else: # algorithm = GradientDescent( # cost=cost, parameters=cg.parameters, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])()]), # on_unused_sources='warn' # ) # END WORKING: implement confidence model import ipdb ipdb.set_trace() # enrich the logged information extensions.append(Timing(every_n_batches=100)) # WORKING: debugging confidence # get theano function from model # WORKING: implement word-level confidence cost # @application(inputs=['representation', 'source_sentence_mask', # 'target_sentence_mask', 'target_sentence', 'target_prefix_mask', 'target_prefix'], # outputs=['cost']) # def confidence_cost(self, representation, source_sentence_mask, # target_sentence, target_sentence_mask, target_prefix, target_prefix_mask): logger.info('Creating theano variables') # WORKING: 26.9.16 -- get confidence outputs directly from (source, prefix, suffix) inputs # This is equivalent to forced alignment --> confidence scores # Note: but this section should probably be in "evaluate" mode, not here in "train" # source_sentence = tensor.lmatrix('source') # source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference #target_sentence = tensor.lmatrix('target_suffix') #target_sentence_mask = tensor.matrix('target_suffix_mask') # TODO: change names back to *_suffix, there is currently a theano function name error # TODO: in the GradientDescent Algorithm #target_prefix = tensor.lmatrix('target_prefix') #target_prefix_mask = tensor.matrix('target_prefix_mask') # confidence_output = decoder.confidence_cost( # encoder.apply(source_sentence, source_sentence_mask), # source_sentence_mask, target_sentence, target_sentence_mask, # target_prefix, target_prefix_mask) # confidence_model = Model(confidence_output) # t_cost_func = confidence_model.get_theano_function() # inputs # [source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask, target_suffix] #import ipdb;ipdb.set_trace() # get the right args from the datastream # TODO: just print source, prefix, suffix, prediction, correct to new files -- this makes sure everything is aligned # OUTPUT_DIR = '/media/1tb_drive/imt_models/word_prediction_accuracy_experiments/en-de/exp_1' # for the_file in os.listdir(OUTPUT_DIR): # file_path = os.path.join(OUTPUT_DIR, the_file) # try: # if os.path.isfile(file_path): # os.unlink(file_path) # except Exception as e: # print(e) # # def write_file_truncate_mask(filename, data, mask, mode='a'): # ''' data is list of list ''' # # assert len(data) == len(mask) # with codecs.open(filename, mode, encoding='utf8') as out: # for l, m in zip(data, mask): # output = u' '.join(l[:int(m.sum())]) + u'\n' # out.write(output) # logger.info('Wrote file: {}'.format(filename)) # # # target_ivocab = {k:v.decode('utf8') for v,k in target_vocab.items()} # source_ivocab = {k:v.decode('utf8') for v,k in source_vocab.items()} # import ipdb; ipdb.set_trace() # tag_ivocab = {1: 'True', 0: 'False'} # # test_iter = tr_stream.get_epoch_iterator() # it = 0 # for t_source, t_source_mask, t_target, t_target_mask, t_target_prefix, t_target_prefix_mask, t_target_suffix, t_target_suffix_mask in test_iter: # if it <= 1000: # it += 1 # t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix) # readouts = t_cost[0] # preds = readouts.argmax(axis=2) # correct = preds.T == t_target_suffix # # # source_output = os.path.join(OUTPUT_DIR,'sources.en') # prefix_output = os.path.join(OUTPUT_DIR,'prefixes.de') # suffix_output = os.path.join(OUTPUT_DIR,'suffixes.de') # prediction_output = os.path.join(OUTPUT_DIR,'predictions.de') # correct_output = os.path.join(OUTPUT_DIR,'prefix_word_prediction_acc.out') # # source_text = [[source_ivocab[w] for w in s] for s in t_source] # prefix_text = [[target_ivocab[w] for w in s] for s in t_target_prefix] # suffix_text = [[target_ivocab[w] for w in s] for s in t_target_suffix] # pred_text = [[target_ivocab[w] for w in s] for s in preds.T] # correct_text = [[tag_ivocab[w] for w in s] for s in correct] # # # for triple in zip([source_output, prefix_output, suffix_output, prediction_output, correct_output], # [source_text, prefix_text, suffix_text, pred_text, correct_text], # [t_source_mask, t_target_prefix_mask, t_target_suffix_mask, t_target_suffix_mask, t_target_suffix_mask]): # write_file_truncate_mask(*triple) # else: # break # # import ipdb; ipdb.set_trace() #t_cost = t_cost_func(t_source, t_target_prefix) #t_cost = t_cost_func(t_target_suffix, t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask) #t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix) # return confidence_cost, flat_y, confidence_logits, readouts #predictions = t_cost[0].argmax(axis=2) # TODO: next step -- print gradients and weights during training find out where nan is coming from # TODO: look at the gradient of this function with respect to parameters? -- see here: http://deeplearning.net/software/theano/tutorial/gradients.html # TODO: function which adds right/wrong tags for model predictions to the datastream. In this case we can learn a simple linear model as a baseline # TODO: print predictions for each batch for each timestep to file -- _dont shuffle_ so that we get the right order # import ipdb;ipdb.set_trace() # from blocks reverse_words example # observables = [ # cost, min_energy, max_energy, mean_activation, # batch_size, max_length, cost_per_character, # algorithm.total_step_norm, algorithm.total_gradient_norm] # for name, parameter in trainable_params.items(): # observables.append(parameter.norm(2).copy(name + "_norm")) # observables.append(algorithm.gradients[parameter].norm(2).copy( # name + "_grad_norm")) for i, (k, v) in enumerate(algorithm.updates): v.name = k.name + '_{}'.format(i) aux_vars = [v for v in cg.auxiliary_variables[-3:]] # import ipdb; ipdb.set_trace() extensions.extend([ TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True), # TrainingDataMonitoring(aux_vars, after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True) ]) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) import ipdb ipdb.set_trace() # Train! main_loop.run()
def train_extractive_qa(new_training_job, config, save_path, params, fast_start, fuel_server, seed): if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed root_path = os.path.join(save_path, 'training_state') extension = '.tar' tar_path = root_path + extension best_tar_path = root_path + '_best' + extension c = config data, qam = initialize_data_and_model(c) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', shuffle=True, batch_size=4, max_length=5).get_epoch_iterator(as_dict=True)) for var in qam.input_vars.values(): var.tag.test_value = test_value_data[var.name] costs = qam.apply_with_default_vars() cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(qam.contexts.shape[1], 'length') batch_size = rename(qam.contexts.shape[0], 'batch_size') predicted_begins, = VariableFilter(name='predicted_begins')(cg) predicted_ends, = VariableFilter(name='predicted_ends')(cg) exact_match, = VariableFilter(name='exact_match')(cg) exact_match_ratio = rename(exact_match.mean(), 'exact_match_ratio') context_unk_ratio, = VariableFilter(name='context_unk_ratio')(cg) monitored_vars = [ length, batch_size, cost, exact_match_ratio, context_unk_ratio ] if c['dict_path']: def_unk_ratio, = VariableFilter(name='def_unk_ratio')(cg) num_definitions = rename(qam.input_vars['defs'].shape[0], 'num_definitions') max_definition_length = rename(qam.input_vars['defs'].shape[1], 'max_definition_length') monitored_vars.extend( [def_unk_ratio, num_definitions, max_definition_length]) if c['def_word_gating'] == 'self_attention': def_gates = VariableFilter(name='def_gates')(cg) def_gates_min = tensor.minimum(*[x.min() for x in def_gates]) def_gates_max = tensor.maximum(*[x.max() for x in def_gates]) monitored_vars.extend([ rename(def_gates_min, 'def_gates_min'), rename(def_gates_max, 'def_gates_max') ]) text_match_ratio = TextMatchRatio(data_path=os.path.join( fuel.config.data_path[0], 'squad/dev-v1.1.json'), requires=[ predicted_begins, predicted_ends, tensor.ltensor3('contexts_text'), tensor.lmatrix('q_ids') ], name='text_match_ratio') parameters = cg.get_parameter_dict() trained_parameters = parameters.values() if c['embedding_path']: logger.debug("Exclude word embeddings from the trained parameters") trained_parameters = [ p for p in trained_parameters if not p == qam.embeddings_var() ] if c['train_only_def_part']: def_reading_parameters = qam.def_reading_parameters() trained_parameters = [ p for p in trained_parameters if p in def_reading_parameters ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) # apply dropout to the training cost and to all the variables # that we monitor during training train_cost = cost train_monitored_vars = list(monitored_vars) if c['dropout']: regularized_cg = ComputationGraph([cost] + train_monitored_vars) # Dima: the dropout that I implemented first bidir_outputs, = VariableFilter(bricks=[Bidirectional], roles=[OUTPUT])(cg) readout_layers = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg) dropout_vars = [bidir_outputs] + readout_layers logger.debug("applying dropout to {}".format(", ".join( [v.name for v in dropout_vars]))) regularized_cg = apply_dropout(regularized_cg, dropout_vars, c['dropout']) # a new dropout with exactly same mask at different steps emb_vars = VariableFilter(roles=[EMBEDDINGS])(regularized_cg) emb_dropout_mask = get_dropout_mask(emb_vars[0], c['emb_dropout']) if c['emb_dropout_type'] == 'same_mask': regularized_cg = apply_dropout2(regularized_cg, emb_vars, c['emb_dropout'], dropout_mask=emb_dropout_mask) elif c['emb_dropout_type'] == 'regular': regularized_cg = apply_dropout(regularized_cg, emb_vars, c['emb_dropout']) else: raise ValueError("unknown dropout type {}".format( c['emb_dropout_type'])) train_cost = regularized_cg.outputs[0] train_monitored_vars = regularized_cg.outputs[1:] rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=train_cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) training_stream = data.get_stream('train', batch_size=c['batch_size'], shuffle=True, max_length=c['max_length']) original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) extensions = [ LoadNoUnpickling(tar_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job), StartFuelServer(original_training_stream, os.path.join(save_path, 'stream.pkl'), before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), ] validation = DataStreamMonitoring( [text_match_ratio] + monitored_vars, data.get_stream('dev', batch_size=c['batch_size_valid'], raw_text=True, q_ids=True), prefix="dev").set_conditions(before_training=not fast_start, after_epoch=True) dump_predictions = DumpPredictions(save_path, text_match_ratio, before_training=not fast_start, after_epoch=True) track_the_best_exact = TrackTheBest( validation.record_name(exact_match_ratio), choose_best=max).set_conditions(before_training=True, after_epoch=True) track_the_best_text = TrackTheBest( validation.record_name(text_match_ratio), choose_best=max).set_conditions(before_training=True, after_epoch=True) extensions.extend([ validation, dump_predictions, track_the_best_exact, track_the_best_text ]) # We often use pretrained word embeddings and we don't want # to load and save them every time. To avoid that, we use # save_main_loop=False, we only save the trained parameters, # and we save the log and the iterations state separately # in the tar file. extensions.extend([ Checkpoint(tar_path, parameters=trained_parameters, save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], every_n_batches=c['save_freq_batches'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best_text.notification_name), (best_tar_path, )), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_train'], after_training=True), RetrievalPrintStats(retrieval=data._retrieval, every_n_batches=c['mon_freq_train'], before_training=not fast_start), Printing(after_epoch=True, every_n_batches=c['mon_freq_train']), FinishAfter(after_n_batches=c['n_batches'], after_n_epochs=c['n_epochs']), Annealing(c['annealing_learning_rate'], after_n_epochs=c['annealing_start_epoch']), LoadNoUnpickling(best_tar_path, after_n_epochs=c['annealing_start_epoch']) ]) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
def main(save_to, cost_name, learning_rate, momentum, num_epochs): mlp = MLP([None], [784, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') scores = mlp.apply(x) batch_size = y.shape[0] indices = tensor.arange(y.shape[0]) target_scores = tensor.set_subtensor( tensor.zeros((batch_size, 10))[indices, y.flatten()], 1) score_diff = scores - target_scores # Logistic Regression if cost_name == 'lr': cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean() # MSE elif cost_name == 'mse': cost = (score_diff**2).mean() # Perceptron elif cost_name == 'perceptron': cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean() # TLE elif cost_name == 'minmin': cost = abs(score_diff[indices, y.flatten()]).mean() cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean() # TLEcut elif cost_name == 'minmin_cut': # Score of the groundtruth should be greater or equal than its target score cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean() # Score of the prediction should be less or equal than its actual score cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean() # TLE2 elif cost_name == 'minmin2': cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()])**2).mean() cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)])**2).mean() # Direct loss minimization elif cost_name == 'direct': epsilon = 0.1 cost = (-scores[indices, (scores + epsilon * target_scores).argmax(axis=1)] + scores[indices, scores.argmax(axis=1)]).mean() cost /= epsilon elif cost_name == 'svm': cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] - scores[indices, y.flatten()]).mean() else: raise ValueError("Unknown cost " + cost) error_rate = MisclassificationRate().apply(y.flatten(), scores) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) cost.name = 'cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) if learning_rate == None: learning_rate = 0.0001 if momentum == None: momentum = 0.0 rule = Momentum(learning_rate=learning_rate, momentum=momentum) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=rule) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), # CallbackExtension( # lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9), # after_epoch=True), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[['test_cost', 'test_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run() df = pandas.DataFrame.from_dict(main_loop.log, orient='index') res = { 'cost': cost_name, 'learning_rate': learning_rate, 'momentum': momentum, 'train_cost': df.train_cost.iloc[-1], 'test_cost': df.test_cost.iloc[-1], 'best_test_cost': df.test_cost.min(), 'train_error': df.train_error_rate.iloc[-1], 'test_error': df.test_error_rate.iloc[-1], 'best_test_error': df.test_error_rate.min() } res = { k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items() } json.dump(res, sys.stdout) sys.stdout.flush()
def main(mode, save_path, num_batches, data_path=None): # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Build bricks encoder = Bidirectional(SimpleRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) fork = Fork( [name for name in encoder.prototype.apply.sequences if name != 'mask'], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.output_dims = {name: dimension for name in fork.input_names} lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) transition = SimpleRecurrent(activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention(state_names=transition.apply.states, sequence_dim=2 * dimension, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = DataStreamMapping( mapping=_transpose, data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=_filter_long, data_stream=dataset.get_default_stream()))))) # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) (energies, ) = VariableFilter(application=readout.readout, name="output")(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy( abs(activations).mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) # More variables for debugging observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring(observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) model = Model(generated) model.set_param_values(load_parameter_values(save_path)) sample_function = model.get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def test_timing(): class FakeMainLoop(): def __init__(self): self.log = TrainingLog() # Build the main loop now = 0 timing = Timing(lambda: now) ml = FakeMainLoop() timing.main_loop = ml # Start training now += 1 timing.before_training() # Start epoch 1 now += 2 timing.before_epoch() assert ml.log[0].initialization_took == 2 ml.log.status._epoch_started = True # Batch 1 timing.before_batch(None) now += 7 ml.log.status.iterations_done += 1 timing.after_batch(None) assert ml.log[1].iteration_took == 7 # Batch 2 timing.before_batch(None) now += 8 ml.log.status.iterations_done += 1 timing.after_batch(None) assert ml.log[2].iteration_took == 8 # Epoch 1 is done ml.log.status.epochs_done += 1 timing.after_epoch() assert ml.log[2].epoch_took == 15 assert ml.log[2].total_took == 17 # Finish training now += 1 timing.after_training() assert ml.log[2].final_total_took == 18 # Resume training now = 0 timing.on_resumption() # Start epoch 2 timing.before_epoch() assert ml.log[2].initialization_took is None # Batch 3 timing.before_batch(None) now += 6 ml.log.status.iterations_done += 1 timing.after_batch(None) assert ml.log[3].iteration_took == 6 assert ml.log[3].total_took == 24 # Finish training before the end of the current epoch timing.after_training() # Resume training now = 2 timing.on_resumption() # Batch 4 timing.before_batch(None) now += 2 ml.log.status.iterations_done += 1 timing.after_batch(None) assert ml.log[4].iteration_took == 2 assert ml.log[4].total_took == 26 # Epoch 2 is done ml.log.status.epochs_done += 1 timing.after_epoch() assert ml.log[4].epoch_took == 8 # Start epoch 3 timing.before_epoch() # Batch 5 timing.before_batch(None) now += 5 ml.log.status.iterations_done += 1 timing.after_batch(None) assert ml.log[5].iteration_took == 5 assert ml.log[5].total_took == 31 # Epoch 3 is done ml.log.status.epochs_done += 1 timing.after_epoch() assert ml.log[5].epoch_took == 5
def main(save_to, num_epochs, regularization=0.0001, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 convnet = create_res_net() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # Apply regularization to the cost biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) l2_norm = sum([(W ** 2).sum() for W in weights]) l2_norm.name = 'l2_norm' l2_regularization = regularization * l2_norm l2_regularization.name = 'l2_regularization' test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + regularization * l2_norm train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 500 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=train_cg.parameters, step_rule=momentum) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (1, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), DataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_cost_without_regularization, l2_regularization, momentum.learning_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + save_to, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ], every_n_batches=17), Plot('Test performance for ' + save_to, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), Checkpoint(save_to, use_cpickle=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)