def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
def train_base_model(self, train_data, test_data, input_dim): x = T.matrix('features') y = T.matrix('targets') mlp, cost, mis_cost = self.create_base_model(x, y, input_dim) cg = ComputationGraph([cost]) inputs = VariableFilter(roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) data_stream = train_data data_stream_test = test_data monitor = DataStreamMonitoring(variables=[mis_cost], data_stream=data_stream_test, prefix="test") plot_ext = Plot('F1-measure', channels=[['test_MisclassificationRate']], after_batch=True) main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=[ monitor, FinishAfter(after_n_epochs=50), Printing(), plot_ext ]) main_loop.run() return mlp
def run(model_name): running_on_laptop = socket.gethostname() == 'yop' X = tensor.tensor4('image_features', dtype='float32') T = tensor.matrix('targets', dtype='float32') image_border_size = 100 if running_on_laptop: host_plot = 'http://*****:*****@ %s' % (model_name, datetime.datetime.now(), socket.gethostname()), channels=[['loss', 'valid_loss_test'], ['valid_error']], after_epoch=True, server_url=host_plot), Printing(), Checkpoint('train2') ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def test_plot(): class Writer(SimpleExtension): def do(self, *args, **kwargs): self.main_loop.log.current_row['channel'] = ( self.main_loop.status['iterations_done']**2) main_loop = MockMainLoop(extensions=[ Writer(after_batch=True), Plot('test', [['channel']]).set_conditions(after_batch=True), FinishAfter(after_n_batches=11) ]) main_loop.run()
def train(self, data_file, output_data_file, n_epochs=0): training_data = dataset.T_H5PYDataset(data_file, which_sets=('train',)) test_data = dataset.T_H5PYDataset(data_file, which_sets=('test',)) session = Session(root_url='http://localhost:5006') if self.MainLoop is None: step_rules = [RMSProp(learning_rate=0.2, decay_rate=0.95), StepClipping(1)] algorithm = GradientDescent(cost=self.Cost, parameters=self.ComputationGraph.parameters, step_rule=CompositeRule(step_rules), on_unused_sources='ignore') train_stream = DataStream.default_stream( training_data, iteration_scheme=SequentialScheme( training_data.num_examples, batch_size=100)) test_stream = DataStream.default_stream( test_data, iteration_scheme=SequentialScheme( test_data.num_examples, batch_size=100)) self.MainLoop = MainLoop( model=Model(self.Cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=n_epochs), Printing(), Checkpoint(output_data_file, every_n_epochs=50), TrainingDataMonitoring([self.Cost], after_batch=True, prefix='train'), DataStreamMonitoring([self.Cost], after_batch=True, data_stream=test_stream, prefix='test'), Plot(output_data_file, channels=[['train_cost', 'test_cost']]) ]) self.MainLoop.run()
def main(): feature_maps = [20, 50] mlp_hiddens = [50] conv_sizes = [5, 5] pool_sizes = [3, 3] save_to = "DvC.pkl" batch_size = 500 image_size = (32, 32) output_size = 2 learningRate = 0.1 num_epochs = 10 num_batches = None host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['valid_cost', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(): mlp_hiddens = [1000] filter_sizes = [(9, 9), (5, 5), (5, 5)] feature_maps = [80, 50, 20] pooling_sizes = [(3, 3), (2, 2), (2, 2)] save_to = "DvC.pkl" image_size = (128, 128) output_size = 2 learningRate = 0.1 num_epochs = 300 num_batches = None if socket.gethostname() == 'tim-X550JX': host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def run(model_name, port_train, port_valid): running_on_laptop = socket.gethostname() == 'yop' X = tensor.tensor4('image_features', dtype='float32') T = tensor.matrix('targets', dtype='float32') image_border_size = (100, 100) if running_on_laptop: host_plot = 'http://*****:*****@ %s' % (model_name, datetime.datetime.now(), socket.gethostname()), channels=[['loss'], ['error', 'valid_error']], after_epoch=True, server_url=host_plot), Printing(), Checkpoint('/tmp/train_bn2') ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions, model=model) main_loop.run()
def main(config, tr_stream, dev_stream, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # add the tags from this function to the IMT datastream # prediction function signature # [target_suffix, source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask] prediction_function = get_prediction_function(exp_config=config) tr_stream = Mapping( tr_stream, CallPredictionFunctionOnStream(prediction_function, [1, 0, 5, 4, 7, 6]), #tr_stream = Mapping(tr_stream, CallFunctionOnStream(prediction_function, [6, 1, 0, 5, 4, 7]), add_sources=('predictions', 'orig_readouts', 'prediction_tags')) # now datastream has 11 things import ipdb ipdb.set_trace() # WORKING: call prediction function twice to get new readouts on predictions instead of reference suffs # the only difference is the index of the suffix tr_stream = Mapping(tr_stream, CallPredictionFunctionOnStream(prediction_function, [1, 0, 5, 4, 7, 8]), add_sources=('dummy_predictions', 'readouts', 'dummy_prediction_tags')) import ipdb ipdb.set_trace() # Create the prediction confidence model # the first draft of this model uses the readout output (before the post-merge step) as the per-timestep state vector # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference target_sentence = tensor.lmatrix('target_suffix') target_sentence_mask = tensor.matrix('target_suffix_mask') target_prefix = tensor.lmatrix('target_prefix') target_prefix_mask = tensor.matrix('target_prefix_mask') # symbolic variable which tags each timestep as GOOD/BAD # Note: later this might be tags for a hypothesis i.e. from TER(p), right now the timesteps are actually determined by the reference # By zipping the confidence model output with the reference, we get the model's confidence that this reference word # will be predicted correctly prediction_tags = tensor.matrix('prediction_tags') readouts = tensor.tensor3('readouts') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = NMTPrefixDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, loss_function='cross_entropy') # rename to match baseline NMT systems decoder.name = 'decoder' cost = decoder.confidence_cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, target_prefix, target_prefix_mask, readouts, prediction_tags) # WORKING: add l2 regularization logger.info('Creating computational graph') # working: implement cost for confidence model cg = ComputationGraph(cost) # INITIALIZATION logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() #cost_cg = ComputationGraph(cost) if config['l2_reg']: l2_reg_alpha = config['l2_reg_alpha'] model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # do we need to name the cost variable again? cost.name = 'cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name in set([ 'confidence_model1_apply_output', 'confidence_model2_apply_output', 'confidence_model3_apply_output' ]) ] # if x.name == 'maxout_apply_output'] # if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # WORKING: implement confidence -- remove all params except output model cost_model = Model(cost) model_params = cost_model.get_parameter_dict() trainable_params = cg.parameters import ipdb ipdb.set_trace() print('trainable params') #params_to_remove = [model_params[k] for k in model_params.keys() if 'confidence' not in k] #for p in params_to_remove: # trainable_params.remove(p) # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W'] # trainable_params.remove(source_embeddings) # trainable_params.remove(target_embeddings) # END WORKING: implement confidence -- remove all params except output model # TODO: fixed dropout mask for recurrent params? # Print shapes # shapes = [param.get_value().shape for param in cg.parameters] # logger.info("Parameter shapes: ") # for shape, count in Counter(shapes).most_common(): # logger.info(' {:15}: {}'.format(shape, count)) # logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names # enc_dec_param_dict = merge(Selector(encoder).get_parameters(), # Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}" # .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), # Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # WORKING: confidence prediction #monitor everything that could possibly be relevant # Set up the sampling graph for validation during training # Theano variables for the sampling graph # Note this also loads the model parameters sampling_vars = load_params_and_get_beam_search(config, encoder=encoder, decoder=decoder) beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab=source_vocab, # trg_vocab=target_vocab, # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu #if config['bleu_script'] is not None: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # src_vocab=source_vocab, # trg_vocab=target_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # TODO: add first-word accuracy validation # TODO: add IMT meteor early stopping #if config.get('imt_f1_validation', None) is not None: # logger.info("Building imt F1 validator") # extensions.append( # IMT_F1_Validator(sampling_input, sampling_prefix, # samples=samples, # config=config, # model=search_model, data_stream=dev_stream, # src_vocab=source_vocab, # trg_vocab=target_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # TODO: hacking here: get the predictions of the confidence model using the `readouts` source of the data_stream # Note that the parameters of this model must be pretrained, otherwise this doesn't make sense # confidence_predictions = decoder.get_confidence(readouts) # confidence_prediction_model = Model(confidence_predictions) # # confidence_param_values = LoadNMT.load_parameter_values(config['confidence_saved_parameters'], brick_delimiter=None) # LoadNMT.set_model_parameters(confidence_prediction_model, confidence_param_values) # # confidence_prediction_func = confidence_prediction_model.get_theano_function() # import ipdb; ipdb.set_trace() # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( # Plot(config['model_save_directory'], channels=[['decoder_confidence_cost_cost']], Plot(config['model_save_directory'], channels=[['cost']], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # WORKING: implement confidence model # if there is dropout or random noise, we need to use the output of the modified graph algorithm = GradientDescent( cost=cg.outputs[0], parameters=trainable_params, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), # eval(config['step_rule'])(), RemoveNotFinite()]), # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), on_unused_sources='warn') #if config['dropout'] < 1.0: # algorithm = GradientDescent( # cost=cg.outputs[0], parameters=trainable_params, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])(), RemoveNotFinite()]), # # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), # on_unused_sources='warn' # ) #else: # algorithm = GradientDescent( # cost=cost, parameters=cg.parameters, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])()]), # on_unused_sources='warn' # ) # END WORKING: implement confidence model import ipdb ipdb.set_trace() # enrich the logged information extensions.append(Timing(every_n_batches=100)) # WORKING: debugging confidence # get theano function from model # WORKING: implement word-level confidence cost # @application(inputs=['representation', 'source_sentence_mask', # 'target_sentence_mask', 'target_sentence', 'target_prefix_mask', 'target_prefix'], # outputs=['cost']) # def confidence_cost(self, representation, source_sentence_mask, # target_sentence, target_sentence_mask, target_prefix, target_prefix_mask): logger.info('Creating theano variables') # WORKING: 26.9.16 -- get confidence outputs directly from (source, prefix, suffix) inputs # This is equivalent to forced alignment --> confidence scores # Note: but this section should probably be in "evaluate" mode, not here in "train" # source_sentence = tensor.lmatrix('source') # source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference #target_sentence = tensor.lmatrix('target_suffix') #target_sentence_mask = tensor.matrix('target_suffix_mask') # TODO: change names back to *_suffix, there is currently a theano function name error # TODO: in the GradientDescent Algorithm #target_prefix = tensor.lmatrix('target_prefix') #target_prefix_mask = tensor.matrix('target_prefix_mask') # confidence_output = decoder.confidence_cost( # encoder.apply(source_sentence, source_sentence_mask), # source_sentence_mask, target_sentence, target_sentence_mask, # target_prefix, target_prefix_mask) # confidence_model = Model(confidence_output) # t_cost_func = confidence_model.get_theano_function() # inputs # [source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask, target_suffix] #import ipdb;ipdb.set_trace() # get the right args from the datastream # TODO: just print source, prefix, suffix, prediction, correct to new files -- this makes sure everything is aligned # OUTPUT_DIR = '/media/1tb_drive/imt_models/word_prediction_accuracy_experiments/en-de/exp_1' # for the_file in os.listdir(OUTPUT_DIR): # file_path = os.path.join(OUTPUT_DIR, the_file) # try: # if os.path.isfile(file_path): # os.unlink(file_path) # except Exception as e: # print(e) # # def write_file_truncate_mask(filename, data, mask, mode='a'): # ''' data is list of list ''' # # assert len(data) == len(mask) # with codecs.open(filename, mode, encoding='utf8') as out: # for l, m in zip(data, mask): # output = u' '.join(l[:int(m.sum())]) + u'\n' # out.write(output) # logger.info('Wrote file: {}'.format(filename)) # # # target_ivocab = {k:v.decode('utf8') for v,k in target_vocab.items()} # source_ivocab = {k:v.decode('utf8') for v,k in source_vocab.items()} # import ipdb; ipdb.set_trace() # tag_ivocab = {1: 'True', 0: 'False'} # # test_iter = tr_stream.get_epoch_iterator() # it = 0 # for t_source, t_source_mask, t_target, t_target_mask, t_target_prefix, t_target_prefix_mask, t_target_suffix, t_target_suffix_mask in test_iter: # if it <= 1000: # it += 1 # t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix) # readouts = t_cost[0] # preds = readouts.argmax(axis=2) # correct = preds.T == t_target_suffix # # # source_output = os.path.join(OUTPUT_DIR,'sources.en') # prefix_output = os.path.join(OUTPUT_DIR,'prefixes.de') # suffix_output = os.path.join(OUTPUT_DIR,'suffixes.de') # prediction_output = os.path.join(OUTPUT_DIR,'predictions.de') # correct_output = os.path.join(OUTPUT_DIR,'prefix_word_prediction_acc.out') # # source_text = [[source_ivocab[w] for w in s] for s in t_source] # prefix_text = [[target_ivocab[w] for w in s] for s in t_target_prefix] # suffix_text = [[target_ivocab[w] for w in s] for s in t_target_suffix] # pred_text = [[target_ivocab[w] for w in s] for s in preds.T] # correct_text = [[tag_ivocab[w] for w in s] for s in correct] # # # for triple in zip([source_output, prefix_output, suffix_output, prediction_output, correct_output], # [source_text, prefix_text, suffix_text, pred_text, correct_text], # [t_source_mask, t_target_prefix_mask, t_target_suffix_mask, t_target_suffix_mask, t_target_suffix_mask]): # write_file_truncate_mask(*triple) # else: # break # # import ipdb; ipdb.set_trace() #t_cost = t_cost_func(t_source, t_target_prefix) #t_cost = t_cost_func(t_target_suffix, t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask) #t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix) # return confidence_cost, flat_y, confidence_logits, readouts #predictions = t_cost[0].argmax(axis=2) # TODO: next step -- print gradients and weights during training find out where nan is coming from # TODO: look at the gradient of this function with respect to parameters? -- see here: http://deeplearning.net/software/theano/tutorial/gradients.html # TODO: function which adds right/wrong tags for model predictions to the datastream. In this case we can learn a simple linear model as a baseline # TODO: print predictions for each batch for each timestep to file -- _dont shuffle_ so that we get the right order # import ipdb;ipdb.set_trace() # from blocks reverse_words example # observables = [ # cost, min_energy, max_energy, mean_activation, # batch_size, max_length, cost_per_character, # algorithm.total_step_norm, algorithm.total_gradient_norm] # for name, parameter in trainable_params.items(): # observables.append(parameter.norm(2).copy(name + "_norm")) # observables.append(algorithm.gradients[parameter].norm(2).copy( # name + "_grad_norm")) for i, (k, v) in enumerate(algorithm.updates): v.name = k.name + '_{}'.format(i) aux_vars = [v for v in cg.auxiliary_variables[-3:]] # import ipdb; ipdb.set_trace() extensions.extend([ TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True), # TrainingDataMonitoring(aux_vars, after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True) ]) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) import ipdb ipdb.set_trace() # Train! main_loop.run()
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') initial_context = tensor.matrix('initial_context') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) # let user specify the target transition class name in config, # eval it and pass to decoder target_transition_name = config.get( 'target_transition', 'GRUInitialStateWithInitialStateSumContext') target_transition = eval(target_transition_name) logger.info('Using target transition: {}'.format(target_transition_name)) decoder = InitialContextDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['context_dim'], target_transition) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, initial_context) cost.name = 'decoder_cost' # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING # TODO: validate performance with/without regularization if config.get('l2_regularization', False) is True: l2_reg_alpha = config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to name the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions # TODO: add checking for existing model and loading logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Create the theano variables that we need for the sampling graph sampling_input = tensor.lmatrix('input') sampling_context = tensor.matrix('context_input') # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config.get('bleu_script', None) is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation, sampling_context) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler( model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab=source_vocab, trg_vocab=target_vocab, src_vocab_size=config['src_vocab_size'], )) # Add early stopping based on bleu if config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Add early stopping based on Meteor if config.get('meteor_directory', None) is not None: logger.info("Building meteor validator") extensions.append( MeteorValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[[ 'decoder_cost', 'validation_set_bleu_score', 'validation_set_meteor_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
batch_size=128)) test_set = H5PYDataset('mushrooms.hdf5', which_sets=('test', )) test_stream = DataStream.default_stream(test_set, iteration_scheme=SequentialScheme( test_set.num_examples, batch_size=128)) main = MainLoop(model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=10), Printing(), TrainingDataMonitoring([cost, error_rate], after_batch=True, prefix='train'), DataStreamMonitoring([cost, error_rate], after_batch=True, data_stream=test_stream, prefix='test'), Plot('Train', channels=[['train_cost', 'test_cost'], ['train_error_rate', 'test_error_rate']]) ]) main.run() hinton(W1.get_value()) hinton(W2.get_value())
import socket import datetime import time host_plot = 'http://tfjgeorge.com:5006' cost.name = 'cost' valid_cost.name = 'valid_cost' extensions = [ Timing(), TrainingDataMonitoring([cost], after_epoch=True, prefix='train'), DataStreamMonitoring(variables=[valid_cost], data_stream=valid_stream), Plot('%s %s' % ( socket.gethostname(), datetime.datetime.now(), ), channels=[['train_cost', 'valid_cost']], after_epoch=True, server_url=host_plot), TrackTheBest('valid_cost'), Checkpoint('model', save_separately=["model", "log"]), FinishIfNoImprovementAfter('valid_cost_best_so_far', epochs=5), #FinishAfter(after_n_epochs=100), Printing() ] from blocks.main_loop import MainLoop main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm,
def main(exp_config, source_vocab, target_vocab, dev_stream, use_bokeh=True): # def setup_model_and_stream(exp_config, source_vocab, target_vocab): # def setup_model_and_stream(exp_config, source_vocab, target_vocab): train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream = setup_model_and_stream( exp_config, source_vocab, target_vocab) cost = create_model(train_encoder, train_decoder, exp_config.get('imt_smoothing_constant', 0.005)) # Set up training model logger.info("Building model") train_model = Model(cost) # Set the parameters from a trained models (.npz file) logger.info("Loading parameters from model: {}".format( exp_config['saved_parameters'])) # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters'], brick_delimiter=exp_config.get('brick_delimiter', None)) LoadNMT.set_model_parameters(train_model, param_values) logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING if exp_config.get('l2_regularization', False) is True: l2_reg_alpha = exp_config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to rename the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization # Note dropout variables are hard-coded here if exp_config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, exp_config['dropout']) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(exp_config['saveto']): os.makedirs(exp_config['saveto']) # TODO: mv the actual config file once we switch to .yaml for min-risk shutil.copy(exp_config['config_file'], exp_config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=exp_config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(exp_config['saveto'], every_n_batches=exp_config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary # TODO: change the if statement here if exp_config['hook_samples'] >= 1 or exp_config['bleu_script'] is not None: logger.info("Building sampling model") search_model = Model(generated) _, samples = VariableFilter( bricks=[train_decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling -- TODO: sampling is broken for min-risk #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu # TODO: use multimodal meteor and BLEU validator # TODO: add 'validator' key to IMT config # Add early stopping based on bleu if exp_config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=exp_config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=exp_config['normalized_bleu'], every_n_batches=exp_config['bleu_val_freq'])) if exp_config.get('imt_f1_validation', False) is not False: logger.info("Building imt F1 validator") extensions.append( IMT_F1_Validator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=exp_config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=exp_config['normalized_bleu'], every_n_batches=exp_config['bleu_val_freq'])) # Add early stopping based on Meteor # if exp_config.get('meteor_directory', None) is not None: # logger.info("Building meteor validator") # extensions.append( # MeteorValidator(theano_sampling_source_input, theano_sampling_context_input, # samples=samples, # config=config, # model=search_model, data_stream=dev_stream, # src_vocab=src_vocab, # trg_vocab=trg_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if exp_config['reload']: extensions.append(LoadNMT(exp_config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(exp_config['model_save_directory'], channels=[[ 'decoder_cost_cost', 'validation_set_imt_f1_score', 'validation_set_bleu_score', 'validation_set_meteor_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph # WORKING: try to catch and fix nan if exp_config['dropout'] < 1.0: if exp_config.get('nan_guard', False): from theano.compile.nanguardmode import NanGuardMode algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn', theano_func_kwargs={ 'mode': NanGuardMode(nan_is_error=True, inf_is_error=True) }) else: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn') else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn') # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=train_model, algorithm=algorithm, data_stream=masked_stream, extensions=extensions) # Train! main_loop.run()
def main(model, cost, config, tr_stream, dev_stream, use_bokeh=False): # Set the parameters from a trained models (.npz file) logger.info("Loading parameters from model: {}".format( exp_config['saved_parameters'])) # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters'], brick_delimiter=exp_config.get('brick_delimiter', None)) LoadNMT.set_model_parameters(model, param_values) logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING if config.get('l2_regularization', False) is True: l2_reg_alpha = config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to rename the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization # Note dropout variables are hard-coded here if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) # TODO: mv the actual config file once we switch to .yaml for min-risk # shutil.copy(config['config_file'], config['saveto']) # shutil.copy(config['config_file'], config['saveto']) # TODO: this breaks when we directly reference a class in the config obj instead of using reflection with codecs.open(os.path.join(config['saveto'], 'config.yaml'), 'w', encoding='utf8') as yaml_out: yaml_out.write(yaml.dump(config)) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary # TODO: change the if statement here if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = train_encoder.apply( theano_sampling_source_input, tensor.ones(theano_sampling_source_input.shape)) # TODO: the generated output actually contains several more values, ipdb to see what they are generated = train_decoder.generate(theano_sampling_source_input, sampling_representation, theano_sampling_context_input) search_model = Model(generated) _, samples = VariableFilter( bricks=[train_decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling -- TODO: sampling is broken for min-risk #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu # TODO: use multimodal meteor and BLEU validator # Add early stopping based on bleu if config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=src_vocab, trg_vocab=trg_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Add early stopping based on Meteor if config.get('meteor_directory', None) is not None: logger.info("Building meteor validator") extensions.append( MeteorValidator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=src_vocab, trg_vocab=trg_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[[ 'decoder_cost_cost', 'validation_set_bleu_score', 'validation_set_meteor_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='warn') else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='warn') #algorithm = GradientDescent( # cost=cost, parameters=cg.parameters, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])()], # ), # on_unused_sources='warn' #) # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def main(save_to, num_epochs, regularization=0.001, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 convnet = create_all_conv_net() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) test_components = (ComponentwiseCrossEntropy().apply( y.flatten(), probs).copy(name='components')) test_error_rate = (MisclassificationRate().apply( y.flatten(), probs).copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), probs).copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate, test_components]) # Apply dropout to all layer outputs except final softmax dropout_vars = VariableFilter( roles=[OUTPUT], bricks=[Convolutional], theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(drop_cg.variables) # train_cg = apply_dropout(drop_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(drop_cg, [x], 0.2) train_cg = drop_cg # train_cg = test_cg train_cost, train_error_rate, train_components = train_cg.outputs # Apply regularization to the cost biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) l2_norm = sum([(W**2).sum() for W in weights]) l2_norm.name = 'l2_norm' l2_regularization = regularization * l2_norm l2_regularization.name = 'l2_regularization' test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + regularization * l2_norm train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train", )) #cifar10_train_stream = RandomPadCropFlip( # NormalizeBatchLevels(DataStream.default_stream( # cifar10_train, iteration_scheme=ShuffledScheme( # cifar10_train.num_examples, batch_size)), # which_sources=('features',)), # (32, 32), pad=5, which_sources=('features',)) cifar10_train_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme(cifar10_train.num_examples, batch_size)), which_sources=('features', )) test_batch_size = 1000 cifar10_test = CIFAR10(("test", )) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme(cifar10_test.num_examples, test_batch_size)), which_sources=('features', )) momentum = Momentum(0.002, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # step_rule = CompositeRule([StepClipping(100), momentum]) step_rule = momentum # Train with simple SGD algorithm = GradientDescent(cost=train_cost, parameters=train_cg.parameters, step_rule=step_rule) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [(1, 0.005), (3, 0.01), (5, 0.02), (200, 0.002), (250, 0.0002), (300, 0.00002)]), DataStreamMonitoring([test_cost, test_error_rate, test_confusion], cifar10_test_stream, prefix="test"), TrainingDataMonitoring([ train_cost, train_error_rate, train_cost_without_regularization, l2_regularization, momentum.learning_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", every_n_batches=10), # after_epoch=True), Plot('Training performance for ' + save_to, channels=[ [ 'train_cost_with_regularization', 'train_cost_without_regularization', 'train_l2_regularization' ], ['train_error_rate'], ['train_total_gradient_norm'], ], every_n_batches=10), # after_batch=True), Plot('Test performance for ' + save_to, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] if histogram: attribution = AttributionExtension(components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(train_cost) main_loop = MainLoop(algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference target_sentence = tensor.lmatrix('target_suffix') target_sentence_mask = tensor.matrix('target_suffix_mask') # TODO: change names back to *_suffix, there is currently a theano function name error # TODO: in the GradientDescent Algorithm target_prefix = tensor.lmatrix('target_prefix') target_prefix_mask = tensor.matrix('target_prefix_mask') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = NMTPrefixDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, loss_function='cross_entropy') # rename to match baseline NMT systems decoder.name = 'decoder' # TODO: change the name of `target_sentence` to `target_suffix` for more clarity cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, target_prefix, target_prefix_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # INITIALIZATION logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) trainable_params = cg.parameters # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W'] # trainable_params.remove(source_embeddings) # trainable_params.remove(target_embeddings) # TODO: fixed dropout mask for recurrent params? # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up the sampling graph for validation during training # Theano variables for the sampling graph sampling_vars = load_params_and_get_beam_search(config, encoder=encoder, decoder=decoder) beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab=source_vocab, trg_vocab=target_vocab, src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # TODO: add first-word accuracy validation # TODO: add IMT meteor early stopping if config.get('imt_f1_validation', None) is not None: logger.info("Building imt F1 validator") extensions.append( IMT_F1_Validator(sampling_input, sampling_prefix, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[['decoder_cost_cost'], [ 'validation_set_bleu_score', 'validation_set_imt_f1_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # WORKING: implement confidence model # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent( cost=cg.outputs[0], parameters=trainable_params, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), on_unused_sources='warn') else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='warn') # END WORKING: implement confidence model # enrich the logged information extensions.append(Timing(every_n_batches=100)) # for i, (k,v) in enumerate(algorithm.updates): # v.name = k.name + '_{}'.format(i) # # aux_vars = [v for v in cg.auxiliary_variables[-3:]] # import ipdb; ipdb.set_trace() extensions.extend([ TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True), # TrainingDataMonitoring(aux_vars, after_batch=True), TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True) ]) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
import time host_plot = 'http://tfjgeorge.com:5006' cost_dropout.name = 'cost' extensions = [ Timing(every_n_batches=50), TrainingDataMonitoring([cost_dropout], prefix='train'), DataStreamMonitoring(variables=[cost_dropout], data_stream=valid_stream, prefix="valid", every_n_batches=50), Plot('%s %s' % ( socket.gethostname(), datetime.datetime.now(), ), channels=[['train_cost', 'valid_cost']], every_n_batches=50, server_url=host_plot), TrackTheBest('valid_cost'), Checkpoint('train', save_separately=["model", "log"]), FinishIfNoImprovementAfter('valid_cost_best_so_far', epochs=5), #FinishAfter(every_n_epochs=100), Printing(every_n_batches=50) ] # MAIN LOOP from blocks.main_loop import MainLoop main_loop = MainLoop(model=model, data_stream=train_stream,
def build_and_run(save_to,modelconfig,experimentconfig): """ part of this is adapted from lasagne tutorial""" n, num_filters, image_size, num_blockstack = modelconfig['depth'], modelconfig['num_filters'], modelconfig['image_size'], modelconfig['num_blockstack'] print("Amount of bottlenecks: %d" % n) # Prepare Theano variables for inputs and targets input_var = T.tensor4('image_features') #target_value = T.ivector('targets') target_var = T.lmatrix('targets') target_vec = T.extra_ops.to_one_hot(target_var[:,0],2) #target_var = T.matrix('targets') # Create residual net model print("Building model...") network = build_cnn(input_var, image_size, n, num_blockstack, num_filters) get_info(network) prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network)) test_prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network,deterministic=True)) # Loss function -> The objective to minimize print("Instanciation of loss function...") #loss = CategoricalCrossEntropy().apply(target_var.flatten(), prediction) #test_loss = CategoricalCrossEntropy().apply(target_var.flatten(), test_prediction) # loss = lasagne.objectives.categorical_crossentropy(prediction, target_var.flatten()).mean() # test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var.flatten()).mean() loss = lasagne.objectives.squared_error(prediction,target_vec).mean() test_loss = lasagne.objectives.squared_error(test_prediction,target_vec).mean() # loss = tensor.nnet.binary_crossentropy(prediction, target_var).mean() # test_loss = tensor.nnet.binary_crossentropy(test_prediction, target_var).mean() test_loss.name = "loss" # loss.name = 'x-ent_error' # loss.name = 'sqr_error' layers = lasagne.layers.get_all_layers(network) #l1 and l2 regularization #pondlayers = {x:0.000025 for i,x in enumerate(layers)} #l1_penality = lasagne.regularization.regularize_layer_params_weighted(pondlayers, lasagne.regularization.l2) #l2_penality = lasagne.regularization.regularize_layer_params(layers[len(layers)/4:], lasagne.regularization.l1) * 25e-6 #reg_penalty = l1_penality + l2_penality #reg_penalty.name = 'reg_penalty' #loss = loss + reg_penalty loss.name = 'reg_loss' error_rate = MisclassificationRate().apply(target_var.flatten(), test_prediction).copy( name='error_rate') # Load the dataset print("Loading data...") istest = 'test' in experimentconfig.keys() if istest: print("Using test stream") train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=istest) # Defining step rule and algorithm if 'step_rule' in experimentconfig.keys() and not experimentconfig['step_rule'] is None : step_rule = experimentconfig['step_rule'](learning_rate=experimentconfig['learning_rate']) else : step_rule=Scale(learning_rate=experimentconfig['learning_rate']) params = map(lasagne.utils.as_theano_expression,lasagne.layers.get_all_params(network, trainable=True)) print("Initializing algorithm") algorithm = GradientDescent( cost=loss, gradients={var:T.grad(loss,var) for var in params},#parameters=cg.parameters, #params step_rule=step_rule) #algorithm.add_updates(extra_updates) grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = "grad_norm" print("Initializing extensions...") plot = Plot(save_to, channels=[['train_loss','valid_loss'], ['train_grad_norm'], #['train_grad_norm','train_reg_penalty'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042') checkpoint = Checkpoint('models/best_'+save_to+'.tar') # checkpoint.add_condition(['after_n_batches=25'], checkpoint.add_condition(['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Defining extensions extensions = [Timing(), FinishAfter(after_n_epochs=experimentconfig['num_epochs'], after_n_batches=experimentconfig['num_batches']), TrainingDataMonitoring([test_loss, error_rate, grad_norm], # reg_penalty], prefix="train", after_epoch=True), #after_n_epochs=1 DataStreamMonitoring([test_loss, error_rate],valid_stream,prefix="valid", after_epoch=True), #after_n_epochs=1 plot, #Checkpoint(save_to,after_n_epochs=5), #ProgressBar(), # Plot(save_to, channels=[['train_loss','valid_loss'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042'), #'grad_norm' # after_batch=True), Printing(after_epoch=True), TrackTheBest('valid_error_rate',min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=5)] # Early-stopping # model = Model(loss) # print("Model",model) main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) print("Starting main loop...") main_loop.run()
def main(save_to, num_epochs, weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 prior_noise_level = -10 noise_step_rule = Scale(1e-6) noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX)) convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True, noise_rate=noise_rate, prior_noise_level=prior_noise_level) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): with training_noise(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # for annealing nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' # Compute noise rates for training graph train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean() train_mean_log_sigma.name = 'mean_log_sigma' train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean() train_nit_rate.name = 'nit_rate' train_nit_regularization = nit_penalty * train_nit_rate train_nit_regularization.name = 'nit_regularization' # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) mask_parameters = [p for p in trainable_parameters if get_brick(p).name == 'mask'] noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) nonmask_weights = [p for p in weights if get_brick(p).name != 'mask'] l2_norm = sum([(W ** 2).sum() for W in nonmask_weights]) l2_norm.name = 'l2_norm' l2_regularization = weight_decay * l2_norm l2_regularization.name = 'l2_regularization' # testversion test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + l2_regularization + train_nit_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 128 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # Create a step rule that reduces the learning rate of noise scale_mask = Restrict(noise_step_rule, mask_parameters) step_rule = CompositeRule([scale_mask, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) exp_name = save_to.replace('.%d', '') # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (50, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), EpochSchedule(noise_step_rule.learning_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4) ]), EpochSchedule(noise_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4), # (6, 3e-4), # (8, 1e-3), # Causes nit rate to jump # (10, 3e-3), # (12, 1e-2), # (15, 3e-2), # (19, 1e-1), # (24, 3e-1), # (30, 1) ]), NoiseExtension( noise_parameters=noise_parameters), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_regularization, train_nit_regularization, momentum.learning_rate, train_mean_log_sigma, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + exp_name, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_nit_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ['train_mean_log_sigma'], ], every_n_batches=17), Plot('Test performance for ' + exp_name, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(exp_name, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
x1 = encoder.apply(f1) x2 = encoder.apply(f2) from cost import ContrastiveLoss cost = ContrastiveLoss(q=dims[-1]).apply(x1=x1, x2=x2, y1=y1, y2=y2) cost.name = 'contrastive_loss' cost_test = cost.copy('contrastive_loss_test') cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, step_rule=AdaDelta(), parameters=cg.parameters) main_loop = MainLoop(algorithm=algorithm, data_stream=train_stream, extensions=[ TrainingDataMonitoring(variables=[cost], after_epoch=True), DataStreamMonitoring(data_stream=test_stream, variables=[cost_test], after_epoch=True), Printing(after_epoch=True), FinishAfter(after_n_epochs=50), Plot(document='siamese network larger', channels=[[cost.name, cost_test.name]], start_server=True, after_epoch=True) ])
def main(config, tr_stream, dev_stream, use_bokeh=False, src_vocab=None, trg_vocab=None): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING # TODO: allow user to remove some params from the graph, for example if embeddings should be kept static if config.get('l2_regularization', False) is True: l2_reg_alpha = config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to name the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # allow user to externally initialize some params model_params = training_model.get_parameter_dict() if config.get('external_embeddings', None) is not None: for key in config['external_embeddings']: path_to_params = config['external_embeddings'][key] logger.info( 'Replacing {} parameters with external params at: {}'.format( key, path_to_params)) external_params = numpy.load(path_to_params) len_external_idx = external_params.shape[0] print(external_params.shape) # Working: look in the dictionary and overwrite the correct rows existing_params = model_params[key].get_value() if key == '/bidirectionalencoder/embeddings.W': vocab = src_vocab elif key == '/decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W': vocab = trg_vocab else: raise KeyError( 'Unknown embedding parameter key: {}'.format(key)) for k, i in vocab.items(): if i < len_external_idx: existing_params[i] = external_params[i] # model_params_shape = model_params[key].get_value().shape # assert model_params[key].get_value().shape == external_params.shape, ("Parameter dims must not change," # "shapes {} and {} do not match". # format(model_params_shape, # external_params.shape)) model_params[key].set_value(existing_params) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) # note that generated containes several different outputs generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling # Note: this is broken for unicode chars #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # WORKING: remove these validators in favor of Async # TODO: implement burn-in in the validation extension (don't fire until we're past the burn-in iteration) # Add early stopping based on bleu # if config.get('bleu_script', None) is not None: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Add early stopping based on Meteor # if config.get('meteor_directory', None) is not None: # logger.info("Building meteor validator") # extensions.append( # MeteorValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up training algorithm logger.info("Initializing training algorithm") # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # enrich the logged information extensions.extend([ Timing(every_n_batches=100), FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ]) # External non-blocking validation extensions.append( RunExternalValidation(config=config, every_n_batches=config['bleu_val_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[['decoder_cost_cost'], ['validation_set_bleu_score'], ['validation_set_meteor_score']], every_n_batches=1)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def _train_model(self, train_stream, valid_stream, load_from, save_to, *args, **kwargs): # Build model self.model = self.config.Model(self.config, self.dataset) # with word2id cg = Model(self.model.cg_generator) algorithm = GradientDescent(cost=self.model.cg_generator, step_rule=self.config.step_rule, parameters=cg.parameters, on_unused_sources='ignore') if plot_avail: extensions = [ FinishAfter(after_n_epochs=1), TrainingDataMonitoring( [v for l in self.model.monitor_train_vars for v in l], prefix='train', every_n_batches=self.config.print_freq), Plot('Training Process', channels=[ v.name for l in self.model.monitor_train_vars for v in l ], after_batch=True) ] else: extensions = [ TrainingDataMonitoring( [v for l in self.model.monitor_train_vars for v in l], prefix='train', every_n_batches=self.config.print_freq) ] saver_loader = self.model_save_loader(load_from=load_from, save_to=save_to, model=cg, dataset=self.dataset) saver_loader.do_load() n_batches = numpy.ceil(self.n_samples / self.config.batch_size).astype('int32') n_valid_batches = numpy.ceil(n_batches * self.config.valid_freq).astype('int32') extensions += [ EvaluatorWithEarlyStop( coverage=1., tolerate_time=self.config.tolerate_time, variables=[ v for l in self.model.monitor_valid_vars for v in l ], monitor_variable=self.model.stop_monitor_var, data_stream=valid_stream, saver=saver_loader, prefix='valid', every_n_batches=n_valid_batches) ] extensions += [ Printing(every_n_batches=self.config.print_freq, after_epoch=True), ProgressBar() ] extensions += [EpochMonitor(1)] main_loop = MainLoop(model=cg, data_stream=train_stream, algorithm=algorithm, extensions=extensions) # Run the model! main_loop.run()
def initialaze_algorithm(config, save_path, bokeh_name, params, bokeh_server, bokeh, use_load_ext, load_log, fast_start, recognizer, data, model, cg, regularized_cg, cost, train_cost, parameters, max_norm_rules, observables, batch_size, batch_cost, weights_entropy, labels_mask, labels, gradients=None): primary_observables = observables secondary_observables = [] validation_observables = [] root_path, extension = os.path.splitext(save_path) train_conf = config['training'] # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) if 'adam' in rule_names: assert len(rule_names) == 1 logger.info("Using Adam for training") core_rules.append( Adam(learning_rate=train_conf.get('scale', 0.002), beta1=train_conf.get('beta1', 0.1), beta2=train_conf.get('beta2', 0.001), epsilon=train_conf.get('epsilon', 1e-8), decay_factor=train_conf.get('decay_rate', (1 - 1e-8)))) burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') #theano_func_kwargs={'mode':NanGuardMode(nan_is_error=True)}) logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances')] + weights_entropy def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name.startswith('weights_entropy'): chld_id = recognizer.child_id_from_postfix(var.name) result.append(rename(aggregation.mean(var, labels_mask[chld_id].sum()), 'weights_entropy_per_label'+ recognizer.children[chld_id].names_postfix)) elif var.name.endswith('_nll'): chld_id = recognizer.child_id_from_postfix(var.name) result.append(rename(aggregation.mean(var.sum(), labels_mask[chld_id].sum()), var.name+'_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False, **data_params_valid), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) additional_patience_notifiers = [] uas = DependencyErrorRate(recognizer.children[0], data, **config['monitoring']['search']) las = AuxiliaryErrorRates(uas, name='LAS') lab = AuxiliaryErrorRates(uas, name='LAB') per_monitoring = DataStreamMonitoring( [uas, las, lab], data.get_one_stream("valid", data.langs[0], batches=False, shuffle=False, **data_params_valid)[0], prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_uas = TrackTheBest( per_monitoring.record_name(uas)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_las = TrackTheBest( per_monitoring.record_name(las)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_lab = TrackTheBest( per_monitoring.record_name(lab)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_uas, track_the_best_las, track_the_best_lab, ] per = uas track_the_best_per = track_the_best_uas additional_patience_notifiers = [track_the_best_lab, track_the_best_las] track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500, num_stds=train_conf.get('clip_stds', 1.0))) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']), # .add_condition(["after_batch"], _gradient_norm_is_none), ] main_postfix = recognizer.children[0].names_postfix channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'+main_postfix), validation._record_name('weights_entropy_per_label'+main_postfix)], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'+main_postfix), validation._record_name('weights_penalty_per_recording'+main_postfix)]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] + additional_patience_notifiers extensions.append(Patience(**patience_conf)) if train_conf.get('min_performance_stops'): extensions.append(EarlyTermination( param_name=track_the_best_per.best_name, min_performance_by_epoch=train_conf['min_performance_stops'])) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid", after_epoch=True), Checkpoint("google_Ortho2_pretrain2_l0001.pkl", after_epoch=True), ProgressBar(), Printing() ] #Adding a live plot with the bokeh server extensions.append( Plot('CatsVsDogs160_GoogleNet_Reload2_l0001', channels=[['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], ['train_total_gradient_norm']], after_batch=True)) params = load_parameter_values('GoogleParameters.pkl') model = Model(cost) model.set_parameter_values(params) main_loop = MainLoop(algorithm, data_stream=stream_train, model=model, extensions=extensions) main_loop.run()
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(rename(gain_matrix.min(), 'min_gain')) primary_observables.append(rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) from blocks.roles import AUXILIARY l2_cost, = VariableFilter(roles=[AUXILIARY], theano_name='l2_cost_aux')(cost_cg) cost_forward, = VariableFilter(roles=[AUXILIARY], theano_name='costs_forward_aux')(cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables + [l2_cost, cost_forward]), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
algorithm = GradientDescent(cost=cost, parameters=cg.parameters, on_unused_sources='ignore', step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ])) main_loop = MainLoop(model=Model(cost), data_stream=data_stream_train, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=n_epochs), monitor, monitor_val, monitor_test, saveSnapshot( '/home/xuehongyang/checkpoints_read/snapshot', save_main_loop=False, after_epoch=True, save_separately=['log', 'model']), ProgressBar(), Printing(every_n_batches=500), Plot('videoqa_open_rereading', channels=[['train_cost']], every_n_batches=500, after_batch=True) ]) print('starting...') main_loop.run()
def run(get_model, model_name): train_stream = ServerDataStream( ('cases', 'image_features', 'image_targets', 'multiplier'), False, hwm=10) valid_stream = ServerDataStream( ('cases', 'image_features', 'image_targets', 'multiplier'), False, hwm=10, port=5558) input_var = tensor.tensor4('image_features') target_var = tensor.tensor4('image_targets') multiply_var = tensor.matrix('multiplier') multiply_var = T.addbroadcast(multiply_var, 1) test_prediction, prediction, params = get_model(input_var, target_var, multiply_var) loss = binary_crossentropy(prediction, target_var).mean() loss.name = 'loss' valid_error = T.neq((test_prediction > 0.5) * 1., target_var).mean() valid_error.name = 'error' scale = Scale(0.1) algorithm = GradientDescent( cost=loss, parameters=params, step_rule=scale, #step_rule=Adam(), on_unused_sources='ignore') host_plot = 'http://localhost:5006' extensions = [ Timing(), TrainingDataMonitoring([loss], after_epoch=True), DataStreamMonitoring(variables=[loss, valid_error], data_stream=valid_stream, prefix="valid"), Plot('%s %s %s' % (model_name, datetime.date.today(), time.strftime('%H:%M')), channels=[['loss', 'valid_loss'], ['valid_error']], after_epoch=True, server_url=host_plot), Printing(), # Checkpoint('train'), FinishAfter(after_n_epochs=10) ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) cg = ComputationGraph(test_prediction) while True: main_loop.run() scale.learning_rate.set_value( numpy.float32(scale.learning_rate.get_value() * 0.7)) numpy.savez('best_weights.npz', [param.get_value() for param in cg.shared_variables])
extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint("catsVsDogs256.pkl"), ProgressBar(), Printing() ] #Adding a live plot with the bokeh server extensions.append( Plot('CatsVsDogs_256', channels=[['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], ['train_total_gradient_norm']], after_epoch=True)) model = Model(cost) main_loop = MainLoop(algorithm, stream_train, model=model, extensions=extensions) main_loop.run()
def run(get_model, model_name): train_stream = ServerDataStream(('cases', 'image_position', 'multiplier', 'sax', 'sax_features', 'targets'), False, hwm=10) valid_stream = ServerDataStream(('cases', 'image_position', 'multiplier', 'sax', 'sax_features', 'targets'), False, hwm=10, port=5558) ftensor5 = tensor.TensorType('float32', (False, ) * 5) input_var = ftensor5('sax_features') target_var = tensor.matrix('targets') multiply_var = tensor.matrix('multiplier') multiply_var = T.addbroadcast(multiply_var, 1) prediction, test_prediction, test_pred_mid, params_bottom, params_top = get_model( input_var, multiply_var) # load parameters cg = ComputationGraph(test_pred_mid) params_val = numpy.load('sunnybrook/best_weights.npz') for p, value in zip(cg.shared_variables, params_val['arr_0']): p.set_value(value) crps = tensor.abs_(test_prediction - target_var).mean() loss = squared_error(prediction, target_var).mean() loss.name = 'loss' crps.name = 'crps' algorithm = GradientDescent(cost=loss, parameters=params_top, step_rule=Adam(), on_unused_sources='ignore') host_plot = 'http://localhost:5006' extensions = [ Timing(), TrainingDataMonitoring([loss], after_epoch=True), DataStreamMonitoring(variables=[crps, loss], data_stream=valid_stream, prefix="valid"), Plot('%s %s %s' % (model_name, datetime.date.today(), time.strftime('%H:%M')), channels=[['loss', 'valid_loss'], ['valid_crps']], after_epoch=True, server_url=host_plot), Printing(), Checkpoint('train'), FinishAfter(after_n_epochs=20) ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()