def main(config, tr_stream, dev_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() for layer_n in range(config['src_dgru_depth']): encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['bidir_encoder_depth']): encoder.children[ 1 + layer_n].prototype.recurrent.weights_init = Orthogonal() if config['trg_igru_depth'] == 1: decoder.interpolator.igru.weights_init = Orthogonal() else: for layer_n in range(config['trg_igru_depth']): decoder.interpolator.igru.transitions[ layer_n].weights_init = Orthogonal() for layer_n in range(config['trg_dgru_depth']): decoder.interpolator.feedback_brick.dgru.transitions[ layer_n].weights_init = Orthogonal() for layer_n in range(config['transition_depth']): decoder.transition.transitions[layer_n].weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(str(shape), count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(str(value.get_value().shape), name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Set extensions logger.info("Initializing extensions") # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [ train_monitor, Timing(), Printing(every_n_batches=config['print_freq']), FinishAfter(after_n_batches=config['finish_after']), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[config['transition_depth']]) ) # generated[transition_depth] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], transition_depth=config['transition_depth'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def main(config, tr_stream, dev_stream): # Create Theano variables source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Test values ''' theano.config.compute_test_value = 'warn' source_sentence.tag.test_value = numpy.random.randint(10, size=(10, 10)) target_sentence.tag.test_value = numpy.random.randint(10, size=(10, 10)) source_sentence_mask.tag.test_value = \ numpy.random.rand(10, 10).astype('float32') target_sentence_mask.tag.test_value = \ numpy.random.rand(10, 10).astype('float32') sampling_input.tag.test_value = numpy.random.randint(10, size=(10, 10)) ''' # Construct model encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() cg = ComputationGraph(cost) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] print('Parameter shapes') for shape, count in Counter(shapes).most_common(): print(' {:15}: {}'.format(shape, count)) # Set up training algorithm algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Set up beam search and sampling computation graphs sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) samples, = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # generated[1] is the next_outputs # Set up training model training_model = Model(cost) enc_param_dict = Selector(encoder).get_params() dec_param_dict = Selector(decoder).get_params() gh_model_name = '/data/lisatmp3/firatorh/nmt/wmt15/trainedModels/blocks/sanity/refGHOG_adadelta_40k_best_bleu_model.npz' tmp_file = numpy.load(gh_model_name) gh_model = dict(tmp_file) tmp_file.close() for key in enc_param_dict: print '{:15}: {}'.format(enc_param_dict[key].get_value().shape, key) for key in dec_param_dict: print '{:15}: {}'.format(dec_param_dict[key].get_value().shape, key) enc_param_dict['/bidirectionalencoder/embeddings.W'].set_value( gh_model['W_0_enc_approx_embdr']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/forward.state_to_state'].set_value( gh_model['W_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/forward.state_to_update'].set_value( gh_model['G_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/forward.state_to_reset'].set_value( gh_model['R_enc_transition_0']) enc_param_dict['/bidirectionalencoder/fwd_fork/fork_inputs.W'].set_value( gh_model['W_0_enc_input_embdr_0']) enc_param_dict['/bidirectionalencoder/fwd_fork/fork_inputs.b'].set_value( gh_model['b_0_enc_input_embdr_0']) enc_param_dict[ '/bidirectionalencoder/fwd_fork/fork_update_inputs.W'].set_value( gh_model['W_0_enc_update_embdr_0']) enc_param_dict[ '/bidirectionalencoder/fwd_fork/fork_reset_inputs.W'].set_value( gh_model['W_0_enc_reset_embdr_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/backward.state_to_state'].set_value( gh_model['W_back_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/backward.state_to_update'].set_value( gh_model['G_back_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/backward.state_to_reset'].set_value( gh_model['R_back_enc_transition_0']) enc_param_dict['/bidirectionalencoder/back_fork/fork_inputs.W'].set_value( gh_model['W_0_back_enc_input_embdr_0']) enc_param_dict['/bidirectionalencoder/back_fork/fork_inputs.b'].set_value( gh_model['b_0_back_enc_input_embdr_0']) enc_param_dict[ '/bidirectionalencoder/back_fork/fork_update_inputs.W'].set_value( gh_model['W_0_back_enc_update_embdr_0']) enc_param_dict[ '/bidirectionalencoder/back_fork/fork_reset_inputs.W'].set_value( gh_model['W_0_back_enc_reset_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W'].set_value( gh_model['W_0_dec_approx_embdr']) #dec_param_dict['/decoder/sequencegenerator/readout/lookupfeedback/lookuptable.W'].set_value(gh_model['W_0_dec_approx_embdr']) dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/maxout_bias.b'].set_value( gh_model['b_0_dec_hid_readout_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax0.W'].set_value( gh_model['W1_dec_deep_softmax']) # Missing W1 dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.W'].set_value( gh_model['W2_dec_deep_softmax']) dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.b'].set_value( gh_model['b_dec_deep_softmax']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_states.W'].set_value( gh_model['W_0_dec_hid_readout_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_feedback.W'].set_value( gh_model['W_0_dec_prev_readout_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_weighted_averages.W'].set_value( gh_model['W_0_dec_repr_readout']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_weighted_averages.b'].set_value( gh_model['b_0_dec_repr_readout']) dec_param_dict['/decoder/sequencegenerator/fork/fork_inputs.b'].set_value( gh_model['b_0_dec_input_embdr_0']) dec_param_dict['/decoder/sequencegenerator/fork/fork_inputs.W'].set_value( gh_model['W_0_dec_input_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/fork/fork_update_inputs.W'].set_value( gh_model['W_0_dec_update_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/fork/fork_reset_inputs.W'].set_value( gh_model['W_0_dec_reset_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_inputs.W'].set_value( gh_model['W_0_dec_dec_inputter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_inputs.b'].set_value( gh_model['b_0_dec_dec_inputter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_update_inputs.W'].set_value( gh_model['W_0_dec_dec_updater_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_update_inputs.b'].set_value( gh_model['b_0_dec_dec_updater_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_reset_inputs.W'].set_value( gh_model['W_0_dec_dec_reseter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_reset_inputs.b'].set_value( gh_model['b_0_dec_dec_reseter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder.state_to_state'].set_value( gh_model['W_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder.state_to_update'].set_value( gh_model['G_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder.state_to_reset'].set_value( gh_model['R_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/attention/state_trans/transform_states.W'].set_value( gh_model['B_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/attention/preprocess.W'].set_value( gh_model['A_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/attention/energy_comp/linear.W'].set_value( gh_model['D_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder/state_initializer/linear_0.W'].set_value( gh_model['W_0_dec_initializer_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder/state_initializer/linear_0.b'].set_value( gh_model['b_0_dec_initializer_0']) config['val_burn_in'] = -1 # Initialize main loop main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=[ FinishAfter(after_n_batches=1), Sampler(model=search_model, config=config, data_stream=tr_stream, every_n_batches=config['sampling_freq']), BleuValidator( sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], before_training=True, before_batch=True), #every_n_batches=config['bleu_val_freq']), TrainingDataMonitoring([cost], after_batch=True), #Plot('En-Fr', channels=[['decoder_cost_cost']], # after_batch=True), Printing(after_batch=True) ]) # Train! main_loop.run()
enc_dec.build_sampler() if configuration['reload']: enc_dec.load() sample_search = BeamSearch(enc_dec=enc_dec, configuration=configuration, beam_size=1, maxlen=configuration['seq_len_src'], stochastic=True) valid_search = BeamSearch(enc_dec=enc_dec, configuration=configuration, beam_size=configuration['beam_size'], maxlen=3*configuration['seq_len_src'], stochastic=False) sampler = Sampler(sample_search, **configuration) bleuvalidator = BleuValidator(valid_search, **configuration) # train function train_fn = enc_dec.train_fn if configuration.get('with_layernorm', False): update_fn = enc_dec.update_fn # train data ds = DStream(**configuration) # valid data vs = get_devtest_stream(data_type='valid', input_file=None, **configuration) # main_loop # modified by Zhaopeng Tu, 2016-07-14 # to continue training
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids'],name='word_encoder') decoder = Decoder(vocab_size=config['trg_vocab_size'], embedding_dim=config['dec_embed'], state_dim=config['dec_nhids'], representation_dim=config['enc_nhids'] * 2, match_function=config['match_function'], use_doubly_stochastic=config['use_doubly_stochastic'], lambda_ds=config['lambda_ds'], use_local_attention=config['use_local_attention'], window_size=config['window_size'], use_step_decay_cost=config['use_step_decay_cost'], use_concentration_cost=config['use_concentration_cost'], lambda_ct=config['lambda_ct'], use_stablilizer=config['use_stablilizer'], lambda_st=config['lambda_st']) # here attended dim (representation_dim) of decoder is 2*enc_nhinds # because the context given by the encoder is a bidirectional context if mode == "train": # Create Theano variables logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') dev_source = tensor.lmatrix('dev_source') dev_target=tensor.lmatrix('dev_target') # Get training and development set streams tr_stream = get_tr_stream_withContext(**config) dev_stream = get_dev_stream_with_grdTruth(**config) # Get cost of the model sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); cost = decoder.cost(sentence_representations_list, sentence_masks_list, target_sentence, target_sentence_mask) logger.info('Creating computational graph') perplexity = tensor.exp(cost) perplexity.name = 'perplexity' costs_computer = function(context_sentences+context_sentence_masks+[target_sentence, target_sentence_mask, source_sentence, source_sentence_mask], (perplexity)) cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init =decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init =decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([perplexity], after_batch=True), CheckpointNMT(config['saveto'], config['model_name'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, model_name=config['model_name'], hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if False: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'], n_best=3, track_n_models=6)) logger.info("Building perplexity validator") extensions.append( pplValidation(dev_source,dev_target, config=config, model=costs_computer, data_stream=dev_stream, model_name=config['model_name'], every_n_batches=config['sampling_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) initial_learning_rate = config['initial_learning_rate'] log_path = os.path.join(config['saveto'], 'log') if config['reload'] and os.path.exists(log_path): with open(log_path, 'rb') as source: log = cPickle.load(source) last = max(log.keys()) - 1 if 'learning_rate' in log[last]: initial_learning_rate = log[last]['learning_rate'] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([Scale(initial_learning_rate), StepClipping(config['step_clipping']), eval(config['step_rule'])()])) _learning_rate = algorithm.step_rule.components[0].learning_rate if config['learning_rate_decay']: extensions.append( LearningRateHalver(record_name='validation_cost', comparator=lambda x, y: x > y, learning_rate=_learning_rate, patience_default=3)) else: extensions.append(OldModelRemover(saveto=config['saveto'])) if config['learning_rate_grow']: extensions.append( LearningRateDoubler(record_name='validation_cost', comparator=lambda x, y: x < y, learning_rate=_learning_rate, patience_default=3)) extensions.append( SimplePrinting(config['model_name'], after_batch=True)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'ppl': # Create Theano variables # Create Theano variables logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') # Get training and development set streams #tr_stream = get_tr_stream_withContext(**config) dev_stream = get_dev_stream_withContext_grdTruth(**config) # Get cost of the model sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); cost = decoder.cost(sentence_representations_list, sentence_masks_list, target_sentence, target_sentence_mask) logger.info('Creating computational graph') costs_computer = function(context_sentences+context_sentence_masks+[target_sentence, target_sentence_mask, source_sentence, source_sentence_mask], (cost)) logger.info("Loading the model..") model = Model(cost) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']); loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started Validation: ") ts = dev_stream.get_epoch_iterator() total_cost = 0.0 total_tokens=0.0 #pbar = ProgressBar(max_value=len(ts)).start()#modified pbar = ProgressBar(max_value=10000).start(); for i, (ctx_0,ctx_0_mask,ctx_1,ctx_1_mask,ctx_2,ctx_2_mask,src, src_mask, trg, trg_mask) in enumerate(ts): costs = costs_computer(*[ctx_0,ctx_1,ctx_2,ctx_0_mask,ctx_1_mask,ctx_2_mask,trg, trg_mask,src, src_mask]) cost = costs.sum() total_cost+=cost total_tokens+=trg_mask.sum() pbar.update(i + 1) total_cost/=total_tokens; pbar.finish() #dev_stream.reset() # run afterprocess # self.ap.main() total_cost=2**total_cost; print("Average validation cost: " + str(total_cost)); elif mode == 'translate': logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 trg_vocab = _ensure_special_tokens( cPickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} config['batch_size'] = 1 sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); generated = decoder.generate(sentence_representations_list,sentence_masks_list) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']); loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started translation: ") test_stream = get_dev_stream_withContext(**config) ts = test_stream.get_epoch_iterator() rts = open(config['val_set_source']).readlines() ftrans_original = open(config['val_output_orig'], 'w') saved_weights = [] total_cost = 0.0 pbar = ProgressBar(max_value=len(rts)).start() for i, (line, line_raw) in enumerate(zip(ts, rts)): trans_in = line_raw[3].split() seqs=[]; input_=[]; input_mask=[]; for j in range(config['ctx_num']+1): seqs.append(sutils._oov_to_unk( line[2*j][0], config['src_vocab_size'], unk_idx)) input_mask.append(numpy.tile(line[2*j+1][0],(config['beam_size'], 1))) input_.append(numpy.tile(seqs[j], (config['beam_size'], 1))) #v=costs_computer(input_[0]); # draw sample, checking to ensure we don't get an empty string back trans, costs, attendeds, weights = \ beam_search.search( input_values={source_sentence: input_[3],source_sentence_mask:input_mask[3], context_sentences[0]: input_[0],context_sentence_masks[0]:input_mask[0], context_sentences[1]: input_[1],context_sentence_masks[1]:input_mask[1], context_sentences[2]: input_[2],context_sentence_masks[2]:input_mask[2]}, max_length=3*len(seqs[2]), eol_symbol=trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths b = numpy.argsort(costs)[0] #best=numpy.argsort(costs)[0:config['beam_size']]; #for b in best: try: total_cost += costs[b] trans_out = trans[b] totalLen=4*len(line[0][0]); #weight = weights[b][:, :totalLen] weight=weights trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' saved_weights.append(weight) print(' '.join(trans_out), file=ftrans_original) pbar.update(i + 1) pbar.finish() logger.info("Total cost of the test: {}".format(total_cost)) cPickle.dump(saved_weights, open(config['attention_weights'], 'wb')) ftrans_original.close() ap = afterprocesser(config) ap.main()
def main(configuration, is_chief=False): l1_reg_weight = configuration['l1_reg_weight'] l2_reg_weight = configuration['l2_reg_weight'] # time_steps*nb_samples src = K.placeholder(shape=(None, None), dtype='int32') src_mask = K.placeholder(shape=(None, None)) trg = K.placeholder(shape=(None, None), dtype='int32') trg_mask = K.placeholder(shape=(None, None)) # for fast training of new parameters ite = K.placeholder(ndim=0) enc_dec = EncoderDecoder(**configuration) softmax_output_num_sampled = configuration['softmax_output_num_sampled'] enc_dec.build_trainer( src, src_mask, trg, trg_mask, ite, l1_reg_weight=l1_reg_weight, l2_reg_weight=l2_reg_weight, softmax_output_num_sampled=softmax_output_num_sampled) enc_dec.build_sampler() # Chief is responsible for initializing and loading model states if is_chief: init_op = tf.initialize_all_variables() init_fn = K.function(inputs=[], outputs=[init_op]) init_fn([]) if configuration['reload']: enc_dec.load() sample_search = BeamSearch(enc_dec=enc_dec, configuration=configuration, beam_size=1, maxlen=configuration['seq_len_src'], stochastic=True) valid_search = BeamSearch(enc_dec=enc_dec, configuration=configuration, beam_size=configuration['beam_size'], maxlen=3 * configuration['seq_len_src'], stochastic=False) sampler = Sampler(sample_search, **configuration) bleuvalidator = BleuValidator(valid_search, **configuration) # train function train_fn = enc_dec.train_fn if configuration['with_reconstruction'] and configuration[ 'with_fast_training']: fast_train_fn = enc_dec.fast_train_fn # train data ds = DStream(**configuration) # valid data vs = get_devtest_stream(data_type='valid', input_file=None, **configuration) iters = args.start valid_bleu_best = -1 epoch_best = -1 iters_best = -1 max_epochs = configuration['finish_after'] # TODO: use global iter and only the chief can save the model for epoch in range(max_epochs): for x, x_mask, y, y_mask in ds.get_iterator(): last_time = time.time() if configuration['with_reconstruction'] and configuration[ 'with_fast_training'] and iters < configuration[ 'fast_training_iterations']: if configuration['fix_base_parameters'] and not configuration[ 'with_tied_weights']: tc = fast_train_fn([x.T, x_mask.T, y.T, y_mask.T]) else: tc = fast_train_fn([x.T, x_mask.T, y.T, y_mask.T, iters]) else: tc = train_fn([x.T, x_mask.T, y.T, y_mask.T]) cur_time = time.time() iters += 1 logger.info( 'epoch %d \t updates %d train cost %.4f use time %.4f' % (epoch, iters, tc[0], cur_time - last_time)) if iters % configuration['save_freq'] == 0: enc_dec.save() if iters % configuration['sample_freq'] == 0: sampler.apply(x, y) if iters < configuration['val_burn_in']: continue if (iters <= configuration['val_burn_in_fine'] and iters % configuration['valid_freq'] == 0) \ or (iters > configuration['val_burn_in_fine'] and iters % configuration['valid_freq_fine'] == 0): valid_bleu = bleuvalidator.apply(vs, configuration['valid_out']) os.system('mkdir -p results/%d' % iters) os.system('mv %s* %s results/%d' % (configuration['valid_out'], configuration['saveto'], iters)) logger.info( 'valid_test \t epoch %d \t updates %d valid_bleu %.4f' % (epoch, iters, valid_bleu)) if valid_bleu > valid_bleu_best: valid_bleu_best = valid_bleu epoch_best = epoch iters_best = iters enc_dec.save(path=configuration['saveto_best']) logger.info('final result: epoch %d \t updates %d valid_bleu_best %.4f' % (epoch_best, iters_best, valid_bleu_best))
def main(config, tr_stream, dev_stream): # Create Theano variables source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector( decoder.transition.initial_transformer).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) cost = cg.outputs[0] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_params(), Selector(decoder).get_params()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.iteritems(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training algorithm if args.subtensor_fix: assert config['step_rule'] == 'AdaDelta' from subtensor_gradient import GradientDescent_SubtensorFix, AdaDelta_SubtensorFix, subtensor_params lookups = subtensor_params(cg, [ encoder.lookup, decoder.sequence_generator.readout.feedback_brick.lookup ]) algorithm = GradientDescent_SubtensorFix( subtensor_params=lookups, cost=cost, params=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), RemoveNotFinite(0.9), AdaDelta_SubtensorFix(subtensor_params=lookups) ])) else: algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), RemoveNotFinite(0.9), eval(config['step_rule'])() ])) # Set up beam search and sampling computation graphs sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) samples, = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # generated[1] is the next_outputs # Set up training model training_model = Model(cost) # Set extensions extensions = [ Sampler(model=search_model, config=config, data_stream=tr_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], every_n_batches=config['sampling_freq']), BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], every_n_batches=config['bleu_val_freq']), TrainingDataMonitoring([cost], after_batch=True), #Plot('En-Fr', channels=[['decoder_cost_cost']], # after_batch=True), Printing(after_batch=True), Dump(config['saveto'], every_n_batches=config['save_freq']) ] # Reload model if necessary if config['reload']: extensions += [LoadFromDumpWMT15(config['saveto'])] # Initialize main loop main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()