def test_beam_search(): """Test beam search using the model from the reverse_words demo. Ideally this test should be done with a trained model, but so far only with a randomly initialized one. So it does not really test the ability to find the best output sequence, but only correctness of returned costs. """ rng = numpy.random.RandomState(1234) alphabet_size = 20 beam_size = 10 length = 15 reverser = WordReverser(10, alphabet_size) reverser.weights_init = reverser.biases_init = IsotropicGaussian(0.5) reverser.initialize() inputs = tensor.lmatrix('inputs') samples, = VariableFilter(bricks=[reverser.generator], name="outputs")( ComputationGraph(reverser.generate(inputs))) input_vals = numpy.tile(rng.randint(alphabet_size, size=(length,)), (beam_size, 1)).T search = BeamSearch(10, samples) results, mask, costs = search.search({inputs: input_vals}, 0, 3 * length) true_costs = reverser.cost( input_vals, numpy.ones((length, beam_size), dtype=floatX), results, mask).eval() true_costs = (true_costs * mask).sum(axis=0) assert_allclose(costs, true_costs, rtol=1e-5)
def test_beam_search(): """Test beam search using the model similar to the reverse_words demo. Ideally this test should be done with a trained model, but so far only with a randomly initialized one. So it does not really test the ability to find the best output sequence, but only correctness of returned costs. """ rng = numpy.random.RandomState(1234) alphabet_size = 20 beam_size = 10 length = 15 simple_generator = SimpleGenerator(10, alphabet_size, seed=1234) simple_generator.weights_init = IsotropicGaussian(0.5) simple_generator.biases_init = IsotropicGaussian(0.5) simple_generator.initialize() inputs = tensor.lmatrix('inputs') samples, = VariableFilter( applications=[simple_generator.generator.generate], name="outputs")(ComputationGraph(simple_generator.generate(inputs))) input_vals = numpy.tile(rng.randint(alphabet_size, size=(length, )), (beam_size, 1)).T search = BeamSearch(samples) results, mask, costs = search.search({inputs: input_vals}, 0, 3 * length, as_arrays=True) # Just check sum assert results.sum() == 2816 true_costs = simple_generator.cost( input_vals, numpy.ones((length, beam_size), dtype=theano.config.floatX), results, mask).eval() true_costs = (true_costs * mask).sum(axis=0) assert_allclose(costs.sum(axis=0), true_costs, rtol=1e-5) # Test `as_lists=True` results2, costs2 = search.search({inputs: input_vals}, 0, 3 * length) for i in range(len(results2)): assert results2[i] == list(results.T[i, :mask.T[i].sum()])
def test_beam_search(): """Test beam search using the model similar to the reverse_words demo. Ideally this test should be done with a trained model, but so far only with a randomly initialized one. So it does not really test the ability to find the best output sequence, but only correctness of returned costs. """ rng = numpy.random.RandomState(1234) alphabet_size = 20 beam_size = 10 length = 15 simple_generator = SimpleGenerator(10, alphabet_size, seed=1234) simple_generator.weights_init = IsotropicGaussian(0.5) simple_generator.biases_init = IsotropicGaussian(0.5) simple_generator.initialize() inputs = tensor.lmatrix('inputs') samples, = VariableFilter( applications=[simple_generator.generator.generate], name="outputs")( ComputationGraph(simple_generator.generate(inputs))) input_vals = numpy.tile(rng.randint(alphabet_size, size=(length,)), (beam_size, 1)).T search = BeamSearch(samples) results, mask, costs = search.search( {inputs: input_vals}, 0, 3 * length, as_arrays=True) # Just check sum assert results.sum() == 2816 true_costs = simple_generator.cost( input_vals, numpy.ones((length, beam_size), dtype=theano.config.floatX), results, mask).eval() true_costs = (true_costs * mask).sum(axis=0) assert_allclose(costs.sum(axis=0), true_costs, rtol=1e-5) # Test `as_lists=True` results2, costs2 = search.search({inputs: input_vals}, 0, 3 * length) for i in range(len(results2)): assert results2[i] == list(results.T[i, :mask.T[i].sum()])
def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, _, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) costs = costs.T outputs = list(outputs.T) costs = list(costs) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] if mode == "sample": costs[i] = costs[i][:true_length].sum() return outputs, costs
def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs
unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 ftrans = open('/Users/lqy/Documents/transout.txt','w',0) falign = gzip.open('/Users/lqy/Documents/alignmentout','w',0) for i, line in enumerate(validate_stream.get_epoch_iterator()): source_line = line[0] #line_tok = mergeSplit(source_token[i]) seq = nmt._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) #产生12 行1列的元素矩阵,元素指的是一个的序列 #print "input_: ",input_[3] trans,costs = beam_search.search(input_values={source_sentence: input_[:]},max_length=3*len(seq), eol_symbol=src_eos_idx,ignore_first_eol=True) lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] trans_out = trans[best] source_word = nmt._idx_to_word(line[0],nmt.src_ivocab) trans_out_word = nmt._idx_to_word(trans_out, nmt.trg_ivocab) trans_out_word_str = trans_out_word.split(" ") source_word_str = source_word.split(" ") alignment = numpy.asarray(getAlignment(numpy.array(source_line)[None, :],numpy.array(trans_out)[None, :]))
class IMT_F1_Validator(SimpleExtension, SamplingBase): """Implements early stopping based on METEOR score.""" def __init__(self, source_sentence, target_prefix, samples, model, data_stream, config, src_vocab=None, trg_vocab=None, n_best=1, track_n_models=1, normalize=True, **kwargs): super(IMT_F1_Validator, self).__init__(**kwargs) self.source_sentence = source_sentence self.target_prefix = target_prefix self.src_vocab = src_vocab self.trg_vocab = trg_vocab self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.best_models = [] self.val_imt_f1_curve = [] self.beam_search = BeamSearch(samples=samples) # Info for Meteor self.target_language = self.config['target_lang'] # Create save directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: imt_f1_score = numpy.load( os.path.join(self.config['saveto'], 'val_imt_f1_scores.npz')) self.val_imt_f1_curve = imt_f1_score['imt_f1_scores'].tolist() # Track n best previous f1_bad scores for i, imt_f1_val in enumerate( sorted(self.val_imt_f1_curve, reverse=True)): if i < self.track_n_models: self.best_models.append( ModelInfo(imt_f1_val, key='IMT_F1')) logger.info("IMT_F1_Scores Reloaded") except: logger.info("IMT_F1_Scores not found") def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status['iterations_done'] <= self.config[ 'val_burn_in']: return # Evaluate the model imt_f1_score = self._evaluate_model() # add an entry to the log self.main_loop.log.current_row[ 'validation_set_imt_f1_score'] = imt_f1_score # save if necessary self._save_model(imt_f1_score) # TODO: if we are evaluating both BLEU and METEOR, we shouldn't need to translate twice!! def _evaluate_model(self): # Set in the superclass -- SamplingBase if not hasattr(self, 'target_dataset'): self._initialize_dataset_info() self.unk_sym = '<UNK>' self.eos_sym = '</S>' self.unk_idx = self.trg_vocab[self.unk_sym] self.eos_idx = self.trg_vocab[self.eos_sym] logger.info("Started Validation: ") val_start_time = time.time() ref_file = self.config['val_set_grndtruth'] trg_hyp_file = tempfile.NamedTemporaryFile(delete=False) if self.verbose: ftrans = codecs.open(self.config['val_set_out'], 'w', encoding='utf8') total_cost = 0.0 with codecs.open(trg_hyp_file.name, 'w', encoding='utf8') as hyps_out: for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ # TODO: the section with beam search and translation is shared by all validators # WORKING: switch this to IMT prefix validation # Note that the indices of source and target in the datastream are hard-coded # currently our datastream is (source,target,prefix,suffix) seq = self._oov_to_unk(line[0], self.config['src_vocab_size'], self.unk_idx) target_prefix = line[2] input_ = numpy.tile(seq, (self.config['beam_size'], 1)) prefix_input_ = numpy.tile(target_prefix, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back # beam search param names come from WHERE?? trans, costs = self.beam_search.search(input_values={ self.source_sentence: input_, self.target_prefix: prefix_input_ }, max_length=3 * len(seq), eol_symbol=self.eos_idx, ignore_first_eol=False) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = self._idx_to_word(trans_out, self.trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format( i + 1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists hyps_out.write(trans_out.decode('utf8') + '\n') if self.verbose: print(trans_out.decode('utf8'), file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of validation set...".format(i)) logger.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() imt_f1_score, imt_precision, imt_recall = imt_f1_from_files( trg_hyp_file.name, ref_file) logger.info("IMT F1 Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) logger.info("IMT F1: {}, Precision: {}, Recall: {}".format( imt_f1_score, imt_precision, imt_recall)) return imt_f1_score def _is_valid_to_save(self, imt_f1_score): if not self.best_models or min( self.best_models, key=operator.attrgetter('score')).score < imt_f1_score: return True return False def _save_model(self, imt_f1_score): if self._is_valid_to_save(imt_f1_score): model = ModelInfo(imt_f1_score, self.config['saveto'], key='IMT_F1') # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logger.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter('score')) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) logger.info("Saving new model {}".format(model.path)) SaveLoadUtils.save_parameter_values( self.main_loop.model.get_parameter_values(), model.path) numpy.savez(os.path.join(self.config['saveto'], 'val_imt_f1_scores.npz'), imt_f1_scores=self.val_imt_f1_curve) signal.signal(signal.SIGINT, s)
class BeamSearchEvaluator(object): def __init__(self, eol_symbol, beam_size, x, x_mask, samples, phoneme_dict=None, black_list=None): if black_list is None: self.black_list = [] else: self.black_list = black_list self.x = x self.x_mask = x_mask self.eol_symbol = eol_symbol self.beam_size = beam_size self.beam_search = BeamSearch(beam_size, samples) self.beam_search.compile() self.phoneme_dict = phoneme_dict def evaluate(self, data_stream, train=False, file_pred=None, file_targets=None): loss = 0. num_examples = 0 iterator = data_stream.get_epoch_iterator() if train: print 'Train evaluation started' i = 0 for inputs in iterator: inputs = dict(zip(data_stream.sources, inputs)) x_mask_val = inputs['features_mask'] x_val = inputs['features'] y_val = inputs['phonemes'] y_mask_val = inputs['phonemes_mask'] for batch_ind in xrange(inputs['features'].shape[1]): if x_val.ndim == 2: input_beam = numpy.tile(x_val[:, batch_ind][:, None], (1, self.beam_size)) else: input_beam = numpy.tile(x_val[:, batch_ind, :][:, None, :], (1, self.beam_size, 1)) input_mask_beam = numpy.tile(x_mask_val[:, batch_ind][:, None], (1, self.beam_size)) predictions, _ = self.beam_search.search( {self.x: input_beam, self.x_mask: input_mask_beam}, self.eol_symbol, 100) predictions = [self.phoneme_dict[phone_ind] for phone_ind in predictions[0] if self.phoneme_dict[phone_ind] not in self.black_list][1:-1] targets = y_val[:sum(y_mask_val[:, batch_ind]), batch_ind] targets = [self.phoneme_dict[phone_ind] for phone_ind in targets if self.phoneme_dict[phone_ind] not in self.black_list][1:-1] predictions = [x[0] for x in groupby(predictions)] targets = [x[0] for x in groupby(targets)] i += 1 if file_pred: file_pred.write(' '.join(predictions) + '(%d)\n' % i) if file_targets: file_targets.write(' '.join(targets) + '(%d)\n' %i) loss += Evaluation.wer([predictions], [targets]) num_examples += 1 print '.. found sequence example:', ' '.join(predictions) print '.. real output was: ', ' '.join(targets) if train: break if train: print 'Train evaluation finished' per = loss.sum() / num_examples return {'per': per}
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Get training and development set streams tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) # Get cost of the model cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get test set stream test_stream = get_dev_stream( config['test_set'], config['src_vocab'], config['src_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens( pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk( line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={sampling_input: input_}, max_length=3*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close()
class SpeechRecognizer(Initializable): """Encapsulate all reusable logic. This class plays a few roles: (a) it's a top brick that knows how to combine bottom, bidirectional and recognizer network, (b) it has the inputs variables and can build whole computation graphs starting with them (c) it hides compilation of Theano functions and initialization of beam search. I find it simpler to have it all in one place for research code. Parameters ---------- All defining the structure and the dimensions of the model. Typically receives everything from the "net" section of the config. """ def __init__( self, input_dims, input_num_chars, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, character_map=None, bidir=True, subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class(input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) generators = [None, None] for i in range(2): # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top{}".format(i)) else: top = Identity(name='top{}'.format(i)) if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition{}".format(i)) else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}_{}".format( i, trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att" + i) elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att{}".format(i)) else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback( num_phonemes + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: feedback = OneOfNFeedback(num_phonemes + 1) if criterion['name'] == 'log_likelihood': emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter{}".format(i)) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() elif criterion['name'].startswith('mse'): emitter = RewardRegressionEmitter(criterion['name'], eos_label, num_phonemes, criterion.get( 'min_reward', -1.0), name="emitter") else: raise ValueError("Unknown criterion {}".format( criterion['name'])) readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout{}".format(i)) if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge{}'.format(i)) readout = Readout(**readout_config) language_model = None if lm and lm.get('path'): lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generators[i] = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator{}".format(i)) self.generator = generators[0] self.forward_to_backward = Linear(dim_dec, dim_dec) # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generators = generators self.children = [self.forward_to_backward, encoder, top, bottom ] + generators # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.n_steps = tensor.lscalar('n_steps') def push_initialization_config(self): super(SpeechRecognizer, self).push_initialization_config() if self.rec_weights_init: rec_weights_config = { 'weights_init': self.rec_weights_init, 'recurrent_weights_init': self.rec_weights_init } global_push_initialization_config(self, rec_weights_config, BaseRecurrent) if self.initial_states_init: global_push_initialization_config( self, {'initial_states_init': self.initial_states_init}) @application def cost(self, application_call, **kwargs): # pop inputs we know about inputs_mask = kwargs.pop('inputs_mask') labels = kwargs.pop('labels') labels_mask = kwargs.pop('labels_mask') # the rest is for bottom bottom_processed = self.bottom.apply(**kwargs) encoded, encoded_mask = self.encoder.apply(input_=bottom_processed, mask=inputs_mask) encoded = self.top.apply(encoded) outs_forward = self.generators[0].evaluate(labels, labels_mask, attended=encoded, attended_mask=encoded_mask) costs_forward, states_forward, _, _, _, _ = outs_forward outs_backward = self.generators[1].evaluate( labels[::-1], labels_mask[::-1] if labels_mask else None, attended=encoded[::-1], attended_mask=encoded_mask[::-1]) costs_backward, states_backward, _, _, _, _ = outs_backward costs_backward = costs_backward[::-1] states_backward = states_backward[::-1] states_shape = states_forward.shape backward_predicted = self.forward_to_backward.apply( states_forward.reshape((states_shape[0] * states_shape[1], -1))) backward_predicted = backward_predicted.reshape(states_shape) backward_predicted = backward_predicted * labels_mask[:, :, None] states_backward = gradient.disconnected_grad(states_backward) states_backward = states_backward * labels_mask[:, :, None] l2_cost = ((backward_predicted - states_backward)**2).mean(axis=2) l2_cost.name = 'l2_cost_aux' application_call.add_auxiliary_variable( l2_cost.sum(axis=0).mean().copy(name='l2_cost_aux')) costs_forward_aux = (costs_forward.sum(axis=0).mean()).copy( name='costs_forward_aux') application_call.add_auxiliary_variable(costs_forward_aux) return costs_forward + costs_backward + 1.5 * l2_cost @application def generate(self, **kwargs): inputs_mask = kwargs.pop('inputs_mask') n_steps = kwargs.pop('n_steps') encoded, encoded_mask = self.encoder.apply( input_=self.bottom.apply(**kwargs), mask=inputs_mask) encoded = self.top.apply(encoded) return self.generator.generate( n_steps=n_steps if n_steps is not None else self.n_steps, batch_size=encoded.shape[1], attended=encoded, attended_mask=encoded_mask, as_dict=True) def load_params(self, path): generated = self.get_generate_graph() with open(path, 'r') as src: param_values = load_parameters(src) Model(generated['outputs']).set_parameter_values(param_values) def get_generate_graph(self, use_mask=True, n_steps=None): inputs_mask = None if use_mask: inputs_mask = self.inputs_mask bottom_inputs = self.inputs return self.generate(n_steps=n_steps, inputs_mask=inputs_mask, **bottom_inputs) def get_cost_graph(self, batch=True, prediction=None, prediction_mask=None): if batch: inputs = self.inputs inputs_mask = self.inputs_mask groundtruth = self.labels groundtruth_mask = self.labels_mask else: inputs, inputs_mask = self.bottom.single_to_batch_inputs( self.single_inputs) groundtruth = self.single_labels[:, None] groundtruth_mask = None if not prediction: prediction = groundtruth if not prediction_mask: prediction_mask = groundtruth_mask cost = self.cost(inputs_mask=inputs_mask, labels=prediction, labels_mask=prediction_mask, **inputs) cost_cg = ComputationGraph(cost) if self.criterion['name'].startswith("mse"): placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg) cost_cg = cost_cg.replace({placeholder: groundtruth}) return cost_cg def analyze(self, inputs, groundtruth, prediction=None): """Compute cost and aligment.""" input_values_dict = dict(inputs) input_values_dict['groundtruth'] = groundtruth if prediction is not None: input_values_dict['prediction'] = prediction if not hasattr(self, "_analyze"): input_variables = list(self.single_inputs.values()) input_variables.append(self.single_labels.copy(name='groundtruth')) prediction_variable = tensor.lvector('prediction') if prediction is not None: input_variables.append(prediction_variable) cg = self.get_cost_graph(batch=False, prediction=prediction_variable[:, None]) else: cg = self.get_cost_graph(batch=False) cost = cg.outputs[0] weights, = VariableFilter(bricks=[self.generator], name="weights")(cg) energies = VariableFilter(bricks=[self.generator], name="energies")(cg) energies_output = [ energies[0][:, 0, :] if energies else tensor.zeros_like(weights) ] states, = VariableFilter(applications=[self.encoder.apply], roles=[OUTPUT], name="encoded")(cg) ctc_matrix_output = [] # Temporarily disabled for compatibility with LM code # if len(self.generator.readout.source_names) == 1: # ctc_matrix_output = [ # self.generator.readout.readout(weighted_averages=states)[:, 0, :]] self._analyze = theano.function( input_variables, [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output, on_unused_input='warn') return self._analyze(**input_values_dict) def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ if hasattr(self, '_beam_search') and self.beam_size == beam_size: # Only recompile if the user wants a different beam size return self.beam_size = beam_size generated = self.get_generate_graph(use_mask=False, n_steps=3) cg = ComputationGraph(generated.values()) samples, = VariableFilter(applications=[self.generator.generate], name="outputs")(cg) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile() def beam_search(self, inputs, **kwargs): # When a recognizer is unpickled, self.beam_size is available # but beam search has to be recompiled. self.init_beam_search(self.beam_size) inputs = dict(inputs) max_length = int( self.bottom.num_time_steps(**inputs) / self.max_decoded_length_scale) search_inputs = {} for var in self.inputs.values(): search_inputs[var] = inputs.pop(var.name)[:, numpy.newaxis, ...] if inputs: raise Exception('Unknown inputs passed to beam search: {}'.format( inputs.keys())) outputs, search_costs = self._beam_search.search( search_inputs, self.eos_label, max_length, ignore_first_eol=self.data_prepend_eos, **kwargs) return outputs, search_costs def init_generate(self): generated = self.get_generate_graph(use_mask=False) cg = ComputationGraph(generated['outputs']) self._do_generate = cg.get_theano_function() def sample(self, inputs, n_steps=None): if not hasattr(self, '_do_generate'): self.init_generate() batch, unused_mask = self.bottom.single_to_batch_inputs(inputs) batch['n_steps'] = n_steps if n_steps is not None \ else int(self.bottom.num_time_steps(**batch) / self.max_decoded_length_scale) return self._do_generate(**batch)[0] def __getstate__(self): state = dict(self.__dict__) for attr in ['_analyze', '_beam_search']: state.pop(attr, None) return state def __setstate__(self, state): self.__dict__.update(state) # To use bricks used on a GPU first on a CPU later try: emitter = self.generator.readout.emitter del emitter._theano_rng except: pass
class BlocksNMTVanillaDecoder(Decoder): """Adaptor class for blocks.search.BeamSearch. We implement the ``Decoder`` class but ignore functionality for predictors or heuristics. Instead, we pass through decoding directly to the blocks beam search module. This is fast, but breaks with the predictor framework. It can only be used for pure single system NMT decoding. Note that this decoder supports sparse feat maps on both source and target side. """ def __init__(self, nmt_model_path, config, decoder_args): """Set up the NMT model used by the decoder. Args: nmt_model_path (string): Path to the NMT model file (.npz) config (dict): NMT configuration decoder_args (object): Decoder configuration passed through from configuration API. """ super(BlocksNMTVanillaDecoder, self).__init__(decoder_args) self.config = config self.set_up_decoder(nmt_model_path) self.src_eos = self.src_sparse_feat_map.word2dense(utils.EOS_ID) def set_up_decoder(self, nmt_model_path): """This method uses the NMT configuration in ``self.config`` to initialize the NMT model. This method basically corresponds to ``blocks.machine_translation.main``. Args: nmt_model_path (string): Path to the NMT model file (.npz) """ self.nmt_model = NMTModel(self.config) self.nmt_model.set_up() loader = LoadNMTUtils(nmt_model_path, self.config['saveto'], self.nmt_model.search_model) loader.load_weights() self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \ if self.config['src_sparse_feat_map'] else FlatSparseFeatMap() if self.config['trg_sparse_feat_map']: self.trg_sparse_feat_map = self.config['trg_sparse_feat_map'] self.beam_search = SparseBeamSearch( samples=self.nmt_model.samples, trg_sparse_feat_map=self.trg_sparse_feat_map) else: self.trg_sparse_feat_map = FlatSparseFeatMap() self.beam_search = BeamSearch(samples=self.nmt_model.samples) def decode(self, src_sentence): """Decodes a single source sentence with the original blocks beam search decoder. Does not use predictors. Note that the score breakdowns in returned hypotheses are only on the sentence level, not on the word level. For finer grained NMT scores you need to use the nmt predictor. ``src_sentence`` is a list of source word ids representing the source sentence without <S> or </S> symbols. As blocks expects to see </S>, this method adds it automatically. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( src_sentence, self.config['src_vocab_size'])) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose( np.tile(seq, (self.config['beam_size'], 1, 1)), (2,0,1)) else: # word ids on the source side input_ = np.tile(seq, (self.config['beam_size'], 1)) trans, costs = self.beam_search.search( input_values={self.nmt_model.sampling_input: input_}, max_length=3*len(src_sentence), eol_symbol=utils.EOS_ID, ignore_first_eol=True) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0,1.0)]] hypo.score_breakdown[0] = [(-costs[idx],1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.config['beam_size'] return hypos def has_predictors(self): """Always returns true. """ return True
class SpeechRecognizer(Initializable): """Encapsulate all reusable logic. This class plays a few roles: (a) it's a top brick that knows how to combine bottom, bidirectional and recognizer network, (b) it has the inputs variables and can build whole computation graphs starting with them (c) it hides compilation of Theano functions and initialization of beam search. I find it simpler to have it all in one place for research code. Parameters ---------- All defining the structure and the dimensions of the model. Typically receives everything from the "net" section of the config. """ def __init__(self, input_dims, input_num_chars, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, character_map=None, bidir=True, subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class( input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition( dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack)] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError("Unknown attention type {}" .format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: feedback = OneOfNFeedback(num_phonemes + 1) if criterion['name'] == 'log_likelihood': emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() elif criterion['name'].startswith('mse'): emitter = RewardRegressionEmitter( criterion['name'], eos_label, num_phonemes, criterion.get('min_reward', -1.0), name="emitter") else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence([ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [d//getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm and lm.get('path'): lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn("Beam search is prone to fail with no log-prob normalization") language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout(lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator( readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.n_steps = tensor.lscalar('n_steps') def push_initialization_config(self): super(SpeechRecognizer, self).push_initialization_config() if self.rec_weights_init: rec_weights_config = {'weights_init': self.rec_weights_init, 'recurrent_weights_init': self.rec_weights_init} global_push_initialization_config(self, rec_weights_config, BaseRecurrent) if self.initial_states_init: global_push_initialization_config(self, {'initial_states_init': self.initial_states_init}) @application def cost(self, **kwargs): # pop inputs we know about inputs_mask = kwargs.pop('inputs_mask') labels = kwargs.pop('labels') labels_mask = kwargs.pop('labels_mask') # the rest is for bottom bottom_processed = self.bottom.apply(**kwargs) encoded, encoded_mask = self.encoder.apply( input_=bottom_processed, mask=inputs_mask) encoded = self.top.apply(encoded) return self.generator.cost_matrix( labels, labels_mask, attended=encoded, attended_mask=encoded_mask) @application def generate(self, **kwargs): inputs_mask = kwargs.pop('inputs_mask') n_steps = kwargs.pop('n_steps') encoded, encoded_mask = self.encoder.apply( input_=self.bottom.apply(**kwargs), mask=inputs_mask) encoded = self.top.apply(encoded) return self.generator.generate( n_steps=n_steps if n_steps is not None else self.n_steps, batch_size=encoded.shape[1], attended=encoded, attended_mask=encoded_mask, as_dict=True) def load_params(self, path): generated = self.get_generate_graph() with open(path, 'r') as src: param_values = load_parameters(src) Model(generated['outputs']).set_parameter_values(param_values) def get_generate_graph(self, use_mask=True, n_steps=None): inputs_mask = None if use_mask: inputs_mask = self.inputs_mask bottom_inputs = self.inputs return self.generate(n_steps=n_steps, inputs_mask=inputs_mask, **bottom_inputs) def get_cost_graph(self, batch=True, prediction=None, prediction_mask=None): if batch: inputs = self.inputs inputs_mask = self.inputs_mask groundtruth = self.labels groundtruth_mask = self.labels_mask else: inputs, inputs_mask = self.bottom.single_to_batch_inputs( self.single_inputs) groundtruth = self.single_labels[:, None] groundtruth_mask = None if not prediction: prediction = groundtruth if not prediction_mask: prediction_mask = groundtruth_mask cost = self.cost(inputs_mask=inputs_mask, labels=prediction, labels_mask=prediction_mask, **inputs) cost_cg = ComputationGraph(cost) if self.criterion['name'].startswith("mse"): placeholder, = VariableFilter(theano_name='groundtruth')(cost_cg) cost_cg = cost_cg.replace({placeholder: groundtruth}) return cost_cg def analyze(self, inputs, groundtruth, prediction=None): """Compute cost and aligment.""" input_values_dict = dict(inputs) input_values_dict['groundtruth'] = groundtruth if prediction is not None: input_values_dict['prediction'] = prediction if not hasattr(self, "_analyze"): input_variables = list(self.single_inputs.values()) input_variables.append(self.single_labels.copy(name='groundtruth')) prediction_variable = tensor.lvector('prediction') if prediction is not None: input_variables.append(prediction_variable) cg = self.get_cost_graph( batch=False, prediction=prediction_variable[:, None]) else: cg = self.get_cost_graph(batch=False) cost = cg.outputs[0] weights, = VariableFilter( bricks=[self.generator], name="weights")(cg) energies = VariableFilter( bricks=[self.generator], name="energies")(cg) energies_output = [energies[0][:, 0, :] if energies else tensor.zeros_like(weights)] states, = VariableFilter( applications=[self.encoder.apply], roles=[OUTPUT], name="encoded")(cg) ctc_matrix_output = [] # Temporarily disabled for compatibility with LM code # if len(self.generator.readout.source_names) == 1: # ctc_matrix_output = [ # self.generator.readout.readout(weighted_averages=states)[:, 0, :]] self._analyze = theano.function( input_variables, [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output, on_unused_input='warn') return self._analyze(**input_values_dict) def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ if hasattr(self, '_beam_search') and self.beam_size == beam_size: # Only recompile if the user wants a different beam size return self.beam_size = beam_size generated = self.get_generate_graph(use_mask=False, n_steps=3) cg = ComputationGraph(generated.values()) samples, = VariableFilter( applications=[self.generator.generate], name="outputs")(cg) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile() def beam_search(self, inputs, **kwargs): # When a recognizer is unpickled, self.beam_size is available # but beam search has to be recompiled. self.init_beam_search(self.beam_size) inputs = dict(inputs) max_length = int(self.bottom.num_time_steps(**inputs) / self.max_decoded_length_scale) search_inputs = {} for var in self.inputs.values(): search_inputs[var] = inputs.pop(var.name)[:, numpy.newaxis, ...] if inputs: raise Exception( 'Unknown inputs passed to beam search: {}'.format( inputs.keys())) outputs, search_costs = self._beam_search.search( search_inputs, self.eos_label, max_length, ignore_first_eol=self.data_prepend_eos, **kwargs) return outputs, search_costs def init_generate(self): generated = self.get_generate_graph(use_mask=False) cg = ComputationGraph(generated['outputs']) self._do_generate = cg.get_theano_function() def sample(self, inputs, n_steps=None): if not hasattr(self, '_do_generate'): self.init_generate() batch, unused_mask = self.bottom.single_to_batch_inputs(inputs) batch['n_steps'] = n_steps if n_steps is not None \ else int(self.bottom.num_time_steps(**batch) / self.max_decoded_length_scale) return self._do_generate(**batch)[0] def __getstate__(self): state = dict(self.__dict__) for attr in ['_analyze', '_beam_search']: state.pop(attr, None) return state def __setstate__(self, state): self.__dict__.update(state) # To use bricks used on a GPU first on a CPU later try: emitter = self.generator.readout.emitter del emitter._theano_rng except: pass
class AccValidator(SimpleExtension): """Implements early stopping based on accuracy score. """ def __init__(self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, normalize=True, store_full_main_loop=False, **kwargs): """Creates a new extension which adds model selection based on the accuracy score to the training main loop. Args: source_sentence (Variable): Input variable to the sampling computation graph samples (Variable): Samples variable of the CG model (NMTModel): See the model module data_stream (DataStream): Data stream to the development set config (dict): NMT configuration n_best (int): beam size track_n_models (int): Number of n-best models for which to create checkpoints. normalize (boolean): Enables length normalization store_full_main_loop (boolean): Stores the iteration state in the old style of Blocks 0.1. Not recommended """ super(AccValidator, self).__init__(**kwargs) self.store_full_main_loop = store_full_main_loop self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.best_models = [] self.val_bleu_curve = [] self.src_sparse_feat_map = config['src_sparse_feat_map'] if config['src_sparse_feat_map'] \ else FlatSparseFeatMap() if config['trg_sparse_feat_map']: self.trg_sparse_feat_map = config['trg_sparse_feat_map'] self.beam_search = SparseBeamSearch( samples=samples, trg_sparse_feat_map=self.trg_sparse_feat_map) else: self.trg_sparse_feat_map = FlatSparseFeatMap() self.beam_search = BeamSearch(samples=samples) # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load(os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logging.info("BleuScores Reloaded") except: logging.info("BleuScores not Found") self.verbose = self.config.get('val_set_out', None) utils.load_trg_wmap(self.config['trg_wmap']) self.trg_wmap = utils.trg_wmap # def __init__(self, *args, **kwargs): # # super(AccValidator, self).__init__(*args, **kwargs) # self.verbose = self.config.get('val_set_out', None) # utils.load_trg_wmap(self.config['trg_wmap']) # self.trg_wmap = utils.trg_wmap def do(self, which_callback, *args): """Decodes the dev set and stores checkpoints in case the BLEU score has improved. """ #if self.main_loop.status['iterations_done'] <= \ # self.config['val_burn_in']: if self.main_loop.status['epochs_done'] <= self.config['val_burn_in']: return self._save_model(self._evaluate_model()) def _evaluate_model(self): """Evaluate model and store checkpoints. """ logging.info("Started Validation: ") val_start_time = time.time() total_cost = 0.0 if self.verbose: ftrans = codecs.open(self.config['val_set_out'], 'w', 'utf-8') for i, line in enumerate(self.data_stream.get_epoch_iterator()): seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( line[0], self.config['src_vocab_size'])) if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = numpy.transpose( numpy.tile(seq, (self.config['beam_size'], 1, 1)), (2,0,1)) else: # word ids on the source side input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3*len(line[0]), eol_symbol=utils.EOS_ID, ignore_first_eol=True) # if i < 10: # logging.info("ID: {}".format(i)) # logging.info("Source: {}".format(line[0])) # for k, tran in enumerate(trans): # logging.info(u"{}".format(utils.apply_trg_wmap(tran,self.trg_wmap))) # logging.info("{}".format(costs[k])) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans = trans[best] if trans and trans[-1] == utils.EOS_ID: trans = trans[:-1] trans_out = ' '.join([str(w) for w in trans]) except ValueError: logging.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' trans = 0 if j == 0: # Write to subprocess and file if it exists ##print(trans_out, file=mb_subprocess.stdin) if self.verbose: print(utils.apply_trg_wmap(trans,self.trg_wmap), file=ftrans) if i != 0 and i % 100 == 0: logging.info( "Translated {} lines of validation set...".format(i)) logging.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() logging.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) logger.info("{} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out'])) bleu_score = float(subprocess.check_output("python2.7 {} {} {} {}".format(self.config['bleu_script'], self.config['val_set_out'], self.config['val_set_grndtruth'], self.config['results_out']), shell=True).decode("utf-8")) self.val_bleu_curve.append(bleu_score) logging.info(bleu_score) return bleu_score def _is_valid_to_save(self, bleu_score): if not self.best_models or min(self.best_models, key=operator.attrgetter('bleu_score')).bleu_score < bleu_score: return True return False def save_parameter_values(self, param_values, path): ''' This method is copied from blocks.machine_translation.checkpoint ''' param_values = {name.replace("/", "-"): param for name, param in param_values.items()} numpy.savez(path, **param_values) def _save_model(self, bleu_score): if self._is_valid_to_save(bleu_score): model = ModelInfo(bleu_score, self.config['saveto']) # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logging.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter('bleu_score')) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) # fs439: introduce store_full_main_loop and # storing best_bleu_params_* files if self.store_full_main_loop: logging.info("Saving full main loop model {}".format(model.path)) numpy.savez(model.path, **self.main_loop.model.get_parameter_dict()) else: logging.info("Saving model parameters {}".format(model.path)) params_to_save = self.main_loop.model.get_parameter_values() self.save_parameter_values(params_to_save, model.path) numpy.savez( os.path.join(self.config['saveto'], 'val_bleu_scores.npz'), bleu_scores=self.val_bleu_curve) signal.signal(signal.SIGINT, s)
class F1Validator(SimpleExtension, SamplingBase): # TODO: a lot has been changed in NMT, sync respectively """Implements early stopping based on F1 score.""" def __init__(self, samples, model, data_stream, config, n_best=1, track_n_models=1, normalize=True, **kwargs): # TODO: change config structure super(F1Validator, self).__init__(**kwargs) self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.vocab = config["src_vocab"] self.unk_sym = config["unk_token"] self.eos_sym = config["eos_token"] self.trg_vocab = config["trg_vocab"] self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()} self.trg_eos_idx = self.trg_vocab[config["eos_token"]] self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.vocab[self.eos_sym] self.best_models = [] self.val_f1_curve = [] self.beam_search = BeamSearch(samples=samples) # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: f1_score = numpy.load( os.path.join(self.config['saveto'], 'val_f1_scores.npz')) self.val_f1_curve = f1_score['f1_scores'].tolist() # Track n best previous f1 scores for i, f1 in enumerate(sorted(self.val_f1_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(f1)) logger.info("F1Scores Reloaded") except: logger.info("F1Scores not Found") def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status['iterations_done'] <= \ self.config['val_burn_in']: return # Evaluate and save if necessary self._save_model(self._evaluate_model()) def _evaluate_model(self): logger.info("Started Validation: ") val_start_time = time.time() total_cost = 0.0 if self.verbose: ftrans = open(self.config['val_set_out'], 'w') C = 0 S = 0 I = 0 D = 0 for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ def tile(x, beam_size): return numpy.tile(x, (beam_size, ) + (1, ) * x.ndim) beam_size = self.config['beam_size'] available_inputs = dict( zip(["sampling_%s" % x for x in self.data_stream.sources], line)) input_values = OrderedDict([(input, tile(available_inputs[input.name], beam_size)) for input in self.model.inputs]) seq = available_inputs["sampling_words"] reference = available_inputs["sampling_punctuation_marks"] # draw sample, checking to ensure we don't get an empty string back trans, costs = \ self.beam_search.search( input_values=input_values, max_length=len(seq), eol_symbol=self.trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = self._idx_to_word(trans_out, self.trg_ivocab) reference = self._idx_to_word(reference, self.trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' if j == 0: # Compute F-Measure keywords = [ '<FULL_STOP>', '<COMMA>', '<QUESTION_MARK>', '<EXCLAMATION_MARK>', '<DOTS>' ] merged_tokens = zip(reference.split(), trans_out.split()) for (x, y) in merged_tokens: if x == y: if x in keywords: C += 1 else: if x in keywords and y in keywords: S += 1 elif x not in keywords: I += 1 elif y not in keywords: D += 1 # If beam returns too short answer if len(reference) > len(trans_out.split()): D += len([ w for w in reference[len(trans_out.split()):] if w in keywords ]) if self.verbose: print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: f1_score = self.compute_f1_score(C, S, I, D) logger.info( "Translated {} lines of validation set... F1 = {}, {}, {}, {}, {}" .format(i, f1_score, C, S, I, D)) # extract the score f1_score = self.compute_f1_score(C, S, I, D) self.val_f1_curve.append(f1_score) logger.info(f1_score) logger.info("Total cost of the validation: {}".format(total_cost)) logger.info( "Translated {} lines of validation set... F1 = {}, {}, {}, {}, {}". format(i, f1_score, C, S, I, D)) self.data_stream.reset() if self.verbose: ftrans.close() logger.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) return f1_score def compute_f1_score(self, C, S, I, D): C += 0.0001 precision = float(C) / (C + S + I) recall = float(C) / (C + S + D) f1 = (2.0 * precision * recall) / (precision + recall) return f1 def _is_valid_to_save(self, f1_score): if not self.best_models or min( self.best_models, key=operator.attrgetter('f1_score')).f1_score < f1_score: return True return False def _save_model(self, f1_score): if self._is_valid_to_save(f1_score): model = ModelInfo(f1_score, self.config['saveto']) # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logger.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter('f1_score')) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) logger.info("Saving new model {}".format(model.path)) params_to_save = self.main_loop.model.get_parameter_values() param_values = { name.replace("/", BRICK_DELIMITER): param for name, param in params_to_save.items() } numpy.savez(model.path, **param_values) numpy.savez(os.path.join(self.config['saveto'], 'val_f1_scores.npz'), f1_scores=self.val_f1_curve) signal.signal(signal.SIGINT, s)
_, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph(generated[1])) beam_search = BeamSearch(samples=samples) # Read from standard input stream = get_stdin_stream(**config) vocab = get_vocab(config['trg_vocab'], config['trg_vocab_size'], config['unk_id'], config['eos_id'], config['bos_id']) inv_vocab = {v: k for k, v in vocab.iteritems()} unk_id = config['unk_id'] eos_id = config['eos_id'] for sample in stream.get_epoch_iterator(): seq = sample[0] input_ = np.tile(seq, (config['beam_size'], 1)) trans, costs = beam_search.search(input_values={sampling_input: input_}, max_length=3 * len(seq), eol_symbol=eos_id, ignore_first_eol=True) trans_indices = [idx for idx in trans[0] if idx != eos_id] # remove </S> from output trans_out = ' '.join( inv_vocab.get(idx, config['unk_token']) for idx in trans_indices) print trans_out
class BlocksVanillaDecoder(cam.sgnmt.decoding.core.Decoder): """Adaptor class for blocks.search.BeamSearch. We implement the ``Decoder`` class but ignore functionality for predictors or heuristics. Instead, we pass through decoding directly to the blocks beam search module. This is fast, but breaks with the predictor framework. It can only be used for pure single system NMT decoding. """ def __init__(self, nmt_model_path, config): """Set up the NMT model used by the decoder. Args: nmt_model_path (string): Path to the NMT model file (.npz) config (dict): NMT configuration """ super(BlocksVanillaDecoder, self).__init__() self.config = config self.set_up_decoder(nmt_model_path) def set_up_decoder(self, nmt_model_path): """This method uses the NMT configuration in ``self.config`` to initialize the NMT model. This method basically corresponds to ``blocks.machine_translation.main``. Args: nmt_model_path (string): Path to the NMT model file (.npz) """ # Create Theano variables logging.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logging.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(self.config['src_vocab_size'], self.config['enc_embed'], self.config['enc_nhids']) decoder = Decoder(self.config['trg_vocab_size'], self.config['dec_embed'], self.config['dec_nhids'], self.config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logging.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model (TODO: do i really need this?) logging.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( self.config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization (TODO: remove?) if self.config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logging.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, self.config['dropout']) # Apply weight noise for regularization (TODO: remove?) if self.config['weight_noise_ff'] > 0.0: logging.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, self.config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logging.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.info(' {:15}: {}'.format(shape, count)) logging.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.info(' {:15}: {}'.format(value.get_value().shape, name)) logging.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logging.info("Building model") # Set extensions logging.info("Initializing extensions") # Set up beam search and sampling computation graphs if necessary logging.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Compare with blocks.machine_translation.BleuValidator.__init__ self.source_sentence = sampling_input self.samples = samples self.model = search_model self.normalize = True self.verbose = self.config.get('val_set_out', None) # Reload model if necessary if self.config['reload']: loader = LoadNMT(nmt_model_path, self.config['saveto'], search_model) loader.load_weights() self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) def decode(self, src_sentence): """Decodes a single source sentence with the original blocks beam search decoder. Does not use predictors. Note that the score breakdowns in returned hypotheses are only on the sentence level, not on the word level. For finer grained NMT scores you need to use the nmt predictor. ``src_sentence`` is a list of source word ids representing the source sentence without <S> or </S> symbols. As blocks expects to see </S>, this method adds it automatically. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ seq = self._oov_to_unk(src_sentence, self.config['src_vocab_size'], utils.UNK_ID) + [utils.EOS_ID] input_ = np.tile(seq, (self.config['beam_size'], 1)) trans, costs = self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3 * len(src_sentence), eol_symbol=utils.EOS_ID, ignore_first_eol=True) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]] hypo.score_breakdown[0] = [(-costs[idx], 1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.config['beam_size'] return hypos def _oov_to_unk(self, seq, vocab_size, unk_idx): return [x if x < vocab_size else unk_idx for x in seq] def has_predictors(self): """Always returns true. """ return True
class BleuValidator(SimpleExtension, SamplingBase): # TODO: a lot has been changed in NMT, sync respectively """Implements early stopping based on BLEU score.""" def __init__(self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, normalize=True, **kwargs): # TODO: change config structure super(BleuValidator, self).__init__(**kwargs) self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.vocab = data_stream.dataset.dictionary self.unk_sym = data_stream.dataset.unk_token self.eos_sym = data_stream.dataset.eos_token self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.vocab[self.eos_sym] self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = ['perl', self.config['bleu_script'], self.config['val_set_grndtruth'], '<'] # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load(os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found") def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status['iterations_done'] <= \ self.config['val_burn_in']: return # Evaluate and save if necessary self._save_model(self._evaluate_model()) def _evaluate_model(self): logger.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 # Get target vocabulary sources = self._get_attr_rec(self.main_loop, 'data_stream') trg_vocab = sources.data_streams[1].dataset.dictionary self.trg_ivocab = {v: k for k, v in trg_vocab.items()} trg_eos_sym = sources.data_streams[1].dataset.eos_token self.trg_eos_idx = trg_vocab[trg_eos_sym] if self.verbose: ftrans = open(self.config['val_set_out'], 'w') for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ seq = self._oov_to_unk( line[0], self.config['src_vocab_size'], self.unk_idx) input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3*len(seq), eol_symbol=self.trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = self._idx_to_word(trans_out, self.trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print(trans_out, file=mb_subprocess.stdin) if self.verbose: print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of validation set...".format(i)) mb_subprocess.stdin.flush() logger.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logger.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logger.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) logger.info(bleu_score) mb_subprocess.terminate() return bleu_score def _is_valid_to_save(self, bleu_score): if not self.best_models or min(self.best_models, key=operator.attrgetter('bleu_score')).bleu_score < bleu_score: return True return False def _save_model(self, bleu_score): if self._is_valid_to_save(bleu_score): model = ModelInfo(bleu_score, self.config['saveto']) # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logger.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter('bleu_score')) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) logger.info("Saving new model {}".format(model.path)) numpy.savez( model.path, **self.main_loop.model.get_parameter_dict()) numpy.savez( os.path.join(self.config['saveto'], 'val_bleu_scores.npz'), bleu_scores=self.val_bleu_curve) signal.signal(signal.SIGINT, s)
class BleuValidator(SimpleExtension): """Implements early stopping based on BLEU score. This class is still very similar to the ``BleuValidator`` in the NMT Blocks example. TODO: Refactor, make this more similar to the rest of SGNMT, use vanilla_decoder.py """ def __init__(self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, normalize=True, store_full_main_loop=False, **kwargs): """Creates a new extension which adds model selection based on the BLEU score to the training main loop. Args: source_sentence (Variable): Input variable to the sampling computation graph samples (Variable): Samples variable of the CG model (NMTModel): See the model module data_stream (DataStream): Data stream to the development set config (dict): NMT configuration n_best (int): beam size track_n_models (int): Number of n-best models for which to create checkpoints. normalize (boolean): Enables length normalization store_full_main_loop (boolean): Stores the iteration state in the old style of Blocks 0.1. Not recommended """ super(BleuValidator, self).__init__(**kwargs) self.store_full_main_loop = store_full_main_loop self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.best_models = [] self.val_bleu_curve = [] self.multibleu_cmd = (self.config['bleu_script'] % self.config['val_set_grndtruth']).split() logging.debug("BLEU command: %s" % self.multibleu_cmd) self.src_sparse_feat_map = config['src_sparse_feat_map'] if config['src_sparse_feat_map'] \ else FlatSparseFeatMap() if config['trg_sparse_feat_map']: self.trg_sparse_feat_map = config['trg_sparse_feat_map'] self.beam_search = SparseBeamSearch( samples=samples, trg_sparse_feat_map=self.trg_sparse_feat_map) else: self.trg_sparse_feat_map = FlatSparseFeatMap() self.beam_search = BeamSearch(samples=samples) # Create saving directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load(os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logging.info("BleuScores Reloaded") except: logging.info("BleuScores not Found") def do(self, which_callback, *args): """Decodes the dev set and stores checkpoints in case the BLEU score has improved. """ if self.main_loop.status['iterations_done'] <= \ self.config['val_burn_in']: return self._save_model(self._evaluate_model()) def _evaluate_model(self): """Evaluate model and store checkpoints. """ logging.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 ftrans = open(self.config['saveto'] + '/validation_out.txt', 'w') for i, line in enumerate(self.data_stream.get_epoch_iterator()): seq = self.src_sparse_feat_map.words2dense(utils.oov_to_unk( line[0], self.config['src_vocab_size'])) if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = numpy.transpose( numpy.tile(seq, (self.config['beam_size'], 1, 1)), (2,0,1)) else: # word ids on the source side input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3*len(seq), eol_symbol=utils.EOS_ID, ignore_first_eol=True) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans = trans[best] if trans and trans[-1] == utils.EOS_ID: trans = trans[:-1] trans_out = ' '.join([str(w) for w in trans]) except ValueError: logging.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print(trans_out, file=mb_subprocess.stdin) print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logging.info( "Translated {} lines of validation set...".format(i)) mb_subprocess.stdin.flush() logging.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logging.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logging.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) logging.info(bleu_score) mb_subprocess.terminate() return bleu_score def _is_valid_to_save(self, bleu_score): if not self.best_models or min(self.best_models, key=operator.attrgetter('bleu_score')).bleu_score < bleu_score: return True return False def save_parameter_values(self, param_values, path): ''' This method is copied from blocks.machine_translation.checkpoint ''' param_values = {name.replace("/", "-"): param for name, param in param_values.items()} numpy.savez(path, **param_values) def _save_model(self, bleu_score): if self._is_valid_to_save(bleu_score): model = ModelInfo(bleu_score, self.config['saveto']) # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logging.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter('bleu_score')) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) # fs439: introduce store_full_main_loop and # storing best_bleu_params_* files if self.store_full_main_loop: logging.info("Saving full main loop model {}".format(model.path)) numpy.savez(model.path, **self.main_loop.model.get_parameter_dict()) else: logging.info("Saving model parameters {}".format(model.path)) params_to_save = self.main_loop.model.get_parameter_values() self.save_parameter_values(params_to_save, model.path) numpy.savez( os.path.join(self.config['saveto'], 'val_bleu_scores.npz'), bleu_scores=self.val_bleu_curve) signal.signal(signal.SIGINT, s)
class BleuValidator(SimpleExtension, SamplingBase): def __init__( self, source_sentence, samples, model, data_stream, config, n_best=1, track_n_models=1, trg_ivocab=None, src_eos_idx=-1, trg_eos_idx=-1, **kwargs ): super(BleuValidator, self).__init__(**kwargs) self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.verbose = config.get("val_set_out", None) self.src_eos_idx = src_eos_idx self.trg_eos_idx = trg_eos_idx # Helpers self.vocab = data_stream.dataset.dictionary self.trg_ivocab = trg_ivocab self.unk_sym = data_stream.dataset.unk_token self.eos_sym = data_stream.dataset.eos_token self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.src_eos_idx # self.vocab[self.eos_sym] self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(beam_size=self.config["beam_size"], samples=samples) self.multibleu_cmd = ["perl", self.config["bleu_script"], self.config["val_set_grndtruth"], "<"] # Create saving directory if it does not exist if not os.path.exists(self.config["saveto"]): os.makedirs(self.config["saveto"]) if self.config["reload"]: try: bleu_score = numpy.load(os.path.join(self.config["saveto"], "val_bleu_scores.npz")) self.val_bleu_curve = bleu_score["bleu_scores"].tolist() # Track n best previous bleu scores for i, bleu in enumerate(sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu)) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found") def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status["iterations_done"] <= self.config["val_burn_in"]: return # Get current model parameters self.model.set_param_values(self.main_loop.model.get_param_values()) # Evaluate and save if necessary self._save_model(self._evaluate_model()) def _evaluate_model(self): logger.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 # Get target vocabulary if not self.trg_ivocab: sources = self._get_attr_rec(self.main_loop, "data_stream") trg_vocab = sources.data_streams[1].dataset.dictionary self.trg_ivocab = {v: k for k, v in trg_vocab.items()} if self.verbose: ftrans = open(self.config["val_set_out"], "w") for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ line[0][-1] = self.src_eos_idx seq = self._oov_to_unk(line[0]) input_ = numpy.tile(seq, (self.config["beam_size"], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3 * len(seq), eol_symbol=self.trg_eos_idx, ignore_first_eol=True, ) nbest_idx = numpy.argsort(costs)[: self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = self._idx_to_word(trans_out[:-1], self.trg_ivocab) except ValueError: print "Can NOT find a translation for line: {}".format(i + 1) trans_out = "<UNK>" if j == 0: # Write to subprocess and file if it exists print >> mb_subprocess.stdin, trans_out if self.verbose: print >> ftrans, trans_out if i != 0 and i % 100 == 0: print "Translated {} lines of validation set...".format(i) mb_subprocess.stdin.flush() print "Total cost of the validation: {}".format(total_cost) self.data_stream.reset() if self.verbose: ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() print "output ", stdout out_parse = re.match(r"BLEU = [-.0-9]+", stdout) logger.info("Validation Took: {} minutes".format(float(time.time() - val_start_time) / 60.0)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) print bleu_score mb_subprocess.terminate() return bleu_score def _is_valid_to_save(self, bleu_score): if not self.best_models or min(self.best_models, key=operator.attrgetter("bleu_score")).bleu_score < bleu_score: return True return False def _save_model(self, bleu_score): if self._is_valid_to_save(bleu_score): model = ModelInfo(bleu_score, self.config["saveto"]) # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logger.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter("bleu_score")) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) logger.info("Saving new model {}".format(model.path)) numpy.savez(model.path, **self.main_loop.model.get_param_values()) numpy.savez(os.path.join(self.config["saveto"], "val_bleu_scores.npz"), bleu_scores=self.val_bleu_curve) signal.signal(signal.SIGINT, s)
class BleuTester(TrainingExtension, SamplingBase): # TODO: a lot has been changed in NMT, sync respectively """Implements Testing BLEU score.""" def __init__(self, source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples, model, data_stream, config, testing_model, n_best=1, track_n_models=1, normalize=True, **kwargs): # TODO: change config structure super(BleuTester, self).__init__(**kwargs) self.source_char_seq = source_char_seq self.source_sample_matrix = source_sample_matrix self.source_char_aux = source_char_aux self.source_word_mask = source_word_mask self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.testing_model = testing_model self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = True # Helpers self.vocab = data_stream.dataset.dictionary self.src_ivocab = {v: k for k, v in self.vocab.items()} self.unk_sym = data_stream.dataset.unk_token self.eos_sym = data_stream.dataset.eos_token self.unk_idx = self.vocab[self.unk_sym] self.eos_idx = self.vocab[self.eos_sym] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = ['perl', self.config['bleu_script'], self.config['test_set_grndtruth'], '<'] def before_training(self): self._evaluate_model() def _evaluate_model(self): logger.info("Started Test: ") test_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE, universal_newlines=True) total_cost = 0.0 # Get target vocabulary trg_vocab = self.data_stream.trg_vocab self.trg_vocab = trg_vocab self.trg_ivocab = {v: k for k, v in trg_vocab.items()} trg_eos_sym = self.data_stream.eos_token self.trg_eos_idx = trg_vocab[trg_eos_sym] if self.verbose: ftrans = open(os.path.join(self.testing_model, self.config['test_set_out']), 'w') for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ seq = self._oov_to_unk( line[0], self.config['src_vocab_size'], self.unk_idx) _, input_dict = self.build_input_dict(numpy.asarray(seq), self.vocab, self.config['beam_size']) # draw sample, checking to ensure we don't get an empty string back result = \ self.beam_search.search( input_values={self.source_char_seq: input_dict['source_char_seq'], self.source_sample_matrix: input_dict['source_sample_matrix'], self.source_word_mask: input_dict['source_word_mask'], self.source_char_aux: input_dict['source_char_aux']}, max_length=3 * len(seq), eol_symbol=self.trg_eos_idx, as_arrays=True, ignore_first_eol=False) trans, costs = result_to_lists(result) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # convert idx to words try: sample_length = trans_out.index(self.trg_vocab['</S>']) except ValueError: sample_length = len(seq) trans_out = trans_out[:sample_length] trans_out = self._idx_to_word(trans_out, self.trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print("Line:", i) print("Input : ", self._idx_to_word(line[0], self.src_ivocab)) print("Sample: ", trans_out) print("Error:", costs[best]) print() print(trans_out, file=mb_subprocess.stdin) if self.verbose: print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of test set...".format(i)) mb_subprocess.stdin.flush() logger.info("Total cost of the test: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logger.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logger.info("Test Took: {} minutes".format( float(time.time() - test_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) logger.info(bleu_score) mb_subprocess.terminate() return bleu_score
class SpeechRecognizer(Initializable): """Encapsulate all reusable logic. This class plays a few roles: (a) it's a top brick that knows how to combine bottom, bidirectional and recognizer network, (b) it has the inputs variables and can build whole computation graphs starting with them (c) it hides compilation of Theano functions and initialization of beam search. I find it simpler to have it all in one place for research code. Parameters ---------- All defining the structure and the dimensions of the model. Typically receives everything from the "net" section of the config. """ def __init__( self, recordings_source, labels_source, eos_label, num_features, num_phonemes, dim_dec, dims_bidir, dims_bottom, enc_transition, dec_transition, use_states_for_readout, attention_type, lm=None, character_map=None, subsample=None, dims_top=None, prior=None, conv_n=None, bottom_activation=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, energy_normalizer=None, # softmax is th edefault set in SequenceContentAndConvAttention **kwargs): if bottom_activation is None: bottom_activation = Tanh() if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.recordings_source = recordings_source self.labels_source = labels_source self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack bottom_activation = bottom_activation post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN if dims_bottom: bottom = MLP([bottom_activation] * len(dims_bottom), [num_features] + dims_bottom, name="bottom") else: bottom = Identity(name='bottom') # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder( self.enc_transition, dims_bidir, dims_bottom[-1] if len(dims_bottom) else num_features, subsample) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec) else: feedback = OneOfNFeedback(num_phonemes + 1) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() else: emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") readout_config = dict(readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm: lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.recordings = tensor.tensor3(self.recordings_source) self.recordings_mask = tensor.matrix(self.recordings_source + "_mask") self.labels = tensor.lmatrix(self.labels_source) self.labels_mask = tensor.matrix(self.labels_source + "_mask") self.batch_inputs = [ self.recordings, self.recordings_source, self.labels, self.labels_mask ] self.single_recording = tensor.matrix(self.recordings_source) self.single_transcription = tensor.lvector(self.labels_source) def push_initialization_config(self): super(SpeechRecognizer, self).push_initialization_config() if self.rec_weights_init: rec_weights_config = { 'weights_init': self.rec_weights_init, 'recurrent_weights_init': self.rec_weights_init } global_push_initialization_config(self, rec_weights_config, BaseRecurrent) if self.initial_states_init: global_push_initialization_config( self, {'initial_states_init': self.initial_states_init}) @application def cost(self, recordings, recordings_mask, labels, labels_mask): bottom_processed = self.bottom.apply(recordings) encoded, encoded_mask = self.encoder.apply(input_=bottom_processed, mask=recordings_mask) encoded = self.top.apply(encoded) return self.generator.cost_matrix(labels, labels_mask, attended=encoded, attended_mask=encoded_mask) @application def generate(self, recordings): encoded, encoded_mask = self.encoder.apply( input_=self.bottom.apply(recordings)) encoded = self.top.apply(encoded) return self.generator.generate(n_steps=recordings.shape[0], batch_size=recordings.shape[1], attended=encoded, attended_mask=encoded_mask, as_dict=True) def load_params(self, path): generated = self.get_generate_graph() param_values = load_parameter_values(path) SpeechModel(generated['outputs']).set_parameter_values(param_values) def get_generate_graph(self): result = self.generate(self.recordings) return result def get_cost_graph(self, batch=True): if batch: return self.cost(self.recordings, self.recordings_mask, self.labels, self.labels_mask) recordings = self.single_recording[:, None, :] labels = self.single_transcription[:, None] return self.cost(recordings, tensor.ones_like(recordings[:, :, 0]), labels, None) def analyze(self, recording, transcription): """Compute cost and aligment for a recording/transcription pair.""" if not hasattr(self, "_analyze"): cost = self.get_cost_graph(batch=False) cg = ComputationGraph(cost) energies = VariableFilter(bricks=[self.generator], name="energies")(cg) energies_output = [ energies[0][:, 0, :] if energies else tensor.zeros( (self.single_transcription.shape[0], self.single_recording.shape[0])) ] states, = VariableFilter(applications=[self.encoder.apply], roles=[OUTPUT], name="encoded")(cg) ctc_matrix_output = [] # Temporarily disabled for compatibility with LM code # if len(self.generator.readout.source_names) == 1: # ctc_matrix_output = [ # self.generator.readout.readout(weighted_averages=states)[:, 0, :]] weights, = VariableFilter(bricks=[self.generator], name="weights")(cg) self._analyze = theano.function( [self.single_recording, self.single_transcription], [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output) return self._analyze(recording, transcription) def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ self.beam_size = beam_size generated = self.get_generate_graph() samples, = VariableFilter(applications=[self.generator.generate], name="outputs")(ComputationGraph( generated['outputs'])) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile() def beam_search(self, recording, char_discount=0.0): if not hasattr(self, '_beam_search'): self.init_beam_search(self.beam_size) input_ = recording[:, numpy.newaxis, :] outputs, search_costs = self._beam_search.search( {self.recordings: input_}, self.eos_label, input_.shape[0] / 3, ignore_first_eol=self.data_prepend_eos, char_discount=char_discount) return outputs, search_costs def __getstate__(self): state = dict(self.__dict__) for attr in ['_analyze', '_beam_search']: state.pop(attr, None) return state def __setstate__(self, state): self.__dict__.update(state) # To use bricks used on a GPU first on a CPU later try: emitter = self.generator.readout.emitter del emitter._theano_rng except: pass
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Get training and development set streams tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) # Get cost of the model cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get test set stream test_stream = get_dev_stream(config['test_set'], config['src_vocab'], config['src_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens(pickle.load( open(config['trg_vocab'])), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={sampling_input: input_}, max_length=3*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info("Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close()
class BeamSearchEvaluator(object): def __init__(self, eol_symbol, beam_size, x, x_mask, samples, phoneme_dict=None, black_list=None, language_model=False): if black_list is None: self.black_list = [] else: self.black_list = black_list self.x = x self.x_mask = x_mask self.eol_symbol = eol_symbol self.beam_size = beam_size if language_model: lm = TrigramLanguageModel() ind_to_word = dict(enumerate(lm.unigrams)) self.beam_search = BeamSearchLM(lm, 1., ind_to_word, beam_size, samples) else: self.beam_search = BeamSearch(beam_size, samples) self.beam_search.compile() self.phoneme_dict = phoneme_dict def evaluate(self, data_stream, train=False, file_pred=None, file_targets=None): loss = 0. num_examples = 0 iterator = data_stream.get_epoch_iterator() if train: print 'Train evaluation started' i = 0 for inputs in iterator: inputs = dict(zip(data_stream.sources, inputs)) x_mask_val = inputs['features_mask'] x_val = inputs['features'] y_val = inputs['phonemes'] y_mask_val = inputs['phonemes_mask'] for batch_ind in xrange(inputs['features'].shape[1]): if x_val.ndim == 2: input_beam = numpy.tile(x_val[:, batch_ind][:, None], (1, self.beam_size)) else: input_beam = numpy.tile(x_val[:, batch_ind, :][:, None, :], (1, self.beam_size, 1)) input_mask_beam = numpy.tile(x_mask_val[:, batch_ind][:, None], (1, self.beam_size)) predictions, _ = self.beam_search.search( { self.x: input_beam, self.x_mask: input_mask_beam }, self.eol_symbol, 100) predictions = [ self.phoneme_dict[phone_ind] for phone_ind in predictions[0] if self.phoneme_dict[phone_ind] not in self.black_list ][1:-1] targets = y_val[:sum(y_mask_val[:, batch_ind]), batch_ind] targets = [ self.phoneme_dict[phone_ind] for phone_ind in targets if self.phoneme_dict[phone_ind] not in self.black_list ][1:-1] predictions = [x[0] for x in groupby(predictions)] targets = [x[0] for x in groupby(targets)] i += 1 if file_pred: file_pred.write(' '.join(predictions) + '(%d)\n' % i) if file_targets: file_targets.write(' '.join(targets) + '(%d)\n' % i) loss += Evaluation.wer([predictions], [targets]) num_examples += 1 print '.. found sequence example:', ' '.join(predictions) print '.. real output was: ', ' '.join(targets) if train: break if train: print 'Train evaluation finished' per = loss.sum() / num_examples return {'per': per}
class SpeechRecognizer(Initializable): """Encapsulate all reusable logic. This class plays a few roles: (a) it's a top brick that knows how to combine bottom, bidirectional and recognizer network, (b) it has the inputs variables and can build whole computation graphs starting with them (c) it hides compilation of Theano functions and initialization of beam search. I find it simpler to have it all in one place for research code. Parameters ---------- All defining the structure and the dimensions of the model. Typically receives everything from the "net" section of the config. """ def __init__(self, recordings_source, labels_source, eos_label, num_features, num_phonemes, dim_dec, dims_bidir, dims_bottom, enc_transition, dec_transition, use_states_for_readout, attention_type, lm=None, character_map=None, subsample=None, dims_top=None, prior=None, conv_n=None, bottom_activation=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, energy_normalizer=None, # softmax is th edefault set in SequenceContentAndConvAttention **kwargs): if bottom_activation is None: bottom_activation = Tanh() if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.recordings_source = recordings_source self.labels_source = labels_source self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack bottom_activation = bottom_activation post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN if dims_bottom: bottom = MLP([bottom_activation] * len(dims_bottom), [num_features] + dims_bottom, name="bottom") else: bottom = Identity(name='bottom') # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, dims_bottom[-1] if len(dims_bottom) else num_features, subsample) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition( dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack)] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError("Unknown attention type {}" .format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec) else: feedback = OneOfNFeedback(num_phonemes + 1) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() else: emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence([ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [d//getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm: lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn("Beam search is prone to fail with no log-prob normalization") language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout(lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator( readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.recordings = tensor.tensor3(self.recordings_source) self.recordings_mask = tensor.matrix(self.recordings_source + "_mask") self.labels = tensor.lmatrix(self.labels_source) self.labels_mask = tensor.matrix(self.labels_source + "_mask") self.batch_inputs = [self.recordings, self.recordings_source, self.labels, self.labels_mask] self.single_recording = tensor.matrix(self.recordings_source) self.single_transcription = tensor.lvector(self.labels_source) def push_initialization_config(self): super(SpeechRecognizer, self).push_initialization_config() if self.rec_weights_init: rec_weights_config = {'weights_init': self.rec_weights_init, 'recurrent_weights_init': self.rec_weights_init} global_push_initialization_config(self, rec_weights_config, BaseRecurrent) if self.initial_states_init: global_push_initialization_config(self, {'initial_states_init': self.initial_states_init}) @application def cost(self, recordings, recordings_mask, labels, labels_mask): bottom_processed = self.bottom.apply(recordings) encoded, encoded_mask = self.encoder.apply( input_=bottom_processed, mask=recordings_mask) encoded = self.top.apply(encoded) return self.generator.cost_matrix( labels, labels_mask, attended=encoded, attended_mask=encoded_mask) @application def generate(self, recordings): encoded, encoded_mask = self.encoder.apply( input_=self.bottom.apply(recordings)) encoded = self.top.apply(encoded) return self.generator.generate( n_steps=recordings.shape[0], batch_size=recordings.shape[1], attended=encoded, attended_mask=encoded_mask, as_dict=True) def load_params(self, path): generated = self.get_generate_graph() param_values = load_parameter_values(path) SpeechModel(generated['outputs']).set_parameter_values(param_values) def get_generate_graph(self): result = self.generate(self.recordings) return result def get_cost_graph(self, batch=True): if batch: return self.cost( self.recordings, self.recordings_mask, self.labels, self.labels_mask) recordings = self.single_recording[:, None, :] labels = self.single_transcription[:, None] return self.cost( recordings, tensor.ones_like(recordings[:, :, 0]), labels, None) def analyze(self, recording, transcription): """Compute cost and aligment for a recording/transcription pair.""" if not hasattr(self, "_analyze"): cost = self.get_cost_graph(batch=False) cg = ComputationGraph(cost) energies = VariableFilter( bricks=[self.generator], name="energies")(cg) energies_output = [energies[0][:, 0, :] if energies else tensor.zeros((self.single_transcription.shape[0], self.single_recording.shape[0]))] states, = VariableFilter( applications=[self.encoder.apply], roles=[OUTPUT], name="encoded")(cg) ctc_matrix_output = [] # Temporarily disabled for compatibility with LM code # if len(self.generator.readout.source_names) == 1: # ctc_matrix_output = [ # self.generator.readout.readout(weighted_averages=states)[:, 0, :]] weights, = VariableFilter( bricks=[self.generator], name="weights")(cg) self._analyze = theano.function( [self.single_recording, self.single_transcription], [cost[:, 0], weights[:, 0, :]] + energies_output + ctc_matrix_output) return self._analyze(recording, transcription) def init_beam_search(self, beam_size): """Compile beam search and set the beam size. See Blocks issue #500. """ self.beam_size = beam_size generated = self.get_generate_graph() samples, = VariableFilter( applications=[self.generator.generate], name="outputs")( ComputationGraph(generated['outputs'])) self._beam_search = BeamSearch(beam_size, samples) self._beam_search.compile() def beam_search(self, recording, char_discount=0.0): if not hasattr(self, '_beam_search'): self.init_beam_search(self.beam_size) input_ = recording[:,numpy.newaxis,:] outputs, search_costs = self._beam_search.search( {self.recordings: input_}, self.eos_label, input_.shape[0] / 3, ignore_first_eol=self.data_prepend_eos, char_discount=char_discount) return outputs, search_costs def __getstate__(self): state = dict(self.__dict__) for attr in ['_analyze', '_beam_search']: state.pop(attr, None) return state def __setstate__(self, state): self.__dict__.update(state) # To use bricks used on a GPU first on a CPU later try: emitter = self.generator.readout.emitter del emitter._theano_rng except: pass
class BleuValidator(SimpleExtension, SamplingBase): """Implements early stopping based on BLEU score.""" def __init__(self, source_sentence, target_prefix, samples, model, data_stream, config, src_vocab=None, trg_vocab=None, n_best=1, track_n_models=1, normalize=True, **kwargs): super(BleuValidator, self).__init__(**kwargs) self.source_sentence = source_sentence self.target_prefix = target_prefix self.src_vocab = src_vocab self.trg_vocab = trg_vocab self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.track_n_models = track_n_models self.normalize = normalize self.verbose = config.get('val_set_out', None) # Helpers self.best_models = [] self.val_bleu_curve = [] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = [ 'perl', self.config['bleu_script'], self.config['val_set_grndtruth'], '<' ] # Create save directory if it does not exist if not os.path.exists(self.config['saveto']): os.makedirs(self.config['saveto']) if self.config['reload']: try: bleu_score = numpy.load( os.path.join(self.config['saveto'], 'val_bleu_scores.npz')) self.val_bleu_curve = bleu_score['bleu_scores'].tolist() # Track n best previous bleu scores for i, bleu in enumerate( sorted(self.val_bleu_curve, reverse=True)): if i < self.track_n_models: self.best_models.append(ModelInfo(bleu, key='BLEU')) logger.info("BleuScores Reloaded") except: logger.info("BleuScores not Found") def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status['iterations_done'] <= self.config[ 'val_burn_in']: return # Evaluate the model bleu_score = self._evaluate_model() # add an entry to the log self.main_loop.log.current_row[ 'validation_set_bleu_score'] = bleu_score # save if necessary self._save_model(bleu_score) def _evaluate_model(self): # Set in the superclass -- SamplingBase if not hasattr(self, 'target_dataset'): self._initialize_dataset_info() # self.unk_sym = self.target_dataset.unk_token # self.eos_sym = self.target_dataset.eos_token self.unk_sym = '<UNK>' self.eos_sym = '</S>' self.unk_idx = self.trg_vocab[self.unk_sym] self.eos_idx = self.trg_vocab[self.eos_sym] logger.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 if self.verbose: ftrans = open(self.config['val_set_out'], 'w') print('LENGTH OF DEV STREAM: {}'.format( len(list(self.data_stream.get_epoch_iterator())))) for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ # Note that the indices of source and target in the datastream are hard-coded # currently our datastream is (source,target,prefix,suffix) seq = self._oov_to_unk(line[0], self.config['src_vocab_size'], self.unk_idx) target_prefix = line[2] input_ = numpy.tile(seq, (self.config['beam_size'], 1)) prefix_input_ = numpy.tile(target_prefix, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back # beam search param names come from WHERE?? trans, costs = self.beam_search.search(input_values={ self.source_sentence: input_, self.target_prefix: prefix_input_ }, max_length=3 * len(seq), eol_symbol=self.eos_idx, ignore_first_eol=False) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # convert idx to words #print('input_seq: {}'.format(seq)) #print('input_prefix: {}'.format(target_prefix)) #print('trans_out_raw: {}'.format(trans_out)) trans_out = self._idx_to_word(trans_out, self.trg_ivocab) #print('trans_out_text: {}'.format(trans_out)) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print(trans_out, file=mb_subprocess.stdin) if self.verbose: print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of validation set...".format(i)) mb_subprocess.stdin.flush() logger.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.verbose: ftrans.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logger.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logger.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) self.val_bleu_curve.append(bleu_score) logger.info(bleu_score) mb_subprocess.terminate() return bleu_score def _is_valid_to_save(self, bleu_score): if not self.best_models or min( self.best_models, key=operator.attrgetter('score')).score < bleu_score: return True return False def _save_model(self, bleu_score): if self._is_valid_to_save(bleu_score): model = ModelInfo(bleu_score, self.config['saveto'], key='BLEU') # Manage n-best model list first if len(self.best_models) >= self.track_n_models: old_model = self.best_models[0] if old_model.path and os.path.isfile(old_model.path): logger.info("Deleting old model %s" % old_model.path) os.remove(old_model.path) self.best_models.remove(old_model) self.best_models.append(model) self.best_models.sort(key=operator.attrgetter('score')) # Save the model here s = signal.signal(signal.SIGINT, signal.SIG_IGN) logger.info("Saving new model {}".format(model.path)) SaveLoadUtils.save_parameter_values( self.main_loop.model.get_parameter_values(), model.path) numpy.savez(os.path.join(self.config['saveto'], 'val_bleu_scores.npz'), bleu_scores=self.val_bleu_curve) signal.signal(signal.SIGINT, s)
class BleuEvaluator(SimpleExtension, SamplingBase): def __init__(self, source_sentence, samples, model, data_stream, ground_truth, config, val_out=None, val_best_out=None, n_best=1, normalize=True, **kwargs): # TODO: change config structure super(BleuEvaluator, self).__init__(**kwargs) self.source_sentence = source_sentence self.samples = samples self.model = model self.data_stream = data_stream self.config = config self.n_best = n_best self.normalize = normalize self.val_out = val_out self.val_best_out = val_out and val_best_out self.bleu_scores = [] self.trg_ivocab = None self.unk_id = config['unk_id'] self.eos_id = config['eos_id'] self.beam_search = BeamSearch(samples=samples) self.multibleu_cmd = ['perl', self.config['bleu_script'], ground_truth, '<'] def do(self, which_callback, *args): # Track validation burn in if self.main_loop.status['iterations_done'] <= self.config['val_burn_in']: return self._evaluate_model() def _evaluate_model(self): logger.info("Started Validation: ") val_start_time = time.time() mb_subprocess = Popen(self.multibleu_cmd, stdin=PIPE, stdout=PIPE) total_cost = 0.0 if self.trg_ivocab is None: sources = self._get_attr_rec(self.main_loop, 'data_stream') trg_vocab = sources.data_streams[1].dataset.dictionary self.trg_ivocab = {v: k for k, v in trg_vocab.items()} if self.val_out: output_file = open(self.val_out, 'w') for i, line in enumerate(self.data_stream.get_epoch_iterator()): """ Load the sentence, retrieve the sample, write to file """ seq = self._oov_to_unk(line[0], self.config['src_vocab_size'], self.unk_id) input_ = numpy.tile(seq, (self.config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = self.beam_search.search( input_values={self.source_sentence: input_}, max_length=3 * len(seq), eol_symbol=self.eos_id, ignore_first_eol=True) # normalize costs according to the sequence lengths if self.normalize: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths nbest_idx = numpy.argsort(costs)[:self.n_best] for j, best in enumerate(nbest_idx): try: total_cost += costs[best] trans_out = trans[best] # keeping eos tokens reduces BLEU score if self.config['remove_eos']: trans_out = [idx for idx in trans_out if idx != self.eos_id] # however keeping unk tokens might be a good idea (avoids brevity penalty) if self.config['remove_unk']: trans_out = [idx for idx in trans_out if idx != self.unk_id] # convert idx to words trans_out = self._idx_to_word(trans_out, self.trg_ivocab) except ValueError: logger.info("Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' if j == 0: # Write to subprocess and file if it exists print(trans_out, file=mb_subprocess.stdin) if self.val_out: print(trans_out, file=output_file) if i != 0 and i % 100 == 0: logger.info("Translated {} lines of validation set...".format(i)) mb_subprocess.stdin.flush() logger.info("Total cost of the validation: {}".format(total_cost)) self.data_stream.reset() if self.val_out: output_file.close() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logger.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logger.info("Validation Took: {} minutes".format(float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) logger.info(bleu_score) mb_subprocess.terminate() self.bleu_scores.append(bleu_score) if self.val_best_out and bleu_score == max(self.bleu_scores): shutil.copy(self.val_out, self.val_best_out) return bleu_score
class BlocksNMTVanillaDecoder(Decoder): """Adaptor class for blocks.search.BeamSearch. We implement the ``Decoder`` class but ignore functionality for predictors or heuristics. Instead, we pass through decoding directly to the blocks beam search module. This is fast, but breaks with the predictor framework. It can only be used for pure single system NMT decoding. Note that this decoder supports sparse feat maps on both source and target side. """ def __init__(self, nmt_model_path, config, decoder_args): """Set up the NMT model used by the decoder. Args: nmt_model_path (string): Path to the NMT model file (.npz) config (dict): NMT configuration decoder_args (object): Decoder configuration passed through from configuration API. """ super(BlocksNMTVanillaDecoder, self).__init__(decoder_args) self.config = config self.set_up_decoder(nmt_model_path) self.src_eos = self.src_sparse_feat_map.word2dense(utils.EOS_ID) def set_up_decoder(self, nmt_model_path): """This method uses the NMT configuration in ``self.config`` to initialize the NMT model. This method basically corresponds to ``blocks.machine_translation.main``. Args: nmt_model_path (string): Path to the NMT model file (.npz) """ self.nmt_model = NMTModel(self.config) self.nmt_model.set_up() loader = LoadNMTUtils(nmt_model_path, self.config['saveto'], self.nmt_model.search_model) loader.load_weights() self.src_sparse_feat_map = self.config['src_sparse_feat_map'] \ if self.config['src_sparse_feat_map'] else FlatSparseFeatMap() if self.config['trg_sparse_feat_map']: self.trg_sparse_feat_map = self.config['trg_sparse_feat_map'] self.beam_search = SparseBeamSearch( samples=self.nmt_model.samples, trg_sparse_feat_map=self.trg_sparse_feat_map) else: self.trg_sparse_feat_map = FlatSparseFeatMap() self.beam_search = BeamSearch(samples=self.nmt_model.samples) def decode(self, src_sentence): """Decodes a single source sentence with the original blocks beam search decoder. Does not use predictors. Note that the score breakdowns in returned hypotheses are only on the sentence level, not on the word level. For finer grained NMT scores you need to use the nmt predictor. ``src_sentence`` is a list of source word ids representing the source sentence without <S> or </S> symbols. As blocks expects to see </S>, this method adds it automatically. Args: src_sentence (list): List of source word ids without <S> or </S> which make up the source sentence Returns: list. A list of ``Hypothesis`` instances ordered by their score. """ seq = self.src_sparse_feat_map.words2dense( utils.oov_to_unk(src_sentence, self.config['src_vocab_size'])) + [self.src_eos] if self.src_sparse_feat_map.dim > 1: # sparse src feats input_ = np.transpose( np.tile(seq, (self.config['beam_size'], 1, 1)), (2, 0, 1)) else: # word ids on the source side input_ = np.tile(seq, (self.config['beam_size'], 1)) trans, costs = self.beam_search.search( input_values={self.nmt_model.sampling_input: input_}, max_length=3 * len(src_sentence), eol_symbol=utils.EOS_ID, ignore_first_eol=True) hypos = [] max_len = 0 for idx in xrange(len(trans)): max_len = max(max_len, len(trans[idx])) hypo = Hypothesis(trans[idx], -costs[idx]) hypo.score_breakdown = len(trans[idx]) * [[(0.0, 1.0)]] hypo.score_breakdown[0] = [(-costs[idx], 1.0)] hypos.append(hypo) self.apply_predictors_count = max_len * self.config['beam_size'] return hypos def has_predictors(self): """Always returns true. """ return True
params = search_model.get_parameter_dict() param_values = SaveLoadUtils().load_parameter_values(os.path.join(config['saveto'], 'params.npz')) for k in params: params[k].set_value(param_values[k]) _, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph(generated[1])) beam_search = BeamSearch(samples=samples) # Read from standard input stream = get_stdin_stream(**config) vocab = get_vocab(config['trg_vocab'], config['trg_vocab_size'], config['unk_id'], config['eos_id'], config['bos_id']) inv_vocab = {v: k for k, v in vocab.iteritems()} unk_id = config['unk_id'] eos_id = config['eos_id'] for sample in stream.get_epoch_iterator(): seq = sample[0] input_ = np.tile(seq, (config['beam_size'], 1)) trans, costs = beam_search.search( input_values={sampling_input: input_}, max_length=3 * len(seq), eol_symbol=eos_id, ignore_first_eol=True) trans_indices = [idx for idx in trans[0] if idx != eos_id] # remove </S> from output trans_out = ' '.join(inv_vocab.get(idx, config['unk_token']) for idx in trans_indices) print trans_out