def decode(): # Load model config config = load_config(FLAGS) # Load source data to decode test_set = TextIterator(source=config['decode_input'], batch_size=config['decode_batch_size'], source_dict=config['source_vocabulary'], maxlen=None, n_words_source=config['num_encoder_symbols']) # Load inverse dictionary used in decoding target_inverse_dict = data_utils.load_inverse_dict( config['target_vocabulary']) # Initiate TF session with tf.Session(config=tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, gpu_options=tf.GPUOptions(allow_growth=True))) as sess: # Reload existing checkpoint model = load_model(sess, config) try: print('Decoding {}..'.format(FLAGS.decode_input)) if FLAGS.write_n_best: fout = [data_utils.fopen(("%s_%d" % (FLAGS.decode_output, k)), 'w') \ for k in range(FLAGS.beam_width)] else: fout = [data_utils.fopen(FLAGS.decode_output, 'w')] for idx, source_seq in enumerate(test_set.next()): print('Source', source_seq) source, source_len = prepare_batch(source_seq) print('Source', source, 'Source Len', source_len) # predicted_ids: GreedyDecoder; [batch_size, max_time_step, 1] # BeamSearchDecoder; [batch_size, max_time_step, beam_width] predicted_ids = model.predict(sess, encoder_inputs=source, encoder_inputs_length=source_len) print(predicted_ids) # Write decoding results for k, f in reversed(list(enumerate(fout))): for seq in predicted_ids: f.write( str( data_utils.seq2words( seq[:, k], target_inverse_dict)) + '\n') if not FLAGS.write_n_best: break print('{}th line decoded'.format(idx * FLAGS.decode_batch_size)) print('Decoding terminated') except IOError: pass finally: [f.close() for f in fout]
def __init__(self, source, source_dict, batch_size=128, n_words_source=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=False, maxibatch_size=20, ): if shuffle_each_epoch: self.source_orig = source self.source = shuffle.main([self.source_orig], temporary=True) else: self.source = data_utils.fopen(source, 'r') self.source_dict = load_dict(source_dict) self.batch_size = batch_size self.skip_empty = skip_empty self.n_words_source = n_words_source if self.n_words_source > 0: for key, idx in self.source_dict.items(): if idx >= self.n_words_source: del self.source_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.label_buffer =[] self.k = batch_size * maxibatch_size self.end_of_data = False
def decode(config): model, config = load_model(config) # Load source data to decode test_set = TextIterator(source=config['decode_input'], source_dict=config['src_vocab'], batch_size=config['batch_size'], n_words_source=config['num_enc_symbols'], maxlen=None) target_inv_dict = load_inv_dict(config['tgt_vocab']) if use_cuda: print 'Using gpu..' model = model.cuda() try: print 'Decoding starts..' fout = fopen(config['decode_output'], 'w') for idx, source_seq in enumerate(test_set): source, source_len = prepare_batch(source_seq) preds_prev = torch.zeros(config['batch_size'], config['max_decode_step']).long() preds_prev[:,0] += data_utils.start_token preds = torch.zeros(config['batch_size'], config['max_decode_step']).long() if use_cuda: source = Variable(source.cuda()) source_len = Variable(source_len.cuda()) preds_prev = Variable(preds_prev.cuda()) preds = preds.cuda() else: source = Variable(source) source_len = Variable(source_len) preds_prev = Variable(preds_prev) states, memories = model.encode(source, source_len) for t in xrange(config['max_decode_step']): # logits: [batch_size x max_decode_step, tgt_vocab_size] _, logits = model.decode(preds_prev, None, memories, keep_len=True) # outputs: [batch_size, max_decode_step] outputs = torch.max(logits, dim=1)[1].view(config['batch_size'], -1) preds[:,t] = outputs[:,t].data if t < config['max_decode_step'] - 1: preds_prev[:,t+1] = outputs[:,t] for i in xrange(len(preds)): fout.write(str(seq2words(preds[i], target_inv_dict)) + '\n') fout.flush() print ' {}th line decoded'.format(idx * config['batch_size']) print 'Decoding terminated' except IOError: pass finally: fout.close()
def decode(): # Load model config config = load_config(FLAGS) print(config['source_vocabulary']) # Load source data to decode test_set = TextIterator(source=config['decode_input'], batch_size=config['decode_batch_size'], source_dict=config['source_vocabulary'],) # Load inverse dictionary used in decoding # Initiate TF session with tf.Session(config=tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, gpu_options=tf.GPUOptions(allow_growth=True))) as sess: # Reload existing checkpoint model = load_model(sess, config) try: if FLAGS.write_n_best: fout = [data_utils.fopen(("%s_%d" % (FLAGS.decode_output, k)), 'w') \ for k in range(FLAGS.beam_width)] else: fout = [data_utils.fopen(FLAGS.decode_output, 'w')] for source_seq, label in test_set: # label = test_labels[idx] source, source_len = prepare_batch(source_seq, batch_size=config['decode_batch_size'], stride = config['max_seq_length'],maxlen=config['max_seq_length']) # predicted_ids: GreedyDecoder; [batch_size, max_time_step, 1] # BeamSearchDecoder; [batch_size, max_time_step, beam_width] predicted_ids = model.predict(sess, encoder_inputs=source, encoder_inputs_length=source_len) # Write decoding results for k, f in reversed(list(enumerate(fout))): f.write(str(source_seq)+'\t\t\t') res = [] for seq in predicted_ids: res.append(list(seq[:,k])) f.write(str(res)+'\n') if not FLAGS.write_n_best: break print('Decoding terminated') except IOError: pass finally: [f.close() for f in fout]