def load_data(config): print >> sys.stderr, 'Reading data...', text_iterator = TextIterator( source=config.source_dataset, target=config.target_dataset, source_dicts=[config.source_vocab], target_dict=config.target_vocab, batch_size=config.batch_size, maxlen=config.maxlen, n_words_source=config.source_vocab_size, n_words_target=config.target_vocab_size, skip_empty=True, shuffle_each_epoch=config.shuffle_each_epoch, sort_by_length=config.sort_by_length, maxibatch_size=config.maxibatch_size, keep_data_in_memory=config.keep_train_set_in_memory) if config.validFreq: valid_text_iterator = TextIterator( source=config.valid_source_dataset, target=config.valid_target_dataset, source_dicts=[config.source_vocab], target_dict=config.target_vocab, batch_size=config.valid_batch_size, maxlen=config.validation_maxlen, n_words_source=config.source_vocab_size, n_words_target=config.target_vocab_size, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=config.maxibatch_size) else: valid_text_iterator = None print >> sys.stderr, 'Done' return text_iterator, valid_text_iterator
def main(model, dictionary, dictionary_target, source, target, context, outfile, wordbyword): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) valid_noshuf = TextIterator(source, target, context, dictionary, dictionary_target, n_words_source=options['n_words_src'], n_words_target=options['n_words'], batch_size=options['valid_batch_size'], maxlen=2000, shuffle=False, tc=options['kwargs'].get('tc', False)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, xc, xc_mask, \ opt_ret, \ cost, cost_, xc_mask_2, xc_mask_3 = \ build_model(tparams, options) inps = [x, x_mask, y, y_mask, xc, xc_mask, xc_mask_2, xc_mask_3] f_log_probs = theano.function(inps, cost, profile=profile) valid_errs = pred_probs(f_log_probs, prepare_data, options, valid_noshuf, verbose=True) numpy.save(outfile, valid_errs)
def score_model(source_file, target_file, scorer_settings, options): scores = [] for option in options: g = tf.Graph() with g.as_default(): with tf.Session() as sess: model, saver = nmt.create_model(option, sess) text_iterator = TextIterator( source=source_file.name, target=target_file.name, source_dicts=option.source_dicts, target_dict=option.target_dict, batch_size=scorer_settings.b, maxlen=float('inf'), source_vocab_sizes=option.source_vocab_sizes, target_vocab_size=option.target_vocab_size, use_factor=(option.factors > 1), sort_by_length=False) losses = nmt.calc_loss_per_sentence( option, sess, text_iterator, model, normalization_alpha=scorer_settings.normalization_alpha) scores.append(losses) return scores
def score_model(source_file, target_file, scorer_settings, options): scores = [] for option in options: with tf.Session() as sess: model, saver = create_model(option, sess) valid_text_iterator = TextIterator( source=source_file.name, target=target_file.name, source_dicts=option.source_dicts, target_dict=option.target_dict, batch_size=scorer_settings.b, maxlen=float('inf'), source_vocab_sizes=option.source_vocab_sizes, target_vocab_size=option.target_vocab_size, use_factor=(option.factors > 1), sort_by_length=False) score = validate( option, sess, valid_text_iterator, model, normalization_alpha=scorer_settings.normalization_alpha) scores.append(score) return scores
def score_model(source_file, target_file, scorer_settings, options): scores = [] for option in options: g = tf.Graph() with g.as_default(): tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True with tf.Session(config=tf_config) as sess: logging.info('Building model...') model = rnn_model.RNNModel(option) saver = model_loader.init_or_restore_variables(option, sess) text_iterator = TextIterator( source=source_file.name, target=target_file.name, source_dicts=option.source_dicts, target_dict=option.target_dict, batch_size=scorer_settings.minibatch_size, maxlen=float('inf'), source_vocab_sizes=option.source_vocab_sizes, target_vocab_size=option.target_vocab_size, use_factor=(option.factors > 1), sort_by_length=False) losses = nmt.calc_loss_per_sentence( option, sess, text_iterator, model, normalization_alpha=scorer_settings.normalization_alpha) scores.append(losses) return scores
def load_data(config): logging.info('Reading data...') text_iterator = TextIterator( source=config.source_dataset, target=config.target_dataset, source_dicts=config.source_dicts, target_dict=config.target_dict, pretrain_dict=config.pretrain_vocab, model_type=config.model_type, batch_size=config.batch_size, maxlen=config.maxlen, source_vocab_sizes=config.source_vocab_sizes, target_vocab_size=config.target_vocab_size, skip_empty=True, shuffle_each_epoch=config.shuffle_each_epoch, sort_by_length=config.sort_by_length, use_factor=(config.factors > 1), utf8_type=config.utf8_type, maxibatch_size=config.maxibatch_size, token_batch_size=config.token_batch_size, keep_data_in_memory=config.keep_train_set_in_memory) if config.valid_freq and config.valid_source_dataset and config.valid_target_dataset: valid_text_iterator = TextIterator( source=config.valid_source_dataset, target=config.valid_target_dataset, source_dicts=config.source_dicts, target_dict=config.target_dict, pretrain_dict=config.pretrain_vocab, model_type=config.model_type, batch_size=config.valid_batch_size, maxlen=config.maxlen, source_vocab_sizes=config.source_vocab_sizes, target_vocab_size=config.target_vocab_size, shuffle_each_epoch=False, sort_by_length=True, use_factor=(config.factors > 1), utf8_type=config.utf8_type, maxibatch_size=config.maxibatch_size, token_batch_size=config.valid_token_batch_size) else: logging.info('no validation set loaded') valid_text_iterator = None logging.info('Done') return text_iterator, valid_text_iterator
def decode(): # Load model config config = load_config(FLAGS) # Load source data to decode test_set = TextIterator(source=config['decode_input'], batch_size=config['decode_batch_size'], source_dict=config['source_vocabulary'], maxlen=None, n_words_source=config['num_encoder_symbols']) # Load inverse dictionary used in decoding target_inverse_dict = data_utils.load_inverse_dict( config['target_vocabulary']) # Initiate TF session with tf.Session(config=tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, gpu_options=tf.GPUOptions(allow_growth=True))) as sess: # Reload existing checkpoint model = load_model(sess, config) try: print('Decoding {}..'.format(FLAGS.decode_input)) if FLAGS.write_n_best: fout = [data_utils.fopen(("%s_%d" % (FLAGS.decode_output, k)), 'w') \ for k in range(FLAGS.beam_width)] else: fout = [data_utils.fopen(FLAGS.decode_output, 'w')] for idx, source_seq in enumerate(test_set): source, source_len = prepare_batch(source_seq) # predicted_ids: GreedyDecoder; [batch_size, max_time_step, 1] # BeamSearchDecoder; [batch_size, max_time_step, beam_width] predicted_ids = model.predict(sess, encoder_inputs=source, encoder_inputs_length=source_len) # Write decoding results for k, f in reversed(list(enumerate(fout))): for seq in predicted_ids: f.write( str( data_utils.seq2words( seq[:, k], target_inverse_dict)) + '\n') if not FLAGS.write_n_best: break print(' {}th line decoded'.format(idx * FLAGS.decode_batch_size)) print('Decoding terminated') except IOError: pass finally: [f.close() for f in fout]
def test(): # load dictionary config = { 'use_gpu': True, 'hidden_units': 400, 'vocab': './train_data/enli.dict', 'word_dim': 200, 'gpu_id': 14, 'dropout': 0.2, 'n_word': 42394, 'batch_size': 32 } worddicts = joblib.load(config['vocab']) print('Loading data') prefix = './train_data/bs_new.utf8' test = TextIterator('{}.query'.format(prefix), '{}.title'.format(prefix), '{}.label'.format(prefix), dict=worddicts, batch_size=config['batch_size']) model_file_lst = [ '/home/disk0/wangqi38/pytorch-final/save_files/enli_0.2_400_shuffle.pkl', '/home/disk0/wangqi38/pytorch-final/save_files/enli_0.3_400.pkl' ] model_lst = [] print('load models') if config['use_gpu']: os.environ["CUDA_VISIBLE_DEVICES"] = str(config['gpu_id']) for model_name in model_file_lst: model = ENLI_Model(config) if config['use_gpu']: model.load_state_dict(torch.load(model_name)) else: model.load_state_dict( torch.load(model_name, map_location={'cuda:0': 'cpu'})) model.eval() model_lst.append(model) use_gpu = config['use_gpu'] tres = pred_acc_ensemble(model_lst, prepare_data, test, use_gpu) print('muti test accuracy', tres[0]) print('bi test accuracy', tres[1]) print('test auc', tres[2]) print('finish')
def rescore_model(source_file, target_file, saveto, models, options, b, normalization_alpha, verbose, alignweights): trng = RandomStreams(1234) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, model in enumerate(models): f_log_probs = load_scorer(model, options[i], alignweights=alignweights) score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalization_alpha=normalization_alpha, alignweights = alignweights) scores.append(score) alignments.append(alignment) return scores, alignments pairs = TextIterator(source_file.name, target_file.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][-1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False) #TODO: sorting by length could be more efficient, but we'd want to resort after scores, alignments = _score(pairs, alignweights) source_file.seek(0) target_file.seek(0) source_lines = source_file.readlines() target_lines = target_file.readlines() for i, line in enumerate(target_lines): score_str = ' '.join(map(str,[s[i] for s in scores])) if verbose: saveto.write('{0} '.format(line.strip())) saveto.write('{0}\n'.format(score_str)) ### optional save weights mode. if alignweights: ### writing out the alignments. temp_name = saveto.name + ".json" with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT: for line in all_alignments: align_OUT.write(line + "\n") ### combining the actual source and target words. combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
def validate_helper(config, sess): model, saver = create_model(config, sess) valid_text_iterator = TextIterator( source=config.valid_source_dataset, target=config.valid_target_dataset, source_dicts=[config.source_vocab], target_dict=config.target_vocab, batch_size=config.valid_batch_size, maxlen=config.validation_maxlen, n_words_source=config.source_vocab_size, n_words_target=config.target_vocab_size, shuffle_each_epoch=False, sort_by_length=False, #TODO maxibatch_size=config.maxibatch_size) costs = validate(sess, valid_text_iterator, model) lines = open(config.valid_target_dataset).readlines() for cost, line in zip(costs, lines): print cost, line.strip()
def get_error(model, test_src, test_target): profile = False # reload options f = open('%s.pkl' % model, 'rb') model_options = pkl.load(f) logging.info(model_options) logging.info('Building model') params = init_params(model_options) # reload parameters params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] dict_src = os.path.join(model_options['baseDir'], model_options['dictionaries'][0]) if len(model_options['dictionaries']) == 1: dict_target = None else: dict_target = os.path.join(model_options['baseDir'], model_options['dictionaries'][1]) valid = TextIterator(test_src, test_target, dict_src, dict_target, n_words_source=model_options['n_words_src'], n_words_target=model_options['n_words'], batch_size=model_options['valid_batch_size'], maxlen=model_options['maxlen']) logging.info('Building f_log_probs...') f_log_probs = theano.function(inps, cost, profile=profile) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() logging.info('Valid Error:%s' % (str(valid_err)))
def validate_helper(config, sess): model, saver = create_model(config, sess) valid_text_iterator = TextIterator( source=config.valid_source_dataset, target=config.valid_target_dataset, source_dicts=config.source_dicts, target_dict=config.target_dict, batch_size=config.valid_batch_size, maxlen=config.maxlen, source_vocab_sizes=config.source_vocab_sizes, target_vocab_size=config.target_vocab_size, shuffle_each_epoch=False, sort_by_length=False, #TODO use_factor=(config.factors > 1), maxibatch_size=config.maxibatch_size) costs = validate(config, sess, valid_text_iterator, model) lines = open(config.valid_target_dataset).readlines() for cost, line in zip(costs, lines): logging.info("{0} {1}".format(cost, line.strip()))
def pretrain(config): logging.info('Reading pretrain data...') pretrain_dictionary text_iterator = TextIterator( source=config.pretrain_dictionary_src, target=config.pretrain_dictionary_trg, source_dicts=config.source_dicts, target_dict=config.target_dict, batch_size=config.batch_size, maxlen=config.maxlen, source_vocab_sizes=config.source_vocab_sizes, target_vocab_size=config.target_vocab_size, skip_empty=True, shuffle_each_epoch=config.shuffle_each_epoch, sort_by_length=config.sort_by_length, use_factor=(config.factors > 1), maxibatch_size=config.maxibatch_size, token_batch_size=config.token_batch_size, keep_data_in_memory=config.keep_train_set_in_memory) logging.info('Done') return text_iterator
def gen_force_train_iter(source_data, target_data, reshuffle, source_dict, target_dict, batch_size, maxlen, n_words_src, n_words_trg): iter = 0 while True: if reshuffle: os.popen('python shuffle.py ' + source_data + ' ' + target_data) os.popen('mv ' + source_data + '.shuf ' + source_data) os.popen('mv ' + target_data + '.shuf ' + target_data) gen_force_train = TextIterator(source_data, target_data, source_dict, target_dict, batch_size, maxlen, n_words_src, n_words_trg) ExampleNum = 0 EpochStart = time.time() for x, y in gen_force_train: if len(x) < batch_size and len(y) < batch_size: continue ExampleNum += len(x) yield x, y, iter TimeCost = time.time() - EpochStart iter += 1 print('Seen', ExampleNum, 'generator samples. Time cost is ', TimeCost)
worddicts = pkl.load(f) n_words = len(worddicts) wv_dict, wv_arr, wv_size = load_word_vectors(embedding_path, 'glove.840B', dim_word) pretrained_emb = norm_weight(n_words, dim_word) for word in worddicts.keys(): try: pretrained_emb[worddicts[word]] = wv_arr[wv_dict[word]].numpy() except: pretrained_emb[worddicts[word]] = torch.normal(torch.zeros(dim_word), std=1).numpy() print('load data...') train = TextIterator(datasets[0], datasets[1], datasets[2], dictionary, n_words=n_words, batch_size=batch_size, maxlen=maxlen, shuffle=True) test = TextIterator(test_datasets[0], test_datasets[1], test_datasets[2], dictionary, n_words=n_words, batch_size=batch_size, shuffle=False) criterion = torch.nn.CrossEntropyLoss() model = ESIM(dim_word, 2, n_words, dim_word, pretrained_emb) if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda()
def multi_rescore_model(source_file, target_file, savetos, models, options, b, normalization_alpha, verbose, alignweights, extra_sources=[], per_word=False): trng = RandomStreams(1234) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] #alignments = [] #aux_alignments = [] costs_per_word = [] for i, model in enumerate(models): f_log_probs = load_scorer(model, options[i], alignweights=alignweights) score, all_alignments, cost_per_word = multi_pred_probs( f_log_probs, prepare_multi_data, options[i], pairs, normalization_alpha=normalization_alpha, alignweights=alignweights) #print 'alignment lens' #print len(all_alignments) #print len(all_alignments[0]) scores.append(score) costs_per_word.append(cost_per_word) return scores, tuple(all_alignments), costs_per_word #print 'extra_sources', extra_sources # list of sources + target sentences (target sentences are the final list) # TODO: make TextIterator generic sents = TextIterator(source_file.name, target_file.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][-1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False, extra_sources=[ss.name for ss in extra_sources]) # TODO: sorting by length could be more efficient, but we'd want to resort after scores, all_alignments, costs_per_word = _score(sents, alignweights) source_lines = [] source_file.seek(0) source_lines.append([source_file.readlines()]) extra_source_lines = [] for i, ss in enumerate(extra_sources): extra_sources[i].seek(0) extra_source_lines.append([extra_sources[i].readlines()]) target_file.seek(0) target_lines = target_file.readlines() # print out scores for each translation for i, line in enumerate(target_lines): if per_word: score_str = ' '.join( map(str, [s for s in costs_per_word[0][i] ][:len(line.split(" ")) + 1])) else: score_str = ' '.join(map(str, [s[i] for s in scores])) if verbose: savetos[0].write('{0} '.format(line.strip())) savetos[0].write('{0}\n'.format(score_str)) # optional save weights mode. if alignweights: #print 'num alignments', len(all_alignments) for i, alignments in enumerate(all_alignments): # write out the alignments. #print len(alignments) temp_name = savetos[i].name + str(i) + ".json" #print temp_name with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT: for line in alignments: #print len(line[0][0]) #raw_input() align_OUT.write(line + "\n") # combine the actual source and target words. #print 'savetos', len(savetos) #print 'source files', len(extra_sources) if i == 0: tmp_srcfile = source_file else: tmp_srcfile = extra_sources[i - 1] combine_source_target_text_1to1(tmp_srcfile, target_file, savetos[i].name, align_OUT, suffix=str(i))
def rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: sys.stderr.write( "\t*** Save weight mode ON, alignment matrix will be saved.\n") outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, f_log_probs in enumerate(fs_log_probs): score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights=alignweights) scores.append(score) alignments.append(alignment) return scores, alignments lines = source_file.readlines() nbest_lines = nbest_file.readlines() if alignweights: ### opening the temporary file. temp_name = saveto.name + ".json" align_OUT = tempfile.NamedTemporaryFile(prefix=temp_name) with tempfile.NamedTemporaryFile( prefix='rescore-tmpin') as tmp_in, tempfile.NamedTemporaryFile( prefix='rescore-tmpout') as tmp_out: for line in nbest_lines: linesplit = line.split(' ||| ') idx = int( linesplit[0]) ##index from the source file. Starting from 0. tmp_in.write(lines[idx]) tmp_out.write(linesplit[1] + '\n') tmp_in.seek(0) tmp_out.seek(0) pairs = TextIterator( tmp_in.name, tmp_out.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False ) #TODO: sorting by length could be more efficient, but we'd have to synchronize scores with n-best list after scores, alignments = _score(pairs, alignweights) for i, line in enumerate(nbest_lines): score_str = ' '.join(map(str, [s[i] for s in scores])) saveto.write('{0} {1}\n'.format(line.strip(), score_str)) ### optional save weights mode. if alignweights: for line in alignments: align_OUT.write(line + "\n") if alignweights: combine_source_target_text(source_file, nbest_file, saveto.name, align_OUT) align_OUT.close()
def train(dim_word=100, # word vector dimensionality dim=1000, # the number of GRU units encoder='gru', patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 weight decay penalty lrate=0.01, n_words=100000, # vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq dataset='/data/lisatmp4/anirudhg/wiki.tok.txt.gz', valid_dataset='/data/lisatmp4/anirudhg/newstest2011.en.tok', dictionary='/data/lisatmp4/anirudhg/wiki.tok.txt.gz.pkl', use_dropout=False, reload_=False): # Model options model_options = locals().copy() # load dictionary with open(dictionary, 'rb') as f: worddicts = pkl.load(f) # invert dictionary worddicts_r = dict() for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print 'Loading data' train = TextIterator(dataset, dictionary, n_words_source=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_dataset, dictionary, n_words_source=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) # create shared variables for parameters tparams = init_tparams(params) # build the symbolic computational graph trng, use_noise, \ x, x_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask] print 'Buliding sampler' f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size # Training loop uidx = 0 estop = False bad_counter = 0 for eidx in xrange(max_epochs): n_samples = 0 for x in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) # pad batch and create mask x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(5): sample, score = gen_sample(tparams, f_next, model_options, trng=trng, maxlen=30, argmax=False) print 'Sample ', jj, ': ', ss = sample for vv in ss: if vv == 0: break if vv in worddicts_r: print worddicts_r[vv], else: print 'UNK', print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err
def main(_): """Main procedure for training and test """ tf.logging.set_verbosity(tf.logging.INFO) # Load vocabulary tf.logging.info("***** Loading Vocabulary *****") token_to_idx = load_vocab(FLAGS.vocab_file) # Load text iterator tf.logging.info("***** Loading Text Iterator *****") test = TextIterator(FLAGS.test_file, token_to_idx, batch_size=FLAGS.test_batch_size, vocab_size=FLAGS.vocab_size, shuffle=False) # Initialize the word embedding tf.logging.info("***** Initialize Word Embedding *****") embedding = load_word_embedding(token_to_idx) # Build graph tf.logging.info("***** Build Computation Graph *****") probability_op, cost_op = create_model(embedding) loss_op = tf.reduce_mean(cost_op) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=5) # training process with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: sess.run(init) # evaluation process tf.logging.info("***** Final Result ***** ") tf.logging.info("restore model at {}".format(FLAGS.model_file)) saver.restore(sess, os.path.join(FLAGS.output_dir, FLAGS.model_file)) if not FLAGS.return_score: test_metrics, test_scores = predict_metrics( sess, cost_op, probability_op, test) tf.logging.info( "test set: MAP %s MRR %s Precision@1 %s Recall@1 %s Recall@2 %s Recall@5 %s", *test_metrics) else: test_scores = predict_metrics(sess, cost_op, probability_op, test) system_level_scores = {} for i in range(FLAGS.number_of_systems): system_level_scores[i] = [] tf.logging.info("Writing confidence score to file") with codecs.open("context_am_ranking_score.txt", mode='w', encoding='utf-8') as wf: wf.truncate() for i, score in enumerate(test_scores): system_level_scores[i % FLAGS.number_of_systems].append(score) with codecs.open("context_am_ranking_score.txt", mode='a', encoding='utf-8') as wf: wf.write(str(score) + '\n') if i % FLAGS.number_of_systems == FLAGS.number_of_systems - 1: with codecs.open("context_am_ranking_score.txt", mode='a', encoding='utf-8') as wf: wf.write('\n') with codecs.open("context_am_ranking_score_system_level.txt", mode='w', encoding='utf-8') as wf: wf.truncate() for k, v in system_level_scores.items(): avg_score = sum(v) / len(v) with codecs.open("context_am_ranking_score_system_level.txt", mode='a', encoding='utf-8') as wf: wf.write(str(avg_score) + '\n') tf.logging.info("Done writing confidence score to file")
def train( dim_word=100, # word vector dimensionality dim=1000, # the number of GRU units encoder='gru', decoder='gru_cond_simple', patience=10, # early stopping patience max_epochs=50, finish_after=100000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty alpha_c=0., # not used lrate=0.01, # learning rate n_words_src=100000, # source vocabulary size n_words=100000, # target vocabulary size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size=16, valid_batch_size=16, saveto='model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets=[ '/home/ubuntu/codes/dl4mt-tutorial/data/europarl-v7.fr-en.en.tok', '/home/ubuntu/codes/dl4mt-tutorial/data/europarl-v7.fr-en.fr.tok' ], valid_datasets=[ '/home/ubuntu/codes/dl4mt-tutorial/data/newstest2011.en.tok', '/home/ubuntu/codes/dl4mt-tutorial/data/newstest2011.fr.tok' ], dictionaries=[ '/home/ubuntu/codes/dl4mt-tutorial/data/europarl-v7.fr-en.en.tok.pkl', '/home/ubuntu/codes/dl4mt-tutorial/data/europarl-v7.fr-en.fr.tok.pkl' ], use_dropout=False, reload_=False, overwrite=False): # Model options model_options = locals().copy() # load dictionaries and invert them worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # reload options if reload_ and os.path.exists(saveto): print 'Reloading model options' with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print 'Loading data' train = TextIterator(datasets[0], datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' params = init_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): print 'Reloading model parameters' params = load_params(saveto, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, trng, use_noise) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # un used, attention weight regularization if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' best_p = None bad_counter = 0 uidx = 0 estop = False history_errs = [] # reload history if reload_ and os.path.exists(saveto): rmodel = numpy.load(saveto) history_errs = list(rmodel['history_errs']) if 'uidx' in rmodel: uidx = rmodel['uidx'] if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size for eidx in xrange(max_epochs): n_samples = 0 for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen, n_words_src=n_words_src, n_words=n_words) if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far, in addition, save the latest model # into a separate file with the iteration number for external eval if numpy.mod(uidx, saveFreq) == 0: print 'Saving the best model...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, uidx=uidx, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # save with uidx if not overwrite: print 'Saving the model at iteration {}...'.format(uidx), saveto_uidx = '{}.iter{}.npz'.format( os.path.splitext(saveto)[0], uidx) numpy.savez(saveto_uidx, history_errs=history_errs, uidx=uidx, **unzip(tparams)) print 'Done' # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[1])): stochastic = True sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=False) print 'Source ', jj, ': ', for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: print worddicts_r[0][vv], else: print 'UNK', print print 'Truth ', jj, ' : ', for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: print worddicts_r[1][vv], else: print 'UNK', print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, uidx=uidx, **params) return valid_err
def main(): model_name = os.path.basename(os.path.dirname(os.path.realpath(__file__))) model = '../../models/{}.npz'.format(model_name) valid_datasets = ['../../data/sequence_and_features/premise_snli_1.0_dev_token.txt', '../../data/sequence_and_features/hypothesis_snli_1.0_dev_token.txt', '../../data/sequence_and_features/premise_snli_1.0_dev_lemma.txt', '../../data/sequence_and_features/hypothesis_snli_1.0_dev_lemma.txt', '../../data/sequence_and_features/label_snli_1.0_dev.txt'] test_datasets = ['../../data/sequence_and_features/premise_snli_1.0_test_token.txt', '../../data/sequence_and_features/hypothesis_snli_1.0_test_token.txt', '../../data/sequence_and_features/premise_snli_1.0_test_lemma.txt', '../../data/sequence_and_features/hypothesis_snli_1.0_test_lemma.txt', '../../data/sequence_and_features/label_snli_1.0_test.txt'] dictionary = ['../../data/sequence_and_features/vocab_cased.pkl', '../../data/sequence_and_features/vocab_cased_lemma.pkl'] # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) print options # load dictionary and invert with open(dictionary[0], 'rb') as f: word_dict = pkl.load(f) print 'Loading knowledge base ...' kb_dicts = options['kb_dicts'] with open(kb_dicts[0], 'rb') as f: kb_dict = pkl.load(f) n_words = options['n_words'] valid_batch_size = options['valid_batch_size'] valid = TextIterator(valid_datasets[0], valid_datasets[1], valid_datasets[2], valid_datasets[3], valid_datasets[4], dictionary[0], dictionary[1], n_words=n_words, batch_size=valid_batch_size, shuffle=False) test = TextIterator(test_datasets[0], test_datasets[1], test_datasets[2], test_datasets[3], test_datasets[4], dictionary[0], dictionary[1], n_words=n_words, batch_size=valid_batch_size, shuffle=False) # allocate model parameters params = init_params(options, word_dict) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x1, x1_mask, x1_kb, x2, x2_mask, x2_kb, kb_att, y, \ opt_ret, \ cost, \ f_pred, \ f_probs = \ build_model(tparams, options) use_noise.set_value(0.) valid_acc = pred_acc(f_pred, prepare_data, options, valid, kb_dict) test_acc = pred_acc(f_pred, prepare_data, options, test, kb_dict) print 'valid accuracy', valid_acc print 'test accuracy', test_acc predict_labels_valid = pred_label(f_pred, prepare_data, options, valid, kb_dict) predict_labels_test = pred_label(f_pred, prepare_data, options, test, kb_dict) with open('predict_gold_samples_valid.txt', 'w') as fw: with open(valid_datasets[0], 'r') as f1: with open(valid_datasets[1], 'r') as f2: with open(valid_datasets[-1], 'r') as f3: for a, b, c, d in zip(predict_labels_valid, f3, f1, f2): fw.write(str(a) + '\t' + b.rstrip() + '\t' + c.rstrip() + '\t' + d.rstrip() + '\n') with open('predict_gold_samples_test.txt', 'w') as fw: with open(test_datasets[0], 'r') as f1: with open(test_datasets[1], 'r') as f2: with open(test_datasets[-1], 'r') as f3: for a, b, c, d in zip(predict_labels_test, f3, f1, f2): fw.write(str(a) + '\t' + b.rstrip() + '\t' + c.rstrip() + '\t' + d.rstrip() + '\n') print 'Done'
def train( dim_word=100, dim_word_src=200, enc_dim=1000, dec_dim=1000, # the number of LSTM units patience=-1, # early stopping patience max_epochs=5000, finish_after=-1, # finish after this many updates decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=100000, # source vocabulary size n_words=100000, # target vocabulary size maxlen=1000, # maximum length of the description maxlen_trg=1000, # maximum length of the description maxlen_sample=1000, optimizer='rmsprop', batch_size=[1, 2, 3, 4], valid_batch_size=16, sort_size=20, save_path=None, save_file_name='model', save_best_models=0, dispFreq=100, validFreq=100, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=-1, pbatchFreq=-1, verboseFreq=10000, datasets=[ 'data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok' ], valid_datasets=[ '../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok' ], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl' ], source_word_level=0, target_word_level=0, use_dropout=False, re_load=False, re_load_old_setting=False, uidx=None, eidx=None, cidx=None, layers=None, save_every_saveFreq=0, save_burn_in=20000, use_bpe=0, init_params=None, build_model=None, build_sampler=None, gen_sample=None, **kwargs): # Model options model_options = locals().copy() del model_options['init_params'] del model_options['build_model'] del model_options['build_sampler'] del model_options['gen_sample'] # load dictionaries and invert them # dictionaries[0] : src # dictionaries[1] : trg worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) # ii, dd : 0 = source, 1 = target for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = cPickle.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk print 'Building model' if not os.path.exists(save_path): os.makedirs(save_path) file_name = '%s%s.npz' % (save_path, save_file_name) best_file_name = '%s%s.best.npz' % (save_path, save_file_name) opt_file_name = '%s%s%s.npz' % (save_path, save_file_name, '.grads') best_opt_file_name = '%s%s%s.best.npz' % (save_path, save_file_name, '.grads') model_name = '%s%s.pkl' % (save_path, save_file_name) params = init_params(model_options) cPickle.dump(model_options, open(model_name, 'wb')) history_errs = [[], [], [], []] # reload options # reload : False if re_load and os.path.exists(file_name): print 'You are reloading your experiment.. do not panic dude..' if re_load_old_setting: with open(model_name, 'rb') as f: models_options = cPickle.load(f) params = load_params(file_name, params) # reload history model = numpy.load(file_name) history_errs = list(lst.tolist() for lst in model['history_errs']) if uidx is None: uidx = model['uidx'] if eidx is None: eidx = model['eidx'] if cidx is None: try: cidx = model['cidx'] except: cidx = 0 else: if uidx is None: uidx = 0 if eidx is None: eidx = 0 if cidx is None: cidx = 0 print 'Loading data' train = MultiTextIterator(source=datasets[0], target=datasets[1], source_dict=dictionaries[0], target_dict=dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, source_word_level=source_word_level, target_word_level=target_word_level, batch_size=batch_size, sort_size=sort_size) valid = [ TextIterator(source=valid_dataset[0], target=valid_dataset[1], source_dict=dictionaries[0], target_dict=dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, source_word_level=source_word_level, target_word_level=target_word_level, batch_size=valid_batch_size, sort_size=sort_size) for valid_dataset in valid_datasets ] # create shared variables for parameters tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) # NOTE : this is where we build the model inps = [x, x_mask, y, y_mask] print 'Building sampler...\n', f_init, f_next = build_sampler(tparams, model_options, trng, use_noise) # print 'Done' # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) # NOTE : f_log_probs : [x, x_mask, y, y_mask], cost print 'Done' if re_load: # NOTE : this whole thing is False use_noise.set_value(0.) valid_scores = [] for ii, vv in enumerate(valid): valid_errs = pred_probs(f_log_probs, prepare_data, model_options, vv, verboseFreq=verboseFreq) valid_err = valid_errs.mean() if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print 'Reload sanity check: Valid ', valid_err cost = cost.mean() # apply L2 regularization on weights # decay_c : 0 if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights # alpha_c : 0 if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) # NOTE : why is this not referenced somewhere later? print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' if clip_c > 0: grads, not_finite, clipped = gradient_clipping(grads, tparams, clip_c) else: not_finite = 0 clipped = 0 # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', if re_load and os.path.exists(file_name): if clip_c > 0: f_grad_shared, f_update, toptparams = eval(optimizer)( lr, tparams, grads, inps, cost=cost, not_finite=not_finite, clipped=clipped, file_name=opt_file_name) else: f_grad_shared, f_update, toptparams = eval(optimizer)( lr, tparams, grads, inps, cost=cost, file_name=opt_file_name) else: # re_load = False, clip_c = 1 if clip_c > 0: f_grad_shared, f_update, toptparams = eval(optimizer)( lr, tparams, grads, inps, cost=cost, not_finite=not_finite, clipped=clipped) else: f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost) # f_grad_shared = theano.function(inp, [cost, not_finite, clipped], updates=gsup, profile=profile) # f_update = theano.function([lr], [], updates=updates, # on_unused_input='ignore', profile=profile) # toptparams print 'Done' print 'Optimization' best_p = None bad_counter = 0 # will never be true if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size # Training loop ud_start = time.time() estop = False if re_load: # IndexError: index 14 is out of bounds for axis 1 with size 13 print "Checkpointed minibatch number: %d" % cidx for cc in xrange(cidx): if numpy.mod(cc, 1000) == 0: print "Jumping [%d / %d] examples" % (cc, cidx) train.next() for epoch in xrange(max_epochs): time0 = time.time() n_samples = 0 NaN_grad_cnt = 0 NaN_cost_cnt = 0 clipped_cnt = 0 update_idx = 0 if re_load: re_load = 0 else: cidx = 0 for x, y in train: # NOTE : x, y are [sen1, sen2, sen3 ...] where sen_i are of different length update_idx += 1 cidx += 1 uidx += 1 use_noise.set_value(1.) # NOTE : n_x <= batch_size x, x_mask, y, y_mask, n_x = prepare_data(x, y, maxlen=maxlen, maxlen_trg=maxlen_trg, n_words_src=n_words_src, n_words=n_words) n_samples += n_x if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 uidx = max(uidx, 0) continue # compute cost, grads and copy grads to shared variables if clip_c > 0: cost, not_finite, clipped = f_grad_shared(x, x_mask, y, y_mask) else: cost = f_grad_shared(x, x_mask, y, y_mask) if clipped: clipped_cnt += 1 # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): import ipdb ipdb.set_trace() NaN_cost_cnt += 1 if not_finite: import ipdb ipdb.set_trace() NaN_grad_cnt += 1 continue # do the update on parameters f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): continue if float(NaN_grad_cnt) > max_epochs * 0.5 or float( NaN_cost_cnt) > max_epochs * 0.5: print 'Too many NaNs, abort training' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: ud = time.time() - ud_start wps = n_samples / float(time.time() - time0) print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'NaN_in_grad', NaN_grad_cnt, \ 'NaN_in_cost', NaN_cost_cnt, 'Gradient_clipped', clipped_cnt, 'UD ', ud, "%.2f sentence/s" % wps ud_start = time.time() if numpy.mod(uidx, pbatchFreq) == 0 and pbatchFreq != -1: pbatch(x, worddicts_r[0]) # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0 and sampleFreq != -1: gen_list = [ 0, batch_size[0], batch_size[0] + batch_size[1], batch_size[0] + batch_size[1] + batch_size[2] ] gen_list = [ii for ii in gen_list if ii < n_x] for jj in gen_list: # jj = min(5, n_samples) stochastic = True use_noise.set_value(0.) # x : maxlen X n_samples sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=maxlen_sample, stochastic=stochastic, argmax=False) print print 'Source ', jj, ': ', if source_word_level: for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: if use_bpe: print(worddicts_r[0][vv]).replace( '@@', ''), else: print worddicts_r[0][vv], else: print 'UNK', print else: source_ = [] for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: source_.append(worddicts_r[0][vv]) else: source_.append('UNK') print "".join(source_) print 'Truth ', jj, ' : ', if target_word_level: for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: if use_bpe: print(worddicts_r[1][vv]).replace( '@@', ''), else: print worddicts_r[1][vv], else: print 'UNK', print else: truth_ = [] for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: truth_.append(worddicts_r[1][vv]) else: truth_.append('UNK') print "".join(truth_) print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] if target_word_level: for vv in ss: if vv == 0: break if vv in worddicts_r[1]: if use_bpe: print(worddicts_r[1][vv]).replace( '@@', ''), else: print worddicts_r[1][vv], else: print 'UNK', print else: sample_ = [] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: sample_.append(worddicts_r[1][vv]) else: sample_.append('UNK') print "".join(sample_) print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: valid_scores = [] for ii, vv in enumerate(valid): use_noise.set_value(0.) # NOTE : when validation, don't pass maxlen, maxlen_trg # meaning, don't limit sentence lengths... # sort of makes sense i suppose? valid_errs = pred_probs( f_log_probs, prepare_data, model_options, vv, verboseFreq=verboseFreq, ) valid_err = valid_errs.mean() valid_scores.append(valid_err) history_errs[ii].append(valid_err) # patience == -1, never happens if len(history_errs[ii]) > patience and valid_err >= \ numpy.array(history_errs[ii])[:-patience].min() and patience != -1: bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): import ipdb ipdb.set_trace() cnt = 0 for ii in xrange(4): if uidx == 0 or valid_scores[ii] <= numpy.array( history_errs[ii]).min(): cnt += 1 if len(history_errs[0]) > 1: if numpy.sum(valid_scores) <= numpy.sum( [aa[:-2] for aa in history_errs]): less_sum = True else: less_sum = False else: less_sum = True if cnt >= 2 and less_sum: best_p = unzip(tparams) best_optp = unzip(toptparams) bad_counter = 0 if saveFreq != validFreq and save_best_models: numpy.savez(best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cdix, **best_p) numpy.savez(best_opt_file_name, **best_optp) print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format( valid_scores[0], valid_scores[1], valid_scores[2], valid_scores[3]) # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if not os.path.exists(save_path): os.mkdir(save_path) params = unzip(tparams) optparams = unzip(toptparams) numpy.savez(file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(opt_file_name, **optparams) if save_every_saveFreq and (uidx >= save_burn_in): this_file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx) this_opt_file_name = '%s%s%s.%d.npz' % ( save_path, save_file_name, '.grads', uidx) numpy.savez(this_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(this_opt_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) if best_p is not None and saveFreq != validFreq: this_best_file_name = '%s%s.%d.best.npz' % ( save_path, save_file_name, uidx) numpy.savez(this_best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **best_p) print 'Done...', print 'Saved to %s' % file_name # finish after this many updates if uidx >= finish_after and finish_after != -1: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples lang_nos = (4535523, 12122376, 1926115, 2326893) lang_done = [x * update_idx for x in batch_size] lang_rem = [x - y for x, y in zip(lang_nos, lang_done)] print "Remaining : DE({}), CS({}), FI({}), RU({})".format( lang_rem[0], lang_rem[1], lang_rem[2], lang_rem[3]) eidx += 1 if estop: break use_noise.set_value(0.) valid_scores = [] for ii, vv in enumerate(valid): valid_err = pred_probs(f_log_probs, prepare_data, model_options, vv).mean() valid_scores.append(valid_err) print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format( valid_scores[0], valid_scores[1], valid_scores[2], valid_scores[3]) params = unzip(tparams) optparams = unzip(toptparams) file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx) opt_file_name = '%s%s%s.%d.npz' % (save_path, save_file_name, '.grads', uidx) numpy.savez(file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(opt_file_name, **optparams) if best_p is not None and saveFreq != validFreq: best_file_name = '%s%s.%d.best.npz' % (save_path, save_file_name, uidx) best_opt_file_name = '%s%s%s.%d.best.npz' % (save_path, save_file_name, '.grads', uidx) numpy.savez(best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **best_p) numpy.savez(best_opt_file_name, **best_optp) return valid_err
def rescore_model(source_file, target_file, saveto, models, options, b, normalization_alpha, verbose, alignweights): trng = RandomStreams(1234) fs_log_probs = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, option) inps = [x, x_mask, y, y_mask] use_noise.set_value(0.) if alignweights: logging.debug( "Save weight mode ON, alignment matrix will be saved.") outputs = [cost, opt_ret['dec_alphas']] f_log_probs = theano.function(inps, outputs) else: f_log_probs = theano.function(inps, cost) fs_log_probs.append(f_log_probs) def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, f_log_probs in enumerate(fs_log_probs): score, alignment = pred_probs( f_log_probs, prepare_data, options[i], pairs, normalization_alpha=normalization_alpha, alignweights=alignweights) scores.append(score) alignments.append(alignment) return scores, alignments pairs = TextIterator( source_file.name, target_file.name, options[0]['dictionaries'][:-1], options[0]['dictionaries'][-1], n_words_source=options[0]['n_words_src'], n_words_target=options[0]['n_words'], batch_size=b, maxlen=float('inf'), sort_by_length=False ) #TODO: sorting by length could be more efficient, but we'd want to resort after scores, alignments = _score(pairs, alignweights) source_file.seek(0) target_file.seek(0) source_lines = source_file.readlines() target_lines = target_file.readlines() for i, line in enumerate(target_lines): score_str = ' '.join(map(str, [s[i] for s in scores])) if verbose: saveto.write('{0} '.format(line.strip())) saveto.write('{0}\n'.format(score_str)) ### optional save weights mode. if alignweights: ### writing out the alignments. temp_name = saveto.name + ".json" with tempfile.NamedTemporaryFile(prefix=temp_name) as align_OUT: for line in all_alignments: align_OUT.write(line + "\n") ### combining the actual source and target words. combine_source_target_text_1to1(source_file, target_file, saveto.name, align_OUT)
def train( dim_word=100, dim_word_src=200, enc_dim=1000, dec_dim=1000, # the number of LSTM units patience=-1, # early stopping patience max_epochs=5000, finish_after=-1, # finish after this many updates decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=100000, # source vocabulary size n_words=100000, # target vocabulary size maxlen=100, # maximum length of the description maxlen_trg=None, # maximum length of the description maxlen_sample=1000, optimizer='rmsprop', batch_size=16, valid_batch_size=16, sort_size=20, save_path=None, save_file_name='model', save_best_models=0, dispFreq=100, validFreq=100, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=-1, verboseFreq=10000, datasets=[ 'data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'], valid_datasets=['../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok'], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'], source_word_level=0, target_word_level=0, use_dropout=False, re_load=False, re_load_old_setting=False, uidx=None, eidx=None, cidx=None, layers=None, save_every_saveFreq=0, save_burn_in=20000, use_bpe=0, init_params=None, build_model=None, build_sampler=None, gen_sample=None, **kwargs ): if maxlen_trg is None: maxlen_trg = maxlen * 10 # Model options model_options = locals().copy() del model_options['init_params'] del model_options['build_model'] del model_options['build_sampler'] del model_options['gen_sample'] # load dictionaries and invert them worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = cPickle.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk print 'Building model' if not os.path.exists(save_path): os.makedirs(save_path) file_name = '%s%s.npz' % (save_path, save_file_name) best_file_name = '%s%s.best.npz' % (save_path, save_file_name) opt_file_name = '%s%s%s.npz' % (save_path, save_file_name, '.grads') best_opt_file_name = '%s%s%s.best.npz' % (save_path, save_file_name, '.grads') model_name = '%s%s.pkl' % (save_path, save_file_name) params = init_params(model_options) cPickle.dump(model_options, open(model_name, 'wb')) history_errs = [] # reload options if re_load and os.path.exists(file_name): print 'You are reloading your experiment.. do not panic dude..' if re_load_old_setting: with open(model_name, 'rb') as f: models_options = cPickle.load(f) params = load_params(file_name, params) # reload history model = numpy.load(file_name) history_errs = list(model['history_errs']) if uidx is None: uidx = model['uidx'] if eidx is None: eidx = model['eidx'] if cidx is None: cidx = model['cidx'] else: if uidx is None: uidx = 0 if eidx is None: eidx = 0 if cidx is None: cidx = 0 print 'Loading data' train = TextIterator(source=datasets[0], target=datasets[1], source_dict=dictionaries[0], target_dict=dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, source_word_level=source_word_level, target_word_level=target_word_level, batch_size=batch_size, sort_size=sort_size) valid = TextIterator(source=valid_datasets[0], target=valid_datasets[1], source_dict=dictionaries[0], target_dict=dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, source_word_level=source_word_level, target_word_level=target_word_level, batch_size=valid_batch_size, sort_size=sort_size) # create shared variables for parameters tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] print 'Building sampler...\n', f_init, f_next = build_sampler(tparams, model_options, trng, use_noise) #print 'Done' # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' if re_load: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid, verboseFreq=verboseFreq) valid_err = valid_errs.mean() if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print 'Reload sanity check: Valid ', valid_err cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' if clip_c > 0: grads, not_finite, clipped = gradient_clipping(grads, tparams, clip_c) else: not_finite = 0 clipped = 0 # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', if re_load and os.path.exists(file_name): if clip_c > 0: f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost, not_finite=not_finite, clipped=clipped, file_name=opt_file_name) else: f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost, file_name=opt_file_name) else: if clip_c > 0: f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost, not_finite=not_finite, clipped=clipped) else: f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost) print 'Done' print 'Optimization' best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size # Training loop ud_start = time.time() estop = False if re_load: print "Checkpointed minibatch number: %d" % cidx for cc in xrange(cidx): if numpy.mod(cc, 1000)==0: print "Jumping [%d / %d] examples" % (cc, cidx) train.next() for epoch in xrange(max_epochs): n_samples = 0 NaN_grad_cnt = 0 NaN_cost_cnt = 0 clipped_cnt = 0 if re_load: re_load = 0 else: cidx = 0 for x, y in train: cidx += 1 uidx += 1 use_noise.set_value(1.) x, x_mask, y, y_mask, n_x = prepare_data(x, y, maxlen=maxlen, maxlen_trg=maxlen_trg, n_words_src=n_words_src, n_words=n_words) n_samples += n_x if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 uidx = max(uidx, 0) continue # compute cost, grads and copy grads to shared variables if clip_c > 0: cost, not_finite, clipped = f_grad_shared(x, x_mask, y, y_mask) else: cost = f_grad_shared(x, x_mask, y, y_mask) if clipped: clipped_cnt += 1 # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): NaN_cost_cnt += 1 if not_finite: NaN_grad_cnt += 1 continue # do the update on parameters f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): continue if float(NaN_grad_cnt) > max_epochs * 0.5 or float(NaN_cost_cnt) > max_epochs * 0.5: print 'Too many NaNs, abort training' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: ud = time.time() - ud_start print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'NaN_in_grad', NaN_grad_cnt,\ 'NaN_in_cost', NaN_cost_cnt, 'Gradient_clipped', clipped_cnt, 'UD ', ud ud_start = time.time() # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0 and sampleFreq != -1: # FIXME: random selection? for jj in xrange(numpy.minimum(5, x.shape[1])): stochastic = True use_noise.set_value(0.) sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=maxlen_sample, stochastic=stochastic, argmax=False) print print 'Source ', jj, ': ', if source_word_level: for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: if use_bpe: print (worddicts_r[0][vv]).replace('@@', ''), else: print worddicts_r[0][vv], else: print 'UNK', print else: source_ = [] for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: source_.append(worddicts_r[0][vv]) else: source_.append('UNK') print "".join(source_) print 'Truth ', jj, ' : ', if target_word_level: for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: if use_bpe: print (worddicts_r[1][vv]).replace('@@', ''), else: print worddicts_r[1][vv], else: print 'UNK', print else: truth_ = [] for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: truth_.append(worddicts_r[1][vv]) else: truth_.append('UNK') print "".join(truth_) print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] if target_word_level: for vv in ss: if vv == 0: break if vv in worddicts_r[1]: if use_bpe: print (worddicts_r[1][vv]).replace('@@', ''), else: print worddicts_r[1][vv], else: print 'UNK', print else: sample_ = [] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: sample_.append(worddicts_r[1][vv]) else: sample_.append('UNK') print "".join(sample_) print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid, verboseFreq=verboseFreq) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) best_optp = unzip(toptparams) bad_counter = 0 if saveFreq != validFreq and save_best_models: numpy.savez(best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **best_p) numpy.savez(best_opt_file_name, **best_optp) if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min() and patience != -1: bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print 'Valid ', valid_err # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if not os.path.exists(save_path): os.mkdir(save_path) params = unzip(tparams) optparams = unzip(toptparams) numpy.savez(file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(opt_file_name, **optparams) if save_every_saveFreq and (uidx >= save_burn_in): this_file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx) this_opt_file_name = '%s%s%s.%d.npz' % (save_path, save_file_name, '.grads', uidx) numpy.savez(this_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(this_opt_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) if best_p is not None and saveFreq != validFreq: this_best_file_name = '%s%s.%d.best.npz' % (save_path, save_file_name, uidx) numpy.savez(this_best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **best_p) print 'Done...', print 'Saved to %s' % file_name # finish after this many updates if uidx >= finish_after and finish_after != -1: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples eidx += 1 if estop: break use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err params = unzip(tparams) optparams = unzip(toptparams) file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx) opt_file_name = '%s%s%s.%d.npz' % (save_path, save_file_name, '.grads', uidx) numpy.savez(file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(opt_file_name, **optparams) if best_p is not None and saveFreq != validFreq: best_file_name = '%s%s.%d.best.npz' % (save_path, save_file_name, uidx) best_opt_file_name = '%s%s%s.%d.best.npz' % (save_path, save_file_name, '.grads',uidx) numpy.savez(best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **best_p) numpy.savez(best_opt_file_name, **best_optp) return valid_err
def main(_): """Main procedure for training and test """ ud_start_whole = time.time() tf.logging.set_verbosity(tf.logging.INFO) # Load vocabulary tf.logging.info("***** Loading Vocabulary *****") token_to_idx = load_vocab(FLAGS.vocab_file) tf.gfile.MakeDirs(FLAGS.output_dir) # Load text iterator tf.logging.info("***** Loading Text Iterator *****") train = TextIterator(FLAGS.train_file, token_to_idx, batch_size=FLAGS.train_batch_size, vocab_size=FLAGS.vocab_size, shuffle=True) print(type(train)) valid = TextIterator(FLAGS.valid_file, token_to_idx, batch_size=FLAGS.valid_batch_size, vocab_size=FLAGS.vocab_size, shuffle=False) test = TextIterator(FLAGS.test_file, token_to_idx, batch_size=FLAGS.test_batch_size, vocab_size=FLAGS.vocab_size, shuffle=False) # Text iterator of training set for evaluation train_eval = TextIterator(FLAGS.train_file, token_to_idx, vocab_size=FLAGS.vocab_size, batch_size=FLAGS.train_batch_size, shuffle=False) # Initialize the word embedding tf.logging.info("***** Initialize Word Embedding *****") embedding = load_word_embedding(token_to_idx) # Build graph tf.logging.info("***** Build Computation Graph *****") probability_op, cost_op = create_model(embedding) loss_op = tf.reduce_mean(cost_op) lr = tf.Variable(0.0, name="learning_rate", trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=lr) tf.logging.info("***** Trainable Variables *****") tvars = tf.trainable_variables() for var in tvars: tf.logging.info(" name = %s, shape = %s", var.name, var.shape) if FLAGS.clip_c > 0.: grads, _ = tf.clip_by_global_norm( tf.gradients(cost_op, tvars), FLAGS.clip_c) train_op = optimizer.apply_gradients(zip(grads, tvars)) init = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=5) # training process with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: sess.run(init) uidx = 0 bad_counter = 0 history_errs = [] current_lr = FLAGS.learning_rate sess.run(tf.assign(lr, current_lr)) for eidx in range(FLAGS.max_train_epochs): tf.logging.info("***** Training at Epoch %s *****", eidx) n_samples = 0 for instance in train: n_samples += len(instance) uidx += 1 (batch_x1, batch_x1_mask, batch_x2, batch_x2_mask, batch_y) = prepare_data( instance) if batch_x1 is None: tf.logging.info("Minibatch with zero sample") uidx -= 1 continue ud_start = time.time() _, loss = sess.run([train_op, loss_op], feed_dict={ "x1:0": batch_x1, "x1_mask:0": batch_x1_mask, "x2:0": batch_x2, "x2_mask:0": batch_x2_mask, "y:0": batch_y, "keep_rate:0": 0.5}) ud = time.time() - ud_start if numpy.mod(uidx, FLAGS.disp_freq) == 0: tf.logging.info( "epoch %s update %s loss %s samples/sec %s", eidx, uidx, loss, 1. * batch_x1.shape[1] / ud) tf.logging.info("***** Evaluation at Epoch %s *****", eidx) tf.logging.info("seen samples %s each epoch", n_samples) tf.logging.info("current learning rate: %s", current_lr) # validate model on validation set and early stop if necessary valid_metrics, valid_scores = predict_metrics( sess, cost_op, probability_op, valid) # select best model based on recall@1 of validation set valid_err = 1.0 - valid_metrics[3] history_errs.append(valid_err) tf.logging.info( "valid set: MAP %s MRR %s Precision@1 %s Recall@1 %s Recall@2 %s Recall@5 %s", *valid_metrics) test_metrics, test_scores = predict_metrics( sess, cost_op, probability_op, test) tf.logging.info( "test set: MAP %s MRR %s Precision@1 %s Recall@1 %s Recall@2 %s Recall@5 %s", *test_metrics) if eidx == 0 or valid_err <= numpy.array(history_errs).min(): best_epoch_num = eidx tf.logging.info( "saving current best model at epoch %s based on metrics on valid set", best_epoch_num) saver.save(sess, os.path.join( FLAGS.output_dir, "model_epoch_{}.ckpt".format(best_epoch_num))) if valid_err > numpy.array(history_errs).min(): bad_counter += 1 tf.logging.info("bad_counter: %s", bad_counter) current_lr = current_lr * 0.5 sess.run(tf.assign(lr, current_lr)) tf.logging.info( "half the current learning rate to %s", current_lr) if bad_counter > FLAGS.patience: tf.logging.info("***** Early Stop *****") estop = True break # evaluation process tf.logging.info("***** Final Result ***** ") tf.logging.info( "restore best model at epoch %s ", best_epoch_num) saver.restore(sess, os.path.join( FLAGS.output_dir, "model_epoch_{}.ckpt".format(best_epoch_num))) valid_metrics, valid_scores = predict_metrics( sess, cost_op, probability_op, valid) tf.logging.info( "valid set: MAP %s MRR %s Precision@1 %s Recall@1 %s Recall@2 %s Recall@5 %s", *valid_metrics) test_metrics, test_scores = predict_metrics( sess, cost_op, probability_op, test) tf.logging.info( "test set: MAP %s MRR %s Precision@1 %s Recall@1 %s Recall@2 %s Recall@5 %s", *test_metrics) train_acc, train_cost = predict_accuracy( sess, cost_op, probability_op, train_eval) tf.logging.info("train set: ACC %s Cost %s", train_acc, train_cost) ud_whole = (time.time() - ud_start_whole) / 3600 tf.logging.info("training epochs: %s", eidx + 1) tf.logging.info("training duration: %s hours", ud_whole)
def train( dim_word=100, # word vector dimensionality dim=100, # the number of GRU units encoder='lstm', # encoder model decoder='lstm', # decoder model patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates decay_c=0., # L2 regularization penalty clip_c=-1., # gradient clipping threshold lrate=0.0004, # learning rate n_words=100000, # vocabulary size n_words_lemma=100000, maxlen=100, # maximum length of the description optimizer='adam', batch_size=32, valid_batch_size=32, save_model='../../models/', saveto='model.npz', dispFreq=100, validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates use_dropout=False, reload_=False, verbose=False, # print verbose information for debug but slow speed delay1=3, delay2=7, delay_tech=5, types='title', cut_word=False, cut_news=False, last_layer="LSTM", CNN_filter=64, CNN_kernel=3, keep_prob = 0.8, datasets=[], valid_datasets=[], test_datasets=[], tech_data = [], dictionary=[], kb_dicts=[], embedding='', # pretrain embedding file, such as word2vec, GLOVE dim_kb=5, RUN_NAME="histogram_visualization", wait_N=10 ): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s", filename='./log_result.txt') # Model options model_options = locals().copy() #tf.reset_default_graph() #tf.set_random_seed(2345) with open(dictionary, 'rb') as f: worddicts = pkl.load(f) logger.info("Loading knowledge base ...") # reload options if reload_ and os.path.exists(saveto): logger.info("Reload options") with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) logger.debug(pprint.pformat(model_options)) logger.info("Loading data") train = TextIterator(datasets[0], datasets[1],tech_data, dict=dictionary, delay1=delay1, delay2=delay2, delay_tech=delay_tech, types=types, n_words=n_words, batch_size=batch_size, cut_word=cut_word, cut_news=cut_news, shuffle=True, shuffle_sentence=False) train_valid = TextIterator(datasets[0], datasets[1],tech_data, dict=dictionary, delay1=delay1, delay2=delay2, delay_tech=delay_tech, types=types, n_words=n_words, batch_size=valid_batch_size, cut_word=cut_word, cut_news=cut_news, shuffle=False, shuffle_sentence=False) valid = TextIterator(valid_datasets[0], valid_datasets[1],tech_data, dict=dictionary, delay1=delay1, delay2=delay2, delay_tech=delay_tech, types=types, n_words=n_words, batch_size=valid_batch_size, cut_word=cut_word, cut_news=cut_news, shuffle=False, shuffle_sentence=False) test = TextIterator(test_datasets[0], test_datasets[1],tech_data, dict=dictionary, delay1=delay1, delay2=delay2, delay_tech=delay_tech, types=types, n_words=n_words, batch_size=valid_batch_size, cut_word=cut_word, cut_news=cut_news, shuffle=False, shuffle_sentence=False) # Initialize (or reload) the parameters using 'model_options' # then build the tensorflow graph logger.info("init_word_embedding") params = init_params(model_options, worddicts) embedding = word_embedding(model_options, params) is_training, cost, x, x_mask, y, n_timesteps, pred, summary = build_model(embedding, model_options) with tf.variable_scope('train'): lr = tf.Variable(0.0, trainable=False) def assign_lr(session, lr_value): session.run(tf.assign(lr, lr_value)) logger.info('Building optimizers...') #optimizer = tf.train.AdamOptimizer(learning_rate=lr) optimizer = tf.train.AdadeltaOptimizer(learning_rate=lr,rho=0.95) logger.info('Done') # print all variables tvars = tf.trainable_variables() for var in tvars: print(var.name, var.shape) lossL = tf.add_n([tf.nn.l2_loss(v) for v in tvars if ('embeddings' not in v.name and 'bias' not in v.name)])# lossL2=lossL * 0.0005 print("don't do L2 variables:") print([v.name for v in tvars if ('embeddings' in v.name or 'bias' in v.name)]) print("\n do L2 variables:") print([v.name for v in tvars if ('embeddings' not in v.name and 'bias' not in v.name)]) cost = cost + lossL2 grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), model_options['clip_c']) extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(extra_update_ops): train_op = optimizer.apply_gradients(zip(grads, tvars)) # train_op = optimizer.minimize(cost) op_loss = tf.reduce_mean(cost) op_L2 = tf.reduce_mean(lossL) logger.info("correct_pred") correct_pred = tf.equal(tf.argmax(input=pred, axis=1), y) # make prediction logger.info("Done") temp_accuracy = tf.cast(correct_pred, tf.float32) # change to float32 logger.info("init variables") init = tf.global_variables_initializer() logger.info("Done") # saver saver = tf.train.Saver(max_to_keep=15) config = tf.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 0.4 config.gpu_options.allow_growth = True with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: #sess = tf_debug.LocalCLIDebugWrapperSession(sess) training_writer = tf.summary.FileWriter("./logs/{}/training".format(RUN_NAME), sess.graph) validate_writer = tf.summary.FileWriter("./logs/{}/validate".format(RUN_NAME), sess.graph) testing_writer = tf.summary.FileWriter("./logs/{}/testing".format(RUN_NAME), sess.graph) sess.run(init) history_errs = [] history_valid_result = [] history_test_result = [] # reload history if reload_ and os.path.exists(saveto): logger.info("Reload history error") history_errs = list(numpy.load(saveto)['history_errs']) bad_counter = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size loss_plot=defaultdict(list) uidx = 0 estop = False valid_acc_record = [] test_acc_record = [] best_num = -1 best_epoch_num = 0 lr_change_list = [] wait_counter = 0 wait_N = model_options['wait_N'] learning_rate = model_options['lrate'] assign_lr(sess, learning_rate) for eidx in range(max_epochs): n_samples = 0 training_cost = 0 training_acc = 0 for x, x_d1, x_d2, y, y_tech in train: n_samples += len(x) uidx += 1 keep_prob = model_options['keep_prob'] is_training = True data_x, data_x_mask, data_x_d1, data_x_d1_mask, data_x_d2, data_x_d2_mask, data_y, final_mask = prepare_data( x, x_d1, x_d2, y, model_options, maxlen=maxlen) print(data_x.shape, data_x_mask.shape, data_x_d1.shape, data_x_d1_mask.shape, data_x_d2.shape, data_x_d2_mask.shape, final_mask.shape, data_y.shape) assert data_y.shape[0] == data_x.shape[0], 'Size does not match' if x is None: logger.debug('Minibatch with zero sample under length {0}'.format(maxlen)) uidx -= 1 continue ud_start = time.time() _, loss,loss_no_mean,temp_acc,l2_check = sess.run([train_op, op_loss,cost,temp_accuracy,op_L2], feed_dict={'input/x:0': data_x, 'input/x_mask:0': data_x_mask, 'input/y:0': data_y, 'input/x_d1:0': data_x_d1, 'input/x_d1_mask:0': data_x_d1_mask, 'input/x_d2:0': data_x_d2, 'input/x_d2_mask:0': data_x_d2_mask, 'input/final_mask:0': final_mask, 'input/technical:0':y_tech, 'input/keep_prob:0': keep_prob, 'input/is_training:0': is_training}) ud = time.time() - ud_start training_cost += loss_no_mean.sum() training_acc += temp_acc.sum() loss_plot['training'].append(loss) '''train_summary = sess.run(summary, feed_dict={'input/x:0': data_x, 'input/x_mask:0': data_x_mask, 'input/y:0': data_y,'input/keep_prob:0':keep_prob,'input/is_training:0':is_training}) training_writer.add_summary(train_summary, eidx)''' if numpy.mod(uidx, dispFreq) == 0: logger.debug('Epoch {0} Update {1} Cost {2} L2 {3} TIME {4}'.format(eidx, uidx, loss,l2_check,ud)) # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: is_training = False valid_acc, valid_loss,valid_final_result = predict_pro_acc(sess, cost, prepare_data, model_options, valid, maxlen, correct_pred, pred, summary, eidx, is_training, train_op,loss_plot, validate_writer,validate=True) test_acc, test_loss,test_final_result = predict_pro_acc(sess, cost, prepare_data, model_options, test, maxlen, correct_pred, pred, summary, eidx, is_training, train_op,loss_plot, testing_writer) # valid_err = 1.0 - valid_acc valid_err = valid_loss history_errs.append(valid_err) history_valid_result.append(valid_final_result) history_test_result.append(test_final_result) loss_plot['validate_ep'].append(valid_loss) loss_plot['testing_ep'].append(test_loss) logger.debug('Epoch {0}'.format(eidx)) logger.debug('Valid cost {0}'.format(valid_loss)) logger.debug('Valid accuracy {0}'.format(valid_acc)) logger.debug('Test cost {0}'.format(test_loss)) logger.debug('Test accuracy {0}'.format(test_acc)) logger.debug('learning_rate: {0}'.format(learning_rate)) valid_acc_record.append(valid_acc) test_acc_record.append(test_acc) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_num = best_num + 1 best_epoch_num = eidx wait_counter = 0 logger.info("Saving...") saver.save(sess, _s(_s(_s(save_model, "epoch"), str(best_num)), "model.ckpt")) logger.info(_s(_s(_s(save_model, "epoch"), str(best_num)), "model.ckpt")) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('{}.pkl'.format(saveto), 'wb')) logger.info("Done") if valid_err > numpy.array(history_errs).min(): wait_counter += 1 # wait_counter +=1 if valid_err>numpy.array(history_errs).min() else 0 if wait_counter >= wait_N: logger.info("wait_counter max, need to half the lr") # print 'wait_counter max, need to half the lr' bad_counter += 1 wait_counter = 0 logger.debug('bad_counter: {0}'.format(bad_counter)) # TODO change the learining rate #learning_rate = learning_rate * 0.9 # learning_rate = learning_rate #assign_lr(sess, learning_rate) lr_change_list.append(eidx) logger.debug('lrate change to: {0}'.format(learning_rate)) # print 'lrate change to: ' + str(lrate) if bad_counter > patience: logger.info("Early Stop!") estop = True break if numpy.isnan(valid_err): pdb.set_trace() # finish after this many updates if uidx >= finish_after: logger.debug('Finishing after iterations! {0}'.format(uidx)) # print 'Finishing after %d iterations!' % uidx estop = True break logger.debug('Seen samples: {0}'.format(n_samples)) logger.debug('Training accuracy: {0}'.format(1.0 * training_acc/n_samples)) loss_plot['training_ep'].append(training_cost/n_samples) # print 'Seen %d samples' % n_samples logger.debug('Saved loss_plot pickle') with open("important_plot.pickle",'wb') as handle: pkl.dump(loss_plot, handle, protocol=pkl.HIGHEST_PROTOCOL) if estop: break with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: # Restore variables from disk. saver.restore(sess, _s(_s(_s(save_model, "epoch"), str(best_num)), "model.ckpt")) keep_prob = 1 is_training = False logger.info('=' * 80) logger.info('Final Result') logger.info('=' * 80) logger.debug('best epoch {0}'.format(best_epoch_num)) valid_acc, valid_cost,valid_final_result = predict_pro_acc(sess, cost, prepare_data, model_options, valid, maxlen, correct_pred, pred, summary, eidx,train_op, is_training, None) logger.debug('Valid cost {0}'.format(valid_cost)) logger.debug('Valid accuracy {0}'.format(valid_acc)) # print 'Valid cost', valid_cost # print 'Valid accuracy', valid_acc test_acc, test_cost,test_final_result = predict_pro_acc(sess, cost, prepare_data, model_options, test, maxlen, correct_pred, pred, summary, eidx,train_op, is_training, None) logger.debug('Test cost {0}'.format(test_cost)) logger.debug('Test accuracy {0}'.format(test_acc)) # print 'best epoch ', best_epoch_num train_acc, train_cost,_ = predict_pro_acc(sess, cost, prepare_data, model_options, train_valid, maxlen, correct_pred, pred, summary, eidx,train_op, is_training, None) logger.debug('Train cost {0}'.format(train_cost)) logger.debug('Train accuracy {0}'.format(train_acc)) valid_m=numpy.array(history_valid_result) test_m=numpy.array(history_test_result) valid_final_result = (numpy.array([valid_final_result])==False) test_final_result = (numpy.array([test_final_result])==False) #print(numpy.all(valid_m, axis = 0)) #print(numpy.all(test_m, axis=0)) print('validation: all prediction through every epoch that are the same:',numpy.where(numpy.all(valid_m, axis = 0))) print('testing: all prediction through every epoch that are the same:',numpy.where(numpy.all(test_m, axis=0))) print('validation: final prediction that is False:',numpy.where(valid_final_result)) print('testing: final prediction that is False:',numpy.where(test_final_result)) if os.path.exists('history_predict.npz'): logger.info("Load and save to history_predict.npz") valid_history = numpy.load('history_predict.npz')['valid_final_result'] test_history = numpy.load('history_predict.npz')['test_final_result'] vv=numpy.concatenate((valid_history,valid_final_result),axis=0) tt=numpy.concatenate((test_history,valid_final_result),axis=0) print('Concate shape valid:',vv.shape) print('Print all validate history outputs that return False',numpy.where(numpy.all(vv,axis=0))) print('Concate shape test:',tt.shape) print('Print all test history outputs that return False',numpy.where(numpy.all(tt,axis=0))) numpy.savez('history_predict.npz',valid_final_result=vv,test_final_result=tt,**params) else: numpy.savez('history_predict.npz',valid_final_result=valid_final_result,test_final_result=test_final_result,**params) # print 'Train cost', train_cost # print 'Train accuracy', train_acc # print 'Test cost ', test_cost # print 'Test accuracy ', test_acc return None
def main(model, src_dict, trg_dict, src, trg, multibleu, batch_size=60, pred_dir='', model_list=False): if pred_dir is not '' and not os.path.exists(pred_dir): os.makedirs(pred_dir) if model_list: model_list_file = model with open(model_list_file) as f: model = f.readline().strip() # load dictionaries and invert them worddicts = [None] * 2 worddicts_r = [None] * 2 for ii, dd in enumerate([src_dict, trg_dict]): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # load model options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) trng = RandomStreams(options['trng']) use_noise = theano.shared(numpy.float32(0.)) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) f_init_2, f_next_2 = build_sampler_2(tparams, options, trng, use_noise) iterator = TextIterator(src, trg, src_dict, trg_dict, n_words_source=options['n_words_src'], n_words_target=options['n_words'], batch_size=batch_size, maxlen=2000, shuffle=False, replace=False) if not model_list: try: valid_out, valid_bleu = greedy_decoding( options, trg, iterator, worddicts_r, tparams, prepare_data, gen_sample_2, f_init_2, f_next_2, trng, multibleu, fname=os.path.join(pred_dir, os.path.basename(model)[:-3] + 'out'), maxlen=100, verbose=False) except: valid_out = '' valid_bleu = 0.0 print valid_out, valid_bleu else: best_score = 0. best_model = '' with open(model_list_file) as f: for line in f: start = time.time() model = line.strip() if model == '': continue params = load_params(model, params) for kk, pp in params.iteritems(): tparams[kk].set_value(params[kk]) print model, try: valid_out, valid_bleu = greedy_decoding( options, trg, iterator, worddicts_r, tparams, prepare_data, gen_sample_2, f_init_2, f_next_2, trng, multibleu, fname=os.path.join( pred_dir, os.path.basename(model)[:-3] + 'out'), maxlen=100, verbose=False) except: valid_out = '' valid_bleu = 0.0 print valid_out, valid_bleu, if valid_bleu > best_score: best_score = valid_bleu best_model = model end = time.time() print "Time: ", end - start print 'Best model: ', best_model print 'Best BLEU: ', best_score
def main(model, src_dict, target_dict, source_file, target_file, saveto, source_word_level=1, target_word_level=0, valid_batch_size=128, n_words_src=302, n_words=302): from char_base import (init_params, build_model, build_sampler) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from nmt import (pred_probs, prepare_data) # load model model_options pkl_file = model.split('.')[0] + '.pkl' with open(pkl_file, 'rb') as f: options = pkl.load(f) trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) # create shared variables for parameters tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, options) inps = [x, x_mask, y, y_mask] print 'Building sampler...\n', f_init, f_next = build_sampler(tparams, options, trng, use_noise) print 'Done' # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost) print 'Done' print('Preparing dataset...') dataset = TextIterator(source=source_file, target=target_file, source_dict=src_dict, target_dict=target_dict, n_words_source=n_words_src, n_words_target=n_words, source_word_level=source_word_level, target_word_level=target_word_level, batch_size=valid_batch_size, sort_size=sort_size) print('Predicting probs...') log_probs = pred_probs(f_log_probs, prepare_data, options, dataset, verboseFreq=10000) print('Done...') output_file = open(saveto, 'w') pwd_cnt = 0 for line in open(target_file): output_file.writelines(line.rstrip() + '\t' + str(1.0 / (math.e**log_probs[pwd_cnt])) + '\n') pwd_cnt += 1 """ for prob in log_probs: output_file.writelines(str(prob) + '\n') """ output_file.flush() output_file.close() print('Evaluation finished...')
for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk funcs, tp = build_networks(model_options) if model_options['see_pretrain']: tparams, tparams_xy0 = tp else: tparams = tp # print 'save the compiled functions/tparams for temperal usage' print 'Loading data' train = TextIterator(model_options['datasets'], model_options['dictionaries'], [0 for _ in range(model_options['n_inputs'])], batch_size=model_options['batch_size'], maxlen=model_options['maxlen']) valid = TextIterator(model_options['valid_datasets'], model_options['dictionaries'], [0 for _ in range(model_options['n_inputs'])], batch_size=model_options['batch_size'], maxlen=500) print clr('-------------------------------------------- Main-Loop -------------------------------------------------', 'yellow') # ------------------ initlization --------------- # best_p = None bad_counter = 0 uidx = 0 estop = False history_errs = [] max_epochs = 100
def train( dim_word=100, # word vector dimensionality dim=100, # the number of GRU units encoder='lstm', # encoder model decoder='lstm', # decoder model patience=10, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates decay_c=0., # L2 regularization penalty clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words=100000, # vocabulary size maxlen=100, # maximum length of the description optimizer='adadelta', batch_size=16, valid_batch_size=16, saveto='model.npz', dispFreq=100, validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates use_dropout=False, reload_=False, verbose=False, # print verbose information for debug but slow speed datasets=[], valid_datasets=[], test_datasets=[], dictionary='', embedding='', # pretrain embedding file, such as word2vec, GLOVE ): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") # Model options model_options = locals().copy() model_options[ 'alphabet'] = " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}" model_options['l_alphabet'] = len(model_options['alphabet']) model_options['dim_char_emb'] = 15 model_options['char_nout'] = 100 model_options['char_k_rows'] = 5 model_options['char_k_cols'] = model_options['dim_char_emb'] # load dictionary and invert them with open(dictionary, 'rb') as f: worddicts = pkl.load(f) worddicts_r = dict() for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk # reload options if reload_ and os.path.exists(saveto): print 'Reload options' with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) logger.debug(pprint.pformat(model_options)) print 'Loading data' train = TextIterator(datasets[0], datasets[1], datasets[2], dictionary, n_words=n_words, batch_size=batch_size) train_valid = TextIterator(datasets[0], datasets[1], datasets[2], dictionary, n_words=n_words, batch_size=valid_batch_size, shuffle=False) valid = TextIterator(valid_datasets[0], valid_datasets[1], valid_datasets[2], dictionary, n_words=n_words, batch_size=valid_batch_size, shuffle=False) test = TextIterator(test_datasets[0], test_datasets[1], test_datasets[2], dictionary, n_words=n_words, batch_size=valid_batch_size, shuffle=False) # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options, worddicts) # reload parameters if reload_ and os.path.exists(saveto): print 'Reload parameters' params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) trng, use_noise, \ x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y, \ opt_ret, \ cost, \ f_pred, f_prods = \ build_model(tparams, model_options) inps = [ x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y ] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' updated_params = OrderedDict([(key, value) for (key, value) in tparams.iteritems() if not key.startswith('Wemb')]) print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(updated_params)) print 'Done' # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append( tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads if verbose: print 'Building function of gradient\'s norm' f_norm_g = theano.function(inps, tensor.sqrt(g2)) # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', f_grad_shared, f_update = eval(optimizer)(lr, updated_params, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): print 'Reload history error' history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size uidx = 0 estop = False valid_acc_record = [] test_acc_record = [] best_epoch_num = 0 lr_change_list = [] wait_counter = 0 wait_N = 1 for eidx in xrange(max_epochs): n_samples = 0 for x1, x2, y in train: n_samples += len(x1) uidx += 1 use_noise.set_value(1.) x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y = prepare_data( x1, x2, y, worddicts_r, maxlen=maxlen) if x1 is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y) if verbose: if clip_c > 0.: norm_g = f_norm_g(x1, x1_mask, char_x1, char_x1_mask, x2, x2_mask, char_x2, char_x2_mask, y) # do the update on parameters f_update(lrate) ud = time.time() - ud_start # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return None # verbose if numpy.mod(uidx, dispFreq) == 0: logger.debug('Epoch {0} Update {1} Cost {2} UD {3}'.format( eidx, uidx, cost, ud)) if verbose: if clip_c > 0.: logger.debug('Grad {0}'.format(norm_g)) # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_cost = pred_probs(f_log_probs, prepare_data, model_options, valid, worddicts_r).mean() valid_acc = pred_acc(f_pred, prepare_data, model_options, valid, worddicts_r) valid_err = 1.0 - valid_acc history_errs.append(valid_err) test_cost = pred_probs(f_log_probs, prepare_data, model_options, test, worddicts_r).mean() test_acc = pred_acc(f_pred, prepare_data, model_options, test, worddicts_r) print 'Valid cost', valid_cost print 'Valid accuracy', valid_acc print 'Test cost', test_cost print 'Test accuracy', test_acc print 'lrate:', lrate valid_acc_record.append(valid_acc) test_acc_record.append(test_acc) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) best_epoch_num = eidx wait_counter = 0 if valid_err > numpy.array(history_errs).min(): wait_counter += 1 if wait_counter >= wait_N: print 'wait_counter max, need to half the lr' bad_counter += 1 wait_counter = 0 print 'bad_counter: ' + str(bad_counter) lrate = lrate * 0.5 lr_change_list.append(eidx) print 'lrate change to: ' + str(lrate) zipp(best_p, tparams) if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): pdb.set_trace() # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) with open('record.csv', 'w') as f: f.write(str(best_epoch_num) + '\n') f.write(','.join(map(str, lr_change_list)) + '\n') f.write(','.join(map(str, valid_acc_record)) + '\n') f.write(','.join(map(str, test_acc_record)) + '\n') use_noise.set_value(0.) print '=' * 80 print 'Final Result' print '=' * 80 train_cost = pred_probs(f_log_probs, prepare_data, model_options, train_valid, worddicts_r).mean() train_acc = pred_acc(f_pred, prepare_data, model_options, train_valid, worddicts_r) print 'Train cost', train_cost print 'Train accuracy', train_acc valid_cost = pred_probs(f_log_probs, prepare_data, model_options, valid, worddicts_r).mean() valid_acc = pred_acc(f_pred, prepare_data, model_options, valid, worddicts_r) print 'Valid cost', valid_cost print 'Valid accuracy', valid_acc test_cost = pred_probs(f_log_probs, prepare_data, model_options, test, worddicts_r).mean() test_acc = pred_acc(f_pred, prepare_data, model_options, test, worddicts_r) print 'Test cost', test_cost print 'Test accuracy', test_acc params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) logger.debug('Done') return None
def ptb_iterator(source, source_dict, batch_size, maxlen, char_level=False, n_words_source=-1, rng=None): data = [] if char_level: # Character level PTB if source.endswith('.gz'): source_file = gzip.open(source, 'r') else: source_file = open(source, 'r') # Make a dictionary mapping known characters to integers # 0 is 'unk' # 1 is 'end of sentence' # (48 entries) chars = ['<unk>', '\n', '#', '$', '&', "'", '*', '-', '.', '/', '\\', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'N', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] char_dict = dict(zip(chars, np.arange(len(chars)))) # Make a list of all lines in integer encoded format for line in source_file: if len(line) > maxlen: continue encoded_line = [] i = 0 while i < len(line): ch = line[i] try: encoded_line.append(char_dict[ch]) except KeyError: # Unknown characters are 0, including '<unk>' encoded_line.append(0) if line[i:i+5]=='<unk>': i = i+4 i += 1 data.append(encoded_line) else: # Word level PTB text_iter = TextIterator(source=source, source_dict=source_dict, batch_size=batch_size, maxlen=maxlen, n_words_source=n_words_source) data = [] for batch in text_iter: data.extend(batch) # Prepare data to sample batches from x_arr = np.zeros((len(data), maxlen), dtype=np.int32) m_arr = np.zeros((len(data), maxlen), dtype=np.uint8) y_arr = np.zeros((len(data), maxlen), dtype=np.int32) for i, line in enumerate(data): x_arr[i, 0:len(line)] = line m_arr[i, 0:len(line)+1] = 1 y_arr[:,:-1] = x_arr[:,1:] if rng is None: rng = np.random.RandomState() num_batches = len(data)//batch_size if len(data)%batch_size: num_batches += 1 def gen(): indices = rng.permutation(len(data)) for i in range(num_batches): x = x_arr[indices[i*batch_size:(i+1)*batch_size]] m = m_arr[indices[i*batch_size:(i+1)*batch_size]] y = y_arr[indices[i*batch_size:(i+1)*batch_size]] yield x, m, y return gen