def find_sent_embedding(whole, n_words=21102, img_w=300, img_h=48, feature_maps=200, filter_hs=[3, 4, 5], n_x=300, n_h=600): options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs options['n_x'] = n_x options['n_h'] = n_h filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes params = init_params(options) tparams = init_tparams(params) data = np.load('./bookcorpus_result.npz') for kk, pp in params.iteritems(): params[kk] = data[kk] for kk, pp in params.iteritems(): tparams[kk].set_value(params[kk]) x = tensor.matrix('x', dtype='int32') layer0_input = tparams['Wemb'][tensor.cast(x.flatten(), dtype='int32')].reshape( (x.shape[0], 1, x.shape[1], tparams['Wemb'].shape[1])) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input, filter_shape, pool_size, prefix=_p('cnn_encoder', i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input = tensor.concatenate(layer1_inputs, 1) f_embed = theano.function([x], layer1_input, name='f_embed') kf = get_minibatches_idx(len(whole), 100) sent_emb = np.zeros((len(whole), 600)) for i, train_index in kf: sents = [whole[t] for t in train_index] x = prepare_data_for_cnn(sents) sent_emb[train_index[0]:train_index[-1] + 1] = f_embed(x) if i % 500 == 0: print i, np.savez('./bookcorpus_embedding.npz', sent_emb=sent_emb) return sent_emb
def train_model(train, valid, test, img_feats, W, n_words=7414, n_x=300, n_h=512, max_epochs=20, lrate=0.001, batch_size=64, valid_batch_size=64, dropout_val=0.5, dispFreq=10, validFreq=500, saveFreq=1000, saveto = 'flickr30k_result_psgld_dropout.npz'): """ n_words : vocabulary size n_x : word embedding dimension n_h : LSTM/GRU number of hidden units max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dropout_val : the probability of dropout dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq : save results after this number of update. saveto : where to save. """ options = {} options['n_words'] = n_words options['n_x'] = n_x options['n_h'] = n_h options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq options['saveFreq'] = saveFreq options['n_z'] = img_feats.shape[0] logger.info('Model options {}'.format(options)) logger.info('{} train examples'.format(len(train[0]))) logger.info('{} valid examples'.format(len(valid[0]))) logger.info('{} test examples'.format(len(test[0]))) logger.info('Building model...') params = init_params(options,W) tparams = init_tparams(params) (use_noise, x, mask, z, f_pred_prob, cost) = build_model(tparams,options) f_cost = theano.function([x, mask, z], cost, name='f_cost') lr_theano = tensor.scalar(name='lr') ntrain_theano = tensor.scalar(name='ntrain') f_grad_shared, f_update = pSGLD(tparams, cost, [x, mask,z], ntrain_theano, lr_theano) logger.info('Training model...') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) estop = False # early stop history_negll = [] best_p = None best_valid_negll, best_test_negll = 0., 0. bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() # statistics of data train_num_words, valid_num_words, test_num_words = 0, 0, 0 for sent in train[0]: train_num_words = train_num_words + len(sent) for sent in valid[0]: valid_num_words = valid_num_words + len(sent) for sent in test[0]: test_num_words = test_num_words + len(sent) n_average = 0 valid_probs = np.zeros((valid_num_words,)) test_probs = np.zeros((test_num_words,)) try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(dropout_val) x = [train[0][t]for t in train_index] z = np.array([img_feats[:,train[1][t]]for t in train_index]) x, mask = prepare_data(x) n_samples += x.shape[1] cost = f_grad_shared(x, mask,z) f_update(lrate,len(train[0])) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_negll=history_negll, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if eidx < 3: valid_negll = calu_negll(f_cost, prepare_data, valid, img_feats, kf_valid) test_negll = calu_negll(f_cost, prepare_data, test, img_feats, kf_test) history_negll.append([valid_negll, test_negll]) else: valid_probs_curr = calu_pred_prob(f_pred_prob, prepare_data, valid, img_feats, kf_valid) test_probs_curr = calu_pred_prob(f_pred_prob, prepare_data, test, img_feats, kf_test) valid_probs = (n_average * valid_probs + valid_probs_curr)/(n_average+1) test_probs = (n_average * test_probs + test_probs_curr)/(n_average+1) n_average += 1 valid_negll = -np.log(valid_probs + 1e-6).sum() / valid_num_words test_negll = -np.log(test_probs + 1e-6).sum() / test_num_words history_negll.append([valid_negll, test_negll]) logger.info('Saving {}th Sample...'.format(n_average)) params = unzip(tparams) np.savez('flickr30k_result_psgld_{}.npz'.format(n_average), valid_probs_curr=valid_probs_curr, test_probs_curr=test_probs_curr, **params) logger.info('Done ...') if (uidx == 0 or valid_negll <= np.array(history_negll)[:,0].min()): best_p = unzip(tparams) best_valid_negll = valid_negll best_test_negll = test_negll bad_counter = 0 logger.info('Perp: Valid {} Test {}'.format(np.exp(valid_negll), np.exp(test_negll))) if (len(history_negll) > 10 and valid_negll >= np.array(history_negll)[:-10,0].min()): bad_counter += 1 if bad_counter > 10: logger.info('Early Stop!') estop = True break logger.info('Seen {} samples'.format(n_samples)) if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) logger.info('Perp: Valid {} Test {}'.format(np.exp(best_valid_negll), np.exp(best_test_negll))) np.savez(saveto, history_negll=history_negll, **best_p) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return best_valid_negll, best_test_negll
def train_classifier(train, valid, test, W, n_words=10000, n_x=300, n_h=200, patience=10, max_epochs=50, lrate=0.001, n_train=10000, optimizer='RMSprop', batch_size=50, valid_batch_size=50, dispFreq=10, validFreq=100, saveFreq=500, saveto='mr_pSGLD_dropout.npz'): """ train, valid, test : datasets W : the word embedding initialization n_words : vocabulary size n_x : word embedding dimension n_h : LSTM/GRU number of hidden units n_z : latent embedding sapce for a sentence patience : Number of epoch to wait before early stop if no progress max_epochs : The maximum number of epoch to run lrate : learning rate optimizer : methods to do optimization batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. """ options = {} options['n_words'] = n_words options['n_x'] = n_x options['n_h'] = n_h options['patience'] = patience options['max_epochs'] = max_epochs options['lrate'] = lrate options['optimizer'] = optimizer options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq logger.info('Model options {}'.format(options)) logger.info('{} train examples'.format(len(train[0]))) logger.info('{} valid examples'.format(len(valid[0]))) logger.info('{} test examples'.format(len(test[0]))) logger.info('Building model...') n_y = np.max(train[1]) + 1 options['n_y'] = n_y params = init_params(options, W) tparams = init_tparams(params) (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, options) lr_theano = tensor.scalar(name='lr') ntrain_theano = tensor.scalar(name='ntrain') f_grad_shared, f_update = pSGLD(tparams, cost, [x, mask, y], ntrain_theano, lr_theano) #print 'Training model...' logger.info('Training model...') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) estop = False # early stop history_errs = [] best_train_err, best_valid_err, best_test_err = 0., 0., 0. bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() n_average = 0 train_probs = np.zeros((len(train[0]), n_y)) valid_probs = np.zeros((len(valid[0]), n_y)) test_probs = np.zeros((len(test[0]), n_y)) try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(0.5) y = [train[1][t] for t in train_index] x = [train[0][t] for t in train_index] x, mask, y = prepare_data(x, y) n_samples += x.shape[1] cost = f_grad_shared(x, mask, y) f_update(lrate, n_train) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format( eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') np.savez(saveto, history_errs=history_errs) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if eidx < 1: train_err = pred_error(f_pred, prepare_data, train, kf) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) history_errs.append([valid_err, test_err, train_err]) else: train_probs_curr = pred_probs(f_pred_prob, prepare_data, train, kf, options) valid_probs_curr = pred_probs(f_pred_prob, prepare_data, valid, kf_valid, options) test_probs_curr = pred_probs(f_pred_prob, prepare_data, test, kf_test, options) train_probs = (n_average * train_probs + train_probs_curr) / (n_average + 1) valid_probs = (n_average * valid_probs + valid_probs_curr) / (n_average + 1) test_probs = (n_average * test_probs + test_probs_curr) / (n_average + 1) n_average += 1 train_pred = train_probs.argmax(axis=1) valid_pred = valid_probs.argmax(axis=1) test_pred = test_probs.argmax(axis=1) train_err = (train_pred == np.array(train[1])).sum() train_err = 1. - numpy_floatX(train_err) / len( train[0]) valid_err = (valid_pred == np.array(valid[1])).sum() valid_err = 1. - numpy_floatX(valid_err) / len( valid[0]) test_err = (test_pred == np.array(test[1])).sum() test_err = 1. - numpy_floatX(test_err) / len(test[0]) history_errs.append([valid_err, test_err, train_err]) if (uidx == 0 or valid_err <= np.array(history_errs)[:, 0].min()): best_train_err = train_err best_valid_err = valid_err best_test_err = test_err bad_counter = 0 logger.info('Train {} Valid {} Test {}'.format( train_err, valid_err, test_err)) if (len(history_errs) > patience and valid_err >= np.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: logger.info('Early Stop!') estop = True break logger.info('Seen {} samples'.format(n_samples)) if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() logger.info('Train {} Valid {} Test {}'.format(best_train_err, best_valid_err, best_test_err)) np.savez(saveto, train_err=best_train_err, valid_err=best_valid_err, test_err=best_test_err, history_errs=history_errs) logger.info('The code run for {} epochs, with {} sec/epochs'.format( eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) #print >> sys.stderr, ('Training took %.1fs' % # (end_time - start_time)) return best_train_err, best_valid_err, best_test_err
def train_classifier(train, valid, test, W, n_words=10000, n_x=300, n_h=200, dropout_val=0.5, patience=10, max_epochs=20, lrate=0.0002, batch_size=50, valid_batch_size=50, dispFreq=10, validFreq=100, saveFreq=200, saveto = 'trec_gru_result.npz'): """ train, valid, test : datasets W : the word embedding initialization n_words : vocabulary size n_x : word embedding dimension n_h : LSTM/GRU number of hidden units dropout_val: dropput probability patience : Number of epoch to wait before early stop if no progress max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq: save the result after this number of update. saveto: where to save the result. """ options = {} options['n_words'] = n_words options['n_x'] = n_x options['n_h'] = n_h options['patience'] = patience options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq logger.info('Model options {}'.format(options)) logger.info('{} train examples'.format(len(train[0]))) logger.info('{} valid examples'.format(len(valid[0]))) logger.info('{} test examples'.format(len(test[0]))) logger.info('Building model...') n_y = np.max(train[1]) + 1 options['n_y'] = n_y params = init_params(options,W) tparams = init_tparams(params) (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams,options) lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, [x, mask, y], lr) logger.info('Training model...') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) estop = False # early stop history_errs = [] best_p = None bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() try: for eidx in xrange(max_epochs): kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(dropout_val) y = [train[1][t] for t in train_index] x = [train[0][t]for t in train_index] x, mask, y = prepare_data(x, y) cost = f_grad_shared(x, mask, y) f_update(lrate) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) history_errs.append([valid_err, test_err, train_err]) if (uidx == 0 or valid_err <= np.array(history_errs)[:,0].min()): best_p = unzip(tparams) bad_counter = 0 logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) if (len(history_errs) > patience and valid_err >= np.array(history_errs)[:-patience,0].min()): bad_counter += 1 if bad_counter > patience: logger.info('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) np.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return train_err, valid_err, test_err
def train_model(train_x, train_y, valid_x, valid_y, test_x, test_y, n_words=10000, n_x=300, n_h=1500, max_epochs=55, collect_epoch=4, lrate=1, anneal_lr_epoch=15, anneal_lr_factor=1.15, dropout_val=0.65, batch_size=32, valid_batch_size=64, dispFreq=10, validFreq=400, saveFreq=1000, saveto='ptb_result_large_sgld_with_dropout.npz'): """ n_words : vocabulary size n_x : word embedding dimension n_h : LSTM/GRU number of hidden units n_z : latent embedding sapce for a sentence patience : Number of epoch to wait before early stop if no progress max_epochs : The maximum number of epoch to run lrate : learning rate optimizer : methods to do optimization batch_size : batch size during training valid_batch_size : The batch size used for validation/test set use_dropout : whether use dropout or not dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. test_size : If >0, we keep only this number of test example. """ options = {} options['n_words'] = n_words options['n_x'] = n_x options['n_h'] = n_h options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq options['saveFreq'] = saveFreq logger.info('Model options {}'.format(options)) logger.info('Building model...') params = init_params(options) tparams = init_tparams(params) use_noise, x, y, f_pred_prob, cost = build_model(tparams, options) f_cost = theano.function([x, y], cost, name='f_cost') lr_theano = tensor.scalar(name='lr') ntrain_theano = tensor.scalar(name='ntrain') f_grad_shared, f_update = SGLD(tparams, cost, [x, y], ntrain_theano, lr_theano) logger.info('Training model...') kf_valid = get_minibatches_idx(valid_x.shape[0], valid_batch_size) kf_test = get_minibatches_idx(test_x.shape[0], valid_batch_size) estop = False # early stop history_negll = [] best_p = None best_valid_negll, best_test_negll = 0., 0. bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() # statistics of data train_num_words = train_x.shape[0] * train_x.shape[1] valid_num_words = valid_x.shape[0] * valid_x.shape[1] test_num_words = test_x.shape[0] * test_x.shape[1] n_average = 0 valid_probs = np.zeros((valid_num_words, )) test_probs = np.zeros((test_num_words, )) try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(train_x.shape[0], batch_size, shuffle=True) if eidx >= anneal_lr_epoch: #annealing learning rate lrate = lrate / anneal_lr_factor for _, train_index in kf: uidx += 1 use_noise.set_value(dropout_val) x = train_x[train_index].T y = train_y[train_index].T n_samples += x.shape[1] cost = f_grad_shared(x, y) f_update(lrate, train_num_words) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format( eidx, uidx, np.exp(cost))) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_negll=history_negll, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if eidx < collect_epoch: valid_negll = calu_negll(f_cost, valid_x, valid_y, kf_valid) test_negll = calu_negll(f_cost, test_x, test_y, kf_test) history_negll.append([valid_negll, test_negll]) else: valid_probs_curr = calu_pred_prob( f_pred_prob, valid_x, valid_y, kf_valid) test_probs_curr = calu_pred_prob( f_pred_prob, test_x, test_y, kf_test) valid_probs = (n_average * valid_probs + valid_probs_curr) / (n_average + 1) test_probs = (n_average * test_probs + test_probs_curr) / (n_average + 1) n_average += 1 valid_negll = -np.log(valid_probs + 1e-6).sum() / valid_num_words test_negll = -np.log(test_probs + 1e-6).sum() / test_num_words history_negll.append([valid_negll, test_negll]) logger.info('Saving {}th Sample...'.format(n_average)) params = unzip(tparams) np.savez( 'ptb_result_sgld_large_{}.npz'.format(n_average), valid_probs_curr=valid_probs_curr, test_probs_curr=test_probs_curr, **params) logger.info('Done ...') if (uidx == 0 or valid_negll <= np.array(history_negll)[:, 0].min()): best_p = unzip(tparams) best_valid_negll = valid_negll best_test_negll = test_negll bad_counter = 0 logger.info('Valid {} Test {}'.format( np.exp(valid_negll), np.exp(test_negll))) if (len(history_negll) > 10 and valid_negll >= np.array(history_negll)[:-10, 0].min()): bad_counter += 1 if bad_counter > 10: logger.info('Early Stop!') estop = True break logger.info('Seen {} samples'.format(n_samples)) if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) # use_noise.set_value(0.) # kf_train_sorted = get_minibatches_idx(len(train), batch_size) # train_negll = calu_negll(f_cost, prepare_data, train, kf_train_sorted) # valid_negll = calu_negll(f_cost, prepare_data, valid, kf_valid) # test_negll = calu_negll(f_cost, prepare_data, test, kf_test) logger.info('Valid {} Test {}'.format(np.exp(best_valid_negll), np.exp(best_test_negll))) np.savez(saveto, history_negll=history_negll, **best_p) logger.info('The code run for {} epochs, with {} sec/epochs'.format( eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return best_valid_negll, best_test_negll
def train_model(train, val, test, n_words=21103, img_w=300, max_len=40, feature_maps=200, filter_hs=[3,4,5], n_x=300, n_h=600, max_epochs=8, lrate=0.0002, batch_size=64, valid_batch_size=64, dispFreq=10, validFreq=500, saveFreq=1000, saveto = 'bookcorpus_result.npz'): """ train, valid, test : datasets n_words : vocabulary size img_w : word embedding dimension, must be 300. max_len : the maximum length of a sentence feature_maps : the number of feature maps we used filter_hs: the filter window sizes we used n_x: word embedding dimension n_h: the number of hidden units in LSTM max_epochs : the maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq: save the result after this number of update. saveto: where to save the result. """ img_h = max_len + 2*(filter_hs[-1]-1) options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs options['n_x'] = n_x options['n_h'] = n_h options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq options['saveFreq'] = saveFreq logger.info('Model options {}'.format(options)) logger.info('Building model...') filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes params = init_params(options) tparams = init_tparams(params) use_noise, x, y, y_mask, cost = build_model(tparams,options) f_cost = theano.function([x, y, y_mask], cost, name='f_cost') lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, [x, y, y_mask], lr) logger.info('Training model...') history_cost = [] uidx = 0 # the number of update done start_time = time.time() kf_valid = get_minibatches_idx(len(val), valid_batch_size) zero_vec_tensor = tensor.vector() zero_vec = np.zeros(img_w).astype(theano.config.floatX) set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor(tparams['Wemb'][21102,:], zero_vec_tensor))]) try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(len(train), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(0.) sents = [train[t]for t in train_index] x = prepare_data_for_cnn(sents) y, y_mask = prepare_data_for_rnn(sents) n_samples += y.shape[1] cost = f_grad_shared(x, y, y_mask) f_update(lrate) # the special <pad_zero> token does not need to update. set_zero(zero_vec) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, np.exp(cost))) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') params = unzip(tparams) np.savez(saveto, history_cost=history_cost, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_cost = calu_cost(f_cost, prepare_data_for_cnn, prepare_data_for_rnn, val, kf_valid) history_cost.append([valid_cost]) logger.info('Valid {}'.format(np.exp(valid_cost))) logger.info('Seen {} samples'.format(n_samples)) except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() # if best_p is not None: # zipp(best_p, tparams) # else: # best_p = unzip(tparams) use_noise.set_value(0.) valid_cost = calu_cost(f_cost, prepare_data_for_cnn, prepare_data_for_rnn, val, kf_valid) logger.info('Valid {}'.format(np.exp(valid_cost))) params = unzip(tparams) np.savez(saveto, history_cost=history_cost, **params) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return valid_cost
tmp = cPickle.load(f) for keys in params: params[str(keys)] = tmp[str(keys)] del tmp tparams = init_tparams(params) (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, options) lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, [x, mask, y], lr) kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) print('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) # ============================================================================= print("train_err %.2f, valid_err %.2f, test_err %.2f" % (train_err, valid_err, test_err))
def train_classifier(train, valid, test, W, n_words=10000, img_w=300, max_len=40, feature_maps=100, filter_hs=[3,4,5], dropout_val=0.5, patience=10, max_epochs=20, lrate=0.0002, batch_size=50, valid_batch_size=50, dispFreq=10, validFreq=100, saveFreq=200, saveto = 'trec_cnn_result.npz'): """ train, valid, test : datasets W : the word embedding initialization n_words : vocabulary size img_w : word embedding dimension, must be 300. max_len : the maximum length of a sentence feature_maps : the number of feature maps we used filter_hs: the filter window sizes we used dropout_val: dropput probability patience : Number of epoch to wait before early stop if no progress max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq: save the result after this number of update. saveto: where to save the result. """ img_h = max_len + 2*(filter_hs[-1]-1) options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs options['patience'] = patience options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq logger.info('Model options {}'.format(options)) logger.info('{} train examples'.format(len(train[0]))) logger.info('{} valid examples'.format(len(valid[0]))) logger.info('{} test examples'.format(len(test[0]))) logger.info('Building model...') n_y = np.max(train[1]) + 1 options['n_y'] = n_y """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes """ filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes params = init_params(options,W) tparams = init_tparams(params) (use_noise, x, y, f_pred_prob, f_pred, cost) = build_model(tparams,options) lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, [x, y], lr) logger.info('Training model...') kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) estop = False # early stop history_errs = [] best_p = None bad_counter = 0 uidx = 0 # the number of update done start_time = time.time() zero_vec_tensor = tensor.vector() zero_vec = np.zeros(img_w).astype(theano.config.floatX) set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor(tparams['Wemb'][n_words-1,:], zero_vec_tensor))]) try: for eidx in xrange(max_epochs): kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(dropout_val) y = np.array([train[1][t] for t in train_index]).astype('int32') x = [train[0][t]for t in train_index] x = prepare_data(x,max_len,n_words,filter_hs[-1]) cost = f_grad_shared(x, y) f_update(lrate) # the special token does not need to update. set_zero(zero_vec) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') if best_p is not None: params = best_p else: params = unzip(tparams) np.savez(saveto, history_errs=history_errs, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf, max_len,n_words, filter_hs[-1]) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, max_len,n_words, filter_hs[-1]) test_err = pred_error(f_pred, prepare_data, test, kf_test, max_len,n_words, filter_hs[-1]) history_errs.append([valid_err, test_err, train_err]) if (uidx == 0 or valid_err <= np.array(history_errs)[:,0].min()): best_p = unzip(tparams) bad_counter = 0 logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) if (len(history_errs) > patience and valid_err >= np.array(history_errs)[:-patience,0].min()): bad_counter += 1 if bad_counter > patience: logger.info('Early Stop!') estop = True break if estop: break except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted, max_len,n_words, filter_hs[-1]) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, max_len,n_words, filter_hs[-1]) test_err = pred_error(f_pred, prepare_data, test, kf_test, max_len,n_words, filter_hs[-1]) logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err)) np.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return train_err, valid_err, test_err
def train_model(train, val, test, train_lab, val_lab, test_lab, ixtoword, n_words=22153, period=887, img_w=300, img_h=148, feature_maps=300, filter_hs=[3, 4, 5], n_x=300, n_h=500, n_h2_d=200, n_h2=900, p_lambda_q=0, p_lambda_fm=0.001, p_lambda_recon=0.001, n_codes=2, max_epochs=16, lr_d=0.0001, lr_g=0.00005, kde_sigma=1., batch_size=256, valid_batch_size=256, dim_mmd=32, dispFreq=10, dg_ratio=1, Large=1e3, validFreq=500, saveFreq=500, saveto='disent_result'): """ n_words : word vocabulary size feature_maps : CNN embedding dimension for each width filter_hs : CNN width n_h : LSTM/GRU number of hidden units n_h2: discriminative network number of hidden units n_gan: number of hidden units in GAN n_codes: number of latent codes max_epochs : The maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. """ n_gan = len(filter_hs) * feature_maps # 900 options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs #band width options['n_x'] = n_x options['n_h'] = n_h options['n_h2'] = n_h2 options['n_h2_d'] = n_h2_d options['n_codes'] = n_codes options['lambda_q'] = p_lambda_q options['lambda_fm'] = p_lambda_fm # weight for feature matching options['lambda_recon'] = p_lambda_recon options['L'] = Large options['max_epochs'] = max_epochs options['lr_d'] = lr_d options['lr_g'] = lr_g options['kde_sigma'] = kde_sigma options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq options['saveFreq'] = saveFreq options['dg_ratio'] = dg_ratio options['n_gan'] = n_gan options['debug'] = False options[ 'feature_match'] = 'mmd' #'mmd' #' mmd_h','mmd_ld' #'JSD_acc' # moment #None # options['shareLSTM'] = True options['delta'] = 0.00 options['sharedEmb'] = False options['cnn_activation'] = 'tanh' # tanh options['sigma_range'] = [20] # range of sigma for mmd options['diag'] = 0.1 # diagonal matrix added on cov for JSD_acc options['label_smoothing'] = 0.01 options['dim_mmd'] = dim_mmd options['force_cut'] = 'None' options['batch_norm'] = False options['wgan'] = False options['cutoff'] = 0.01 options['max_step'] = 60 options['period'] = period logger.info('Model options {}'.format(options)) logger.info('Building model...') filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes # generative model for GAN ## modified n_label = len(set(train_lab)) options['label_sizes'] = n_label n_feature = len(options['filter_hs']) * options['feature_maps'] options['input_shape'] = (n_h2_d, n_feature) options['pred_shape'] = (1, n_h2_d) if options['feature_match'] == 'mmd_ld': options['mmd_shape'] = (dim_mmd, n_h2_d) options['propose_shape'] = (n_codes, n_h2_d) # if options['reverse']: options['input_recon_shape'] = (n_h2, n_feature) options['recon_shape'] = (n_gan, n_h2) if options['shareLSTM'] else (n_gan + 1, n_h2) ## d_params_s, g_params_s, s_params_s = init_params(options) d_params, g_params, s_params = init_tparams(d_params_s, g_params_s, s_params_s, options) lr_d_t = tensor.scalar(name='lr_d') lr_g_t = tensor.scalar(name='lr_g') use_noise, use_noise2, x, z, d_cost, g_cost, r_cost, fake_recon, acc_fake_xx, acc_real_xx, acc_fake_mean, acc_real_mean, wtf1, wtf2, wtf3, wtf4, wtf5, wtf6, KDE, KDE_input = build_model( d_params, g_params, s_params, options) # change f_cost = theano.function([x, z], [d_cost, g_cost, KDE, KDE_input], name='f_cost') #f_print = theano.function([x, z],[ wtf1, wtf2, wtf3, wtf4, wtf5, wtf6, KDE, KDE_input], name='f_print',on_unused_input='ignore') f_print = theano.function([x, z], [wtf1, wtf2, wtf3, wtf4, wtf5, wtf6], name='f_print') f_recon = theano.function([x, z], [r_cost, fake_recon, d_cost], name='f_recon', on_unused_input='ignore') if options['feature_match']: ss_updates = [(s_params['acc_fake_xx'], acc_fake_xx), (s_params['acc_real_xx'], acc_real_xx), (s_params['acc_fake_mean'], acc_fake_mean), (s_params['acc_real_mean'], acc_real_mean), (s_params['seen_size'], s_params['seen_size'] + options['batch_size'])] f_update_ss = theano.function([x, z], s_params, updates=ss_updates) f_cost_d, _train_d = Adam(d_params, d_cost, [x, z], lr_d_t) if options['feature_match']: f_cost_g, _train_g = Adam(g_params, g_cost, [x, z], lr_g_t) else: f_cost_g, _train_g = Adam(g_params, g_cost, [z], lr_g_t) ## logger.info('Training model...') history_cost = [] uidx = 0 # the number of update done kdes = np.zeros(10) kde_std = 0. # standard deviation of every 10 kde_input kde_mean = 0. start_time = time.time() kf_valid = get_minibatches_idx(len(val), valid_batch_size) y_min = min(train_lab) train_lab = [t - y_min for t in train_lab] val_lab = [t - y_min for t in val_lab] test_lab = [t - y_min for t in test_lab] testset = [prepare_for_bleu(s) for s in test[:1000]] try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(len(train), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(0.0) use_noise2.set_value(0.0) sents = [train[t] for t in train_index] x = prepare_data_for_cnn(sents) n_samples += x.shape[0] if options['shareLSTM']: z = np.random.uniform( -1, 1, (batch_size, n_gan)).astype('float32') else: z = np.random.uniform( -1, 1, (batch_size, n_gan + 1)).astype('float32') z[:, 0] = np.random.randint(n_codes, size=batch_size).astype('float32') # update gradient if options['feature_match']: cost_g = f_cost_g(x, z) else: cost_g = f_cost_g(z) if np.isnan(cost_g): logger.info('NaN detected') temp_out = f_print(x, z) print 'real' + str(temp_out[0]) + ' fake' + str( temp_out[1]) return 1., 1., 1. if np.isinf(cost_g): temp_out = f_print(x, z) print 'real' + str(temp_out[0]) + ' fake' + str( temp_out[1]) logger.info('Inf detected') return 1., 1., 1. # update G _train_g(lr_g) if np.mod(uidx, dispFreq) == 0: temp_out = f_print(x, z) _, _, cost_d = f_recon(x, z) np.set_printoptions(precision=3) np.set_printoptions(threshold=np.inf) print 'real ' + str(round( temp_out[0], 2)) + ' fake ' + str(round( temp_out[1], 2)) + ' Covariance loss ' + str( round(temp_out[3], 2)) + ' mean loss ' + str( round(temp_out[5], 2)) print 'cost_g ' + str(cost_g) + ' cost_d ' + str(cost_d) print( "Generated:" + " ".join( [ixtoword[x] for x in temp_out[2][0] if x != 0])) logger.info( 'Epoch {} Update {} Cost G {} Real {} Fake {} loss_cov {} meanMSE {}' .format(eidx, uidx, cost_g, round(temp_out[0], 2), round(temp_out[1], 2), temp_out[3], temp_out[5])) logger.info('Generated: {}'.format(" ".join( [ixtoword[x] for x in temp_out[2][0] if x != 0]))) if np.mod(uidx, dg_ratio) == 0: x = prepare_data_for_cnn(sents) cost_d = f_cost_d(x, z) _train_d(lr_d) if np.mod(uidx, dispFreq) == 0: logger.info('Cost D {}'.format(cost_d)) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') d_params_s = unzip(d_params) g_params_s = unzip(g_params) params_d = OrderedDict() params_g = OrderedDict() for kk, pp in d_params_s.iteritems(): params_d[kk] = np.asarray(d_params_s[kk]) for kk, pp in g_params_s.iteritems(): params_g[kk] = np.asarray(g_params_s[kk]) np.savez(saveto + '_d.npz', history_cost=history_cost, options=options, **params_d) np.savez(saveto + '_g.npz', history_cost=history_cost, options=options, **params_g) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) use_noise2.set_value(0.) if options['shareLSTM']: val_z = np.random.uniform( -1, 1, (batch_size, n_gan)).astype('float32') else: val_z = np.random.uniform( -1, 1, (batch_size, n_gan + 1)).astype('float32') temp_out = f_print(x, val_z) predset = temp_out[2] [bleu2s, bleu3s, bleu4s] = cal_BLEU([prepare_for_bleu(s) for s in predset], {0: testset}) logger.info( 'Valid BLEU2 = {}, BLEU3 = {}, BLEU4 = {}'.format( bleu2s, bleu3s, bleu4s)) print 'Valid BLEU (2,3,4): ' + ' '.join( [str(round(it, 3)) for it in (bleu2s, bleu3s, bleu4s)]) if options['feature_match']: f_update_ss(x, z) logger.info('Seen {} samples'.format(n_samples)) except KeyboardInterrupt: logger.info('Training interrupted') end_time = time.time() logger.info('The code run for {} epochs, with {} sec/epochs'.format( eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return valid_cost