def main(num_epochs=NEPOCH): print("Loading data ...") snli = SNLI(batch_size=BSIZE) train_batches = list(snli.train_minibatch_generator()) dev_batches = list(snli.dev_minibatch_generator()) test_batches = list(snli.test_minibatch_generator()) W_word_embedding = snli.weight # W shape: (# vocab size, WE_DIM) W_word_embedding = snli.weight / \ (numpy.linalg.norm(snli.weight, axis=1).reshape( snli.weight.shape[0], 1) + 0.00001)
def test(): from args import conf data = SNLI(conf) setattr(conf, 'char_vocab_size', len(data.char_vocab)) setattr(conf, 'word_vocab_size', len(data.TEXT.vocab)) setattr(conf, 'class_size', len(data.LABEL.vocab)) setattr(conf, 'max_word_len', data.max_word_len) model = BIMPM(conf, data) model.load_state_dict(torch.load('results/baseline.pt')) model = model.to(conf.device) _, acc = evaluate(model, conf, data) print(f'test acc: {acc:.3f}')
def main(): from args import conf print('loading SNLI data...') data = SNLI(conf) setattr(conf, 'char_vocab_size', len(data.char_vocab)) setattr(conf, 'word_vocab_size', len(data.TEXT.vocab)) setattr(conf, 'class_size', len(data.LABEL.vocab)) setattr(conf, 'max_word_len', data.max_word_len) setattr(conf, 'model_time', strftime('%H:%M:%S', gmtime())) print('training start!') best_model = train(conf, data) if not os.path.exists('results'): os.makedirs('results') torch.save(best_model.state_dict(), 'results/baseline.pt') print('training finished!')
def test(): data = SNLI(conf) conf.char_vocab_size = len(data.char_vocab) conf.word_vocab_size = len(data.TEXT.vocab) conf.class_size = len(data.LABEL.vocab) conf.max_word_len = data.max_word_len model = BIMPM(conf, data) model.load_state_dict(torch.load('results/baseline.pt')) model.word_emb.weight.requires_grad = True model = model.to(conf.device).eval() batch = next(iter(data.dev_iter)) output = F.softmax(model(batch.premise, batch.hypothesis), 1) original_scores, original_predictions = torch.max(output, 1) original_scores = original_scores.detach().cpu().numpy() original_predictions = original_predictions.detach().cpu().numpy() reduced, removed_indices = get_rawr( model, batch, max_beam_size=rawr_conf.max_beam_size, conf_threshold=rawr_conf.conf_threshold, p_not_h=False, ) reduced_hypothesis = padding_tensor( [torch.LongTensor(r[0]) for r in reduced]) reduced_hypothesis = reduced_hypothesis.to(conf.device) output = F.softmax(model(batch.premise, batch.hypothesis), 1) reduced_scores, reduced_predictions = torch.max(output, 1) reduced_scores = reduced_scores.detach().cpu().numpy() reduced_predictions = reduced_predictions.detach().cpu().numpy() print(all(reduced_predictions == original_predictions))
def main(num_epochs=NEPOCH): print("Loading data ...") snli = SNLI(batch_size=BSIZE) train_batches = list(snli.train_minibatch_generator()) dev_batches = list(snli.dev_minibatch_generator()) test_batches = list(snli.test_minibatch_generator()) W_word_embedding = snli.weight # W shape: (# vocab size, WE_DIM) del snli print("Building network ...") ########### sentence embedding encoder ########### """ # sentence vector, with each number standing for a word number input_var = T.TensorType('int32', [False, False])('sentence_vector') input_var.tag.test_value = numpy.hstack((numpy.random.randint(1, 10000, (BSIZE, 20), 'int32'), numpy.zeros((BSIZE, 5)).astype('int32'))) input_var.tag.test_value[1, 20:22] = (413, 45) l_in = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var) input_mask = T.TensorType('int32', [False, False])('sentence_mask') input_mask.tag.test_value = numpy.hstack((numpy.ones((BSIZE, 20), dtype='int32'), numpy.zeros((BSIZE, 5), dtype='int32'))) input_mask.tag.test_value[1, 20:22] = 1 l_mask = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask) # output shape (BSIZE, None, WEDIM) l_word_embed = lasagne.layers.EmbeddingLayer( l_in, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=W_word_embedding) """ ########### input layers ########### # hypothesis input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector') input_var_h.tag.test_value = numpy.hstack((numpy.random.randint(1, 10000, (BSIZE, 18), 'int32'), numpy.zeros((BSIZE, 6)).astype('int32'))) l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_h) input_mask_h = T.TensorType('int32', [False, False])('hypo_mask') input_mask_h.tag.test_value = numpy.hstack((numpy.ones((BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32'))) input_mask_h.tag.test_value[1, 18:22] = 1 l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_h) # premise input_var_p = T.TensorType('int32', [False, False])('premise_vector') input_var_p.tag.test_value = numpy.hstack((numpy.random.randint(1, 10000, (BSIZE, 16), 'int32'), numpy.zeros((BSIZE, 3)).astype('int32'))) l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_p) input_mask_p = T.TensorType('int32', [False, False])('premise_mask') input_mask_p.tag.test_value = numpy.hstack((numpy.ones((BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32'))) input_mask_p.tag.test_value[1, 16:18] = 1 l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_p) ################################### # output shape (BSIZE, None, WEDIM) l_hypo_embed = lasagne.layers.EmbeddingLayer( l_in_h, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=W_word_embedding) l_prem_embed = lasagne.layers.EmbeddingLayer( l_in_p, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=l_hypo_embed.W) # ATTEND l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_embed, p=DPOUT, rescale=True) l_hypo_embed_hid1 = DenseLayer3DInput( l_hypo_embed_dpout, num_units=EMBDHIDA, nonlinearity=lasagne.nonlinearities.rectify) l_hypo_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_hypo_embed_hid1, p=DPOUT, rescale=True) l_hypo_embed_hid2 = DenseLayer3DInput( l_hypo_embed_hid1_dpout, num_units=EMBDHIDB, nonlinearity=lasagne.nonlinearities.rectify) l_prem_embed_dpout = lasagne.layers.DropoutLayer(l_prem_embed, p=DPOUT, rescale=True) l_prem_embed_hid1 = DenseLayer3DInput( l_prem_embed_dpout, num_units=EMBDHIDA, nonlinearity=lasagne.nonlinearities.rectify) l_prem_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_prem_embed_hid1, p=DPOUT, rescale=True) l_prem_embed_hid2 = DenseLayer3DInput( l_prem_embed_hid1_dpout, num_units=EMBDHIDB, nonlinearity=lasagne.nonlinearities.rectify) # output dim: (BSIZE, NROWx, NROWy) l_e = ComputeEmbeddingPool([l_hypo_embed_hid2, l_prem_embed_hid2]) # output dim: (BSIZE, NROWy, DIM) l_hypo_weighted = AttendOnEmbedding([l_hypo_embed, l_e], masks=[l_mask_h, l_mask_p], direction='col') # output dim: (BSIZE, NROWx, DIM) l_prem_weighted = AttendOnEmbedding([l_prem_embed, l_e], masks=[l_mask_h, l_mask_p], direction='row') # COMPARE # output dim: (BSIZE, NROW, 4*LSTMHID) l_hypo_premwtd = lasagne.layers.ConcatLayer([l_hypo_embed, l_prem_weighted], axis=2) l_prem_hypowtd = lasagne.layers.ConcatLayer([l_prem_embed, l_hypo_weighted], axis=2) l_hypo_premwtd_dpout = lasagne.layers.DropoutLayer(l_hypo_premwtd, p=DPOUT, rescale=True) l_hypo_comphid1 = DenseLayer3DInput( l_hypo_premwtd_dpout, num_units=COMPHIDA, nonlinearity=lasagne.nonlinearities.rectify) l_hypo_comphid1_dpout = lasagne.layers.DropoutLayer(l_hypo_comphid1, p=DPOUT, rescale=True) l_hypo_comphid2 = DenseLayer3DInput( l_hypo_comphid1_dpout, num_units=COMPHIDB, nonlinearity=lasagne.nonlinearities.rectify) l_prem_hypowtd_dpout = lasagne.layers.DropoutLayer(l_prem_hypowtd, p=DPOUT, rescale=True) l_prem_comphid1 = DenseLayer3DInput( l_prem_hypowtd_dpout, num_units=COMPHIDA, W=l_hypo_comphid1.W, b=l_hypo_comphid1.b, nonlinearity=lasagne.nonlinearities.rectify) l_prem_comphid1_dpout = lasagne.layers.DropoutLayer(l_prem_comphid1, p=DPOUT, rescale=True) l_prem_comphid2 = DenseLayer3DInput( l_prem_comphid1_dpout, num_units=COMPHIDB, W=l_hypo_comphid2.W, b=l_hypo_comphid2.b, nonlinearity=lasagne.nonlinearities.rectify) # AGGREGATE # output dim: (BSIZE, 4*LSTMHID) l_hypo_mean = MeanOverDim(l_hypo_comphid2, mask=l_mask_h, dim=1) l_prem_mean = MeanOverDim(l_prem_comphid2, mask=l_mask_p, dim=1) l_v1v2 = lasagne.layers.ConcatLayer([l_hypo_mean, l_prem_mean], axis=1) l_v1v2_dpout = lasagne.layers.DropoutLayer(l_v1v2, p=DPOUT, rescale=True) l_outhid = lasagne.layers.DenseLayer( l_v1v2_dpout, num_units=OUTHID, nonlinearity=lasagne.nonlinearities.rectify) l_outhid_dpout = lasagne.layers.DropoutLayer(l_outhid, p=DPOUT, rescale=True) l_output = lasagne.layers.DenseLayer( l_outhid_dpout, num_units=3, nonlinearity=lasagne.nonlinearities.softmax) ########### target, cost, validation, etc. ########## target_values = T.ivector('target_output') target_values.tag.test_value = numpy.asarray([1,] * BSIZE, dtype='int32') network_output = lasagne.layers.get_output(l_output) network_prediction = T.argmax(network_output, axis=1) error_rate = T.mean(T.neq(network_prediction, target_values)) network_output_clean = lasagne.layers.get_output(l_output, deterministic=True) network_prediction_clean = T.argmax(network_output_clean, axis=1) error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values)) cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values)) cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values)) # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_output) if not UPDATEWE: all_params.remove(l_hypo_embed.W) numparams = sum([numpy.prod(i) for i in [i.shape.eval() for i in all_params]]) print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams)) print("-----------------------------------------------------------------") for item in all_params: print("{0:24}{1:24}{2}".format(item, item.shape.eval(), numpy.prod(item.shape.eval()))) # if exist param file then load params look_for = 'params' + os.sep + 'params_' + filename + '.pkl' if os.path.isfile(look_for): print("Resuming from file: " + look_for) all_param_values = cPickle.load(open(look_for, 'rb')) for p, v in zip(all_params, all_param_values): p.set_value(v) # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LR) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function( [l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values], [cost, error_rate], updates=updates) # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) compute_cost = theano.function( [l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values], [cost_clean, error_rate_clean]) # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) def evaluate(mode): if mode == 'dev': data = dev_batches if mode == 'test': data = test_batches set_cost = 0. set_error_rate = 0. for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1): _cost, _error = compute_cost(hypo, hm, premise, pm, truth) set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \ 1.0 / batches_seen * _cost set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \ 1.0 / batches_seen * _error return set_cost, set_error_rate print("Done. Evaluating scratch model ...") dev_set_cost, dev_set_error = evaluate('dev') print("BEFORE TRAINING: dev cost %f, error %f" % (dev_set_cost, dev_set_error)) print("Training ...") try: for epoch in range(num_epochs): train_set_cost = 0. train_set_error = 0. start = time.time() for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(train_batches, 1): _cost, _error = train(hypo, hm, premise, pm, truth) train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \ 1.0 / batches_seen * _cost train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \ 1.0 / batches_seen * _error if batches_seen % 100 == 0: end = time.time() print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" % ( batches_seen * BSIZE, end - start, LR, train_set_cost, train_set_error)) start = end if batches_seen % 2000 == 0: dev_set_cost, dev_set_error = evaluate('dev') print("***dev cost %f, error %f" % (dev_set_cost, dev_set_error)) # save parameters all_param_values = [p.get_value() for p in all_params] cPickle.dump(all_param_values, open('params' + os.sep + 'params_' + filename + '.pkl', 'wb')) dev_set_cost, dev_set_error = evaluate('dev') test_set_cost, test_set_error = evaluate('test') print("epoch %d, cost: train %f dev %f test %f;\n" " error train %f dev %f test %f" % ( epoch, train_set_cost, dev_set_cost, test_set_cost, train_set_error, dev_set_error, test_set_error)) except KeyboardInterrupt: pdb.set_trace() pass
def main(): from args import conf, tune_conf parser = argparse.ArgumentParser() parser.add_argument('--baseline', default='results/baseline.pt') parser.add_argument( '--ent-train', default='/scratch0/shifeng/rawr/new_snli/rawr.train.pkl') parser.add_argument('--ent-dev', default='/scratch0/shifeng/rawr/new_snli/rawr.dev.pkl') args = parser.parse_args() out_dir = prepare_output_dir(args, args.root_dir) log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) fh = logging.FileHandler(os.path.join(out_dir, 'output.log')) fh.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') fh.setFormatter(formatter) ch.setFormatter(formatter) log.addHandler(fh) log.addHandler(ch) log.info('===== {} ====='.format(out_dir)) ''' load regular data ''' log.info('loading regular training data') data = SNLI(conf) conf.char_vocab_size = len(data.char_vocab) conf.word_vocab_size = len(data.TEXT.vocab) conf.class_size = len(data.LABEL.vocab) conf.max_word_len = data.max_word_len log.info('loading entropy dev data {}'.format(tune_conf.ent_dev)) with open(tune_conf.ent_dev, 'rb') as f: ent_dev = pickle.load(f) if isinstance(ent_dev[0], list): ent_dev = list(itertools.chain(*ent_dev)) log.info('{} entropy dev examples'.format(len(ent_dev))) ent_dev = [[ x['data']['premise'], x['data']['hypothesis'], x['data']['label'] ] for x in ent_dev] log.info('loading entropy training data {}'.format(tune_conf.ent_train)) with open(tune_conf.ent_train, 'rb') as f: ent_train = pickle.load(f) if isinstance(ent_train[0], list): ent_train = list(itertools.chain(*ent_train)) log.info('{} entropy training examples'.format(len(ent_train))) ent_train = [[ x['data']['premise'], x['data']['hypothesis'], x['data']['label'] ] for x in ent_train] train_ent_batches = batchify(ent_train, tune_conf.batch_size) log.info('{} entropy training batches'.format(len(train_ent_batches))) log.info('loading model from {}'.format(args.baseline)) model = BIMPM(conf, data) model.load_state_dict(torch.load(args.baseline)) # model.word_emb.weight.requires_grad = True model.cuda(conf.gpu) parameters = list(filter(lambda p: p.requires_grad, model.parameters())) optimizer = optim.Adam(parameters, lr=tune_conf.lr) ent_optimizer = optim.Adam(parameters, lr=tune_conf.ent_lr) criterion = nn.CrossEntropyLoss() init_loss, init_acc = evaluate(model, data.dev_iter) log.info("initial loss {:.4f} accuracy {:.4f}".format(init_loss, init_acc)) best_acc = init_acc dev_ent_batches = batchify(ent_dev, tune_conf.batch_size) init_ent, init_ent_acc = evaluate_ent(model, dev_ent_batches) log.info("initial entropy {:.4f} ent_acc {:.4f}".format( init_ent, init_ent_acc)) epoch = 0 i_ent, i_mle = 0, 0 # number of examples train_loss, train_ent = 0, 0 train_mle_iter = iter(data.train_iter) train_ent_iter = iter(train_ent_batches) while True: model.train() for i in range(tune_conf.n_ent): try: prem, hypo, label = next(train_ent_iter) except StopIteration: random.shuffle(train_ent_batches) train_ent_iter = iter(train_ent_batches) i_ent = 0 train_ent = 0 break output = forward(model, prem, hypo, conf.max_sent_len) output = F.softmax(output, 1) ent = entropy(output).sum() train_ent += ent.data.cpu().numpy()[0] loss = -tune_conf.gamma * ent ent_optimizer.zero_grad() loss.backward() ent_optimizer.step() i_ent += prem.shape[0] end_of_epoch = False for i in range(tune_conf.n_mle): if i_mle >= len(data.train_iter): epoch += 1 end_of_epoch = True data.train_iter.init_epoch() train_mle_iter = iter(data.train_iter) i_mle = 0 train_loss = 0 break batch = next(train_mle_iter) output = forward(model, batch.premise, batch.hypothesis, conf.max_sent_len) loss = criterion(output, batch.label) train_loss += loss.data.cpu().numpy()[0] optimizer.zero_grad() loss.backward() optimizer.step() i_mle += batch.premise.shape[0] if i_mle % 1000 == 0: _loss = train_loss / i_mle if i_mle != 0 else 0 _ent = train_ent / i_ent if i_ent != 0 else 0 log.info( 'epoch [{:2}] [{} / {}] loss[{:.5f}] entropy[{:.5f}]'.format( epoch, i_mle, len(data.train_iter), _loss, _ent)) if end_of_epoch or i_mle % 1e5 == 0: dev_loss, dev_acc = evaluate(model, data.dev_iter) dev_ent, dev_ent_acc = evaluate_ent(model, dev_ent_batches) log.info("dev acc: {:.4f} ent: {:.4f} ent_acc: {:.4f}".format( dev_acc, dev_ent, dev_ent_acc)) model_path = os.path.join(out_dir, 'checkpoint_epoch_{}.pt'.format(epoch)) torch.save(model.state_dict(), model_path) if dev_acc > best_acc: best_acc = dev_acc model_path = os.path.join(out_dir, 'best_model.pt') torch.save(model.state_dict(), model_path) log.info("best model saved {}".format(dev_acc)) if epoch > 40: break
def main(num_epochs=NUM_EPOCHS): print("Loading data ...") snli = SNLI(batch_size=BATCH_SIZE) train_batches = list(snli.train_minibatch_generator()) dev_batches = list(snli.dev_minibatch_generator()) test_batches = list(snli.test_minibatch_generator()) W_word_embedding = snli.weight # W shape: (# vocab size, WE_DIM) del snli print("Building network ...") ########### sentence embedding encoder ########### # sentence vector, with each number standing for a word number input_var = T.TensorType('int32', [False, False])('sentence_vector') input_var.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (50, 20), 'int32'), numpy.zeros( (50, 5)).astype('int32'))) input_var.tag.test_value[1, 20:22] = (413, 45) l_in = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_var) input_mask = T.TensorType('int32', [False, False])('sentence_mask') input_mask.tag.test_value = numpy.hstack((numpy.ones( (50, 20), dtype='int32'), numpy.zeros((50, 5), dtype='int32'))) input_mask.tag.test_value[1, 20:22] = 1 l_mask = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_mask) # output shape (BATCH_SIZE, None, WE_DIM) l_word_embed = lasagne.layers.EmbeddingLayer( l_in, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=W_word_embedding) # how to set it to be non-trainable? # bidirectional LSTM l_forward = lasagne.layers.LSTMLayer( l_word_embed, mask_input=l_mask, num_units=LSTM_HIDDEN, ingate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), forgetgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), cell=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), nonlinearity=lasagne.nonlinearities.tanh, peepholes=False, grad_clipping=GRAD_CLIP) l_backward = lasagne.layers.LSTMLayer( l_word_embed, mask_input=l_mask, num_units=LSTM_HIDDEN, ingate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), forgetgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), cell=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), nonlinearity=lasagne.nonlinearities.tanh, peepholes=False, grad_clipping=GRAD_CLIP, backwards=True) # output dim: (BATCH_SIZE, None, 2*LSTM_HIDDEN) l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2) # Attention mechanism to get sentence embedding # output dim: (BATCH_SIZE, None, ATTENTION_HIDDEN) l_ws1 = DenseLayer3DInput(l_concat, num_units=ATTENTION_HIDDEN) # output dim: (BATCH_SIZE, None, N_ROWS) l_ws2 = DenseLayer3DInput(l_ws1, num_units=N_ROWS, nonlinearity=None) l_annotations = Softmax3D(l_ws2, mask=l_mask) # output dim: (BATCH_SIZE, 2*LSTM_HIDDEN, N_ROWS) l_sentence_embedding = ApplyAttention([l_annotations, l_concat]) # beam search? Bi lstm in the sentence embedding layer? etc. ########### get embeddings for hypothesis and premise ########### # hypothesis input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector') input_var_h.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (50, 18), 'int32'), numpy.zeros( (50, 6)).astype('int32'))) l_in_h = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_var_h) input_mask_h = T.TensorType('int32', [False, False])('hypo_mask') input_mask_h.tag.test_value = numpy.hstack((numpy.ones( (50, 18), dtype='int32'), numpy.zeros((50, 6), dtype='int32'))) input_mask_h.tag.test_value[1, 18:22] = 1 l_mask_h = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_mask_h) # premise input_var_p = T.TensorType('int32', [False, False])('premise_vector') input_var_p.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (50, 16), 'int32'), numpy.zeros( (50, 3)).astype('int32'))) l_in_p = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_var_p) input_mask_p = T.TensorType('int32', [False, False])('premise_mask') input_mask_p.tag.test_value = numpy.hstack((numpy.ones( (50, 16), dtype='int32'), numpy.zeros((50, 3), dtype='int32'))) input_mask_p.tag.test_value[1, 16:18] = 1 l_mask_p = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_mask_p) hypothesis_embedding, hypothesis_annotation = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_h.input_var, l_mask: l_mask_h.input_var }) premise_embedding, premise_annotation = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_p.input_var, l_mask: l_mask_p.input_var }) ########### gated encoder and output MLP ########## l_hypo_embed = lasagne.layers.InputLayer(shape=(BATCH_SIZE, N_ROWS, 2 * LSTM_HIDDEN), input_var=hypothesis_embedding) l_pre_embed = lasagne.layers.InputLayer(shape=(BATCH_SIZE, N_ROWS, 2 * LSTM_HIDDEN), input_var=premise_embedding) # output dim: (BATCH_SIZE, 2*LSTM_HIDDEN, N_ROWS) l_factors = GatedEncoder3D([l_hypo_embed, l_pre_embed], num_hfactors=2 * LSTM_HIDDEN) # Dropout: l_factors_noise = lasagne.layers.DropoutLayer(l_factors, p=GAEREG, rescale=True) # l_hids = DenseLayer3DWeight() l_outhid = lasagne.layers.DenseLayer( l_factors_noise, num_units=OUT_HIDDEN, nonlinearity=lasagne.nonlinearities.rectify) # Dropout: l_outhid_noise = lasagne.layers.DropoutLayer(l_outhid, p=GAEREG, rescale=True) l_output = lasagne.layers.DenseLayer( l_outhid_noise, num_units=3, nonlinearity=lasagne.nonlinearities.softmax) ########### target, cost, validation, etc. ########## target_values = T.ivector('target_output') target_values.tag.test_value = numpy.asarray([ 1, ] * 50, dtype='int32') network_output = lasagne.layers.get_output(l_output) network_output_clean = lasagne.layers.get_output(l_output, deterministic=True) # penalty term and cost attention_penalty = T.mean( ( T.batched_dot( hypothesis_annotation, # pay attention to this line: # T.extra_ops.cpu_contiguous(hypothesis_annotation.dimshuffle(0, 2, 1)) hypothesis_annotation.dimshuffle(0, 2, 1)) - T.eye(hypothesis_annotation.shape[1]).dimshuffle('x', 0, 1))**2, axis=(0, 1, 2) ) + T.mean( ( T.batched_dot( premise_annotation, # T.extra_ops.cpu_contiguous(premise_annotation.dimshuffle(0, 2, 1)) # ditto. premise_annotation.dimshuffle(0, 2, 1) # ditto. ) - T.eye(premise_annotation.shape[1]).dimshuffle('x', 0, 1))**2, axis=(0, 1, 2)) cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values) + \ ATTENTION_PENALTY * attention_penalty) cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values) + \ ATTENTION_PENALTY * attention_penalty) # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_output) + \ lasagne.layers.get_all_params(l_sentence_embedding) numparams = sum( [numpy.prod(i) for i in [i.shape.eval() for i in all_params]]) print("Number of params: {}".format(numparams)) # if exist param file then load params look_for = 'params' + os.sep + 'params_' + filename + '.pkl' if os.path.isfile(look_for): print("Resuming from file: " + look_for) all_param_values = cPickle.load(open(look_for, 'rb')) for p, v in zip(all_params, all_param_values): p.set_value(v) # withoutwe_params = all_params + [l_word_embed.W] # Compute updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) # Theano functions for training and computing cost print("Compiling functions ...") network_prediction = T.argmax(network_output, axis=1) error_rate = T.mean(T.neq(network_prediction, target_values)) network_prediction_clean = T.argmax(network_output_clean, axis=1) error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values)) train = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost, error_rate], updates=updates) compute_cost = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost_clean, error_rate_clean]) def evaluate(mode): if mode == 'dev': data = dev_batches if mode == 'test': data = test_batches set_cost = 0. set_error_rate = 0. for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1): _cost, _error = compute_cost(hypo, hm, premise, pm, truth) set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \ 1.0 / batches_seen * _cost set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \ 1.0 / batches_seen * _error return set_cost, set_error_rate dev_set_cost, dev_set_error = evaluate('dev') print("BEFORE TRAINING: dev cost %f, error %f" % (dev_set_cost, dev_set_error)) print("Training ...") try: for epoch in range(num_epochs): train_set_cost = 0. train_set_error = 0. start = time.time() for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(train_batches, 1): _cost, _error = train(hypo, hm, premise, pm, truth) train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \ 1.0 / batches_seen * _cost train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \ 1.0 / batches_seen * _error if batches_seen % 100 == 0: end = time.time() print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" % (batches_seen * BATCH_SIZE, LEARNING_RATE, end - start, train_set_cost, train_set_error)) start = end if batches_seen % 2000 == 0: dev_set_cost, dev_set_error = evaluate('dev') test_set_cost, test_set_error = evaluate('test') print("***dev cost %f, error %f" % (dev_set_cost, dev_set_error)) print("***test cost %f, error %f" % (test_set_cost, test_set_error)) # save parameters all_param_values = [p.get_value() for p in all_params] cPickle.dump( all_param_values, open('params' + os.sep + 'params_' + filename + '.pkl', 'wb')) # load params # all_param_values = cPickle.load(open('params' + os.sep + 'params_' + filename, 'rb')) # for p, v in zip(all_params, all_param_values): # p.set_value(v) dev_set_cost, dev_set_error = evaluate('dev') test_set_cost, test_set_error = evaluate('test') print("epoch %d, cost: train %f dev %f test %f;\n" " error train %f dev %f test %f" % (epoch, train_set_cost, dev_set_cost, test_set_cost, train_set_error, dev_set_error, test_set_error)) except KeyboardInterrupt: pdb.set_trace() pass
def main(num_epochs=NEPOCH): print("Loading data ...") snli = SNLI(batch_size=BSIZE) train_batches = list(snli.train_minibatch_generator()) dev_batches = list(snli.dev_minibatch_generator()) test_batches = list(snli.test_minibatch_generator()) W_word_embedding = snli.weight # W shape: (# vocab size, WE_DIM) W_word_embedding = snli.weight / \ (numpy.linalg.norm(snli.weight, axis=1).reshape(snli.weight.shape[0], 1) + \ 0.00001) del snli print("Building network ...") ########### input layers ########### # hypothesis input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector') input_var_h.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 18), 'int32'), numpy.zeros( (BSIZE, 6)).astype('int32'))) l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_h) input_mask_h = T.TensorType('int32', [False, False])('hypo_mask') input_mask_h.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32'))) input_mask_h.tag.test_value[1, 18:22] = 1 l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_h) # premise input_var_p = T.TensorType('int32', [False, False])('premise_vector') input_var_p.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 16), 'int32'), numpy.zeros( (BSIZE, 3)).astype('int32'))) l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_p) input_mask_p = T.TensorType('int32', [False, False])('premise_mask') input_mask_p.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32'))) input_mask_p.tag.test_value[1, 16:18] = 1 l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_p) ################################### # output shape (BSIZE, None, WEDIM) l_hypo_embed = lasagne.layers.EmbeddingLayer( l_in_h, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=W_word_embedding) l_prem_embed = lasagne.layers.EmbeddingLayer( l_in_p, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=l_hypo_embed.W) # EMBEDING MAPPING: output shape (BSIZE, None, WEMAP) l_hypo_reduced_embed = DenseLayer3DInput(l_hypo_embed, num_units=WEMAP, W=init.Normal(), b=init.Constant(0.), nonlinearity=None) l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_reduced_embed, p=DPOUT, rescale=True) l_prem_reduced_embed = DenseLayer3DInput(l_prem_embed, num_units=WEMAP, W=init.Normal(), b=init.Constant(0.), nonlinearity=None) l_prem_embed_dpout = lasagne.layers.DropoutLayer(l_prem_reduced_embed, p=DPOUT, rescale=True) # ATTEND l_hypo_embed_hid1 = DenseLayer3DInput( l_hypo_embed_dpout, num_units=EMBDHIDA, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) l_hypo_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_hypo_embed_hid1, p=DPOUT, rescale=True) l_hypo_embed_hid2 = DenseLayer3DInput( l_hypo_embed_hid1_dpout, num_units=EMBDHIDB, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) l_prem_embed_hid1 = DenseLayer3DInput( l_prem_embed_dpout, num_units=EMBDHIDA, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) l_prem_embed_hid1_dpout = lasagne.layers.DropoutLayer(l_prem_embed_hid1, p=DPOUT, rescale=True) l_prem_embed_hid2 = DenseLayer3DInput( l_prem_embed_hid1_dpout, num_units=EMBDHIDB, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) # output dim: (BSIZE, NROWx, NROWy) l_e = ComputeEmbeddingPool([l_hypo_embed_hid1, l_prem_embed_hid2]) # output dim: (BSIZE, NROWy, DIM) l_hypo_weighted = AttendOnEmbedding([l_hypo_reduced_embed, l_e], masks=[l_mask_h, l_mask_p], direction='col') # output dim: (BSIZE, NROWx, DIM) l_prem_weighted = AttendOnEmbedding([l_prem_reduced_embed, l_e], masks=[l_mask_h, l_mask_p], direction='row') # COMPARE # output dim: (BSIZE, NROW, 4*LSTMHID) l_hypo_premwtd = lasagne.layers.ConcatLayer( [l_hypo_reduced_embed, l_prem_weighted], axis=2) l_prem_hypowtd = lasagne.layers.ConcatLayer( [l_prem_reduced_embed, l_hypo_weighted], axis=2) l_hypo_premwtd_dpout = lasagne.layers.DropoutLayer(l_hypo_premwtd, p=DPOUT, rescale=True) l_hypo_comphid1 = DenseLayer3DInput( l_hypo_premwtd_dpout, num_units=COMPHIDA, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) l_hypo_comphid1_dpout = lasagne.layers.DropoutLayer(l_hypo_comphid1, p=DPOUT, rescale=True) l_hypo_comphid2 = DenseLayer3DInput( l_hypo_comphid1_dpout, num_units=COMPHIDB, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) l_prem_hypowtd_dpout = lasagne.layers.DropoutLayer(l_prem_hypowtd, p=DPOUT, rescale=True) l_prem_comphid1 = DenseLayer3DInput( l_prem_hypowtd_dpout, num_units=COMPHIDA, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) l_prem_comphid1_dpout = lasagne.layers.DropoutLayer(l_prem_comphid1, p=DPOUT, rescale=True) l_prem_comphid2 = DenseLayer3DInput( l_prem_comphid1_dpout, num_units=COMPHIDB, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) # AGGREGATE # output dim: (BSIZE, 4*LSTMHID) l_hypo_mean = MeanOverDim(l_hypo_comphid2, mask=l_mask_h, dim=1) l_prem_mean = MeanOverDim(l_prem_comphid2, mask=l_mask_p, dim=1) l_v1v2 = lasagne.layers.ConcatLayer([l_hypo_mean, l_prem_mean], axis=1) l_v1v2_dpout = lasagne.layers.DropoutLayer(l_v1v2, p=DPOUT, rescale=True) l_outhid1 = lasagne.layers.DenseLayer( l_v1v2_dpout, num_units=OUTHID, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) l_outhid1_dpout = lasagne.layers.DropoutLayer(l_outhid1, p=DPOUT, rescale=True) l_outhid2 = lasagne.layers.DenseLayer( l_outhid1_dpout, num_units=OUTHID, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.rectify) # l_outhid2_dpout = lasagne.layers.DropoutLayer(l_outhid2, p=DPOUT, rescale=True) l_output = lasagne.layers.DenseLayer( l_outhid2, num_units=3, W=init.Normal(), b=init.Constant(0.), nonlinearity=lasagne.nonlinearities.softmax) ########### target, cost, validation, etc. ########## target_values = T.ivector('target_output') target_values.tag.test_value = numpy.asarray([ 1, ] * BSIZE, dtype='int32') network_output = lasagne.layers.get_output(l_output) network_prediction = T.argmax(network_output, axis=1) error_rate = T.mean(T.neq(network_prediction, target_values)) network_output_clean = lasagne.layers.get_output(l_output, deterministic=True) network_prediction_clean = T.argmax(network_output_clean, axis=1) error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values)) cost = T.mean( T.nnet.categorical_crossentropy(network_output, target_values)) cost_clean = T.mean( T.nnet.categorical_crossentropy(network_output_clean, target_values)) # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_output) if not UPDATEWE: all_params.remove(l_hypo_embed.W) numparams = sum( [numpy.prod(i) for i in [i.shape.eval() for i in all_params]]) print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams)) print("-----------------------------------------------------------------") for item in all_params: print("{0:24}{1:24}{2}".format(item, item.shape.eval(), numpy.prod(item.shape.eval()))) # if exist param file then load params look_for = 'params' + os.sep + 'params_' + filename + '.pkl' if os.path.isfile(look_for): print("Resuming from file: " + look_for) all_param_values = cPickle.load(open(look_for, 'rb')) for p, v in zip(all_params, all_param_values): p.set_value(v) # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LR) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost, error_rate], updates=updates) # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) compute_cost = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost_clean, error_rate_clean]) # mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) def evaluate(mode): if mode == 'dev': data = dev_batches if mode == 'test': data = test_batches set_cost = 0. set_error_rate = 0. for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1): _cost, _error = compute_cost(hypo, hm, premise, pm, truth) set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \ 1.0 / batches_seen * _cost set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \ 1.0 / batches_seen * _error return set_cost, set_error_rate print("Done. Evaluating scratch model ...") dev_set_cost, dev_set_error = evaluate('dev') print("BEFORE TRAINING: dev cost %f, error %f" % (dev_set_cost, dev_set_error)) print("Training ...") try: for epoch in range(num_epochs): train_set_cost = 0. train_set_error = 0. start = time.time() for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(train_batches, 1): _cost, _error = train(hypo, hm, premise, pm, truth) train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \ 1.0 / batches_seen * _cost train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \ 1.0 / batches_seen * _error if (batches_seen * BSIZE) % 5000 == 0: end = time.time() print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" % (batches_seen * BSIZE, end - start, LR, train_set_cost, train_set_error)) start = end if (batches_seen * BSIZE) % 100000 == 0: dev_set_cost, dev_set_error = evaluate('dev') print("***dev cost %f, error %f" % (dev_set_cost, dev_set_error)) # save parameters all_param_values = [p.get_value() for p in all_params] cPickle.dump( all_param_values, open('params' + os.sep + 'params_' + filename + '.pkl', 'wb')) dev_set_cost, dev_set_error = evaluate('dev') test_set_cost, test_set_error = evaluate('test') print("epoch %d, cost: train %f dev %f test %f;\n" " error train %f dev %f test %f" % (epoch, train_set_cost, dev_set_cost, test_set_cost, train_set_error, dev_set_error, test_set_error)) except KeyboardInterrupt: pdb.set_trace() pass
def main(): parser = argparse.ArgumentParser() parser.add_argument('--fold', required=True) parser.add_argument('--baseline', default='results/baseline.pt') parser.add_argument('--pnoth', default=False, action='store_true', help='reduce premise instead of hypothesis') parser.add_argument('--truth', default=False, action='store_true', help='use label instead of prediction as target') args = parser.parse_args() data = SNLI(conf) conf.char_vocab_size = len(data.char_vocab) conf.word_vocab_size = len(data.TEXT.vocab) conf.class_size = len(data.LABEL.vocab) conf.max_word_len = data.max_word_len q_vocab = data.TEXT.vocab.itos a_vocab = data.LABEL.vocab.itos out_dir = prepare_output_dir(conf, 'results', 'rawr') print('Generating [{}] rawr data from [{}].'.format( args.fold, args.baseline)) print(out_dir) model = BIMPM(conf, data) model.load_state_dict(torch.load(args.baseline)) model.word_emb.weight.requires_grad = True model.to(conf.device) datasets = {'train': data.train_iter, 'dev': data.dev_iter} if args.pnoth: fname = 'rawr.{}.premise.pkl'.format(args.fold) else: fname = 'rawr.{}.hypothesis.pkl'.format(args.fold) checkpoint = [] for batch_i, batch in enumerate(tqdm(datasets[args.fold])): if batch_i > len(datasets[args.fold]): # otherwise train iter will loop forever! break batch_size = batch.hypothesis.shape[0] model.eval() output = F.softmax(model(batch.premise, batch.hypothesis), 1) original_scores, original_predictions = torch.max(output, 1) original_scores = original_scores.detach().cpu().numpy() original_predictions = original_predictions.detach().cpu().numpy() batch_cpu = Batch(batch.premise.data.cpu(), batch.hypothesis.data.cpu(), batch.label.data.cpu()) reduced, removed_indices = get_rawr( model, batch, max_beam_size=rawr_conf.max_beam_size, conf_threshold=rawr_conf.conf_threshold, p_not_h=args.pnoth) for i in range(batch_size): og = { 'premise': batch_cpu.premise[i], 'hypothesis': batch_cpu.hypothesis[i], 'premise_readable': to_text(batch_cpu.premise[i], q_vocab), 'hypothesis_readable': to_text(batch_cpu.hypothesis[i], q_vocab), 'prediction': original_predictions[i], 'prediction_readable': a_vocab[original_predictions[i]], 'score': original_scores[i], 'label': batch_cpu.label[i], 'label_readable': a_vocab[batch_cpu.label[i]] } checkpoint.append({'original': og, 'reduced': []}) s1 = batch.hypothesis[i] if args.pnoth else batch.premise[i] s1 = s1.to(conf.device) for j, s2 in enumerate(reduced[i]): s2 = torch.LongTensor(s2).to(conf.device) model.eval() if args.pnoth: output = model(s2.unsqueeze(0), s1.unsqueeze(0)) else: output = model(s1.unsqueeze(0), s2.unsqueeze(0)) output = F.softmax(output, 1) pred_scores, pred = torch.max(output, 1) pred = pred.detach().cpu().numpy()[0] pred_scores = pred_scores.detach().cpu().numpy()[0] if args.pnoth: hypo, prem = s1.cpu(), s2.cpu() else: prem, hypo = s1.cpu(), s2.cpu() checkpoint[-1]['reduced'].append({ 'premise': prem, 'hypothesis': hypo, 'premise_readable': to_text(prem, q_vocab), 'hypothesis_readable': to_text(hypo, q_vocab), 'prediction': pred, 'prediction_readable': a_vocab[pred], 'score': pred_scores, 'label': batch_cpu.label[i], 'label_readable': a_vocab[batch_cpu.label[i]], 'removed_indices': removed_indices[i][j], 'which_reduced': 'premise' if args.pnoth else 'hypothesis' }) if batch_i % 1000 == 0 and batch_i > 0: out_path = os.path.join(out_dir, '{}.{}'.format(fname, batch_i)) with open(out_path, 'wb') as f: pickle.dump(checkpoint, f) checkpoint = [] if len(checkpoint) > 0: out_path = os.path.join(out_dir, '{}.{}'.format(fname, batch_i)) with open(out_path, 'wb') as f: pickle.dump(checkpoint, f)
def main(): from args import conf parser = argparse.ArgumentParser() parser.add_argument('--train', default='results/rawr.train.hypothesis.pkl') parser.add_argument('--dev', default='results/rawr.dev.hypothesis.pkl') parser.add_argument('--truth', default=False, action='store_true', help='use label instead of prediction as target') parser.add_argument('--ogdev', default=False, action='store_true', help='use original dev set instead of reduced') parser.add_argument('--full', default=0, type=float, help='amount of full examples to include') args = parser.parse_args() conf.train_data = args.train conf.dev_data = args.dev print('loading regular data...') regular_data = SNLI(conf) conf.char_vocab_size = len(regular_data.char_vocab) conf.word_vocab_size = len(regular_data.TEXT.vocab) conf.class_size = len(regular_data.LABEL.vocab) conf.max_word_len = regular_data.max_word_len conf.out_dir = prepare_output_dir(conf, 'results', 'reduced') print('loading reduced data from [{}]'.format(conf.train_data)) with open(conf.train_data, 'rb') as f: train = pickle.load(f) print('loading reduced data from [{}]'.format(conf.dev_data)) with open(conf.dev_data, 'rb') as f: dev = pickle.load(f) train_label = 'label' if args.truth else 'prediction' train = [(x['premise'], x['hypothesis'], ex['original'][train_label]) for ex in train for x in ex['reduced']] # dev = [(x['premise'], x['hypothesis'], x['label']) # for ex in dev for x in ex['reduced']] dev = [(x['premise'], x['hypothesis'], x['label']) for ex in dev for x in ex['reduced'][:1]] train_batches = batchify(train, conf.batch_size) if args.full > 0: n_examples = int(len(regular_data.train_iter) * args.full) print('use {} ({}) full training data'.format( n_examples * conf.batch_size, args.full)) full_batches = [] for j, x in enumerate(regular_data.train_iter): if j > n_examples: break full_batches.append((x.premise, x.hypothesis, x.label)) # train_batches += full_batches train_batches = full_batches print(len(train_batches)) if args.ogdev: dev_batches = list(regular_data.dev_iter) dev_batches = [(x.premise, x.hypothesis, x.label) for x in dev_batches] else: dev_batches = batchify(train, conf.batch_size) model = BIMPM(conf, regular_data) if conf.gpu > -1: model.cuda(conf.gpu) print('begin training') best_model = train_reduced(model, train_batches, dev_batches, conf) torch.save(best_model.state_dict(), os.path.join(conf.out_dir, 'best.pt')) print('training finished!')
def main(num_epochs=NEPOCH): print("Loading data ...") snli = SNLI(batch_size=BSIZE) train_batches = list(snli.train_minibatch_generator()) dev_batches = list(snli.dev_minibatch_generator()) test_batches = list(snli.test_minibatch_generator()) W_word_embedding = snli.weight # W shape: (# vocab size, WE_DIM) del snli print("Building network ...") ########### sentence embedding encoder ########### # sentence vector, with each number standing for a word number input_var = T.TensorType('int32', [False, False])('sentence_vector') input_var.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 20), 'int32'), numpy.zeros( (BSIZE, 5)).astype('int32'))) input_var.tag.test_value[1, 20:22] = (413, 45) l_in = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var) input_mask = T.TensorType('int32', [False, False])('sentence_mask') input_mask.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 20), dtype='int32'), numpy.zeros((BSIZE, 5), dtype='int32'))) input_mask.tag.test_value[1, 20:22] = 1 l_mask = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask) # output shape (BSIZE, None, WEDIM) l_word_embed = lasagne.layers.EmbeddingLayer( l_in, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=W_word_embedding) # bidirectional LSTM l_forward = lasagne.layers.LSTMLayer( l_word_embed, mask_input=l_mask, num_units=LSTMHID, ingate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), forgetgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), cell=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), nonlinearity=lasagne.nonlinearities.tanh, peepholes=False, grad_clipping=GCLIP) l_backward = lasagne.layers.LSTMLayer( l_word_embed, mask_input=l_mask, num_units=LSTMHID, ingate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), forgetgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), cell=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), nonlinearity=lasagne.nonlinearities.tanh, peepholes=False, grad_clipping=GCLIP, backwards=True) # output dim: (BSIZE, None, 2*LSTMHID) l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2) l_concat_dpout = lasagne.layers.DropoutLayer( l_concat, p=DPOUT, rescale=True) # might not need this line # Attention mechanism to get sentence embedding # output dim: (BSIZE, None, ATTHID) l_ws1 = DenseLayer3DInput(l_concat_dpout, num_units=ATTHID) l_ws1_dpout = lasagne.layers.DropoutLayer(l_ws1, p=DPOUT, rescale=True) # output dim: (BSIZE, None, NROW) l_ws2 = DenseLayer3DInput(l_ws1_dpout, num_units=NROW, nonlinearity=None) l_annotations = Softmax3D(l_ws2, mask=l_mask) # output dim: (BSIZE, 2*LSTMHID, NROW) l_sentence_embedding = ApplyAttention([l_annotations, l_concat]) # beam search? Bi lstm in the sentence embedding layer? etc. ########### get embeddings for hypothesis and premise ########### # hypothesis input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector') input_var_h.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 18), 'int32'), numpy.zeros( (BSIZE, 6)).astype('int32'))) l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_h) input_mask_h = T.TensorType('int32', [False, False])('hypo_mask') input_mask_h.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32'))) input_mask_h.tag.test_value[1, 18:22] = 1 l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_h) # premise input_var_p = T.TensorType('int32', [False, False])('premise_vector') input_var_p.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 16), 'int32'), numpy.zeros( (BSIZE, 3)).astype('int32'))) l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_p) input_mask_p = T.TensorType('int32', [False, False])('premise_mask') input_mask_p.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32'))) input_mask_p.tag.test_value[1, 16:18] = 1 l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_p) hypothesis_embedding, hypothesis_annotation = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_h.input_var, l_mask: l_mask_h.input_var }) premise_embedding, premise_annotation = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_p.input_var, l_mask: l_mask_p.input_var }) hypothesis_embedding_clean, hypothesis_annotation_clean = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_h.input_var, l_mask: l_mask_h.input_var }, deterministic=True) premise_embedding_clean, premise_annotation_clean = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_p.input_var, l_mask: l_mask_p.input_var }, deterministic=True) ########### gated encoder and output MLP ########## l_hypo_embed = lasagne.layers.InputLayer(shape=(BSIZE, NROW, 2 * LSTMHID), input_var=hypothesis_embedding) l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_embed, p=DPOUT, rescale=True) l_pre_embed = lasagne.layers.InputLayer(shape=(BSIZE, NROW, 2 * LSTMHID), input_var=premise_embedding) l_pre_embed_dpout = lasagne.layers.DropoutLayer(l_pre_embed, p=DPOUT, rescale=True) # output dim: (BSIZE, NROW, 2*LSTMHID) l_factors = GatedEncoder3D([l_hypo_embed_dpout, l_pre_embed_dpout], num_hfactors=2 * LSTMHID) l_factors_dpout = lasagne.layers.DropoutLayer(l_factors, p=DPOUT, rescale=True) # l_hids = DenseLayer3DWeight() l_outhid = lasagne.layers.DenseLayer( l_factors_dpout, num_units=OUTHID, nonlinearity=lasagne.nonlinearities.rectify) l_outhid_dpout = lasagne.layers.DropoutLayer(l_outhid, p=DPOUT, rescale=True) l_output = lasagne.layers.DenseLayer( l_outhid_dpout, num_units=3, nonlinearity=lasagne.nonlinearities.softmax) ########### target, cost, validation, etc. ########## target_values = T.ivector('target_output') target_values.tag.test_value = numpy.asarray([ 1, ] * BSIZE, dtype='int32') network_output = lasagne.layers.get_output(l_output) network_prediction = T.argmax(network_output, axis=1) accuracy = T.mean(T.eq(network_prediction, target_values)) network_output_clean = lasagne.layers.get_output( l_output, { l_hypo_embed: hypothesis_embedding_clean, l_pre_embed: premise_embedding_clean }, deterministic=True) network_prediction_clean = T.argmax(network_output_clean, axis=1) accuracy_clean = T.mean(T.eq(network_prediction_clean, target_values)) # penalty term and cost attention_penalty = T.mean( (T.batched_dot(hypothesis_annotation, hypothesis_annotation.dimshuffle(0, 2, 1)) - T.eye(hypothesis_annotation.shape[1]).dimshuffle('x', 0, 1))**2, axis=(0, 1, 2)) + T.mean( (T.batched_dot(premise_annotation, premise_annotation.dimshuffle(0, 2, 1)) - T.eye(premise_annotation.shape[1]).dimshuffle('x', 0, 1))**2, axis=(0, 1, 2)) L2_lstm = ((l_forward.W_in_to_ingate ** 2).sum() + \ (l_forward.W_hid_to_ingate ** 2).sum() + \ (l_forward.W_in_to_forgetgate ** 2).sum() + \ (l_forward.W_hid_to_forgetgate ** 2).sum() + \ (l_forward.W_in_to_cell ** 2).sum() + \ (l_forward.W_hid_to_cell ** 2).sum() + \ (l_forward.W_in_to_outgate ** 2).sum() + \ (l_forward.W_hid_to_outgate ** 2).sum() + \ (l_backward.W_in_to_ingate ** 2).sum() + \ (l_backward.W_hid_to_ingate ** 2).sum() + \ (l_backward.W_in_to_forgetgate ** 2).sum() + \ (l_backward.W_hid_to_forgetgate ** 2).sum() + \ (l_backward.W_in_to_cell ** 2).sum() + \ (l_backward.W_hid_to_cell ** 2).sum() + \ (l_backward.W_in_to_outgate ** 2).sum() + \ (l_backward.W_hid_to_outgate ** 2).sum()) L2_attention = (l_ws1.W**2).sum() + (l_ws2.W**2).sum() L2_gae = (l_factors.Wxf**2).sum() + (l_factors.Wyf**2).sum() L2_outputhid = (l_outhid.W**2).sum() L2_softmax = (l_output.W**2).sum() L2 = L2_lstm + L2_attention + L2_gae + L2_outputhid + L2_softmax cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values)) + \ L2REG * L2 cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values)) + \ L2REG * L2 if ATTPENALTY != 0.: cost = cost + ATTPENALTY * attention_penalty cost_clean = cost_clean + ATTPENALTY * attention_penalty # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_output) + \ lasagne.layers.get_all_params(l_sentence_embedding) if not UPDATEWE: all_params.remove(l_word_embed.W) numparams = sum( [numpy.prod(i) for i in [i.shape.eval() for i in all_params]]) print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams)) print("-----------------------------------------------------------------") for item in all_params: print("{0:24}{1:24}{2}".format(item, item.shape.eval(), numpy.prod(item.shape.eval()))) # if exist param file then load params look_for = 'params' + os.sep + 'params_' + filename + '.pkl' if os.path.isfile(look_for): print("Resuming from file: " + look_for) all_param_values = cPickle.load(open(look_for, 'rb')) for p, v in zip(all_params, all_param_values): p.set_value(v) # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LR) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost, accuracy], updates=updates) compute_cost = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost_clean, accuracy_clean]) predict = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var ], network_prediction_clean) def evaluate(mode, verbose=False): if mode == 'dev': data = dev_batches if mode == 'test': data = test_batches set_cost = 0. set_accuracy = 0. for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1): _cost, _accuracy = compute_cost(hypo, hm, premise, pm, truth) set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \ 1.0 / batches_seen * _cost set_accuracy = (1.0 - 1.0 / batches_seen) * set_accuracy + \ 1.0 / batches_seen * _accuracy if verbose == True: predicted = [] truth = [] for batches_seen, (hypo, hm, premise, pm, th) in enumerate(data, 1): predicted.append(predict(hypo, hm, premise, pm)) truth.append(th) truth = numpy.concatenate(truth) predicted = numpy.concatenate(predicted) cm = confusion_matrix(truth, predicted) pr_a = cm.trace() * 1.0 / truth.size pr_e = ((cm.sum(axis=0)*1.0/truth.size) * \ (cm.sum(axis=1)*1.0/truth.size)).sum() k = (pr_a - pr_e) / (1 - pr_e) print(mode + " set statistics:") print("kappa index of agreement: %f" % k) print("confusion matrix:") print(cm) return set_cost, set_accuracy print("Done. Evaluating scratch model ...") test_set_cost, test_set_accuracy = evaluate('test', verbose=True) print("BEFORE TRAINING: dev cost %f, accuracy %f" % (test_set_cost, test_set_accuracy)) print("Training ...") try: for epoch in range(num_epochs): train_set_cost = 0. train_set_accuracy = 0. start = time.time() for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(train_batches, 1): _cost, _accuracy = train(hypo, hm, premise, pm, truth) train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \ 1.0 / batches_seen * _cost train_set_accuracy = (1.0 - 1.0 / batches_seen) * train_set_accuracy + \ 1.0 / batches_seen * _accuracy if batches_seen % 100 == 0: end = time.time() print( "Sample %d %.2fs, lr %.4f, train cost %f, accuracy %f" % (batches_seen * BSIZE, end - start, LR, train_set_cost, train_set_accuracy)) start = end if batches_seen % 2000 == 0: dev_set_cost, dev_set_accuracy = evaluate('dev') print("***dev cost %f, accuracy %f" % (dev_set_cost, dev_set_accuracy)) # save parameters all_param_values = [p.get_value() for p in all_params] cPickle.dump( all_param_values, open('params' + os.sep + 'params_' + filename + '.pkl', 'wb')) dev_set_cost, dev_set_accuracy = evaluate('dev') test_set_cost, test_set_accuracy = evaluate('test', verbose=True) print("epoch %d, cost: train %f dev %f test %f;\n" " accu: train %f dev %f test %f" % (epoch, train_set_cost, dev_set_cost, test_set_cost, train_set_accuracy, dev_set_accuracy, test_set_accuracy)) except KeyboardInterrupt: pdb.set_trace() pass