def __init__(self, input_size, hidden_size, num_layers, valid_num_children=None, attention=True, one_hot=False, embedding_size=256, dropout=False, binary_tree_lstm_cell=False): super(TreeEncoder, self).__init__() self.lstm_list = nn.ModuleList() self.one_hot = one_hot self.dropout = False if dropout: self.dropout = nn.Dropout(p=dropout) if one_hot: if binary_tree_lstm_cell: self.lstm_list.append(BinaryTreeLSTM(input_size, hidden_size)) else: self.lstm_list.append( TreeLSTM(input_size, hidden_size, valid_num_children)) else: self.embedding = nn.Embedding(input_size, embedding_size) self.lstm_list.append( TreeLSTM(embedding_size, hidden_size, valid_num_children)) # All TreeLSTMs have input of hidden_size except the first. for i in range(num_layers - 1): if binary_tree_lstm_cell: self.lstm_list.append(BinaryTreeLSTM(input_size, hidden_size)) else: self.lstm_list.append( TreeLSTM(input_size, hidden_size, valid_num_children)) self.attention = attention
def __init__(self, config, weight_matrix, word_idx): self.config = config self.tree_lstm = tree_lstm = TreeLSTM(weight_matrix, word_idx, config) self.compiler = tree_lstm.compiler # graph compiler for tree LSTM self.encoder = None # reloader for tree-LSTM # NOTICE: MUST reload before new Variable defined self.reloader = tf.train.Saver(tf.global_variables()) # place holders self.keep_prob_ph = tree_lstm.keep_prob_ph self.emb_dropout = tree_lstm.emb_dropout self.is_train = tf.placeholder_with_default(False, [], name='is_train') self.labels = tf.placeholder(tf.int32, [None, None], name='sentiment_label')
import torch.optim as optim import torch import torch.utils.data as Data train_input_file = open(os.getcwd() + "//data//tree_training.pkl") train_label = np.load(os.getcwd() + "//data//tree_training_label.npy") test_input_file = open(os.getcwd() + "//data//tree_testing.pkl") test_label = np.load(os.getcwd() + "//data//tree_testing_label.npy") train_data = pickle.load(train_input_file) test_data = pickle.load(test_input_file) train_data = BatchedTree(train_data) test_data = BatchedTree(test_data) batch_size = 8 model = TreeLSTM(x_size=100, h_size=25, dropout=0.3, cell_type='n_ary', n_ary=4) train_set = Data.TensorDataset(torch.tensor(train_data), torch.tensor(train_label)) train_loader = Data.DataLoader(train_set, batch_size=batch_size, shuffle=True) test_set = Data.TensorDataset(torch.tensor(test_data), torch.tensor(test_label)) test_loader = Data.DataLoader(test_set, batch_size=batch_size, shuffle=True) criterion = torch.nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) for epoch in range(100):
def main(args): np.random.seed(args.seed) th.manual_seed(args.seed) th.cuda.manual_seed(args.seed) best_epoch = -1 best_dev_acc = 0 cuda = args.gpu >= 0 device = th.device('cuda:{}'.format( args.gpu)) if cuda else th.device('cpu') if cuda: th.cuda.set_device(args.gpu) trainset = data.SST() train_loader = DataLoader(dataset=trainset, batch_size=args.batch_size, collate_fn=data.SST.batcher(device), shuffle=True, num_workers=0) devset = data.SST(mode='dev') dev_loader = DataLoader(dataset=devset, batch_size=100, collate_fn=data.SST.batcher(device), shuffle=False, num_workers=0) testset = data.SST(mode='test') test_loader = DataLoader(dataset=testset, batch_size=100, collate_fn=data.SST.batcher(device), shuffle=False, num_workers=0) model = TreeLSTM(trainset.num_vocabs, args.x_size, args.h_size, trainset.num_classes, args.dropout, cell_type='childsum' if args.child_sum else 'nary', pretrained_emb=trainset.pretrained_emb).to(device) print(model) params_ex_emb = [ x for x in list(model.parameters()) if x.requires_grad and x.size(0) != trainset.num_vocabs ] params_emb = list(model.embedding.parameters()) for p in params_ex_emb: if p.dim() > 1: INIT.xavier_uniform_(p) optimizer = optim.Adagrad([{ 'params': params_ex_emb, 'lr': args.lr, 'weight_decay': args.weight_decay }, { 'params': params_emb, 'lr': 0.1 * args.lr }]) dur = [] for epoch in range(args.epochs): t_epoch = time.time() model.train() for step, batch in enumerate(train_loader): g = batch.graph n = g.number_of_nodes() h = th.zeros((n, args.h_size)).to(device) c = th.zeros((n, args.h_size)).to(device) if step >= 3: t0 = time.time() # tik logits = model(batch, h, c) logp = F.log_softmax(logits, 1) loss = F.nll_loss(logp, batch.label, reduction='sum') optimizer.zero_grad() loss.backward() optimizer.step() if step >= 3: dur.append(time.time() - t0) # tok if step > 0 and step % args.log_every == 0: pred = th.argmax(logits, 1) acc = th.sum(th.eq(batch.label, pred)) root_ids = [ i for i in range(batch.graph.number_of_nodes()) if batch.graph.out_degree(i) == 0 ] root_acc = np.sum(batch.label.cpu().data.numpy()[root_ids] == pred.cpu().data.numpy()[root_ids]) print( "Epoch {:05d} | Step {:05d} | Loss {:.4f} | Acc {:.4f} | Root Acc {:.4f} | Time(s) {:.4f}" .format(epoch, step, loss.item(), 1.0 * acc.item() / len(batch.label), 1.0 * root_acc / len(root_ids), np.mean(dur))) print('Epoch {:05d} training time {:.4f}s'.format( epoch, time.time() - t_epoch)) # eval on dev set accs = [] root_accs = [] model.eval() for step, batch in enumerate(dev_loader): g = batch.graph n = g.number_of_nodes() with th.no_grad(): h = th.zeros((n, args.h_size)).to(device) c = th.zeros((n, args.h_size)).to(device) logits = model(batch, h, c) pred = th.argmax(logits, 1) acc = th.sum(th.eq(batch.label, pred)).item() accs.append([acc, len(batch.label)]) root_ids = [ i for i in range(batch.graph.number_of_nodes()) if batch.graph.out_degree(i) == 0 ] root_acc = np.sum(batch.label.cpu().data.numpy()[root_ids] == pred.cpu().data.numpy()[root_ids]) root_accs.append([root_acc, len(root_ids)]) dev_acc = 1.0 * np.sum([x[0] for x in accs]) / np.sum([x[1] for x in accs]) dev_root_acc = 1.0 * np.sum([x[0] for x in root_accs]) / np.sum( [x[1] for x in root_accs]) print("Epoch {:05d} | Dev Acc {:.4f} | Root Acc {:.4f}".format( epoch, dev_acc, dev_root_acc)) if dev_root_acc > best_dev_acc: best_dev_acc = dev_root_acc best_epoch = epoch th.save(model.state_dict(), 'best_{}.pkl'.format(args.seed)) else: if best_epoch <= epoch - 10: break # lr decay for param_group in optimizer.param_groups: param_group['lr'] = max(1e-5, param_group['lr'] * 0.99) #10 print(param_group['lr']) # test model.load_state_dict(th.load('best_{}.pkl'.format(args.seed))) accs = [] root_accs = [] model.eval() for step, batch in enumerate(test_loader): g = batch.graph n = g.number_of_nodes() with th.no_grad(): h = th.zeros((n, args.h_size)).to(device) c = th.zeros((n, args.h_size)).to(device) logits = model(batch, h, c) pred = th.argmax(logits, 1) acc = th.sum(th.eq(batch.label, pred)).item() accs.append([acc, len(batch.label)]) root_ids = [ i for i in range(batch.graph.number_of_nodes()) if batch.graph.out_degree(i) == 0 ] root_acc = np.sum(batch.label.cpu().data.numpy()[root_ids] == pred.cpu().data.numpy()[root_ids]) root_accs.append([root_acc, len(root_ids)]) test_acc = 1.0 * np.sum([x[0] for x in accs]) / np.sum([x[1] for x in accs]) test_root_acc = 1.0 * np.sum([x[0] for x in root_accs]) / np.sum( [x[1] for x in root_accs]) print( '------------------------------------------------------------------------------------' ) print("Epoch {:05d} | Test Acc {:.4f} | Root Acc {:.4f}".format( best_epoch, test_acc, test_root_acc))
max_depth = randint(1, 4) # root root_id = tree_.add_node(parent_id=None, tensor=torch.rand(X_SIZE)) parents = [root_id] for _ in range(max_depth): child_num = randint(1, 4) new_parents = [] for _ in range(child_num): parent_id = choice(parents) child_id = tree_.add_node(parent_id=parent_id, tensor=torch.rand(X_SIZE)) new_parents.append(child_id) parents = new_parents return tree_ if __name__ == "__main__": tree_list = [] for _ in range(5): tree = make_random_tree() tree_list.append(tree) batched_tree = BatchedTree(tree_list) model = TreeLSTM(x_size=X_SIZE, h_size=H_SIZE, dropout=0.3, cell_type='n_ary', n_ary=4) out_batch = model(batched_tree) print(out_batch.get_hidden_state())
def main(args): np.random.seed(args.seed) mx.random.seed(args.seed) best_epoch = -1 best_dev_acc = 0 cuda = args.gpu >= 0 if cuda: if args.gpu in mx.test_utils.list_gpus(): ctx = mx.gpu(args.gpu) else: print( 'Requested GPU id {} was not found. Defaulting to CPU implementation' .format(args.gpu)) ctx = mx.cpu() else: ctx = mx.cpu() if args.use_glove: prepare_glove() trainset = data.SSTDataset() train_loader = gluon.data.DataLoader(dataset=trainset, batch_size=args.batch_size, batchify_fn=batcher(ctx), shuffle=True, num_workers=0) devset = data.SSTDataset(mode='dev') dev_loader = gluon.data.DataLoader(dataset=devset, batch_size=100, batchify_fn=batcher(ctx), shuffle=True, num_workers=0) testset = data.SSTDataset(mode='test') test_loader = gluon.data.DataLoader(dataset=testset, batch_size=100, batchify_fn=batcher(ctx), shuffle=False, num_workers=0) model = TreeLSTM(trainset.vocab_size, args.x_size, args.h_size, trainset.num_classes, args.dropout, cell_type='childsum' if args.child_sum else 'nary', pretrained_emb=trainset.pretrained_emb, ctx=ctx) print(model) params_ex_emb = [ x for x in model.collect_params().values() if x.grad_req != 'null' and x.shape[0] != trainset.vocab_size ] params_emb = list(model.embedding.collect_params().values()) for p in params_emb: p.lr_mult = 0.1 model.initialize(mx.init.Xavier(magnitude=1), ctx=ctx) model.hybridize() trainer = gluon.Trainer(model.collect_params('^(?!embedding).*$'), 'adagrad', { 'learning_rate': args.lr, 'wd': args.weight_decay }) trainer_emb = gluon.Trainer(model.collect_params('^embedding.*$'), 'adagrad', {'learning_rate': args.lr}) dur = [] L = gluon.loss.SoftmaxCrossEntropyLoss(axis=1) for epoch in range(args.epochs): t_epoch = time.time() for step, batch in enumerate(train_loader): g = batch.graph n = g.number_of_nodes() # TODO begin_states function? h = mx.nd.zeros((n, args.h_size), ctx=ctx) c = mx.nd.zeros((n, args.h_size), ctx=ctx) if step >= 3: t0 = time.time() # tik with mx.autograd.record(): pred = model(batch, h, c) loss = L(pred, batch.label) loss.backward() trainer.step(args.batch_size) trainer_emb.step(args.batch_size) if step >= 3: dur.append(time.time() - t0) # tok if step > 0 and step % args.log_every == 0: pred = pred.argmax(axis=1).astype(batch.label.dtype) acc = (batch.label == pred).sum() root_ids = [ i for i in range(batch.graph.number_of_nodes()) if batch.graph.out_degree(i) == 0 ] root_acc = np.sum(batch.label.asnumpy()[root_ids] == pred.asnumpy()[root_ids]) print( "Epoch {:05d} | Step {:05d} | Loss {:.4f} | Acc {:.4f} | Root Acc {:.4f} | Time(s) {:.4f}" .format(epoch, step, loss.sum().asscalar(), 1.0 * acc.asscalar() / len(batch.label), 1.0 * root_acc / len(root_ids), np.mean(dur))) print('Epoch {:05d} training time {:.4f}s'.format( epoch, time.time() - t_epoch)) # eval on dev set accs = [] root_accs = [] for step, batch in enumerate(dev_loader): g = batch.graph n = g.number_of_nodes() h = mx.nd.zeros((n, args.h_size), ctx=ctx) c = mx.nd.zeros((n, args.h_size), ctx=ctx) pred = model(batch, h, c).argmax(1).astype(batch.label.dtype) acc = (batch.label == pred).sum().asscalar() accs.append([acc, len(batch.label)]) root_ids = [ i for i in range(batch.graph.number_of_nodes()) if batch.graph.out_degree(i) == 0 ] root_acc = np.sum( batch.label.asnumpy()[root_ids] == pred.asnumpy()[root_ids]) root_accs.append([root_acc, len(root_ids)]) dev_acc = 1.0 * np.sum([x[0] for x in accs]) / np.sum([x[1] for x in accs]) dev_root_acc = 1.0 * np.sum([x[0] for x in root_accs]) / np.sum( [x[1] for x in root_accs]) print("Epoch {:05d} | Dev Acc {:.4f} | Root Acc {:.4f}".format( epoch, dev_acc, dev_root_acc)) if dev_root_acc > best_dev_acc: best_dev_acc = dev_root_acc best_epoch = epoch model.save_parameters('best_{}.params'.format(args.seed)) else: if best_epoch <= epoch - 10: break # lr decay trainer.set_learning_rate(max(1e-5, trainer.learning_rate * 0.99)) print(trainer.learning_rate) trainer_emb.set_learning_rate( max(1e-5, trainer_emb.learning_rate * 0.99)) print(trainer_emb.learning_rate) # test model.load_parameters('best_{}.params'.format(args.seed)) accs = [] root_accs = [] for step, batch in enumerate(test_loader): g = batch.graph n = g.number_of_nodes() h = mx.nd.zeros((n, args.h_size), ctx=ctx) c = mx.nd.zeros((n, args.h_size), ctx=ctx) pred = model(batch, h, c).argmax(axis=1).astype(batch.label.dtype) acc = (batch.label == pred).sum().asscalar() accs.append([acc, len(batch.label)]) root_ids = [ i for i in range(batch.graph.number_of_nodes()) if batch.graph.out_degree(i) == 0 ] root_acc = np.sum( batch.label.asnumpy()[root_ids] == pred.asnumpy()[root_ids]) root_accs.append([root_acc, len(root_ids)]) test_acc = 1.0 * np.sum([x[0] for x in accs]) / np.sum([x[1] for x in accs]) test_root_acc = 1.0 * np.sum([x[0] for x in root_accs]) / np.sum( [x[1] for x in root_accs]) print( '------------------------------------------------------------------------------------' ) print("Epoch {:05d} | Test Acc {:.4f} | Root Acc {:.4f}".format( best_epoch, test_acc, test_root_acc))
def main(args): mini_str = '/mini' if args.mini else '' # path to mini dataset version_suffix = '_v2.0' if args.squad_version == 2.0 else '' # gets proper dataset version (1.1 or 2.0) # Prepare output directory under ./weights/ to store model-specific data including weights out_dir = 'weights/%s' % args.experiment if os.path.exists(out_dir): print( 'Warning - you are overwriting previous experiment %s. Hit Ctrl Z to abort.\n' % args.experiment) shutil.rmtree(out_dir) os.mkdir(out_dir) logger = open(os.path.join(out_dir, 'log.txt'), 'w') print_and_log( 'Timestamp = %s for %s\n' % (datetime.strftime(datetime.now(), '%m/%d/%Y %H:%M'), args.experiment), logger) # Load Dev Data and save it to this model's weights dir print_and_log('Loading v%s Dev Data...' % args.squad_version, logger) dev_data = load_pk('preprocess/data%s/squad_dev_trees%s.npy' % (mini_str, version_suffix)) dev_batcher = Batcher(dev_data, is_train=False, target_batch_size=args.batch_size) save_as_pk(dev_batcher, os.path.join(out_dir, 'dev_batcher.npy')) print_and_log('Loaded Dev Data...', logger) # Load Train Data and save it to this model's weights dir print_and_log('Loading v%s Train Data...' % args.squad_version, logger) train_data = load_pk('preprocess/data%s/squad_train_trees%s.npy' % (mini_str, version_suffix)) train_batcher = Batcher(train_data, is_train=True, target_batch_size=args.batch_size) print_and_log('Loaded Train Data...', logger) # Create models and optimizers span_extractor = TreeLSTM(use_cuda=args.cuda) answer_verifier = AnswerVerifier(use_cuda=args.cuda) if args.cuda: span_extractor.cuda() answer_verifier.cuda() span_extractor_grad_params = filter(lambda p: p.requires_grad, span_extractor.parameters()) span_extractor_optimizer = optim.Adam(span_extractor_grad_params, args.span_extractor_lr) answer_verifier_grad_params = filter(lambda p: p.requires_grad, answer_verifier.parameters()) answer_verifier_optimizer = optim.Adam(answer_verifier_grad_params, args.answer_verifier_lr) # Determines if question is answerable or not answer_verifier_logistic_loss = BCEWithLogitsLoss( pos_weight=span_extractor.cudify(torch.FloatTensor([0.5]))) best_span_f1 = -1 # Keep track of which epoch model achieves highest span level F1 on the dev set best_answer_verifier_accuracy = -1 best_span_epoch = -1 best_answer_verifier_epoch = -1 for epoch_idx in range(args.epochs): print_and_log('Starting Epoch %d...' % (epoch_idx + 1), logger) train_evaluator = Evaluator( 'train' ) # Stores predictions and returns evaluation string at the end of epoch dev_evaluator = Evaluator('dev') start_time = time() span_extractor.train() answer_verifier.train() while train_batcher.has_next(): # Clear gradients and get next batch span_extractor_optimizer.zero_grad() answer_verifier_optimizer.zero_grad() joint_loss = _run_batch( batch=train_batcher.next(), span_extractor=span_extractor, span_extractor_optimizer=span_extractor_optimizer, answer_verifier=answer_verifier, answer_verifier_optimizer=answer_verifier_optimizer, answer_verifier_logistic_loss=answer_verifier_logistic_loss, evaluator=train_evaluator) joint_loss.backward() # Make a gradient step span_extractor_optimizer.step() answer_verifier_optimizer.step() print_and_log('Took %s.' % format_seconds(time() - start_time), logger) print_and_log('\t' + train_evaluator.eval_string(), logger) span_extractor.eval() answer_verifier.eval() while dev_batcher.has_next(): _run_batch( batch=dev_batcher.next(), span_extractor=span_extractor, span_extractor_optimizer=span_extractor_optimizer, answer_verifier=answer_verifier, answer_verifier_optimizer=answer_verifier_optimizer, answer_verifier_logistic_loss=answer_verifier_logistic_loss, evaluator=dev_evaluator) print_and_log('\t' + dev_evaluator.eval_string(), logger) dev_f1 = dev_evaluator.span_f1() if dev_f1 > best_span_f1: best_span_f1 = dev_f1 best_span_epoch = epoch_idx + 1 torch.save(span_extractor, os.path.join(out_dir, 'best_span_extractor.tar')) dev_answer_verifier_accuracy = dev_evaluator.avg_answer_accuracy() if dev_answer_verifier_accuracy > best_answer_verifier_accuracy: best_answer_verifier_accuracy = dev_answer_verifier_accuracy best_answer_verifier_epoch = epoch_idx + 1 torch.save(answer_verifier, os.path.join(out_dir, 'best_answer_verifier.tar')) print_and_log( '\nBest span = %.4f F1 at %d epoch' % (best_span_f1, best_span_epoch), logger) print_and_log( '\nBest answer verifier = %.4f accuracy at %d epoch' % (best_answer_verifier_accuracy, best_answer_verifier_epoch), logger)
def main(args): np.random.seed(args.seed) th.manual_seed(args.seed) th.cuda.manual_seed(args.seed) cuda = args.gpu >= 0 device = th.device('cuda:{}'.format( args.gpu)) if cuda else th.device('cpu') if cuda: th.cuda.set_device(args.gpu) trainset = data.SST() train_loader = DataLoader(dataset=trainset, batch_size=args.batch_size, collate_fn=batcher(device), shuffle=True, num_workers=0) model = TreeLSTM(trainset.num_vocabs, args.x_size, args.h_size, trainset.num_classes, args.dropout, cell_type='childsum' if args.child_sum else 'nary', pretrained_emb=trainset.pretrained_emb).to(device) print(model) params_ex_emb = [ x for x in list(model.parameters()) if x.requires_grad and x.size(0) != trainset.num_vocabs ] params_emb = list(model.embedding.parameters()) optimizer = optim.Adagrad([{ 'params': params_ex_emb, 'lr': args.lr, 'weight_decay': args.weight_decay }, { 'params': params_emb, 'lr': 0.1 * args.lr }]) for epoch in range(args.epochs): model.train() count = 0 t_epoch = time.time() for step, batch in enumerate(train_loader): g = batch.graph n = g.number_of_nodes() h = th.zeros((n, args.h_size)).to(device) c = th.zeros((n, args.h_size)).to(device) logits = model(batch, h, c) logp = F.log_softmax(logits, 1) loss = F.nll_loss(logp, batch.label, reduction='elementwise_mean') optimizer.zero_grad() loss.backward() optimizer.step() count += 1 if cuda: th.cuda.synchronize() t_epoch_end = time.time() print('Epoch {:05d} batch {} training time {:.4f}s'.format( epoch, count, t_epoch_end - t_epoch))