def main(args): print "main" """ SEED """ np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) """ DATA """ train, valid, test = get_nli(args.nlipath) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], GLOVE_PATH) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) args.word_emb_dim = 300 nli_net = torch.load(args.model) print(nli_net) # loss weight = torch.FloatTensor(args.n_classes).fill_(1) loss_fn = nn.CrossEntropyLoss(weight=weight) loss_fn.size_average = False if args.gpu_id > -1: nli_net.cuda() loss_fn.cuda() """ Train model on Natural Language Inference task """ epoch = 1 for pair in [(train, 'train'), (valid, 'dev'), (test, 'test')]: #args.batch_size = len(pair[0]['lbls']) eval_acc = evaluate_preds( 0, pair[0], args, word_vec, nli_net, pair[1], "%s/%s_%s" % (args.outputdir, pair[1], args.pred_file)) print "Accuracy on " + pair[1] + ": " + str(eval_acc)
def get_vocab(args): # build a vocabulary from all train,dev,test set of the actual snli plus the test set of the # all the transfer tasks. train, valid, test = {}, {}, {} for split in ['test', 'valid', 'train']: for s in ['s1', 's2']: eval(split)[s] = [] for datapath, n_classes in [ (args.test_path, args.data_to_n_classes[args.test_data]), (args.train_path, args.data_to_n_classes[args.train_data]) ]: transfer_train, transfer_valid, transfer_test = get_nli( datapath, n_classes) for split in ['test', 'valid', 'train']: for s in ['s1', 's2']: eval(split)[s].extend(eval("transfer_" + split)[s]) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], args.embdfile) return word_vec
# set gpu device torch.cuda.set_device(params.gpu_id) # print parameters passed, and all parameters print('\ntogrep : {0}\n'.format(sys.argv[1:])) print(params) """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, valid, test = get_nli(params.nlipath) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], GLOVE_PATH) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array([['<s>'] + [word for word in sent.split() if word in word_vec] +\ ['</s>'] for sent in eval(data_type)[split]]) params.word_emb_dim = 300 #params.word_emb_dim = 512 """ MODEL """ # model config
SKIPTHOUGHT_PATH = '/Users/Jonas/Documents/Repositories/skipthought/models/toronto_n5/' INFERSENT_PATH = '/Users/Jonas/Documents/Repositories/InferSent/code/' SICK_PATH = '/Users/Jonas/Documents/Repositories/skipthought/eval/SICK/' SNLI_PATH = '/Users/Jonas/Documents/Repositories/InferSent/dataset/SNLI/' TORONTO_PATH = '/Users/Jonas/Documents/Repositories/skipthought/corpus/' SAVE_PATH = '..' sys.path.append(SKIPTHOUGHT_PATH) sys.path.append(INFERSENT_PATH) from data import get_nli MODELS = ['skipthought', 'infersent'] MODEL = MODELS[0] print('Loading corpus') train, dev, test = get_nli(SNLI_PATH) train = np.array(train['s2']) dev = np.array(dev['s2']) test = np.array(test['s2']) print('Loading saved model') tf.reset_default_graph() embeddings = None # in case of 'cbow' or 'infersent' model n_iter = 0 with open(MODEL_PATH + 'vocab.pkl', 'rb') as f: vocab = pkl.load(f) if MODEL == 'skipthought': from skipthought import Skipthought_para from skipthought import Skipthought_model
# set gpu device torch.cuda.set_device(params.gpu_id) # print parameters passed, and all parameters print('\ntogrep : {0}\n'.format(sys.argv[1:])) print(params) """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, valid, test = get_nli(params) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], params.word_emb_path) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) """ MODEL """ # model config config_nli_model = { 'n_words': len(word_vec),
def main(): global dtype dtype = torch.FloatTensor #Print Flags for key, value in vars(FLAGS).items(): print(key + ' : ' + str(value)) main() nli_path = nli_DEFAULT glove_path = glove_DEFAULT train, dev, test = get_nli(nli_path) vocab, embeddings = build_vocab( train['s1'] + train['s2'] + test['s1'] + test['s2'] + dev['s1'] + dev['s2'], glove_path) config = { 'n_words': len(embeddings), 'emb_dim': FLAGS.emb_dim, 'lstm_dim': FLAGS.lstm_dim, 'dpout': FLAGS.dpout, 'fc_dim': FLAGS.fc_dim, 'b_size': FLAGS.bsize, 'n_classes': FLAGS.n_classes, 'model_name': FLAGS.model_name, 'n_classes': FLAGS.n_classes, }
def clear_gradients(model, name): for param in eval('model.' + name).parameters(): if param.grad is not None: param.grad *= 0.0 """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, valid, test = get_nli(params.nlipath, params.n_classes) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], params.word_emb_path) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) """ MODEL """ # model config config_nli_model = { 'n_words': len(word_vec),
def main(args): """ SEED """ np.random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu_id > -1: torch.cuda.manual_seed(args.seed) """ DATA """ train, valid, test = get_nli(args.nlipath, args.n_classes) word_vecs = build_vocab(train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], args.embdfile) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array([['<s>'] + [word for word in sent.split() if word in word_vecs] + ['</s>'] for sent in eval(data_type)[split]]) args.word_emb_dim = len(word_vecs[list(word_vecs.keys())[0]]) nli_model_configs = get_model_configs(args, len(word_vecs)) nli_model_configs["n_classes"] = args.n_classes # define premise and hypoth encoders premise_encoder = eval(nli_model_configs['encoder_type'])(nli_model_configs) hypoth_encoder = eval(nli_model_configs['encoder_type'])(nli_model_configs) shared_nli_net = SharedNLINet(nli_model_configs, premise_encoder, hypoth_encoder) shared_hypoth_net = SharedHypothNet(nli_model_configs, hypoth_encoder) print(shared_nli_net) print(shared_hypoth_net) if args.pre_trained_model: print( "Pre_trained_model: " + args.pre_trained_model) pre_trained_model = torch.load(args.pre_trained_model) shared_nli_net_params = shared_nli_net.state_dict() pre_trained_params = pre_trained_model.state_dict() assert shared_nli_net_params.keys() == pre_trained_params.keys(), "load model has different parameter state names that NLI_HYPOTHS_NET" for key, parameters in shared_nli_net_params.items(): if parameters.size() == pre_trained_params[key].size(): shared_nli_net_params[key] = pre_trained_params[key] shared_nli_net.load_state_dict(shared_nli_net_params) print(shared_nli_net) if args.pre_trained_adv_model: print( "Pre_trained_adv_model: " + args.pre_trained_adv_model) pre_trained_model = torch.load(args.pre_trained_adv_model) shared_hypoth_net_params = shared_hypoth_net.state_dict() pre_trained_params = pre_trained_model.state_dict() assert shared_hypoth_net_params.keys() == pre_trained_params.keys(), "load model has different parameter state names that NLI_HYPOTHS_NET" for key, parameters in nli_hypoth_params.items(): if parameters.size() == pre_trained_params[key].size(): shared_hypoth_net_params[key] = pre_trained_params[key] shared_hypoth_net.load_state_dict(shared_hypoth_net_params) print(shared_hypoth_net) # nli loss weight = torch.FloatTensor(args.n_classes).fill_(1) loss_fn_nli = nn.CrossEntropyLoss(weight=weight) loss_fn_nli.size_average = False # hypoth (adversarial) loss weight = torch.FloatTensor(args.n_classes).fill_(1) loss_fn_hypoth = nn.CrossEntropyLoss(weight=weight) loss_fn_hypoth.size_average = False # optimizer optim_fn, optim_params = get_optimizer(args.optimizer) optimizer_nli = optim_fn(shared_nli_net.parameters(), **optim_params) #optimizer_hypoth = optim_fn(shared_hypoth_net.parameters(), **optim_params) # only pass hypoth classifier params to avoid updating shared encoder params twice optimizer_hypoth = optim_fn(shared_hypoth_net.classifier.parameters(), **optim_params) if args.gpu_id > -1: shared_nli_net.cuda() shared_hypoth_net.cuda() loss_fn_nli.cuda() loss_fn_hypoth.cuda() """ TRAIN """ global val_acc_best, lr, stop_training, adam_stop val_acc_best = -1e10 adam_stop = False stop_training = False lr = optim_params['lr'] if 'sgd' in args.optimizer else None """ Train model on Natural Language Inference task """ epoch = 1 while not stop_training and epoch <= args.n_epochs: train_acc_nli, train_acc_hypoth, shared_nli_net, shared_hypoth_net = trainepoch(epoch, train, optimizer_nli, optimizer_hypoth, args, word_vecs, shared_nli_net, shared_hypoth_net, loss_fn_nli, loss_fn_hypoth, args.adv_lambda, args.adv_hyp_encoder_lambda) eval_acc_nli, eval_acc_hypoth = evaluate(epoch, valid, optimizer_nli, optimizer_hypoth, args, word_vecs, shared_nli_net, shared_hypoth_net, 'valid', adv_lambda=args.adv_lambda) epoch += 1
# print parameters passed, and all parameters print('\ntogrep : {0}\n'.format(sys.argv[1:])) print(params) """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, valid, test = get_nli(params.nlipath) word_vec = build_vocab(train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], W2V_PATH) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array([['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) params.word_emb_dim = 300 """ MODEL
(params.outputdir + "/" + params.outputmodelname)) with open( params.outputdir + "/" + params.outputmodelname + "/" + 'commandline_args.txt', 'w') as f: args = parser.parse_args() json.dump(args.__dict__, f, indent=2) """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, valid, test = get_nli(params.dataset_path) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], params.vector_rep) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) params.word_emb_dim = 300 """ MODEL """
if USE_CUDA: torch.cuda.set_device(params.gpu_id) print('using cuda:\n', USE_CUDA, '\n') # print parameters print('parameters (passed):\n', '{0}'.format(sys.argv[1:]), '\n') print('parameters (all):\n', params, '\n') """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, dev, test = get_nli(params.nlipath, use_frac={'train': params.train_frac}) word_vec = build_vocab( train['s1'] + train['s2'] + dev['s1'] + dev['s2'] + test['s1'] + test['s2'], params.word_emb_path) for split in ['s1', 's2']: for data_type in ['train', 'dev', 'test']: eval(data_type)[split] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) """ MODEL """ # model config config_nli_model = { 'n_words': len(word_vec),
max_len = np.max(lengths) embed = np.zeros((max_len, len(batch), 300)) #batch[:] = [b.remove(' ') for b in batch] for i in range(len(batch)): for j in range(len(batch[i])): #print(batch[i][j]) embed[j, i, :] = word_vec[batch[i][j]] return torch.from_numpy(embed).float(), lengths ''' GLOVE_PATH = '<glove>/<path>' wenda_infersent = torch.load('./glove_modeldir/GloVe.pickle') wenda_infersent.encoder.enc_lstm.flatten_parameters() train, valid, test = get_nli('./<corpus>/<path>') train['s1'] = list(set(train['s1'])) train['s2'] = list(set(train['s2'])) print(len(train['s1'])) word_vec = build_vocab(train['s1'], GLOVE_PATH) for split in ['s1', 's2']: for data_type in ['train']: eval(data_type)[split] = np.array( [[word for word in list(sent) if word in word_vec] for sent in eval(data_type)[split]]) permutation = np.random.permutation(len(train['s1']))
# set gpu device # torch.cuda.set_device(params.gpu_id) # print parameters passed, and all parameters print('\ntogrep : {0}\n'.format(sys.argv[1:])) print(params) """ SEED """ np.random.seed(params.seed) torch.manual_seed(params.seed) torch.cuda.manual_seed(params.seed) """ DATA """ train, valid, test = get_nli(params.nlipath, params.discmark) word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], params.word_emb_path) for split in ['s1', 's2']: for data_type in ['train', 'valid', 'test']: eval(data_type)[split] = np.array( [['<s>'] + [word for word in sent.split() if word in word_vec] + ['</s>'] for sent in eval(data_type)[split]]) """ MODEL """ # model config config_nli_model = { 'n_words': len(word_vec),
import numpy as np from data import get_nli, get_batch, build_vocab train, valid, test = get_nli('dataset/SNLI/') print(train['label'][:3]) print(train['s1'][:3]) import pickle tokenizer = pickle.load(open('nli_tokenizer.pkl', 'rb')) vocab = {w: i for (w, i) in tokenizer.word_index.items()} inv_vocab = {i: w for (w, i) in tokenizer.word_index.items()} word_vec = build_vocab( train['s1'] + train['s2'] + valid['s1'] + valid['s2'] + test['s1'] + test['s2'], 'glove.840B.300d.txt') new_word_vec = {} new_word_vec['<s>'] = word_vec['<s>'] new_word_vec['</s>'] = word_vec['</s>'] new_word_vec['<p>'] = word_vec['<p>'] new_word_vec[0] = word_vec['<p>'] with open('glove.840B.300d.txt') as f: for line in f: word, vec = line.split(' ', 1) if word in vocab: new_word_vec[vocab[word]] = np.array(list(map(float, vec.split()))) if word == 'UNK': glove_unk = np.array(list(map(float, vec.split()))) with open('glove_unk.pkl', 'wb') as fw: pickle.dump(glove_unk, fw) print('Found {0}(/{1}) words with glove vectors'.format(
save_every=1000000, epochs=100) with open(path + 'paras.pkl', 'wb') as f: pkl.dump(paras, f) return paras if __name__ == '__main__': path = '../dataset/SNLI/' GLOVE_PATH = "../dataset/GloVe/glove.840B.300d.txt" SKIPTHOUGHT_PATH = '/cluster/project2/mr/vetterle/skipthought/toronto_n5/' # SKIPTHOUGHT_PATH = "/Users/Jonas/Documents/Repositories/skipthought/models/toronto_n5/" output_path = '../training_data/' model_path = '../models/m9/' train, dev, test = get_nli(path) # word_vec = build_vocab(train['s1'] + train['s2'] + dev['s1'] + dev['s2'] + test['s1'] + test['s2'], GLOVE_PATH) word_vec = build_vocab(train['s1'] + train['s2'] + dev['s1'] + dev['s2'] + test['s1'] + test['s2'], SKIPTHOUGHT_PATH, skipthought=True) print(word_vec['<s>'].shape) if not os.path.exists(model_path): os.makedirs(model_path) with open(model_path + 'vocab.pkl', 'wb') as f: pkl.dump(word_vec, f) for split in ['s1', 's2']: for data_type in ['train', 'dev', 'test']: