def main(): parser = argparse.ArgumentParser( description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='../data/', help='location of the data corpus') parser.add_argument('--presaved', action='store_true', help='use presaved data') parser.add_argument('--glovedata', type=str, default='../data/glove.6B', help='location of the pretrained glove embeddings') parser.add_argument('--din', type=int, default=30, help='length of LSTM') parser.add_argument('--demb', type=int, default=100, help='size of word embeddings') parser.add_argument('--dhid', type=int, default=100, help='humber of hidden units per layer') parser.add_argument('--dout', type=int, default=2, help='number of output classes') parser.add_argument('--nlayers', type=int, default=1, help='number of layers') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--embinit', type=str, default='random', help='embedding weight initialization type') parser.add_argument('--decinit', type=str, default='random', help='decoder weight initialization type') parser.add_argument('--hidinit', type=str, default='random', help='recurrent hidden weight initialization type') parser.add_argument('--dropout', type=float, default=0.0, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--batchsize', type=int, default=20, metavar='N', help='batch size') parser.add_argument('--seed', type=int, default=3, help='random seed') parser.add_argument('--vocabsize', type=int, default=200000, help='random seed') parser.add_argument('--optimizer', action='store_true', help='use ADAM optimizer') parser.add_argument('--pipeline', action='store_true', help='use pipeline file') parser.add_argument('--psw', type=int, default=1, help='remove stop words') parser.add_argument('--ppunc', action='store_true', help='remove punctuation') parser.add_argument('--pntok', action='store_true', help='use number tokens') parser.add_argument('--pkq', action='store_true', help='keep question words') parser.add_argument('--stem', action='store_true', help='use stemmer') parser.add_argument('--lemma', action='store_true', help='use lemmatizer') parser.add_argument('--freezeemb', action='store_false', help='freezes embeddings') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--loginterval', type=int, default=100, metavar='N', help='report interval') parser.add_argument('--save', type=str, default='', help='path to save the final model') args = parser.parse_args() if not args.presaved: pipe = None if args.pipeline: stemmer, lemmatizer = None, None if args.stem: stemmer = SnowballStemmer('english') elif args.lemma: lemmatizer = WordNetLemmatizer() pipe = functools.partial(pipeline, rm_stop_words=args.psw, rm_punc=args.ppunc, number_token=args.pntok, keep_questions=args.pkq, stemmer=stemmer, lemmatizer=lemmatizer) corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe) print('Loading Data') # train_data = pd.read_csv(args.data) #Shuffle order of training data # train_data = train_data.reindex(np.random.permutation(train_data.index)) # val_data = train_data.iloc[int(len(train_data) * 0.9):] # train_data = train_data.iloc[:int(len(train_data) * 0.9)] train_data = pd.read_csv('../data/train_data_shuffle.csv') val_data = pd.read_csv('../data/val_data_shuffle.csv') print('Cleaning and Tokenizing') q1, q2, y = clean_and_tokenize(train_data, corpus) q1_val, q2_val, y_val = clean_and_tokenize(val_data, corpus) train_feat = list(map(feature_gen, zip(q1, q2))) val_feat = list(map(feature_gen, zip(q1_val, q2_val))) scalar = preprocessing.StandardScaler() train_feat = scalar.fit_transform(train_feat) val_feat = scalar.transform(val_feat) print('Piping Data') q1 = corpus.pipe_data(q1) q2 = corpus.pipe_data(q2) q1_val = corpus.pipe_data(q1_val) q2_val = corpus.pipe_data(q2_val) corpus.gen_vocab(q1 + q2 + q2_val + q1_val) n_feat = train_feat.shape[1] d_in = args.din feat_max = int(np.max([n_feat, d_in])) X = torch.Tensor(len(train_data), 1, 3, feat_max) X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long() X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long() X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(train_feat)) y = torch.from_numpy(np.array(y)).long() X_val = torch.Tensor(len(val_data), 1, 3, feat_max) X_val[:, 0, 0, :] = torch.from_numpy( corpus.pad_numericalize(q1_val, feat_max)).long() X_val[:, 0, 1, :] = torch.from_numpy( corpus.pad_numericalize(q2_val, feat_max)).long() X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat)) y_val = torch.from_numpy(np.array(y_val)).long() torch.save(X, '../data/X_feat.t') torch.save(y, '../data/y_feat.t') torch.save(X_val, '../data/X_val_feat.t') torch.save(y_val, '../data/y_val_feat.t') with open(args.save + '_corpus_feat.pkl', 'wb') as corp_f: pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL) else: n_feat = 22 d_in = args.din print('Loading Presaved Data') X = torch.load(args.data + 'X_feat.t') y = torch.load(args.data + 'y_feat.t') X_val = torch.load(args.data + 'X_val_feat.t') y_val = torch.load(args.data + 'y_val_feat.t') with open('../data/corpus_feat.pkl', 'rb') as f: corpus = pkl.load(f) if args.cuda: X, y = X.cuda(), y.cuda() X_val, y_val = X_val.cuda(), y_val.cuda() print('Generating Data Loaders') #X.size len(train_data),1,2,fix_length train_dataset = TensorDataset(X, y) train_loader = DataLoader(train_dataset, batch_size=args.batchsize, shuffle=True) valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=args.batchsize, shuffle=False) ntokens = len(corpus) glove_embeddings = None if args.embinit == 'glove': assert args.demb in (50, 100, 200, 300) glove_embeddings = get_glove_embeddings(args.glovedata, corpus.dictionary.word2idx, ntokens, args.demb) model = LSTMModelMLPFeat(args.din, args.dhid, args.nlayers, args.dout, args.demb, n_feat, args.vocabsize, args.dropout, args.embinit, args.hidinit, args.decinit, glove_embeddings, args.cuda) if args.cuda: model.cuda() criterion = nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) model_config = '\t'.join([ str(x) for x in (torch.__version__, args.clip, args.nlayers, args.din, args.demb, args.dhid, args.embinit, args.decinit, args.hidinit, args.dropout, args.optimizer, args.lr, args.vocabsize, args.pipeline, args.psw, args.ppunc, args.pntok, args.pkq, args.stem, args.lemma) ]) print( 'Pytorch | Clip | #Layers | InSize | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer| LR | VocabSize | pipeline | stop | punc | ntoken | keep_ques | stem | lemma' ) print(model_config) # best_val_acc = 0.78 best_ll = 0.5 for epoch in range(args.epochs): model.train() total_cost = 0 start_time = time.time() cur_loss = 0 for ind, (qs, duplicate) in enumerate(train_loader): model.zero_grad() pred = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) if args.cuda: pred = pred.cuda() duplicate = duplicate.cuda() duplicate = Variable(duplicate) loss = criterion(pred, duplicate) loss.backward() clip_grad_norm(model.parameters(), args.clip) if optimizer: optimizer.step() else: for p in model.parameters(): p.data.add_(-args.lr, p.grad.data) total_cost += loss.data[0] cur_loss += loss.data[0] if ind % args.loginterval == 0 and ind > 0: cur_loss = loss.data[0] / args.loginterval elapsed = time.time() - start_time print( '| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | ' 'Loss {:.6f}'.format(epoch, ind, len(X) // args.batchsize, elapsed * 1000.0 / args.loginterval, cur_loss)) start_time = time.time() cur_loss = 0 model.eval() train_acc, train_ll = evaluate(model, train_loader, args.cuda, d_in, n_feat) val_acc, val_ll = evaluate(model, valid_loader, args.cuda, d_in, n_feat) # if args.save and (val_acc > best_val_acc): if args.save and (val_ll < best_ll): with open(args.save + '_corpus.pkl', 'wb') as corp_f: pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL) torch.save(model.cpu(), args.save) torch.save(model.cpu().state_dict(), args.save + ".state_dict") with open(args.save + ".state_dict.config", "w") as f: f.write(model_config) best_ll = val_ll if args.cuda: model.cuda() print( 'Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}' .format(epoch, total_cost, train_acc, val_acc, train_ll, val_ll)) print('-' * 89)
def main(): parser = argparse.ArgumentParser( description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='../data/', help='location of the data corpus') parser.add_argument('--presaved', action='store_true', help='use presaved data') parser.add_argument('--glovedata', type=str, default='../data/', help='location of the pretrained glove embeddings') parser.add_argument('--din', type=int, default=30, help='length of LSTM') parser.add_argument('--demb', type=int, default=300, help='size of word embeddings') parser.add_argument('--dhid', type=int, default=300, help='number of hidden units per layer') parser.add_argument('--dlin', type=int, default=500, help='number linear transformation nodes') parser.add_argument('--dout', type=int, default=2, help='number of output classes') parser.add_argument('--nlayers', type=int, default=1, help='number of layers') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--wd', type=float, default=0.0, help='adam l2 weight decay') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--embinit', type=str, default='random', help='embedding weight initialization type') parser.add_argument('--decinit', type=str, default='random', help='decoder weight initialization type') parser.add_argument('--hidinit', type=str, default='random', help='recurrent hidden weight initialization type') parser.add_argument('--dropout', type=float, default=0.0, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--rnn', type=str, default='lstm', help='lstm or gru') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--batchsize', type=int, default=2000, metavar='N', help='batch size') parser.add_argument('--seed', type=int, default=3, help='random seed') parser.add_argument('--vocabsize', type=int, default=200000, help='random seed') parser.add_argument('--optimizer', action='store_true', help='use ADAM optimizer') parser.add_argument('--reweight', action='store_true', help='reweight loss function') parser.add_argument('--clean', action='store_true', help='clean text') parser.add_argument('--rm_stops', action='store_true', help='remove stop words') parser.add_argument('--bidir', action='store_false', help='bidirectional') parser.add_argument('--freezeemb', action='store_false', help='freezes embeddings') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--loginterval', type=int, default=100, metavar='N', help='report interval') parser.add_argument('--save', type=str, default='', help='path to save the final model') args = parser.parse_args() pipe = None corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe) train_data = pd.read_csv('../data/train_data_shuffle.csv') valid_data = pd.read_csv('../data/val_data_shuffle.csv') train_data = train_data.fillna(' ') valid_data = valid_data.fillna(' ') if args.reweight: print('Downsampling') #downsample pos_valid = valid_data[valid_data['is_duplicate'] == 1] neg_valid = valid_data[valid_data['is_duplicate'] == 0] p = 0.19 pl = len(pos_valid) tl = len(pos_valid) + len(neg_valid) val = int(pl - (pl - p * tl) / ((1 - p))) pos_valid = pos_valid.iloc[:int(val)] valid_data = pd.concat([pos_valid, neg_valid]) print('Splitting Train') q1 = list(train_data['question1'].map(str)) q2 = list(train_data['question2'].map(str)) y = list(train_data['is_duplicate']) print('Splitting Valid') q1_val = list(valid_data['question1'].map(str)) q2_val = list(valid_data['question2'].map(str)) y_val = list(valid_data['is_duplicate']) train_feat = pd.read_csv('../data/train_features_all_norm.csv') val_feat = train_feat.iloc[valid_data['id']].values train_feat = train_feat.iloc[train_data['id']].values print('Splitting Data') if args.clean: print('Cleaning Data') stops = None if args.rm_stops: stops = stops = set(stopwords.words("english")) q1 = [split_text(x, stops) for x in q1] q2 = [split_text(x, stops) for x in q2] q1_val = [split_text(x, stops) for x in q1_val] q2_val = [split_text(x, stops) for x in q2_val] else: q1 = [x.lower().split() for x in q1] q2 = [x.lower().split() for x in q2] q1_val = [x.lower().split() for x in q1_val] q2_val = [x.lower().split() for x in q2_val] print('Downsample Weight: ', np.mean(y_val)) corpus.gen_vocab(q1 + q2 + q2_val + q1_val) n_feat = train_feat.shape[1] d_in = args.din feat_max = int(np.max([n_feat, d_in])) X = torch.Tensor(len(train_data), 1, 3, feat_max) X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long() X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long() X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(train_feat)) y = torch.from_numpy(np.array(y)).long() X_val = torch.Tensor(len(valid_data), 1, 3, feat_max) X_val[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val, feat_max)).long() X_val[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val, feat_max)).long() X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat)) y_val = torch.from_numpy(np.array(y_val)).long() if args.cuda: X, y = X.cuda(), y.cuda() X_val, y_val = X_val.cuda(), y_val.cuda() print('Generating Data Loaders') #X.size len(train_data),1,2,fix_length train_dataset = TensorDataset(X, y) train_loader = DataLoader(train_dataset, batch_size=args.batchsize, shuffle=True) valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=args.batchsize, shuffle=False) num_train = len(X) del X, y, X_val, y_val, train_feat, val_feat, q1, q2, q1_val, q2_val ntokens = len(corpus) glove_embeddings = None if args.embinit == 'glove': assert args.demb in (50, 100, 200, 300) glove_embeddings = get_glove_embeddings(args.glovedata, corpus.dictionary.word2idx, ntokens, args.demb) model = ConvRNNLSTMFeat(args.din, args.dhid, args.dout, args.demb, args.dlin, args.vocabsize, args.dropout, args.embinit, args.hidinit, args.decinit, glove_embeddings, args.cuda, args.rnn, args.bidir, n_feat) if args.cuda: model.cuda() if args.reweight: w_tensor = torch.Tensor([1.309028344, 0.472001959]) if args.cuda: w_tensor = w_tensor.cuda() criterion = nn.NLLLoss(weight=w_tensor) else: criterion = nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) model_config = '\t'.join([ str(x) for x in (torch.__version__, args.clip, args.nlayers, args.din, args.demb, args.dhid, args.embinit, args.decinit, args.hidinit, args.dropout, args.optimizer, args.reweight, args.lr, args.vocabsize, args.batchsize, args.clean, args.rm_stops) ]) print( 'Pytorch | Clip | #Layers | InSize | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer | Reweight | LR | VocabSize | batchsize | Clean | Stops' ) print(model_config) # best_val_acc = 0.78 best_ll = 0.3 for epoch in range(args.epochs): model.train() total_cost = 0 start_time = time.time() cur_loss = 0 for ind, (qs, duplicate) in enumerate(train_loader): model.zero_grad() pred = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) if args.cuda: pred = pred.cuda() duplicate = duplicate.cuda() duplicate = Variable(duplicate) loss = criterion(pred, duplicate) loss.backward() clip_grad_norm(model.parameters(), args.clip) if optimizer: optimizer.step() else: for p in model.parameters(): p.data.add_(-args.lr, p.grad.data) total_cost += loss.data[0] cur_loss += loss.data[0] if ind % args.loginterval == 0 and ind > 0: cur_loss = loss.data[0] / args.loginterval elapsed = time.time() - start_time print( '| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | ' 'Loss {:.6f}'.format(epoch, ind, num_train // args.batchsize, elapsed * 1000.0 / args.loginterval, cur_loss)) start_time = time.time() cur_loss = 0 model.eval() train_acc, train_ll = evaluate(model, train_loader, args.cuda, d_in, n_feat) val_acc, val_ll = evaluate(model, valid_loader, args.cuda, d_in, n_feat) # if args.save and (val_acc > best_val_acc): if args.save and (val_ll < best_ll): with open(args.save + '_corpus.pkl', 'wb') as corp_f: pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL) torch.save(model.cpu(), args.save) torch.save(model.cpu().state_dict(), args.save + ".state_dict") with open(args.save + ".state_dict.config", "w") as f: f.write(model_config) best_ll = val_ll if args.cuda: model.cuda() print( 'Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}' .format(epoch, total_cost, train_acc, val_acc, train_ll, val_ll)) print('-' * 89) del train_loader print('Reloading Best Model') model = torch.load(args.save) model.cuda() model.eval() print('RELOADING VALID') valid_data = pd.read_csv('../data/val_data_shuffle.csv') valid_data = valid_data.fillna(' ') q1_val = list(valid_data['question1'].map(str)) q2_val = list(valid_data['question2'].map(str)) y_val = list(valid_data['is_duplicate']) train_feat = pd.read_csv('../data/train_features_all_norm.csv') val_feat = train_feat.iloc[valid_data['id']].values if args.clean: print('Cleaning Data') stops = None if args.rm_stops: stops = stops = set(stopwords.words("english")) q1_val = [split_text(x, stops) for x in q1_val] q2_val = [split_text(x, stops) for x in q2_val] else: q1_val = [x.lower().split() for x in q1_val] q2_val = [x.lower().split() for x in q2_val] X_val = torch.Tensor(len(valid_data), 1, 3, feat_max) X_val[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val, feat_max)).long() X_val[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val, feat_max)).long() X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat)) y_val = torch.from_numpy(np.array(y_val)).long() if args.cuda: X_val, y_val = X_val.cuda(), y_val.cuda() valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=args.batchsize, shuffle=False) del X_val, y_val, train_feat, val_feat, q1_val, q2_val, valid_data print('PREDICTING VALID') pred_list = [] for ind, (qs, _) in enumerate(valid_loader): out = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) pred_list += list(out.exp()[:, 1].data.cpu().numpy()) with open('../predictions/' + args.save + '_val.pkl', 'wb') as f: pkl.dump(pred_list, f, protocol=pkl.HIGHEST_PROTOCOL) if args.reweight: print('LOADING TEST DATA') test_data = pd.read_csv('../data/test.csv') test_data = test_data.fillna(' ') q1 = list(test_data['question1'].map(str)) q2 = list(test_data['question2'].map(str)) q1 = [x.lower().split() for x in q1] q2 = [x.lower().split() for x in q2] print('LOADING TEST FEATURES') test_feat = pd.read_csv('../data/test_features_all_norm.csv').values n_feat = test_feat.shape[1] d_in = args.din feat_max = int(np.max([n_feat, d_in])) X = torch.Tensor(len(test_data), 1, 3, feat_max) X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long() X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long() X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(test_feat)) y = torch.LongTensor(len(test_data)).zero_() if args.cuda: X = X.cuda() y = y.cuda() test_loader = DataLoader(TensorDataset(X, y), batch_size=500, shuffle=False) print('PREDICTING') pred_list = [] for ind, (qs, _) in enumerate(test_loader): out = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) pred_list += list(out.exp()[:, 1].data.cpu().numpy()) with open('../predictions/' + args.save + '.pkl', 'wb') as f: pkl.dump(pred_list, f, protocol=pkl.HIGHEST_PROTOCOL)