def create_word_index(model, glove_path, embedding_size, min_samples, pdtb_category=''): if os.path.exists(glove_path): v_builder = GloveVocabBuilder(path_glove=glove_path) d_word_index, embed = v_builder.get_word_index() ed_size = embed.size(1) is_glove = True else: v_builder = VocabBuilder(path_file=PROCESSED_DATA_PATH + '/train.tsv') d_word_index, embed = v_builder.get_word_index(min_sample=min_samples) ed_size = embedding_size is_glove = False results_path = get_results_path(model, is_glove, ed_size, pdtb_category) joblib.dump(d_word_index, results_path + '/d_word_index.pkl', compress=3) return (v_builder, d_word_index, embed, ed_size, results_path)
parser.add_argument('--cuda', default=False, action='store_true', help='use cuda') parser.add_argument('--fasttext-tensor', default='data/fasttext.pt', help='path to fasttext embeddings tensor') parser.add_argument('--fasttext-voc', default='data/fasttext_voc.pkl', help='path to fasttext embeddings tensor') parser.add_argument('--train-path', default="data/en-ud-train.csv", help='path to train data csv') parser.add_argument('--dev-path', default="data/en-ud-dev.csv", help='path to dev data csv') parser.add_argument('--clip', type=float, default=5, help='gradient clipping') args = parser.parse_args() print() # create vocab print("===> creating word, tag, char, dep_rel vocabs and loading pre-trained embeddings ...") start = time.time() fasttext_embed = torch.load(args.fasttext_tensor) fasttext_word_to_index = pickle.load(open(args.fasttext_voc, 'rb')) w_builder = VocabBuilder(path_file=args.train_path) word_to_index, words = w_builder.get_word_index(min_sample=args.min_samples) char_builder = CharBuilder(path_file=args.train_path) char_to_index, chars = char_builder.get_char_index() pos_builder = TagBuilder(args.train_path,"POS") pos_to_index, pos_tags = pos_builder.get_tag_index_padded() xpos_builder = TagBuilder(args.train_path,"XPOS") xpos_to_index, xpos_tags = xpos_builder.get_tag_index_padded() rel_builder = TagBuilder(args.train_path,"Drel") rel_to_index, rel_tags = rel_builder.get_tag_index() if not os.path.exists('gen'): os.mkdir('gen') with open("gen/parser_model.params", 'wb') as paramsfp: pickle.dump((word_to_index,char_to_index,pos_to_index,xpos_to_index,rel_to_index), paramsfp) print('===> vocab creating in: {t:.3f}s'.format(t=time.time()-start))
parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') args = parser.parse_args() # create vocab print("===> creating vocabs ...") end = time.time() v_builder, d_word_index, embed = None, None, None if os.path.exists(args.glove): v_builder = GloveVocabBuilder(path_glove=args.glove) d_word_index, embed = v_builder.get_word_index() args.embedding_size = embed.size(1) else: v_builder = VocabBuilder(path_file='data/train1.csv') d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples) if not os.path.exists('gen'): os.mkdir('gen') joblib.dump(d_word_index, 'gen/d_word_index.pkl', compress=3) print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end)) print('args: ', args) # create trainer print("===> creating dataloaders ...") end = time.time() train_loader = TextClassDataLoader('data/train1.csv', d_word_index, batch_size=args.batch_size)
def run_model(domain): # create vocab print("===> creating vocabs for domain..." + domain) end = time.time() domain_d = 'reviews/leave_out_' + domain lda_model = models.LdaModel.load(domain_d + '/lda_model/lda_' + domain) lda_dict = gensim.corpora.Dictionary.load(domain_d + '/lda_model/dict_' + domain) print(domain_d) v_builder = VocabBuilder(path_file=domain_d + '/train.csv', min_sample=args.min_samples) d_word_index = v_builder.get_word_index() vocab_size = len(d_word_index) word2id = {v: k for k, v in d_word_index.iteritems()} #print (word2id) embeddings = load_glove_embeddings( '/home/DebanjanChaudhuri/topic_lstm_torch/word_vecs/glove.6B.50d.txt', d_word_index) if not os.path.exists('gen_' + domain): os.mkdir('gen_' + domain) joblib.dump(d_word_index, 'gen_' + domain + '/d_word_index.pkl', compress=3) print('===> vocab creating: {t:.3f}'.format(t=time.time() - end)) # create trainer print("===> creating dataloaders ...") end = time.time() train_loader = TextClassDataLoader(domain_d + '/train.csv', d_word_index, batch_size=args.batch_size) val_loader = TextClassDataLoader(domain_d + '/val.csv', d_word_index, batch_size=args.batch_size) test_loader = TextClassDataLoader(domain_d + '/test.csv', d_word_index, batch_size=args.batch_size) print('===> Dataloader creating: {t:.3f}'.format(t=time.time() - end)) # create model print("===> creating rnn model ...") if args.mit_topic: print("with topic vectors.") model = RNNTopic(vocab_size=vocab_size, embed_size=args.embedding_size, num_output=args.classes, topic_size=50, hidden_size=args.hidden_size, num_layers=args.layers, batch_first=True, use_gpu=args.cuda, embeddings=embeddings, emb_drop=args.emb_drop, fc_size=args.fc_layer) else: model = RNN(vocab_size=vocab_size, embed_size=args.embedding_size, num_output=args.classes, hidden_size=args.hidden_size, num_layers=args.layers, batch_first=True, use_gpu=args.cuda, embeddings=embeddings, emb_drop=args.emb_drop, fc_size=args.fc_layer) print(model) # optimizer and loss optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss() print(optimizer) print(criterion) if args.cuda: torch.backends.cudnn.enabled = True cudnn.benchmark = True model.cuda() criterion = criterion.cuda() #List for checking early stopping val_acc = [] for epoch in range(1, args.epochs + 1): adjust_learning_rate(args.lr, optimizer, epoch) train(train_loader, model, criterion, optimizer, epoch, lda_model, lda_dict, word2id) print("getting performance on validation set!") v_acc = validate(val_loader, model, criterion, lda_model, lda_dict, word2id) print(len(val_acc), args.early_stopping) #if len(val_acc) > args.early_stopping: print("checking early stopping.") if earlystop(val_acc, v_acc): print("Early stopping!") break val_acc.append(v_acc) # save current model if epoch % args.save_freq == 0: name_model = 'rnn_{}.pkl'.format(epoch) path_save_model = os.path.join('gen_' + domain + '/', name_model) joblib.dump(model.float(), path_save_model, compress=2) print("Results on test set for leave-out-domain!" + domain) test_acc = test(test_loader, model, criterion, lda_model, lda_dict, word2id) return test_acc
# create vocab print("===> creating vocabs ...") end = datetime.datetime.now() v_builder, d_word_index, embed = None, None, None train_path = args.train_data test_path = args.test_data dic_name = os.path.join('gen', args.weight_name + '.pkl') weight_save_model = os.path.join('gen', args.weight_name) try: d_word_index = joblib.load(dic_name) embed = torch.load(weight_save_model) print('load existing embedding vectors, name is ', args.weight_name) except: v_builder = VocabBuilder(path_file=train_path) d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples) print('create new embedding vectors') if not os.path.exists('gen'): os.mkdir('gen') joblib.dump(d_word_index, dic_name, compress=3) end = datetime.datetime.now() train_loader = Word2vecLoader(train_path, d_word_index, batch_size=args.batch_size) val_loader = Word2vecLoader(test_path, d_word_index, batch_size=args.batch_size)
parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') args = parser.parse_args() # create vocab print("===> creating vocabs ...") end = time.time() v_builder, d_word_index, embed = None, None, None if os.path.exists(args.glove): v_builder = GloveVocabBuilder(path_glove=args.glove) d_word_index, embed = v_builder.get_word_index() args.embedding_size = embed.size(1) else: v_builder = VocabBuilder(path_file=args.train) d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples) if not os.path.exists('gen'): os.mkdir('gen') try: os.makedirs('models/' + args.name) except FileExistsError: pass with codecs.open('models/' + args.name + '/classify_stat.pkl', 'wb') as fout: pickle.dump(d_word_index, fout) # joblib.dump(d_word_index, 'models/' + args.name + '/d_word_index.pkl', compress=3) print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end)) print('args: ', args) # create trainer
#!/usr/bin/env python # encoding: utf-8 from vocab import VocabBuilder from dataloader import DataLoader from model import RNN filepath = "./dataset/dataset.csv" vocab_obj = VocabBuilder(filepath=filepath) word_to_index = vocab_obj.word_to_index label_to_index = vocab_obj.label_to_index index_to_label = {} for label, index in label_to_index.items(): index_to_label[index] = label loader = DataLoader(filepath=filepath, word_to_index=word_to_index, label_to_index=label_to_index, batch_size=128) vocab_size = len(word_to_index) embedding_size = 128 num_output = len(label_to_index) model = RNN(vocab_size=vocab_size, embed_size=embedding_size, num_output=num_output, rnn_model="LSTM", use_last=True,
val_file = os.path.join(args.data, 'val.csv') test_file = os.path.join(args.data, 'test.csv') else: train_file = os.path.join(args.data, 'trainval.tsv') val_file = os.path.join(args.data, 'val.tsv') test_file = os.path.join(args.data, 'test.tsv') v_builder, d_word_index, embed = None, None, None #if os.path.exists(args.glove): if args.use_glove: glove_file = 'glove/glove.6B.{}d.txt'.format(args.glove) v_builder = GloveVocabBuilder(path_glove=glove_file) d_word_index, embed = v_builder.get_word_index() args.embedding_size = embed.size(1) else: v_builder = VocabBuilder(path_file=train_file) d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples) #d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples) model_dir = os.path.join('checkpoints', args.model) if not os.path.exists(model_dir): os.makedirs(model_dir, exist_ok=True) joblib.dump(d_word_index, os.path.join(model_dir, 'd_word_index.pkl'), compress=3) print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end)) print('args: ', args) # create trainer
parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') args = parser.parse_args() # create vocab print("===> creating vocabs ...") end = time.time() v_builder, d_word_index, embed = None, None, None if os.path.exists(args.glove): v_builder = GloveVocabBuilder(path_glove=args.glove) d_word_index, embed = v_builder.get_word_index() args.embedding_size = embed.size(1) else: v_builder = VocabBuilder(path_file='data/train_pdtb.tsv') d_word_index, embed = v_builder.get_word_index(min_sample=args.min_samples) gen = args.gen + str(args.embedding_size) + 'v' if not os.path.exists(gen): os.makedirs(gen) joblib.dump(d_word_index, gen + '/d_word_index.pkl', compress=3) print('===> vocab creatin: {t:.3f}'.format(t=time.time() - end)) print('args: ', args) # create trainer print("===> creating dataloaders ...") end = time.time() train_loader = TextClassDataLoader('data/train_pdtb.tsv', d_word_index,