def build_data(args): print("Building dataset...") if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) vocab = Vocab(wl_th=args.wl_th, wcutoff=args.wcutoff) vocab.build(fname=args.train_file, idf_file=args.idf_file, firstline=False, limit=args.sent_limit) args.vocab = vocab if args.word_emb_file is not None: scale = np.sqrt(3.0 / args.word_dim) args.word_pretrained = Embeddings.get_W(args.word_emb_file, args.word_dim, vocab.w2i, scale) else: args.word_pretrained = None if os.path.exists(args.idf_file): print("Load idf file ...") args.idf_embs = Embeddings.get_W(args.idf_file, 1, vocab.w2i, 0) else: args.idf_embs = None SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args)) return args
def load_data(self, datafile): dataset = pd.read_csv(datafile) if self.debug: dataset = dataset.iloc[:3000] text = 'comment_text' self.X = dataset[text].values labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] # labels = ['severe_toxic'] assert (len(labels) == self.config.label_size) self.y = dataset[labels].values self.X_train, self.X_val, self.y_train, self.y_val = train_test_split( self.X, self.y, test_size=0.1, random_state=124) ## Build the vocabulary using the train data. self.vocab = Vocab() train_sents = [get_words(line) for line in self.X_train] self.vocab.construct(list(itertools.chain.from_iterable(train_sents)), threshold=self.config.min_word_freq) print('Training on {} samples and validating on {} samples'.format( len(self.X_train), len(self.X_val))) print() self.embedding_matrix = np.random.uniform( -0.005, 0.005, size=[len(self.vocab), self.config.embed_size]).astype('float32') with tf.variable_scope("Embeddings") as scope: embedding = tf.get_variable("Embeds", initializer=self.embedding_matrix, dtype=tf.float32) if self.debug: return ## Populate embedding matrix from pre-trained word embeddings pretrained_index = {} with open('./WordVectors/crawl-300d-2M.vec') as fh: for line in fh: word_vec = line.strip().split() word = word_vec[0] vector = np.asarray(word_vec[1:], dtype='float32') pretrained_index[word] = vector pw = 0.0 for word, idx in self.vocab.word_to_idx.items(): pretrained_vector = pretrained_index.get(word) if pretrained_vector is not None: self.embedding_matrix[idx] = pretrained_vector pw += 1 print("Found pretrained vectors for {:.2f}% of data".format( pw / len(self.vocab) * 100)) del pretrained_index ## Done only for memory constraint. Don't do this!!
def main(): input_file = "data/train.txt" vocab_file = "data/vocab" embedding_file = "data/glove.npz" glove_file = "data/glove.840B.300d.txt" dict_file = "data/dict.p" max_vocab_size = 5e4 Vocab.build_vocab(input_file, vocab_file, dict_file, glove_file, embedding_file, max_vocab_size)
def encode_sentence(data_dir): vocab = Vocab(os.path.join(data_dir, 'dict_cleaned.txt')) split_paths = {} for split in ['train', 'test']: split_paths[split] = os.path.join(data_dir, split) encodes = [] with open(os.path.join(split_paths[split], 'sents.txt'), 'r') as sf: for line in sf.readlines(): sentence = line.strip().split() index = [str(vocab.encode(word)) for word in sentence] encode = " ".join(index) encodes.append(encode) with open(os.path.join(split_paths[split], 'index.txt'), 'w') as wf: wf.writelines('\n'.join(encodes))
def load_mr(data_dir): voc = Vocab(os.path.join(data_dir, 'dict_cleaned.txt')) split_paths = {} for split in ["train", "test"]: split_paths[split] = os.path.join(data_dir, split) data = {} max_sentence_length = 0 count = 0 sumlen = 0 for split, path in split_paths.iteritems(): sentencepath = os.path.join(path, "index.txt") labelpath = os.path.join(path, "labels.txt") splitdata = [] with open(sentencepath, 'r') as sf, open(labelpath, 'r') as lf: for line, label in zip(sf.readlines(), lf.readlines()): sentence = line.strip() pair = {} pair['sentence'] = sentence pair['label'] = int(label.strip()) splitdata.append(pair) if len(sentence) > max_sentence_length: max_sentence_length = len(sentence) sumlen += len(sentence) count += 1 data[split] = splitdata average_len = int(sumlen / count) return data, voc, max_sentence_length, average_len
def main(): config = Config() vocab = Vocab(config.dict_file) dev_q, dev_c, dev_s, dev_spans, dev_s_idx, dev_answerable = load_data( config.dev_file, vocab, config.debug) dev_data = list( zip(dev_q, dev_c, dev_s, dev_s_idx, dev_answerable, dev_spans)) ssnet = SSQANet(config) ssnet.build_model() ssnet.restore_session(config.dir_model) batches = batch_loader(dev_data, config.batch_size, shuffle=False) acc_history = [] em_history = [] for batch in batches: batch_q, batch_c, batch_s, batch_s_idx, batch_ans, batch_spans = zip( *batch) question_lengths, padded_q = zero_padding(batch_q, level=1) context_lengths, padded_c = zero_padding(batch_c, level=1) sequence_lengths, sentence_lengths, padded_s = zero_padding(batch_s, level=2) batch_acc, batch_em, batch_loss = ssnet.eval( padded_q, question_lengths, padded_c, context_lengths, padded_s, sequence_lengths, sentence_lengths, batch_s_idx, batch_ans, batch_spans) acc_history.append(batch_acc) em_history.append(batch_em) dev_acc = np.mean(acc_history) dev_em = np.mean(em_history) print("classification acc :{}".format(dev_acc)) print("EM :{}".format(dev_em))
def __init__(self, model, optimizer, train_dataset, test_dataset, num_folds=config.num_folds, loss_function=None): self.num_folds = num_folds assert num_folds >= 1 self.use_crf = config.use_crf vocab = Vocab.from_files( [config.dataset_path, config.test_dataset_path], store=config.mapping_file) #self.train_dataset = ReviewDataset(config.dataset_path, preprocessed= False, vocab= vocab) #self.test_dataset = ReviewDataset(config.test_dataset_path, preprocessed= False, vocab= vocab) #self.model = model( vocab, embedding_path= config.word_embedding_path, use_crf= config.use_crf ).to(config.device) self.train_dataset = train_dataset self.test_dataset = test_dataset self.model = model self.optimizer = optimizer(self.model.parameters()) if not self.use_crf and loss_function is None: raise Exception( ' Loss function must be specified when crf is not being used ') self.device = torch.device( config.device if torch.cuda.is_available() else 'cpu') self.model.to(self.device) print('using device: ', self.device)
'adam': torch.optim.Adam, # default lr=0.001 'adamax': torch.optim.Adamax, # default lr=0.002 'asgd': torch.optim.ASGD, # default lr=0.01 'rmsprop': torch.optim.RMSprop, # default lr=0.01 'sgd': torch.optim.SGD, } models = { 'lstm': LSTM, 'attention_lstm': AttentionAspectExtraction, 'global_attention_lstm': GlobalAttentionAspectExtraction, 'hsan': HSAN, 'decnn': DECNN } vocab = Vocab.from_files([config.dataset_path, config.test_dataset_path], store=config.mapping_file) train_dataset = ReviewDataset(config.dataset_path, preprocessed=False, vocab=vocab) test_dataset = ReviewDataset(config.test_dataset_path, preprocessed=False, vocab=vocab) network = models[config.model](vocab, embedding_path=config.word_embedding_path, lambda1=config.lambda1, use_crf=config.use_crf).to(config.device) trainer = Trainer(network, optimizers[config.optimizer], train_dataset, test_dataset,
return label_prob, label_pred def forward(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): # (batch_size,sequence_len,hidden_dim) rnn_out = self.lstm.get_all_atthiddens(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) # (batch_size,sequence_len,num_labels+2) label_score = self.hidden2tag(rnn_out) label_score = self.dropfinal(label_score) return label_score if __name__ == "__main__": from data_utils import Data2tensor, Vocab, seqPAD, CoNLLDataset train_file='/media/data/NER/conll03/conll03/train.bmes' dev_file='/media/data/NER/conll03/conll03/dev.bmes' test_file='/media/data/NER/conll03/conll03/test.bmes' vocab = Vocab(cutoff=1, wl_th=None, cl_th=None, w_lower=False, w_norm=False, c_lower=False, c_norm=False) vocab.build([train_file, dev_file, test_file]) word2idx = vocab.wd2idx(vocab_words=vocab.w2i, vocab_chars=vocab.c2i, allow_unk=True, start_end=True) tag2idx = vocab.tag2idx(vocab_tags=vocab.l2i, start_end=True) train_data = CoNLLDataset(train_file, word2idx=word2idx, tag2idx=tag2idx) train_iters = Vocab.minibatches(train_data, batch_size=10) data=[] label_ids = [] for words, labels in train_iters: char_ids, word_ids = zip(*words) data.append(words) word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=1024, cthres=32) char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=1024, cthres=32) label_ids, label_lengths = seqPAD.pad_sequences(labels, pad_tok=0, wthres=1024, cthres=32)
def main(opts): if len(opts) == 0: raise ValueError("Usage: build_data.py <dataset>") dataset = opts[0] if dataset not in ['cateringServices', 'automotiveEngineering', 'bbn']: raise ValueError( "Dataset must be either cateringServices, automotiveEngineering, or bbn." ) cf.load_config(dataset) global MAX_SENT_LEN MAX_SENT_LEN = cf.MAX_SENT_LEN dataset_filenames = { "train": cf.TRAIN_FILENAME, "dev": cf.DEV_FILENAME, } # 1. Construct the Hierarchy by looking through each dataset for unique labels. hierarchy = build_hierarchy(dataset_filenames) # 2. Construct two empty Vocab objects (one for words, another for wordpieces), which will be populated in step 3. word_vocab = Vocab() wordpiece_vocab = Vocab() logger.info("Hierarchy contains %d categories unique to the test set." % len(hierarchy.get_categories_unique_to_test_dataset())) # 3. Build a data loader for each dataset (train, test). data_loaders = {} for ds_name, filepath in dataset_filenames.items(): logger.info("Loading %s dataset from %s." % (ds_name, filepath)) dataset, sentences, total_wordpieces = build_dataset( filepath, hierarchy, word_vocab, wordpiece_vocab, ds_name) if ds_name == "dev": batch_size = 1 else: batch_size = cf.BATCH_SIZE data_loader = DataLoader(dataset, batch_size=batch_size, pin_memory=True) data_loaders[ds_name] = data_loader logger.info("The %s dataset was built successfully." % ds_name) logger.info( "Dataset contains %i wordpieces (including overly long sentences)." % total_wordpieces) if ds_name == "train": total_wordpieces_train = total_wordpieces BYPASS_SAVING = False if BYPASS_SAVING: logger.info("Bypassing file saving - training model directly") train_without_loading(data_loaders, word_vocab, wordpiece_vocab, hierarchy, total_wordpieces_train) return logger.info("Saving data loaders to file...") dutils.save_obj_to_pkl_file(data_loaders, 'data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl') logger.info("Saving vocabs and hierarchy to file...") dutils.save_obj_to_pkl_file(word_vocab, 'word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl') dutils.save_obj_to_pkl_file(wordpiece_vocab, 'wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl') dutils.save_obj_to_pkl_file(hierarchy, 'hierarchy', cf.ASSET_FOLDER + '/hierarchy.pkl') dutils.save_obj_to_pkl_file(total_wordpieces_train, 'total_wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl') dutils.save_list_to_file(word_vocab.ix_to_token, 'word vocab', cf.DEBUG_FOLDER + '/word_vocab.txt') dutils.save_list_to_file(wordpiece_vocab.ix_to_token, 'wordpiece vocab', cf.DEBUG_FOLDER + '/wordpiece_vocab.txt')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--path', help="dataset path", type=str, default=None) parser.add_argument('--name', help="name of dataset", type=str, default=None) parser.add_argument('--data_indice', help="indices of dataset", type=str, default=None) parser.add_argument('--adjacency', help="use adjacency matrix", type=bool, default=False) parser.add_argument('--batch', help="batch size", type=int, default=128) parser.add_argument('--embed_size', help="embedding vector size", type=int, default=1024) parser.add_argument('--seq', help="sequence length", type=int, default=256) parser.add_argument('--layers', help="number of layers", type=int, default=6) parser.add_argument('--nhead', help="number of head", type=int, default=4) parser.add_argument('--saved_model', help="dir of fine-tuned model", type=str) parser.add_argument('--matrix_position', help="position of adjacency matrix", type=str, default='atom') parser.add_argument('--num_workers', help="number of workers", type=int, default=0) parser.add_argument("--seed", type=int, default=7) parser.add_argument('--type', type=str) #parser.add_argument('--type', help="type of dataset", type=str) arg = parser.parse_args() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #device = torch.device("cpu") print("device:", device) Smiles_vocab = Vocab() if arg.type == 'zinc': testdataset = SmilesDataset(arg.path, Smiles_vocab, seq_len=arg.seq, mat_position=arg.matrix_position) else: testdataset = ADMETDataset(arg.path, arg.name, Smiles_vocab, seq_len=arg.seq, trainType='Training', mat_position=arg.matrix_position) test_dataloader = DataLoader(testdataset, batch_size=arg.batch, num_workers=arg.num_workers) model = Smiles_BERT(len(Smiles_vocab), max_len=arg.seq, nhead=arg.nhead, model_dim=arg.embed_size, nlayers=arg.layers, adj=arg.adjacency) value_layer = nn.Linear(arg.embed_size, 1) mask_layer = Masked_prediction(arg.embed_size, len(Smiles_vocab)) model = BERT_double_tasks(model, value_layer, mask_layer) model.load_state_dict(torch.load(arg.saved_model)) model.to(device) #if torch.cuda.device_count() > 1: # model = nn.DataParallel(model) correct = 0 total = 0 predicted_list = np.array([]) target_list = np.array([]) total_loss = 0 criterion = nn.L1Loss() model.eval() test_iter = tqdm.tqdm(enumerate(test_dataloader), total=len(test_dataloader)) position_num = torch.arange(arg.seq).repeat(arg.batch, 1).to(device) with torch.no_grad(): for i, data in test_iter: data = {key: value.to(device) for key, value in data.items()} if data["smiles_bert_input"].size(0) != arg.batch: position_num = torch.arange(arg.seq).repeat( data["smiles_bert_input"].size(0), 1).to(device) if arg.adjacency is True: qed_output, output = model( data["smiles_bert_input"], position_num, adj_mask=data["smiles_bert_adj_mask"], adj_mat=data["smiles_bert_adjmat"]) else: qed_output, output = model(data["smiles_bert_input"], position_num) #output = output[:,0] loss = criterion(qed_output, data["smiles_bert_value"].view(-1, 1)) total_loss += loss.item() predicted = output.argmax(dim=-1) #print(predicted, data["smiles_bert_label"].shape) for k in range(predicted.size(0)): for j in range(predicted.size(1)): if data["smiles_bert_label"][k][j].item() != 0: correct += predicted[k][j].eq( data["smiles_bert_label"][k] [j].item()).sum().item() total += 1 #predicted_list = np.append(predicted_list, predicted.cpu().detach().numpy()) #target_list = np.append(target_list, data["smiles_bert_label"].cpu().detach().numpy()) #_, predicted = torch.max(output.data, 1) #total += data["smiles_bert_label"].size(0) #correct += (torch.round(predicted) == data["smiles_bert_label"]).sum().item() #predicted_list = np.reshape(predicted_list, (-1)) #target_list = np.reshape(target_list, (-1)) #print(predicted_list, target_list) print("Accuracy on testset: ", 100 * correct / total, "MAE on QED:", total_loss / len(test_iter))
if __name__ == '__main__': vocab_num = 100000 pubmed_w2v_path = 'pubmed_w2v.txt' emb_path = 'emb_cnn.pt' opt = Options(config_vocab=False) pubmedreader = PubMedReader(opt) print('loding text data') train_sents, train_labels, test_sents, test_labels, valid_sents, valid_labels = pubmedreader.get_data( ) print('read vocab') fixed_vocab_set = read_vocab(pubmed_w2v_path) print('fixed vocab set size {}'.format(len(fixed_vocab_set))) print('build vocab') vocab = Vocab.build_vocab(train_sents, fixed_vocab_set=fixed_vocab_set) # vocab.append_sents(valid_sents, fixed_vocab_set=fixed_vocab_set) vocab.append_sents(test_sents, fixed_vocab_set=fixed_vocab_set) # print('vocab size {} before shrink'.format(vocab.vocab_len)) vocab.shrink_vocab(2) print('vocab size {} after shrink'.format(vocab.vocab_len)) print('read vec') word_list = [vocab.idx2word[i] for i in range(len(vocab.idx2word))] vec = read_vec(pubmed_w2v_path, word_list) assert vec.shape[0] == vocab.vocab_len print('build emb layer') emb = Embedding(vocab.vocab_len,
# word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) # idx_word = {} max_story_size = max(map(len, (s for s, _, _ in data))) mean_story_size = int(np.mean([len(s) for s, _, _ in data])) sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data))) query_size = max(map(len, (q for _, q, _ in data))) answer_size = max(map(len, (a for _, _, a in data))) del data sentence_size = max(query_size, sentence_size, answer_size) # for the position sentence_size += 1 # +1 for time words +1 for go +1 for eos memory_size = min(FLAGS.memory_size, max_story_size) #+ FLAGS.additional_info_memory_size vocab = Vocab() vocab.add_vocab(words) # for i in range(memory_size): # vocab.word_to_index('time{}'.format(i + 1)) S, Q, A, A_fact, A_weight = vectorize_data(train, vocab, sentence_size, memory_size, fact=FLAGS.model_type) # Add time words/indexes additional_vocab_size = 50 # for additional infor from knowledge base vocab_size = vocab.vocab_size #+ additional_vocab_size # +1 for nil word # sentence_size= max(sentence_size,20) # set the same certain length for decoder
return batch_loss def inference(self, label_score, k=1): if self.num_labels > 2: label_prob = F.softmax(label_score, dim=-1) label_prob, label_pred = label_prob.data.topk(k) else: label_prob = torch.sigmoid(label_score.squeeze()) label_pred = (label_prob >= 0.5).data.long() return label_prob, label_pred if __name__ == "__main__": from data_utils import Data2tensor, Vocab, seqPAD, Txtfile filename = "../data/train.txt" vocab = Vocab(wl_th=None, cutoff=2) vocab.build([filename], firstline=False) word2idx = vocab.wd2idx(vocab.w2i) tag2idx = vocab.tag2idx(vocab.l2i) train_data = Txtfile(filename, firstline=False, word2idx=word2idx, tag2idx=tag2idx) train_iters = Vocab.minibatches(train_data, batch_size=4) data = [] label_ids = [] for words, labels in train_iters: data.append(words) label_ids.append(labels) word_ids, sequence_lengths = seqPAD.pad_sequences(words,
def main(): config = Config() vocab = Vocab(config.dict_file) q, c, s, spans, s_idx, answerable = load_data(config.train_file, vocab, config.debug) dev_q, dev_c, dev_s, dev_spans, dev_s_idx, dev_answerable = load_data( config.dev_file, vocab, config.debug) train_data = list(zip(q, c, s, s_idx, answerable, spans)) dev_data = list( zip(dev_q, dev_c, dev_s, dev_s_idx, dev_answerable, dev_spans)) ssnet = SSQANet(config) ssnet.build_model() best_score = 0 for i in range(config.num_epochs): epoch = i + 1 batches = batch_loader(train_data, config.batch_size, shuffle=False) for batch in batches: batch_q, batch_c, batch_s, batch_s_idx, batch_ans, batch_spans = zip( *batch) question_lengths, padded_q = zero_padding(batch_q, level=1) context_lengths, padded_c = zero_padding(batch_c, level=1) sequence_lengths, sentence_lengths, padded_s = zero_padding( batch_s, level=2) loss, acc, pred, step = ssnet.train(padded_q, question_lengths, padded_c, context_lengths, padded_s, sequence_lengths, sentence_lengths, batch_s_idx, batch_ans, batch_spans, config.dropout) train_batch_acc, train_batch_em, train_batch_loss = ssnet.eval( padded_q, question_lengths, padded_c, context_lengths, padded_s, sequence_lengths, sentence_lengths, batch_s_idx, batch_ans, batch_spans) if step % 100 == 0: print("epoch: %d, step:%d, loss:%.4f, acc:%.2f, em:%.2f" % (epoch, step, loss, train_batch_acc, train_batch_em)) if step % 1000 == 0: dev_batches = batch_loader(dev_data, config.batch_size, shuffle=False) total_em = [] total_acc = [] total_loss = [] for dev_batch in dev_batches: dev_batch_q, dev_batch_c, dev_batch_s, \ dev_batch_s_idx, dev_batch_ans, dev_batch_spans = zip(*dev_batch) question_lengths, padded_q = zero_padding(dev_batch_q, level=1) context_lengths, padded_c = zero_padding(dev_batch_c, level=1) sequence_lengths, sentence_lengths, padded_s = zero_padding( dev_batch_s, level=2) dev_batch_acc, dev_batch_em, dev_batch_loss = ssnet.eval( padded_q, question_lengths, padded_c, context_lengths, padded_s, sequence_lengths, sentence_lengths, dev_batch_s_idx, dev_batch_ans, dev_batch_spans) total_loss.append(dev_batch_loss) total_em.append(dev_batch_em) total_acc.append(dev_batch_acc) dev_em = np.mean(total_em) dev_acc = np.mean(total_acc) dev_loss = np.mean(total_loss) ssnet.write_summary(dev_acc, dev_em, dev_loss, mode="dev") ssnet.write_summary(train_batch_acc, train_batch_em, train_batch_loss, mode="train") print("after %d step, dev_em:%.2f" % (step, dev_em)) if dev_em > best_score: best_score = dev_em print("new score! em: %.2f, acc:%.2f" % (dev_em, dev_acc)) ssnet.save_session(config.dir_model)
distance = 1 + pred_score - y_score.view(-1, 1) abs_distance = torch.max(distance, torch.zeros_like(distance)) ranking = abs_distance.sum(-1) reg = self.regularized() return ranking.mean() + reg if __name__ == "__main__": import random from data_utils import Data2tensor, Vocab, seqPAD, Txtfile, PADt, Embeddings Data2tensor.set_randseed(1234) use_cuda = torch.cuda.is_available() filename = "/media/data/restaurants/yelp_dataset/processed/extracted_rev/yelp_data_rev.pro.txt" idf_file = "./idf.txt" vocab = Vocab(wl_th=None, wcutoff=5) vocab.build(filename, idf_file=idf_file, firstline=False, limit=100000) word2idx = vocab.wd2idx(vocab_words=vocab.w2i, unk_words=True, se_words=False) train_data = Txtfile(filename, firstline=False, word2idx=word2idx, limit=100000) batch_size = 8 neg_sampling = 5 no_chunks = batch_size * (neg_sampling + 1) train_iters = Vocab.minibatches(train_data, batch_size=no_chunks)
import numpy as np import json from data_utils import Vocab vocab = {} vectors = [] index = 0 train_dataset = './datasets/Restaurants_Train.xml' test_dataset = './datasets/Restaurants_Test.xml' mapping_file = './embeddings/restaurant_mapping.json' vocab = Vocab.from_files([train_dataset, test_dataset], store=mapping_file).get_vocab() embedding = np.zeros((len(vocab), 200)) with open('embeddings/glove/glove.6B.100d.txt', 'r', encoding='utf-8') as f: for line in f: values = line.split() word = values[0] if word in vocab: vector = np.asarray(values[1:]) embedding[vocab[word], :100] = vector print('glove done') with open('embeddings/domain_embedding/restaurant_emb.vec', 'r', encoding='utf-8') as f: for line in f:
def main(): parser = argparse.ArgumentParser() parser.add_argument('--path', help="dataset path", type=str, default=None) parser.add_argument('--dataset', help="name of dataset", type=str) #parser.add_argument('--data_indice', help="indices of dataset", type=str) parser.add_argument('--adjacency', help="use adjacency matrix", type=bool, default=False) parser.add_argument('--batch', help="batch size", type=int, default=128) parser.add_argument('--epoch', help="epoch", type=int, default=100) parser.add_argument('--seq', help="sequence length", type=int, default=256) parser.add_argument('--lr', help="learning rate", type=float, default=0.0001) parser.add_argument('--embed_size', help="embedding vector size", type=int, default=1024) parser.add_argument('--model_dim', help="dim of transformer", type=int, default=1024) parser.add_argument('--layers', help="number of layers", type=int, default=6) parser.add_argument('--nhead', help="number of head", type=int, default=4) parser.add_argument('--drop_rate', help="ratio of dropout", type=float, default=0) parser.add_argument('--matrix_position', help="position of adjacency matrix", type=str, default='atom') parser.add_argument('--warmup_step', help="warmup step for scheduled learning rate", type=int, default=10000) parser.add_argument('--num_workers', help="number of workers", type=int, default=0) parser.add_argument('--split', help="type of dataset", type=str, default='scaffold') parser.add_argument('--saved_model', help="dir of pre-trained model", type=str) parser.add_argument("--seed", type=int, default=7) arg = parser.parse_args() _init_seed_fix(arg.seed) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) if arg.dataset == "tox21": num_tasks = 12 elif arg.dataset == "bace": num_tasks = 1 elif arg.dataset == 'bbbp': num_tasks = 1 elif arg.dataset == 'clintox': num_tasks = 2 elif arg.dataset == 'sider': num_tasks = 27 elif arg.dataset == 'toxcast': num_tasks = 617 elif arg.dataset == 'muv': num_tasks = 17 elif arg.dataset == 'hiv': num_tasks = 1 Smiles_vocab = Vocab() # read data dataset = FinetuningDataset(arg.path, arg.dataset, Smiles_vocab, seq_len=arg.seq, trainType='Training', mat_position=arg.matrix_position) print("Dataset loaded") if arg.split == 'scaffold': smiles_csv = pd.read_csv(arg.path + "/" + arg.dataset + ".csv", sep=',') smiles_list = smiles_csv['smiles'].tolist() train_idx, valid_idx, test_idx = scaffold_split(smiles_list) elif arg.split == 'random_scaffold': smiles_list = smiles_csv['smiles'].tolist() train_idx, valid_idx, test_idx = random_scaffold(smiles_list, arg.seed) else: indices = list(range(len(dataset))) split1, split2 = int(np.floor(0.1 * len(dataset))), int( np.floor(0.2 * len(dataset))) #np.random.seed(arg.seed) np.random.shuffle(indices) train_idx, valid_idx, test_idx = indices[split2:], indices[ split1:split2], indices[:split1] train_sampler = SubsetRandomSampler(train_idx) valid_sampler = SubsetRandomSampler(valid_idx) test_sampler = SubsetRandomSampler(test_idx) # preprocessing - dataloader(train, valid, test) train_dataloader = DataLoader(dataset, batch_size=arg.batch, sampler=train_sampler, num_workers=arg.num_workers, pin_memory=True) valid_dataloader = DataLoader(dataset, batch_size=arg.batch, sampler=valid_sampler, num_workers=arg.num_workers) test_dataloader = DataLoader(dataset, batch_size=arg.batch, sampler=test_sampler, num_workers=arg.num_workers) model = Smiles_BERT(len(Smiles_vocab), max_len=arg.seq, nhead=arg.nhead, feature_dim=arg.embed_size, feedforward_dim=arg.model_dim, nlayers=arg.layers, adj=arg.adjacency, dropout_rate=arg.drop_rate) model.load_state_dict(torch.load(arg.saved_model)) output_layer = nn.Linear(arg.embed_size, num_tasks) model = BERT_base(model, output_layer) #model = BERT_base_dropout(model, output_layer) model.to(device) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) #model.to(device) optim = Adam(model.parameters(), lr=arg.lr, weight_decay=0) criterion = nn.BCEWithLogitsLoss(reduction='none') # load model print("Start fine-tuning with seed", arg.seed) min_valid_loss = 100000 counter = 0 for epoch in range(arg.epoch): avg_loss = 0 valid_avg_loss = 0 total_hit = 0 total = 0 data_iter = tqdm.tqdm(enumerate(train_dataloader), total=len(train_dataloader)) #position_num = torch.arange(arg.seq).repeat(arg.batch,1).to(device) model.train() for i, data in data_iter: data = {key: value.to(device) for key, value in data.items()} position_num = torch.arange(arg.seq).repeat( data["smiles_bert_input"].size(0), 1).to(device) if arg.adjacency is True: output = model.forward(data["smiles_bert_input"], position_num, adj_mask=data["smiles_bert_adj_mask"], adj_mat=data["smiles_bert_adjmat"]) else: output = model.forward(data["smiles_bert_input"], position_num) output = output[:, 0] data["smiles_bert_label"] = data["smiles_bert_label"].view( output.shape).to(torch.float64) is_valid = data["smiles_bert_label"]**2 > 0 loss = criterion(output.double(), (data["smiles_bert_label"] + 1) / 2) loss = torch.where( is_valid, loss, torch.zeros(loss.shape).to(loss.device).to(loss.dtype)) optim.zero_grad() loss = torch.sum(loss) / torch.sum(is_valid) loss.backward() #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optim.step() avg_loss += loss.item() status = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "loss": loss.item() } if i % 100 == 0: print(i) #data_iter.write(str(status)) print("Epoch: ", epoch, "average loss: ", avg_loss / len(data_iter)) model.eval() valid_iter = tqdm.tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)) #position_num = torch.arange(arg.seq).repeat(arg.batch,1).to(device) predicted_list = [] target_list = [] with torch.no_grad(): for i, data in valid_iter: data = {key: value.to(device) for key, value in data.items()} position_num = torch.arange(arg.seq).repeat( data["smiles_bert_input"].size(0), 1).to(device) if arg.adjacency is True: output = model.forward( data["smiles_bert_input"], position_num, adj_mask=data["smiles_bert_adj_mask"], adj_mat=data["smiles_bert_adjmat"]) else: output = model.forward(data["smiles_bert_input"], position_num) output = output[:, 0] data["smiles_bert_label"] = data["smiles_bert_label"].view( output.shape).to(torch.float64) is_valid = data["smiles_bert_label"]**2 > 0 valid_loss = criterion(output.double(), (data["smiles_bert_label"] + 1) / 2) valid_loss = torch.where( is_valid, valid_loss, torch.zeros(valid_loss.shape).to(valid_loss.device).to( valid_loss.dtype)) valid_loss = torch.sum(valid_loss) / torch.sum(is_valid) valid_avg_loss += valid_loss.item() predicted = torch.sigmoid(output) predicted_list.append(predicted) target_list.append(data["smiles_bert_label"]) #_, predicted = torch.max(output.data, 1) #total += data["smiles_bert_label"].size(0) #total_hit += (torch.round(predicted) == data["smiles_bert_label"]).sum().item() predicted_list = torch.cat(predicted_list, dim=0).cpu().numpy() target_list = torch.cat(target_list, dim=0).cpu().numpy() #predicted_list = np.reshape(predicted_list, -1) #target_list = np.reshape(target_list, -1) roc_list = [] for i in range(target_list.shape[1]): if np.sum(target_list[:, i] == 1) > 0 and np.sum( target_list[:, i] == -1) > 0: is_valid = target_list[:, i]**2 > 0 roc_list.append( roc_auc_score((target_list[is_valid, i] + 1) / 2, predicted_list[is_valid, i])) print("AUCROC: ", sum(roc_list) / len(roc_list)) if valid_avg_loss < min_valid_loss: save_path = "../finetuned_model/" + str( arg.dataset) + "_epoch_" + str(epoch) + "_val_loss_" + str( round(valid_avg_loss / len(valid_dataloader), 5)) torch.save(model.state_dict(), save_path + '.pt') model.to(device) min_valid_loss = valid_avg_loss counter = 0 counter += 1 if counter > 5: break # eval print("Finished. Start evaluation.") correct = 0 total = 0 predicted_list = [] target_list = [] model.eval() #test_iter = tqdm.tqdm(enumerate(test_dataloader), total=len(test_dataloader)) #position_num = torch.arange(arg.seq).repeat(arg.batch,1).to(device) with torch.no_grad(): for i, data in enumerate(test_dataloader): data = {key: value.to(device) for key, value in data.items()} position_num = torch.arange(arg.seq).repeat( data["smiles_bert_input"].size(0), 1).to(device) if arg.adjacency is True: output = model(data["smiles_bert_input"], position_num, adj_mask=data["smiles_bert_adj_mask"], adj_mat=data["smiles_bert_adjmat"]) else: output = model(data["smiles_bert_input"], position_num) output = output[:, 0] data["smiles_bert_label"] = data["smiles_bert_label"].view( output.shape).to(torch.float64) predicted = torch.sigmoid(output) predicted_list.append(predicted) target_list.append(data["smiles_bert_label"]) #_, predicted = torch.max(output.data, 1) #total += data["smiles_bert_label"].size(0) #correct += (torch.round(predicted) == data["smiles_bert_label"]).sum().item() predicted_list = torch.cat(predicted_list, dim=0).cpu().numpy() target_list = torch.cat(target_list, dim=0).cpu().numpy() #predicted_list = np.reshape(predicted_list, -1) #target_list = np.reshape(target_list, -1) roc_list = [] for i in range(target_list.shape[1]): if np.sum(target_list[:, i] == 1) > 0 and np.sum( target_list[:, i] == -1) > 0: is_valid = target_list[:, i]**2 > 0 roc_list.append( roc_auc_score((target_list[is_valid, i] + 1) / 2, predicted_list[is_valid, i])) print("AUCROC: ", sum(roc_list) / len(roc_list)) print("Evaluate on min valid loss model") correct = 0 total = 0 predicted_list = [] target_list = [] model.load_state_dict(torch.load(save_path + '.pt')) model.eval() #test_iter = tqdm.tqdm(enumerate(test_dataloader), total=len(test_dataloader)) #position_num = torch.arange(arg.seq).repeat(arg.batch,1).to(device) with torch.no_grad(): for i, data in enumerate(test_dataloader): data = {key: value.to(device) for key, value in data.items()} position_num = torch.arange(arg.seq).repeat( data["smiles_bert_input"].size(0), 1).to(device) if arg.adjacency is True: output = model(data["smiles_bert_input"], position_num, adj_mask=data["smiles_bert_adj_mask"], adj_mat=data["smiles_bert_adjmat"]) else: output = model(data["smiles_bert_input"], position_num) output = output[:, 0] data["smiles_bert_label"] = data["smiles_bert_label"].view( output.shape).to(torch.float64) predicted = torch.sigmoid(output) predicted_list.append(predicted) target_list.append(data["smiles_bert_label"]) #_, predicted = torch.max(output.data, 1) #total += data["smiles_bert_label"].size(0) #correct += (torch.round(predicted) == data["smiles_bert_label"]).sum().item() #predicted_list = np.reshape(predicted_list, -1) #target_list = np.reshape(target_list, -1) predicted_list = torch.cat(predicted_list, dim=0).cpu().numpy() target_list = torch.cat(target_list, dim=0).cpu().numpy() roc_list = [] for i in range(target_list.shape[1]): if np.sum(target_list[:, i] == 1) > 0 and np.sum( target_list[:, i] == -1) > 0: is_valid = target_list[:, i]**2 > 0 roc_list.append( roc_auc_score((target_list[is_valid, i] + 1) / 2, predicted_list[is_valid, i])) print("AUCROC: ", sum(roc_list) / len(roc_list))
# Calculate un-nomalized scores decoded_scores = self.scorer_layer(h_n_drop) # YOUR CODE ENDS HERE ####################### return decoded_scores, rec_hidden, rec_output if __name__ == '__main__': from data_utils import Vocab, Txtfile, Data2tensor, seqPAD, PAD cutoff = 5 wl_th = -1 batch_size = 16 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data_files = ["../dataset/train.small.txt"] vocab = Vocab(wl_th=wl_th, cutoff=cutoff) vocab.build(data_files, firstline=False) word2idx = vocab.wd2idx(vocab.w2i) label2idx = vocab.tag2idx(vocab.l2i) rec_type = "LSTM" ntoken = len(vocab.w2i) nlabels = len(vocab.l2i) emb_size = 50 hidden_size = 64 nlayers = 2 dropout = 0.5 bidirect = False #embedding_matrix=create_embedding_matrix(vocab,ntoken,emb_size) #print(embedding_matrix[5])
def main(): # The dataset filenames are stored as a dictionary, e.g. # "train": "data/bbn/train.json", # "dev": "data/bbn/dev.json"... etc dataset_filenames = { "train": cf.TRAIN_FILENAME, "dev": cf.DEV_FILENAME, "test": cf.TEST_FILENAME, } # 1. Construct the Hierarchy by looking through each dataset for unique labels. hierarchy = build_hierarchy(dataset_filenames) # 2. Construct two empty Vocab objects (one for words, another for wordpieces), which will be populated in step 3. word_vocab = Vocab() wordpiece_vocab = Vocab() logger.info("Hierarchy contains %d categories unique to the test set." % len(hierarchy.get_categories_unique_to_test_dataset())) # 3. Build a data loader for each dataset (train, test). # A 'data loader' is an Pytorch object that stores a dataset in a numeric format. data_loaders = {} # Iterate over each of the train, dev and test datasets. for ds_name, filepath in dataset_filenames.items(): logger.info("Loading %s dataset from %s." % (ds_name, filepath)) dataset, total_wordpieces = build_dataset(filepath, hierarchy, word_vocab, wordpiece_vocab, ds_name) data_loader = DataLoader(dataset, batch_size=cf.BATCH_SIZE, pin_memory=True) data_loaders[ds_name] = data_loader logger.info("The %s dataset was built successfully." % ds_name) logger.info( "Dataset contains %i wordpieces (including overly long sentences)." % total_wordpieces) if ds_name == "train": total_wordpieces_train = total_wordpieces print(hierarchy.category_counts['train']) # This part is not necessary (it was added so that I didn't have to save the huge Wiki dataset to disk). # If BYPASS_SAVING is set to true, the model will start training and the data loaders will not be saved onto the harddrive. BYPASS_SAVING = False if BYPASS_SAVING: logger.info("Bypassing file saving - training model directly") train_without_loading(data_loaders, word_vocab, wordpiece_vocab, hierarchy, total_wordpieces_train) #return logger.info("Evaluating directly") evaluate_without_loading(data_loaders, word_vocab, wordpiece_vocab, hierarchy, total_wordpieces_train) return # This part saves every data loader into the asset directory, so that they can be read during training. logger.info("Saving data loaders to file...") dutils.save_obj_to_pkl_file(data_loaders, 'data loaders', cf.ASSET_FOLDER + '/data_loaders.pkl') logger.info("Saving vocabs and hierarchy to file...") dutils.save_obj_to_pkl_file(word_vocab, 'word vocab', cf.ASSET_FOLDER + '/word_vocab.pkl') dutils.save_obj_to_pkl_file(wordpiece_vocab, 'wordpiece vocab', cf.ASSET_FOLDER + '/wordpiece_vocab.pkl') dutils.save_obj_to_pkl_file(hierarchy, 'hierarchy', cf.ASSET_FOLDER + '/hierarchy.pkl') dutils.save_obj_to_pkl_file(total_wordpieces_train, 'total_wordpieces', cf.ASSET_FOLDER + '/total_wordpieces.pkl') dutils.save_list_to_file(word_vocab.ix_to_token, 'word vocab', cf.DEBUG_FOLDER + '/word_vocab.txt') dutils.save_list_to_file(wordpiece_vocab.ix_to_token, 'wordpiece vocab', cf.DEBUG_FOLDER + '/wordpiece_vocab.txt')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--path', help="dataset path", type=str, default=None) parser.add_argument('--save_path', help="trained model path", type=str, default=None) parser.add_argument('--adjacency', help="use adjacency matrix", type=bool, default=False) parser.add_argument('--batch', help="batch size", type=int, default=128) parser.add_argument('--epoch', help="epoch", type=int, default=50) parser.add_argument('--seq', help="sequence length", type=int, default=256) parser.add_argument('--lr', help="learning rate", type=float, default=0.0001) parser.add_argument('--embed_size', help="embedding vector size", type=int, default=1024) parser.add_argument('--model_dim', help="dim of transformer", type=int, default=1024) parser.add_argument('--layers', help="number of layers", type=int, default=6) parser.add_argument('--nhead', help="number of head", type=int, default=4) parser.add_argument('--drop_rate', help="ratio of dropout", type=float, default=0) parser.add_argument('--matrix_position', help="position of adjacency matrix", type=str, default='atom') parser.add_argument('--warmup_step', help="warmup step for scheduled learning rate", type=int, default=10000) parser.add_argument('--num_workers', help="number of workers", type=int, default=0) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--seed", type=int, default=7) #parser.add_argument('--savepath', help="saved model dir", type=str) arg = parser.parse_args() torch.manual_seed(arg.seed) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) Smiles_vocab = Vocab() dataset = SmilesDataset(arg.path, Smiles_vocab, seq_len=arg.seq, mat_position=arg.matrix_position) print("Dataset loaded") train_dataloader = DataLoader(dataset, shuffle=True, batch_size=arg.batch, num_workers=arg.num_workers, pin_memory=True) model = Smiles_BERT(len(Smiles_vocab), max_len=arg.seq, nhead=arg.nhead, feature_dim=arg.embed_size, feedforward_dim=arg.model_dim, nlayers=arg.layers, adj=arg.adjacency, dropout_rate=arg.drop_rate) value_layer = nn.Linear(arg.embed_size, 1) mask_layer = Masked_prediction(arg.embed_size, len(Smiles_vocab)) model = BERT_double_tasks(model, value_layer, mask_layer) model.to(device) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) optim = Adam(model.parameters(), lr=arg.lr, weight_decay=0) scheduled_optim = ScheduledOptim(optim, arg.embed_size, n_warmup_steps=arg.warmup_step) criterion = nn.CrossEntropyLoss(ignore_index=0) criterion2 = nn.L1Loss() print("Start pre-training") for epoch in range(arg.epoch): avg_loss = 0 #hit = 0 #total = 0 data_iter = tqdm.tqdm(enumerate(train_dataloader), total=len(train_dataloader)) position_num = torch.arange(arg.seq).repeat(arg.batch, 1).to(device) model.train() for i, data in data_iter: data = {key: value.to(device) for key, value in data.items()} if data["smiles_bert_input"].size(0) != arg.batch: position_num = torch.arange(arg.seq).repeat( data["smiles_bert_input"].size(0), 1).to(device) if arg.adjacency is True: value_out, mask_out = model.forward( data["smiles_bert_input"], position_num, adj_mask=data["smiles_bert_adj_mask"], adj_mat=data["smiles_bert_adjmat"]) else: value_out, mask_out = model.forward(data["smiles_bert_input"], position_num) #print(output.shape, data["smiles_bert_label"].shape) #print(output, data["smiles_bert_label"]) loss = criterion(mask_out.transpose( 1, 2), data["smiles_bert_label"]) + criterion2( value_out, data["smiles_bert_value"].view(-1, 1)) scheduled_optim.zero_grad() loss.backward() #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) scheduled_optim.step_and_update_lr() avg_loss += loss.item() status = { "epoch": epoch, "iter": i, "avg_loss": avg_loss / (i + 1), "loss": loss.item() } if i % 1000 == 0: data_iter.write(str(status)) if i % 5000 == 0: #print() torch.save( model.module.state_dict(), str(arg.save_path) + "/temp_model_" + "epoch_" + str(epoch) + "_" + str(i) + "_" + str(round(avg_loss / (i + 1), 5))) #hit = output.argmax(dim=-1).eq(data["smiles_bert_label"]) print("Epoch: ", epoch, "average loss: ", avg_loss / len(data_iter)) save_path = str(arg.save_path) + "/nlayers_" + str( arg.layers) + "_nhead_" + str(arg.nhead) + "_adj_" + str( arg.adjacency) + "_epoch_" + str(epoch) + "_loss_" + str( round(avg_loss / len(data_iter), 5)) torch.save(model.module.bert.state_dict(), save_path + '.pt') model.to(device) print("model saved") correct = 0 total = 0 predicted_list = np.array([]) target_list = np.array([]) total_loss = 0 '''
def inference(self, label_score, k=1): label_prob = F.softmax(label_score, dim=-1) label_prob, label_pred = label_prob.data.topk(k) return label_prob, label_pred if __name__ == '__main__': from data_utils import Vocab, Txtfile, Data2tensor, seqPAD, PAD cutoff = 5 wl_th = -1 batch_size = 16 bptt = 10 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data_files = ["../dataset/train.txt"] vocab = Vocab(wl_th=wl_th, cutoff=cutoff) vocab.build(data_files, firstline=False) word2idx = vocab.wd2idx(vocab.w2i) label2idx = vocab.tag2idx(vocab.l2i) train_data = Txtfile(data_files[0], firstline=False, source2idx=word2idx, label2idx=label2idx) # train_data = [sent[0] for sent in train_data] train_batch = vocab.minibatches(train_data, batch_size=batch_size) inpdata=[] outdata=[] for sent in train_batch: word_pad_ids, seq_lens = seqPAD.pad_sequences(sent, pad_tok=vocab.w2i[PAD]) data_tensor = Data2tensor.idx2tensor(word_pad_ids) for i in range(0, data_tensor.size(1)-1, bptt): data, target = vocab.bptt_batch(data_tensor, i, bptt) inpdata.append(data)
return batch_loss def inference(self, label_score, k=1): if self.num_labels > 2: label_prob = F.softmax(label_score, dim=-1) label_prob, label_pred = label_prob.data.topk(k) else: label_prob = F.sigmoid(label_score.squeeze()) label_pred = (label_prob >= 0.5).data.long() return label_prob, label_pred if __name__ == "__main__": from data_utils import Data2tensor, Vocab, seqPAD, Csvfile filename = "/media/data/langID/small_scale/train.csv" vocab = Vocab(cl_th=None, cutoff=1, c_lower=False, c_norm=False) vocab.build([filename], firstline=False) word2idx = vocab.wd2idx(vocab.c2i) tag2idx = vocab.tag2idx(vocab.l2i) train_data = Csvfile(filename, firstline=False, word2idx=word2idx, tag2idx=tag2idx) train_iters = Vocab.minibatches(train_data, batch_size=10) data = [] label_ids = [] for words, labels in train_iters: data.append(words) label_ids.append(labels) word_ids, sequence_lengths = seqPAD.pad_sequences(words,
class BaseModel(): def load_data(self, datafile): dataset = pd.read_csv(datafile) if self.debug: dataset = dataset.iloc[:3000] text = 'comment_text' self.X = dataset[text].values labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] # labels = ['severe_toxic'] assert (len(labels) == self.config.label_size) self.y = dataset[labels].values self.X_train, self.X_val, self.y_train, self.y_val = train_test_split( self.X, self.y, test_size=0.1, random_state=124) ## Build the vocabulary using the train data. self.vocab = Vocab() train_sents = [get_words(line) for line in self.X_train] self.vocab.construct(list(itertools.chain.from_iterable(train_sents)), threshold=self.config.min_word_freq) print('Training on {} samples and validating on {} samples'.format( len(self.X_train), len(self.X_val))) print() self.embedding_matrix = np.random.uniform( -0.005, 0.005, size=[len(self.vocab), self.config.embed_size]).astype('float32') with tf.variable_scope("Embeddings") as scope: embedding = tf.get_variable("Embeds", initializer=self.embedding_matrix, dtype=tf.float32) if self.debug: return ## Populate embedding matrix from pre-trained word embeddings pretrained_index = {} with open('./WordVectors/crawl-300d-2M.vec') as fh: for line in fh: word_vec = line.strip().split() word = word_vec[0] vector = np.asarray(word_vec[1:], dtype='float32') pretrained_index[word] = vector pw = 0.0 for word, idx in self.vocab.word_to_idx.items(): pretrained_vector = pretrained_index.get(word) if pretrained_vector is not None: self.embedding_matrix[idx] = pretrained_vector pw += 1 print("Found pretrained vectors for {:.2f}% of data".format( pw / len(self.vocab) * 100)) del pretrained_index ## Done only for memory constraint. Don't do this!! def input_embeddings(self): with tf.variable_scope("Embeddings", reuse=True): embedding = tf.get_variable("Embeds") input_vectors = tf.nn.embedding_lookup(self.embedding_matrix, self.input_placeholder) return input_vectors def core_module(self): return def calculate_loss(self, output): labels = self.label_placeholder log_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=output, labels=labels)) l2_loss = 0 for weights in tf.trainable_variables(): if ("Bias" not in weights.name) and ("Embeddings" not in weights.name): l2_loss += (self.config.l2 * tf.nn.l2_loss(weights)) loss = log_loss + l2_loss return loss def training_operation(self, loss): return tf.train.AdamOptimizer( learning_rate=self.config.lr).minimize(loss) def build_feeddict(self): return