def main(opts): line_pairs, vocab_size, idx_dict = load_data() print_data_stats(line_pairs, vocab_size, idx_dict) # Train Test Split num_lines = len(line_pairs) num_train = int(0.8 * num_lines) train_pairs, test_pairs = line_pairs[:num_train], line_pairs[num_train:] line_pairs = train_pairs # Split the line pairs into an 80% train and 20% val split num_lines = len(line_pairs) num_train = int(0.8 * num_lines) train_pairs, val_pairs = line_pairs[:num_train], line_pairs[num_train:] # Group the data by the lengths of the source and target words, to form batches train_dict = create_dict(train_pairs) val_dict = create_dict(val_pairs) test_dict = create_dict(test_pairs) ########################################################################## ### Setup: Create Encoder, Decoder, Learning Criterion, and Optimizers ### ########################################################################## encoder = models.GRUEncoder(vocab_size=vocab_size, hidden_size=opts.hidden_size, opts=opts) if opts.no_attention: decoder = models.NoAttentionDecoder(vocab_size=vocab_size, hidden_size=opts.hidden_size) else: decoder = models.AttentionDecoder(vocab_size=vocab_size, hidden_size=opts.hidden_size) if opts.cuda: encoder.cuda() decoder.cuda() print("Moved models to GPU!") criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=opts.learning_rate) try: training_loop(train_dict, val_dict, idx_dict, encoder, decoder, criterion, optimizer, opts) # Evaluation on 20% Test Data print("\nEvaluation on 20% Test Data") test_loss = evaluate(test_dict, encoder, decoder, idx_dict, criterion, opts) print(f"Test loss {test_loss}") except KeyboardInterrupt: print('Exiting early from training.')
def eval(self, train_pos, train_neg, test_pos, test_neg, embeddings_file=None, checkpoint_file=None): phrase_dic = clean_dictionary(pickle.load(open(config.phrase_dic, 'rb'))) if self.model_type == 'rnn': if use_cuda: print('GPU available!!') device = torch.device('cuda') else: device = torch.device('cpu') modelcheckpoint = torch.load(checkpoint_file, map_location=device) vocabulary_size = len(modelcheckpoint['word2idx']) model = models.GRUEncoder(vocabulary_size, self.embedding_dim, self.rnn_size, self.neg_sample_num, self.batch_size, self.window_size) print_params(model) # if use_cuda: print('GPU available!!') model.cuda() # model.eval() model.load_state_dict(modelcheckpoint['state_dict']) # print('Number of positive training samples: ', len(train_pos)) print('Number of negative training samples: ', len(train_neg)) print('Number of positive testing samples: ', len(test_pos)) print('Number of negative testing samples: ', len(test_neg)) word2idx = modelcheckpoint['word2idx'] node_embeddings = self.create_node_embeddings(model, phrase_dic, word2idx) else: node_embeddings = load_embeddings(embeddings_file) if config.evaluate_cosine: # first calculate the cosine similarity for every edge in test_pos and in test_neg cosine_test_pos = get_cos_embedding(test_pos, node_embeddings, phrase_dic) cosine_test_neg = get_cos_embedding(test_neg, node_embeddings, phrase_dic) # turn negative values to zeros cosine_test_pos[cosine_test_pos < 0] = 0 cosine_test_neg[cosine_test_neg < 0] = 0 # the predictions are the cosine similarities and we also create the labels. test_preds = np.concatenate([cosine_test_pos, cosine_test_neg]) test_labels = np.zeros(test_preds.shape[0]) test_labels[:cosine_test_pos.shape[0]] = 1 test_auc = roc_auc_score(test_labels, test_preds) print('node2vec Test AUC score: ', str(test_auc)) if config.evaluate_lr: test_neg = pickle.load(open(config.test_neg, 'rb')) train_pos_edge_embs = get_edge_embeddings(train_pos, node_embeddings, self.model_type, phrase_dic) train_neg_edge_embs = get_edge_embeddings(train_neg, node_embeddings, self.model_type, phrase_dic) train_set = np.concatenate([train_pos_edge_embs, train_neg_edge_embs]) # labels: 1-> link exists, 0-> false edge train_labels = np.zeros(len(train_set)) train_labels[:len(train_pos_edge_embs)] = 1 # for testing test_pos_edge_embs = get_edge_embeddings(test_pos, node_embeddings, self.model_type, phrase_dic) test_neg_edge_embs = get_edge_embeddings(test_neg, node_embeddings, self.model_type, phrase_dic) test_set = np.concatenate([test_pos_edge_embs, test_neg_edge_embs]) # labels: 1-> link exists, 0-> false edge test_labels = np.zeros(len(test_set)) test_labels[:len(test_pos_edge_embs)] = 1 # train the classifier and evaluate in the test set # shuffle train set idx_list = [i for i in range(len(train_labels))] shuffle(idx_list) train_set = train_set[idx_list] train_labels = train_labels[idx_list] # shuffle test set idx_list = [i for i in range(len(test_labels))] shuffle(idx_list) test_set = test_set[idx_list] test_labels = test_labels[idx_list] classifier = LogisticRegression() classifier.fit(train_set, train_labels) # evaluate test_preds = classifier.predict_proba(test_set) false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, test_preds[:, 1]) average_precision = average_precision_score(test_labels, test_preds[:, 1]) test_auc = auc(false_positive_rate, true_positive_rate) test_roc = roc_auc_score(test_labels, test_preds[:, 1]) print('node2vec Test ROC score: ', str(test_roc)) print('node2vec Test AUC score: ', str(test_auc)) print('node2vec Test AP score: ', str(average_precision))
def train(self): # initialize the model if self.model_type == 'rnn': model = models.GRUEncoder(self.vocabulary_size, self.embedding_dim, self.rnn_size, self.neg_sample_num, self.batch_size, self.window_size) else: model = models.AverageNode2Vec(self.vocabulary_size, self.embedding_dim, self.neg_sample_num, self.batch_size, self.window_size) print_params(model) params = model.parameters() if use_cuda: print('GPU available!!') model.cuda() if self.model_type == 'rnn': optimizer = optim.Adam(params, lr=config.lr) else: optimizer = optim.SparseAdam(params, lr=config.lr) dataset = Node2VecDataset(self.utils, self.neg_sample_num) dataloader = DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=False) model.train() for epoch in range(self.epochs): batch_num = 0 batch_costs = [] last_batch_num = -1 # if we resume training load the last checkpoint if config.resume_training: if use_cuda: print('GPU available..will resume training!!') device = torch.device('cuda') else: device = torch.device('cpu') modelcheckpoint = torch.load(os.path.join(config.checkpoint_dir, config.checkpoint_to_load), map_location=device) model.load_state_dict(modelcheckpoint['state_dict']) optimizer.load_state_dict(modelcheckpoint['optimizer']) last_batch_num = modelcheckpoint['batch_num'] self.word2idx = modelcheckpoint['word2idx'] # last_loss = modelcheckpoint['loss'] print("We stopped in {} batch".format(last_batch_num)) # iterator = tqdm(dataloader) for sample in iterator: # if we resume training--continue from the last batch we stopped if batch_num <= last_batch_num: batch_num += 1 continue ###----------- phr = sample['center'] pos_context = sample['context'] neg_v = np.random.choice(self.utils.sample_table, size=(len(phr) * self.neg_sample_num)).tolist() ###----------- # ----------- phr = [phr2idx(self.utils.phrase_dic[phr_id.item()], self.word2idx) for phr_id in phr] pos_context = [phr2idx(self.utils.phrase_dic[item.item()], self.word2idx) for item in pos_context] neg_v = [phr2idx(self.utils.phrase_dic[item], self.word2idx) for item in neg_v] # ----------- # -------------- optimizer.zero_grad() loss = model(phr, pos_context, neg_v) loss.backward() optimizer.step() batch_costs.append(loss.cpu().item()) # -------------- # print the average cost every 5000 batches if batch_num % 5000 == 0: print('Batches Average Loss: {}, Batches: {} '.format( sum(batch_costs) / float(len(batch_costs)), batch_num)) batch_costs = [] # save the model every 300000 batches if batch_num % 300000 == 0: print("Saving at {} batches".format(batch_num)) state = {'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'word2idx': self.word2idx, 'idx2word': self.utils.idx2word, 'batch_num': batch_num, 'loss': loss.cpu().item()} save_checkpoint(state, filename=self.odir_checkpoint + '{}_checkpoint_batch_{}.pth.tar'.format( config.dataset_name, batch_num)) ### batch_num += 1 # reset the yielder on the dataset class if epoch + 1 != self.epochs: dataset.reset_generator() # save the model on each epoch state = {'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'word2idx': self.word2idx, 'idx2word': self.utils.idx2word} save_checkpoint(state, filename=self.odir_checkpoint + config.checkpoint_name.format(epoch + 1)) # TODO do something better here config.checkpoint_name = config.checkpoint_name.format(epoch + 1) # training has finished..save the word embeddings print("Optimization Finished!") self.wv = model.save_embeddings(file_name=self.odir_embeddings + self.output_file, idx2word=self.utils.idx2word, use_cuda=True)