def validate(self, val_kg): losses = [] try: dataloader = DataLoader(val_kg, batch_size=self.b_size, use_cuda='all') except AssertionError: dataloader = DataLoader(val_kg, batch_size=self.b_size) for batch in dataloader: h, t, r = batch n_h, n_t = self.sampler.corrupt_batch(h, t, r) pos, neg = self.model(h, t, n_h, n_t, r) loss = self.loss_fn(pos, neg) losses.append(loss.item()) return np.mean(losses)
def __init__(self, kg, model_type, ts, **kwargs): self.kg = kg self.truth_share = ts self.emb_dim = kwargs.pop('emb_dim', 250) self.model_type = model_type self.model = getattr(torchkge.models.bilinear, self.model_type + 'Model')(emb_dim=self.emb_dim, n_entities=self.kg.n_ent, n_relations=self.kg.n_rel) all_is = [ int(d.split('_')[1]) for d in os.listdir(wot.models_path) if os.path.isdir(join(wot.models_path, d)) and f'{self.model_type}_' in d ] i = [x for x in range(1, len(all_is) + 2) if x not in all_is][0] self.model_path = join(models_path, f'{self.model_type}_{str(i+1).zfill(2)}') os.makedirs(self.model_path, exist_ok=True) self.logfile = join(self.model_path, 'log.txt') ## Hyperparameters self.lr = kwargs.pop('lr', 0.0004) self.n_epochs = kwargs.pop('n_epochs', 100) self.b_size = kwargs.pop('b_size', 32) self.logline( tabulate([(k, v) for k, v in vars(self).items()], headers=['variable', 'value'])) try: self.dataloader = DataLoader(self.kg, batch_size=self.b_size, use_cuda='all') except AssertionError: self.dataloader = DataLoader(self.kg, batch_size=self.b_size) ## Logger self.epochs = 0 self.tr_losses = [] self.best_epoch = -1 self.val_losses = [] self.val_epochs = []
def main(): # Define some hyper-parameters for training global optimizer benchmarks = 'GeoDBpedia21' model_name = 'TransR_GDR' opt_method = 'Adam' # "Adagrad" "Adadelta" "Adam" "SGD" GDR = True # 是否引入坐标信息 emb_dim = 100 # TransE model ent_dim = emb_dim rel_dim = emb_dim lr = 0.001 margin = 0.5 n_epochs = 20000 train_b_size = 256 # 训练时batch size eval_b_size = 64 # 测评valid test 时batch size # save_time_freq = 5 # require_improvement = save_time_freq*5 validation_freq = 10 # 多少轮进行在验证集进行一次测试 同时保存最佳模型 require_improvement = validation_freq * 3 # 验证集top_k超过多少epoch没下降,结束训练 model_save_path = './checkpoint/' + benchmarks + '_' + model_name + '_' + opt_method + '.ckpt' # 保存最佳hits k (ent)模型 device = 'cuda:0' if cuda.is_available() else 'cpu' # Load dataset module = getattr(import_module('torchkge.models'), model_name + 'Model') load_data = getattr(import_module('torchkge.utils.datasets'), 'load_' + benchmarks) print('Loading data...') kg_train, kg_val, kg_test = load_data(GDR=GDR) print(f'Train set: {kg_train.n_ent} entities, {kg_train.n_rel} relations, {kg_train.n_facts} triplets.') print(f'Valid set: {kg_val.n_facts} triplets, Test set: {kg_test.n_facts} triplets.') # Define the model and criterion print('Loading model...') if 'TransE' in model_name: model = module(emb_dim, kg_train.n_ent, kg_train.n_rel, dissimilarity_type='L2') else: model = module(ent_dim, rel_dim, kg_train.n_ent, kg_train.n_rel) criterion = MarginLoss(margin) # Move everything to CUDA if available if device == 'cuda:0': cuda.empty_cache() model.to(device) criterion.to(device) dataloader = DataLoader(kg_train, batch_size=train_b_size, use_cuda='all') else: dataloader = DataLoader(kg_train, batch_size=train_b_size, use_cuda=None) # Define the torch optimizer to be used optimizer = optimizer(model, opt_method=opt_method, lr=lr) # optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5) sampler = BernoulliNegativeSampler(kg_train) start_epoch = 1 best_score = float('-inf') if os.path.exists(model_save_path): # 存在则加载模型 并继续训练 start_epoch, best_score = load_ckpt(model_save_path, model, optimizer) print(f'loading ckpt sucessful, start on epoch {start_epoch}...') print(model) print('lr: {}, margin: {}, dim {}, total epoch: {}, device: {}, batch size: {}, optim: {}, GDR: {}' \ .format(lr, margin, emb_dim, n_epochs, device, train_b_size, opt_method, GDR)) print('Training...') last_improve = start_epoch # 记录上次验证集loss下降的epoch数 start = time.time() # last_improve = start # save_time = start for epoch in range(start_epoch, n_epochs + 1): # model.normalize_parameters() running_loss = 0.0 model.train() for i, batch in enumerate(dataloader): if GDR: h, t, r, point = batch[0], batch[1], batch[2], batch[3] n_h, n_t = sampler.corrupt_batch(h, t, r) # 1:1 negative sampling n_point = id2point(n_h, n_t, kg_train.id2point) optimizer.zero_grad() # forward + backward + optimize pos, neg = model(h, t, n_h, n_t, r) loss = criterion(pos, neg, point, n_point) else: h, t, r = batch[0], batch[1], batch[2] n_h, n_t = sampler.corrupt_batch(h, t, r) optimizer.zero_grad() pos, neg = model(h, t, n_h, n_t, r) loss = criterion(pos, neg) loss.backward() optimizer.step() running_loss += loss.item() model.normalize_parameters() # print('\rEpoch [{:>4}/{:>4}] | mean loss: {:>8.3f}, time: {}'.format(epoch, n_epochs, running_loss / len(dataloader), time_since(start)), end='', flush=True) # # test if epoch % validation_freq == 0: create_dir_not_exists('./checkpoint') model.eval() evaluator = LinkPredictionEvaluator(model, kg_val) evaluator.evaluate(b_size=eval_b_size, verbose=False) _, hit_at_k = evaluator.hit_at_k(10) # val filter hit_k print('Epoch [{:>5}/{:>5}] '.format(epoch, n_epochs), end='') if hit_at_k > best_score: save_ckpt(model, optimizer, epoch, best_score, model_save_path) best_score = hit_at_k improve = '*' # 在有提升的结果后面加上*标注 last_improve = epoch # 验证集hit_k增大即认为有提升 else: improve = '' msg = '| Train loss: {:>8.3f}, Val Hit@10: {:>5.2%}, Time {} {}' print(msg.format(running_loss / len(dataloader), hit_at_k, time_since(start), improve)) # model.normalize_parameters() if epoch - last_improve > require_improvement: # 验证集top_k超过一定epoch没增加,结束训练 print("\nNo optimization for a long time, auto-stopping...") break # # test # if (time.time() - save_time)/60 > save_time_freq: # create_dir_not_exists('./checkpoint') # model.eval() # evaluator = LinkPredictionEvaluator(model, kg_val) # evaluator.evaluate(b_size=eval_b_size, verbose=False) # _, hit_at_k = evaluator.hit_at_k(10) # val filter hit_k # if hit_at_k > best_score: # save_ckpt(model, optimizer, epoch, best_score, model_save_path) # best_score = hit_at_k # improve = '*' # 在有提升的结果后面加上*标注 # last_improve = time.time() # 验证集hit_k增大即认为有提升 # else: # improve = '' # save_time = time.time() # msg = ', Val Hit@10: {:>5.2%} {}' # print(msg.format(hit_at_k, improve)) # model.normalize_parameters() # if (time.time() - last_improve)/60 > require_improvement: # # 验证集top_k超过一定epoch没增加,结束训练 # print("\nNo optimization for a long time, auto-stopping...") # break print('\nTraining done, start evaluate on test data...') print('model name: {}, lr: {}, dim {}, device: {}, eval batch size: {}, optim: {}, GDR: {}' \ .format(model_name, lr, emb_dim, device, eval_b_size, opt_method, GDR)) # Testing the best checkpoint on test dataset load_ckpt(model_save_path, model, optimizer) model.eval() lp_evaluator = LinkPredictionEvaluator(model, kg_test) lp_evaluator.evaluate(eval_b_size, verbose=False) lp_evaluator.print_results() rp_evaluator = RelationPredictionEvaluator(model, kg_test) rp_evaluator.evaluate(eval_b_size, verbose=False) rp_evaluator.print_results() print(f'Total time cost: {time_since(start)}')
# In[43]: # Move everything to CUDA if available if cuda.is_available(): cuda.empty_cache() model.cuda() criterion.cuda() # In[20]: # Define the torch optimizer to be used optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5) sampler = BernoulliNegativeSampler(kg_train) dataloader = DataLoader(kg_train, batch_size=b_size, use_cuda='all') iterator = tqdm(range(n_epochs), unit='epoch') for epoch in iterator: running_loss = 0.0 for i, batch in enumerate(dataloader): h, t, r = batch[0], batch[1], batch[2] n_h, n_t = sampler.corrupt_batch(h, t, r) optimizer.zero_grad() # forward + backward + optimize pos, neg = model(h, t, n_h, n_t, r) loss = criterion(pos, neg) loss.backward() optimizer.step()
print('multiple gpus are available') if args.gpu is not None: model = DataParallel(model, device_ids=args.gpu) else: model = DataParallel(model) checkpoint_manager = CheckpointManager(restore_dir) ckpt = checkpoint_manager.load_checkpoint(f'best_{args.model}.tar') model.load_state_dict(ckpt['model_state_dict']) criterion = MarginLoss(margin) model.to(device) criterion.to(device) sampler = BernoulliNegativeSampler(kg_test) test_dl = DataLoader(kg_test, batch_size=args.batch_size) model.eval() test_loss = 0 for step, batch in tqdm(enumerate(test_dl), desc='steps', total=len(test_dl)): h, t, r = map(lambda elm: elm.to(device), batch) n_h, n_t = sampler.corrupt_batch(h, t, r) with torch.no_grad(): pos, neg = model(h, t, n_h, n_t, r) loss = criterion(pos, neg) test_loss += loss.item() test_loss /= (step + 1) training_summary = previous_summary['Training Summary']
model = DataParallel(model, device_ids=device_ids) else: model = DataParallel(model) model.to(device) criterion.to(device) writer = SummaryWriter(save_dir / f'runs_{args.model}') checkpoint_manager = CheckpointManager(save_dir) summary_manager = SummaryManager(save_dir) summary_manager.update(experiment_summary) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=1e-5) sampler = BernoulliNegativeSampler(kg_train) tr_dl = DataLoader(kg_train, batch_size=args.batch_size) val_dl = DataLoader(kg_valid, batch_size=args.batch_size) best_val_loss = 1e+10 for epoch in tqdm(range(args.epochs), desc='epochs'): tr_loss = 0 model.train() for step, batch in enumerate(tr_dl): h, t, r = map(lambda elm: elm.to(device), batch) n_h, n_t = sampler.corrupt_batch(h, t, r) optimizer.zero_grad() pos, neg = model(h, t, n_h, n_t, r) loss = criterion(pos, neg)
if torch.cuda.is_available(): torch.cuda.empty_cache() model.cuda() criterion.cuda() writer = SummaryWriter(save_dir / f'runs_{args.model}') checkpoint_manager = CheckpointManager(save_dir) summary_manager = SummaryManager(save_dir) summary_manager.update(experiment_summary) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=1e-5) sampler = BernoulliNegativeSampler(kg_train) tr_dl = DataLoader(kg_train, batch_size=args.batch_size, use_cuda='all') val_dl = DataLoader(kg_train, batch_size=args.batch_size, use_cuda='all') best_val_loss = 1e+10 for epoch in tqdm(range(args.epochs), desc='epochs'): tr_loss = 0 model.train() for step, batch in enumerate(tr_dl): h, t, r = batch[0], batch[1], batch[2] n_h, n_t = sampler.corrupt_batch(h, t, r) optimizer.zero_grad() pos, neg = model(h, t, n_h, n_t, r)
def __init__(self, kg, model_type, ts, **kwargs): self.kg = kg self.truth_share = ts self.model_type = model_type self.diss_type = kwargs.pop('diss_type', 'L2') if self.model_type in ['TransR', 'TransD', 'TorusE']: self.ent_emb_dim = kwargs.pop('ent_emb_dim', args.emb_dim) self.rel_emb_dim = kwargs.pop('rel_emb_dim', args.emb_dim) self.model = getattr(torchkge.models.translation, model_type + 'Model')(ent_emb_dim=self.ent_emb_dim, rel_emb_dim=self.rel_emb_dim, n_entities=self.kg.n_ent, n_relations=self.kg.n_rel) else: self.emb_dim = kwargs.pop('emb_dim', args.emb_dim) if self.model_type is 'TransE': self.model = getattr(torchkge.models, f'{model_type}Model')( emb_dim=self.emb_dim, n_entities=kg.n_ent, n_relations=kg.n_rel, dissimilarity_type=self.diss_type) else: self.model = getattr(torchkge.models, f'{model_type}Model')( emb_dim=self.emb_dim, n_entities=kg.n_ent, n_relations=kg.n_rel) self.n_entities = kg.n_ent self.n_relations = kg.n_rel all_is = [ int(d.split('_')[1]) for d in os.listdir(wot.models_path) if os.path.isdir(join(wot.models_path, d)) and f'{self.model_type}_' in d ] i = [x for x in range(1, len(all_is) + 2) if x not in all_is][0] self.model_path = join(wot.models_path, f'{self.model_type}_{str(i+1).zfill(2)}') os.makedirs(self.model_path, exist_ok=True) self.logfile = join(self.model_path, 'log.txt') ## Hyperparameters self.lr = kwargs.pop('lr', args.lr) self.n_epochs = kwargs.pop('n_epochs', 100) self.b_size = kwargs.pop('b_size', 32) self.logline( tabulate([(k, v) for k, v in vars(self).items()], headers=['variable', 'value'])) # Legacy code # super(CustomTransModel, self).__init__(self.emb_dim, kg.n_ent, kg.n_rel, # dissimilarity_type=self.diss_type) try: self.dataloader = DataLoader(self.kg, batch_size=self.b_size, use_cuda='all') except AssertionError: self.dataloader = DataLoader(self.kg, batch_size=self.b_size) ## Logger self.epochs = 0 self.tr_losses = [] self.best_epoch = -1 self.val_losses = [] self.val_epochs = []