def train(self): # create data loader train_dataset = eval(self.dataset_conf.loader_name)( self.config, split='train') dev_dataset = eval(self.dataset_conf.loader_name)(self.config, split='dev') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) subset_indices = range(self.subsample_size) train_loader_sub = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False, sampler=SubsetRandomSampler(subset_indices)) dev_loader_sub = torch.utils.data.DataLoader( dev_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, collate_fn=dev_dataset.collate_fn, drop_last=False, sampler=SubsetRandomSampler(subset_indices)) # create models model = eval(self.model_conf.name)(self.model_conf) if self.use_gpu: model = nn.DataParallel(model, device_ids=self.gpus).cuda() # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD( params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam( params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=10, is_decrease=False) lr_scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=self.warmup_setps) # reset gradient optimizer.zero_grad() # resume training or use prxetrained model if self.train_conf.is_resume: if self.train_conf.pretrain: model_snapshot = torch.load(self.train_conf.resume_model,map_location=self.device) model.load_state_dict(model_snapshot["model"],strict=False) model.to(self.device) else: model_snapshot = torch.load(self.train_conf.resume_model,map_location=self.device) model.load_state_dict(model_snapshot["model"],strict=True) model.to(self.device) # Training Loop num_train = len(train_dataset) iter_count = 0 best_val_loss = np.inf best_val_loss_test = np.inf best_win_pct_val = 0 best_win_pct_val_test = 0 results = defaultdict(list) for epoch in range(self.train_conf.max_epoch): # --------------------------------validation--------------------------------------------- if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0: #calculate validation loss model.eval() with torch.no_grad(): result_dataset_val = self.cal_dataset_loss(model,dev_loader_sub) if self.is_val: logger.info("-----------------Avg. Validation Loss = {:.4f}, " "NMLL = {:.4f}, NMLL_opt = {:.4f}, Win_pct = {:.2f}%, " "NMLL_test = {:.4f}, NMLL_test_opt = {:.4f}, " "Win_pct_test = {:.2f}%--------------------".format( result_dataset_val['loss'], result_dataset_val['nmll'], result_dataset_val['nmll_opt_sm'], result_dataset_val['win_pct_ai']*100, result_dataset_val['nmll_test'], result_dataset_val['nmll_opt_sm_test'], result_dataset_val['win_pct_ai_test']*100)) self.writer.add_scalar('nmll_opt_val', result_dataset_val['nmll_opt_sm'], iter_count) self.writer.add_scalar('nmll_opt_test_val', result_dataset_val['nmll_opt_sm_test'], iter_count) self.writer.add_scalar('win_pct_ai_val', result_dataset_val['win_pct_ai'], iter_count) self.writer.add_scalar('win_pct_ai_test_val', result_dataset_val['win_pct_ai_test'], iter_count) else: logger.info("-----------------Avg. Validation Loss = {:.4f}, " "NMLL = {:.4f}, NMLL_orig = {:.4f}, Win_pct = {:.2f}%, " "NMLL_test = {:.4f}, NMLL_test_orig = {:.4f}, " "Win_pct_test = {:.2f}%--------------------".format( result_dataset_val['loss'], result_dataset_val['nmll'], result_dataset_val['nmll_orig'], result_dataset_val['win_pct']*100, result_dataset_val['nmll_test'], result_dataset_val['nmll_test_orig'], result_dataset_val['win_pct_test']*100)) self.writer.add_scalar('val_loss', result_dataset_val['loss'], iter_count) self.writer.add_scalar('nmll_loss_val', result_dataset_val['nmll'], iter_count) self.writer.add_scalar('nmll_loss_orig_val', result_dataset_val['nmll_orig'], iter_count) self.writer.add_scalar('nmll_loss_test_val', result_dataset_val['nmll_test'], iter_count) self.writer.add_scalar('nmll_loss_test_orig_val', result_dataset_val['nmll_test_orig'], iter_count) self.writer.add_scalar('win_pct_val', result_dataset_val['win_pct'], iter_count) self.writer.add_scalar('win_pct_val_test', result_dataset_val['win_pct_test'], iter_count) results['val_loss'] += [result_dataset_val['loss']] results['nmll_loss_val'] += [result_dataset_val['nmll']] results['nmll_loss_orig_val'] += [result_dataset_val['nmll_orig']] results['nmll_loss_test_val'] += [result_dataset_val['nmll_test']] results['nmll_loss_test_orig_val'] += [result_dataset_val['nmll_test_orig']] results['win_pct_val'] += [result_dataset_val['win_pct']] results['win_pct_val_test'] += [result_dataset_val['win_pct_test']] # save best model if result_dataset_val['loss'] < best_val_loss: best_val_loss = result_dataset_val['loss'] best_val_loss_test = result_dataset_val['nmll_test'] if self.is_val: best_win_pct_val = result_dataset_val['win_pct_ai'] best_win_pct_val_test = result_dataset_val['win_pct_ai_test'] else: best_win_pct_val = result_dataset_val['win_pct'] best_win_pct_val_test = result_dataset_val['win_pct_test'] snapshot( model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='best') logger.info("Current Best Validation Loss = {:.4f}".format(best_val_loss)) # check early stop if early_stop.tick([result_dataset_val['loss']]): snapshot( model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='last') self.writer.close() break # --------------------------------------training----------------------------------- model.train() for data in train_loader: optimizer.zero_grad() if self.use_gpu: data['max_node_size'],data['X_data_tr'],data['X_data_val'],data['X_data_test'],data['F_tr'],data['F_val'],data['F_test'],data['N_val'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_test'],data['kernel_mask_test'],data['diagonal_mask_test'],data['node_mask_tr'],data['dim_mask'], data['nmll'], data['dim_size'] = data_to_gpu( data['max_node_size'],data['X_data_tr'],data['X_data_val'],data['X_data_test'],data['F_tr'],data['F_val'],data['F_test'],data['N_val'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_test'],data['kernel_mask_test'],data['diagonal_mask_test'],data['node_mask_tr'],data['dim_mask'], data['nmll'], data['dim_size']) if self.model_conf.name == 'GpSMDoubleAtt': mu, var, weights, nmll, nmll_test = model(data['X_data_tr'],data['X_data_val'],data['F_tr'],data['F_val'],data['node_mask_tr'],data['dim_mask'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_val'],device = self.device,eval_mode = True,X_data_test = data['X_data_test'],F_data_test = data['F_test'],kernel_mask_test=data['kernel_mask_test'],diagonal_mask_test=data['diagonal_mask_test'],N_data_test=data['N_test']) elif self.model_conf.name == 'GpSMDoubleAttNoMu': var, weights, nmll, nmll_test = model(data['X_data_tr'],data['X_data_val'],data['F_tr'],data['F_val'],data['node_mask_tr'],data['dim_mask'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_val'],device = self.device,eval_mode = True,X_data_test = data['X_data_test'],F_data_test = data['F_test'],kernel_mask_test=data['kernel_mask_test'],diagonal_mask_test=data['diagonal_mask_test'],N_data_test=data['N_test']) else: raise ValueError("No model of given name!") # print("Outside: input size", data['X_data'].shape, "output_size", nmll.shape) nmll_orig = data['nmll'] win_pct_train = torch.sum(nmll<nmll_orig+0.01).float()/nmll.shape[0] data_dim_vec = data['X_data_tr'].shape[-1] nmll_loss_train = torch.mean(nmll) train_loss = nmll_loss_train # calculate gradient train_loss.backward() nmll_loss_orig = torch.mean(nmll_orig) # calculate gradient norm grad_norm = 0 for p in model.parameters(): if p.requires_grad: param_norm = p.grad.data.norm() grad_norm += param_norm.item() ** 2 grad_norm = grad_norm ** (1./2) nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() train_loss = float(train_loss.data.cpu().numpy()) nmll_loss_train = float(nmll_loss_train.data.cpu().numpy()) nmll_loss_train_orig = float(nmll_loss_orig.data.cpu().numpy()) win_pct_train = float(win_pct_train.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) self.writer.add_scalar('nmll_loss_train', nmll_loss_train, iter_count) self.writer.add_scalar('nmll_loss_train_orig', nmll_loss_train_orig, iter_count) self.writer.add_scalar('win_pct_train', win_pct_train, iter_count) self.writer.add_scalar('grad_norm', grad_norm, iter_count) results['nmll_loss_train'] += [nmll_loss_train] results['nmll_loss_train_orig'] += [nmll_loss_train_orig] results['train_loss'] += [train_loss] results['win_pct_train'] += [win_pct_train] results['train_step'] += [iter_count] results['grad_norm'] += [grad_norm] # display loss if (iter_count + 1) % self.train_conf.display_iter == 0: logger.info("Loss @ epoch {:04d} iteration {:08d} = {:.4f}, NMLL = {:.4f}, NMLL_orig = {:.4f}, Win_pct = {:.2f}%, Grad_norm = {:.4f}, LR = {:.2e}".format( epoch + 1, iter_count + 1, train_loss, nmll_loss_train, nmll_loss_train_orig, win_pct_train*100, grad_norm, get_lr(optimizer))) iter_count += 1 # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1) lr_scheduler.step() #look at predictions, for debug purpose model.eval() with torch.no_grad(): results_sample_tr = self.cal_sample_result(model,train_loader_sub) results_sample_dev = self.cal_sample_result(model,dev_loader_sub) result_dataset_tr = self.cal_dataset_loss(model,train_loader_sub) result_dataset_dev = self.cal_dataset_loss(model,dev_loader_sub) train_loss = result_dataset_tr['loss'] results['best_val_loss'] = best_val_loss results['win_count_tr'] = results_sample_tr['win_pct'] results['win_count_dev'] = results_sample_dev['win_pct'] results['nmll_loss_sample_tr'] = results_sample_tr['nmll_loss_sample'] results['nmll_loss_sample_dev'] = results_sample_dev['nmll_loss_sample'] pickle.dump(results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() logger.info("Best Validation Loss = {:.4f}, " "Best Win_pct_val = {:.2f}%, " "Best Val Loss on Test = {:.4f}, " "Best Win_pct_val_test = {:.2f}%, " "Final Training NMLL = {:.4f}, " "Training NMLL original = {:.4f}, " "Win_pct_train = {:.2f}%, " "Final Dev NMLL = {:.4f}, " "Dev NMLL original = {:.4f}, " "Win_pct_dev = {:.2f}%, " "Final Dev Test NMLL = {:.4f}, " "Dev Test NMLL original = {:.4f}, " "Win_pct_test_dev = {:.2f}%.".format( best_val_loss, \ best_win_pct_val*100, \ best_val_loss_test, \ best_win_pct_val_test*100, \ result_dataset_tr['nmll'], \ result_dataset_tr['nmll_orig'], \ result_dataset_tr['win_pct']*100, \ result_dataset_dev['nmll'], \ result_dataset_dev['nmll_orig'], \ result_dataset_dev['win_pct']*100, \ result_dataset_dev['nmll_test'], \ result_dataset_dev['nmll_test_orig'], \ result_dataset_dev['win_pct_test']*100)) avg_nmll_tr = np.mean(results_sample_tr['nmll_sample_compare'],0) logger.info('% of GPs with higher marginal likelihood = {:.2f}%'.format(results_sample_tr['win_pct']*100)) logger.info('Average NMLL on training samples: true = {}, learned = {}'.format(avg_nmll_tr[1],avg_nmll_tr[0])) avg_nmll_dev = np.mean(results_sample_dev['nmll_sample_compare'],0) logger.info('% of GPs with higher marginal likelihood = {:.2f}%'.format(results_sample_dev['win_pct']*100)) logger.info('Average NMLL on testing samples: true = {}, learned = {}'.format(avg_nmll_dev[1],avg_nmll_dev[0])) snapshot( model.module if self.use_gpu else model, optimizer, self.config, self.train_conf.max_epoch + 1, tag='final') return None
def train(self): # create data loader train_dataset = eval(self.dataset_conf.loader_name)(self.config, split='train') dev_dataset = eval(self.dataset_conf.loader_name)(self.config, split='dev') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.train_conf.batch_size, shuffle=self.train_conf.shuffle, num_workers=self.train_conf.num_workers, collate_fn=train_dataset.collate_fn, drop_last=False) dev_loader = torch.utils.data.DataLoader( dev_dataset, batch_size=self.train_conf.batch_size, shuffle=False, num_workers=self.train_conf.num_workers, collate_fn=dev_dataset.collate_fn, drop_last=False) # create models model = eval(self.model_conf.name)(self.config) if self.use_gpu: model = nn.DataParallel(model, device_ids=self.gpus).cuda() # create optimizer params = filter(lambda p: p.requires_grad, model.parameters()) if self.train_conf.optimizer == 'SGD': optimizer = optim.SGD(params, lr=self.train_conf.lr, momentum=self.train_conf.momentum, weight_decay=self.train_conf.wd) elif self.train_conf.optimizer == 'Adam': optimizer = optim.Adam(params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd) else: raise ValueError("Non-supported optimizer!") early_stop = EarlyStopper([0.0], win_size=10, is_decrease=False) lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=self.train_conf.lr_decay_steps, gamma=self.train_conf.lr_decay) # reset gradient optimizer.zero_grad() # resume training if self.train_conf.is_resume: load_model(model, self.train_conf.resume_model, optimizer=optimizer) # Training Loop iter_count = 0 best_val_loss = np.inf results = defaultdict(list) for epoch in range(self.train_conf.max_epoch): # validation if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0: model.eval() val_loss = [] for data in tqdm(dev_loader): if self.use_gpu: data['node_feat'], data['node_mask'], data[ 'label'] = data_to_gpu(data['node_feat'], data['node_mask'], data['label']) if self.model_conf.name == 'LanczosNet': data['L'], data['D'], data['V'] = data_to_gpu( data['L'], data['D'], data['V']) elif self.model_conf.name == 'GraphSAGE': data['nn_idx'], data[ 'nonempty_mask'] = data_to_gpu( data['nn_idx'], data['nonempty_mask']) elif self.model_conf.name == 'GPNN': data['L'], data['L_cluster'], data[ 'L_cut'] = data_to_gpu(data['L'], data['L_cluster'], data['L_cut']) else: data['L'] = data_to_gpu(data['L'])[0] with torch.no_grad(): if self.model_conf.name == 'AdaLanczosNet': pred, _ = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'LanczosNet': pred, _ = model(data['node_feat'], data['L'], data['D'], data['V'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GraphSAGE': pred, _ = model(data['node_feat'], data['nn_idx'], data['nonempty_mask'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GPNN': pred, _ = model(data['node_feat'], data['L'], data['L_cluster'], data['L_cut'], label=data['label'], mask=data['node_mask']) else: pred, _ = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) curr_loss = (pred - data['label'] ).abs().cpu().numpy() * self.const_factor val_loss += [curr_loss] val_loss = float(np.mean(np.concatenate(val_loss))) logger.info("Avg. Validation MAE = {}".format(val_loss)) self.writer.add_scalar('val_loss', val_loss, iter_count) results['val_loss'] += [val_loss] # save best model if val_loss < best_val_loss: best_val_loss = val_loss snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='best') logger.info( "Current Best Validation MAE = {}".format(best_val_loss)) # check early stop if early_stop.tick([val_loss]): snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1, tag='last') self.writer.close() break # training model.train() lr_scheduler.step() for data in train_loader: optimizer.zero_grad() if self.use_gpu: data['node_feat'], data['node_mask'], data[ 'label'] = data_to_gpu(data['node_feat'], data['node_mask'], data['label']) if self.model_conf.name == 'LanczosNet': data['L'], data['D'], data['V'] = data_to_gpu( data['L'], data['D'], data['V']) elif self.model_conf.name == 'GraphSAGE': data['nn_idx'], data['nonempty_mask'] = data_to_gpu( data['nn_idx'], data['nonempty_mask']) elif self.model_conf.name == 'GPNN': data['L'], data['L_cluster'], data[ 'L_cut'] = data_to_gpu(data['L'], data['L_cluster'], data['L_cut']) else: data['L'] = data_to_gpu(data['L'])[0] if self.model_conf.name == 'AdaLanczosNet': _, train_loss = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'LanczosNet': _, train_loss = model(data['node_feat'], data['L'], data['D'], data['V'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GraphSAGE': _, train_loss = model(data['node_feat'], data['nn_idx'], data['nonempty_mask'], label=data['label'], mask=data['node_mask']) elif self.model_conf.name == 'GPNN': _, train_loss = model(data['node_feat'], data['L'], data['L_cluster'], data['L_cut'], label=data['label'], mask=data['node_mask']) else: _, train_loss = model(data['node_feat'], data['L'], label=data['label'], mask=data['node_mask']) # assign gradient train_loss.backward() optimizer.step() train_loss = float(train_loss.data.cpu().numpy()) self.writer.add_scalar('train_loss', train_loss, iter_count) results['train_loss'] += [train_loss] results['train_step'] += [iter_count] # display loss if (iter_count + 1) % self.train_conf.display_iter == 0: logger.info( "Loss @ epoch {:04d} iteration {:08d} = {}".format( epoch + 1, iter_count + 1, train_loss)) iter_count += 1 # snapshot model if (epoch + 1) % self.train_conf.snapshot_epoch == 0: logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1)) snapshot(model.module if self.use_gpu else model, optimizer, self.config, epoch + 1) results['best_val_loss'] += [best_val_loss] pickle.dump( results, open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb')) self.writer.close() logger.info("Best Validation MAE = {}".format(best_val_loss)) return best_val_loss