def __setup_model_hparams(self): # 1. define losses self.loss = nn.MSELoss() self.loss_adv = nn.BCELoss() # 2. define model metric self.metric = Kappa() # 3. define optimizer self.optimizer = eval(f"torch.optim.{self.hparams['optimizer_name']}")( params=self.model.parameters(), **self.hparams['optimizer_hparams']) # 4. define scheduler self.scheduler = eval( f"torch.optim.lr_scheduler.{self.hparams['scheduler_name']}")( optimizer=self.optimizer, **self.hparams['scheduler_hparams']) # 5. define early stopping self.early_stopping = EarlyStopping( checkpoint_path=self.hparams['checkpoint_path'] + f'/checkpoint_{self.start_training}' + '.pt', patience=self.hparams['patience'], delta=self.hparams['min_delta'], is_maximize=True, ) # 6. set gradient clipping self.apply_clipping = self.hparams['clipping'] # clipping of gradients # 7. Set scaler for optimizer self.scaler = torch.cuda.amp.GradScaler() return True
def __setup_model_hparams(self): # 1. define losses self.loss = SimCLR_2( temperature=100 ) # SimclrCriterion(batch_size=self.hparams['batch_size'],device=self.device) # 2. define optimizer self.optimizer = eval(f"torch.optim.{self.hparams['optimizer_name']}")( params=self.model.parameters(), **self.hparams['optimizer_hparams']) # 3. define scheduler self.scheduler = eval( f"torch.optim.lr_scheduler.{self.hparams['scheduler_name']}")( optimizer=self.optimizer, **self.hparams['scheduler_hparams']) # 4. define early stopping self.early_stopping = EarlyStopping( checkpoint_path=self.hparams['checkpoint_path'] + f'/checkpoint_{self.start_training}' + '.pt', patience=self.hparams['patience'], delta=self.hparams['min_delta'], is_maximize=False, ) # 5. set gradient clipping self.apply_clipping = self.hparams['clipping'] # clipping of gradients # 6. Set scaler for optimizer self.scaler = torch.cuda.amp.GradScaler() return True
def main(opt): """ train_dataset = BADataset(opt.dataroot, opt.L, True, False, False) train_dataloader = BADataloader(train_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) valid_dataset = BADataset(opt.dataroot, opt.L, False, True, False) valid_dataloader = BADataloader(valid_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) test_dataset = BADataset(opt.dataroot, opt.L, False, False, True) test_dataloader = BADataloader(test_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) """ all_dataset = BADataset(opt.dataroot, opt.L, False, False, False) all_dataloader = BADataloader(all_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=opt.workers, drop_last=False) net = COSSIMMLP(opt) net.double() print(net) criterion = nn.BCELoss() if opt.cuda: net.cuda() criterion.cuda() #optimizer = optim.Adam(net.parameters(), lr=opt.lr) optimizer = "" early_stopping = EarlyStopping(patience=opt.patience, verbose=True) os.makedirs(OutputDir, exist_ok=True) train_loss_ls = [] valid_loss_ls = [] test_loss_ls = [] for epoch in range(0, opt.niter): # train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt) # valid_loss = valid(valid_dataloader, net, criterion, opt) # test_loss = test(test_dataloader, net, criterion, opt) train_loss = 0 valid_loss = 0 test_loss = 0 train_loss_ls.append(train_loss) valid_loss_ls.append(valid_loss) test_loss_ls.append(test_loss) early_stopping(valid_loss, net, OutputDir) if early_stopping.early_stop: print("Early stopping") break df = pd.DataFrame({'epoch':[i for i in range(1, len(train_loss_ls)+1)], 'train_loss': train_loss_ls, 'valid_loss': valid_loss_ls, 'test_loss': test_loss_ls}) df.to_csv(OutputDir + '/loss.csv', index=False) net.load_state_dict(torch.load(OutputDir + '/checkpoint.pt')) inference(all_dataloader, net, criterion, opt, OutputDir)
def main(opt): train_dataset = BADataset(opt.dataroot, opt.L, True, False, False) train_dataloader = BADataloader(train_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) valid_dataset = BADataset(opt.dataroot, opt.L, False, True, False) valid_dataloader = BADataloader(valid_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) test_dataset = BADataset(opt.dataroot, opt.L, False, False, True) test_dataloader = BADataloader(test_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) all_dataset = BADataset(opt.dataroot, opt.L, False, False, False) all_dataloader = BADataloader(all_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=opt.workers, drop_last=False) opt.n_edge_types = train_dataset.n_edge_types opt.n_node = train_dataset.n_node net = STGGNN(opt, kernel_size=2, n_blocks=1, state_dim_bottleneck=opt.state_dim, annotation_dim_bottleneck=opt.annotation_dim) net.double() print(net) criterion = nn.BCELoss() if opt.cuda: net.cuda() criterion.cuda() optimizer = optim.Adam(net.parameters(), lr=opt.lr) early_stopping = EarlyStopping(patience=opt.patience, verbose=True) os.makedirs(OutputDir, exist_ok=True) train_loss_ls = [] valid_loss_ls = [] test_loss_ls = [] #net.load_state_dict(torch.load(OutputDir + '/checkpoint_5083.pt')) for epoch in range(0, opt.niter): train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt) valid_loss = valid(valid_dataloader, net, criterion, opt) test_loss = test(test_dataloader, net, criterion, opt) train_loss_ls.append(train_loss) valid_loss_ls.append(valid_loss) test_loss_ls.append(test_loss) early_stopping(valid_loss, net, OutputDir) if early_stopping.early_stop: print("Early stopping") break df = pd.DataFrame({'epoch':[i for i in range(1, len(train_loss_ls)+1)], 'train_loss': train_loss_ls, 'valid_loss': valid_loss_ls, 'test_loss': test_loss_ls}) df.to_csv(OutputDir + '/loss.csv', index=False) net.load_state_dict(torch.load(OutputDir + '/checkpoint.pt')) inference(all_dataloader, net, criterion, opt, OutputDir)
def main(opt): train_dataset = BADataset(opt.dataroot, opt.L, True, False, False) train_dataloader = BADataloader(train_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) valid_dataset = BADataset(opt.dataroot, opt.L, False, True, False) valid_dataloader = BADataloader(valid_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) test_dataset = BADataset(opt.dataroot, opt.L, False, False, True) test_dataloader = BADataloader(test_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) all_dataset = BADataset(opt.dataroot, opt.L, False, False, False) all_dataloader = BADataloader(all_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=opt.workers, drop_last=False) opt.n_edge_types = train_dataset.n_edge_types opt.n_node = train_dataset.n_node net = EGCN(gcn_args, activation = torch.nn.RReLU(), device = opt.device) print(net) criterion = nn.MSELoss() #criterion = nn.CosineSimilarity(dim=-1, eps=1e-6) if opt.cuda: net.cuda() criterion.cuda() optimizer = optim.Adam(net.parameters(), lr=opt.lr) early_stopping = EarlyStopping(patience=opt.patience, verbose=True) os.makedirs(OutputDir, exist_ok=True) train_loss_ls = [] valid_loss_ls = [] test_loss_ls = [] for epoch in range(0, opt.niter): train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt) valid_loss = valid(valid_dataloader, net, criterion, opt) test_loss = test(test_dataloader, net, criterion, opt) train_loss_ls.append(train_loss) valid_loss_ls.append(valid_loss) test_loss_ls.append(test_loss) early_stopping(valid_loss, net, OutputDir) if early_stopping.early_stop: print("Early stopping") break df = pd.DataFrame({'epoch':[i for i in range(1, len(train_loss_ls)+1)], 'train_loss': train_loss_ls, 'valid_loss': valid_loss_ls, 'test_loss': test_loss_ls}) df.to_csv(OutputDir + '/loss.csv', index=False) #net.load_state_dict(torch.load(OutputDir + '/checkpoint.pt')) net = torch.load(OutputDir + '/checkpoint.pt') inference(all_dataloader, net, criterion, opt, OutputDir)
def __init__(self): self.patience = 7 self.warm_up = 6 self.patience_decay = {1: 0.8, 2: 0.5, 3: 0} self.early_stoping = EarlyStopping(patience=config.patience, verbose=True) self.decay = 0 self.stop = False self.lr_decay_dict = [0.7, 0.9]
def __init__(self, input_size, n_channels, hparams): self.hparams = hparams self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # define the models self.model = WaveNet(n_channels=n_channels).to(self.device) summary(self.model, (input_size, n_channels)) # self.model.half() if torch.cuda.device_count() > 1: print("Number of GPUs will be used: ", torch.cuda.device_count() - 3) self.model = DP(self.model, device_ids=list( range(torch.cuda.device_count() - 3))) else: print('Only one GPU is available') self.metric = Metric() self.num_workers = 1 ########################## compile the model ############################### # define optimizer self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=self.hparams['lr'], weight_decay=1e-5) # weights = torch.Tensor([0.025,0.033,0.039,0.046,0.069,0.107,0.189,0.134,0.145,0.262,1]).cuda() self.loss = nn.BCELoss() # CompLoss(self.device) # define early stopping self.early_stopping = EarlyStopping( checkpoint_path=self.hparams['checkpoint_path'] + '/checkpoint.pt', patience=self.hparams['patience'], delta=self.hparams['min_delta'], ) # lr cheduler self.scheduler = ReduceLROnPlateau( optimizer=self.optimizer, mode='max', factor=0.2, patience=3, verbose=True, threshold=self.hparams['min_delta'], threshold_mode='abs', cooldown=0, eps=0, ) self.seed_everything(42) self.threshold = 0.75 self.scaler = torch.cuda.amp.GradScaler()
def main(): #using kfold #train_dataset = GraphemeDataSet(TRAIN_DATA_DIR, TRAIN_DATA_CSV, TRAINING_FOLDS) #valid_dataset = GraphemeDataSet(TRAIN_DATA_DIR, TRAIN_DATA_CSV, VALIDATION_FOLDS) #using one split : train and validation train_dataset = GraphemeDataSet(TRAIN_DATA_DIR, TRAIN_DATA_CSV, is_train=True) valid_dataset = GraphemeDataSet(TRAIN_DATA_DIR, TRAIN_DATA_CSV, is_train=False) train_loader = DataLoader(dataset=train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4) valid_loader = DataLoader(dataset=valid_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4) model = MODEL_DISPATCHER[BASE_MODEL](pretrained=True) model.to(DEVICE) optimizer = torch.optim.Adam(model.parameters()) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.3, patience=5, verbose=True) early_stopping = EarlyStopping(patience=7, verbose=True) for e in range(EPOCHS): print("Epoch {} : ".format(e)) train(train_loader, model, optimizer, e) val_score = evaluate(valid_loader, model, e) scheduler.step(val_score) early_stopping(val_score, model) if early_stopping.early_stop: print("Early stopping!") break
def train_model(model_num, model, dataloaders, num_tr_samples, criterion, optimizer, hist_dir, early_stop_patience, writer, num_epochs=25, with_cuda=True): since = time.time() val_acc_history = [] best_model_wts = copy.deepcopy(model.state_dict()) best_loss = math.inf best_val_acc = 0 best_epoch = 0 all_f_ = torch.tensor([]) all_l_ = torch.tensor([]) hist_dir = os.path.join(hist_dir, 'model_' + str(model_num)) if not os.path.exists(hist_dir): os.mkdir(hist_dir) csv = open(os.path.join(hist_dir, 'eval_history.csv'), 'w') csv.write("epoch,train_loss,train_acc,val_loss,val_acc\n") monitor = 'val_acc' early_stopping = EarlyStopping(patience=early_stop_patience, monitor=monitor, verbose=False) for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 12) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 print(phase, '|', end='') print(" Dataloader len: ", len(dataloaders[phase].dataset)) # Iterate over data. for i, (inputs, labels) in enumerate(dataloaders[phase]): #print(' ',i+1,') vs',inputs.size(1), end = '') if torch.cuda.is_available() and with_cuda: inputs = inputs.to('cuda') labels = labels.to('cuda') # zero the parameter gradients optimizer.zero_grad() model._init_hidden_state(last_batch_size=inputs.size(0)) # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): # Get model outputs and calculate loss # Special case for inception because in training it has an auxiliary output. In train # mode we calculate the loss by summing the final output and the auxiliary output # but in testing we only consider the final output. outputs, features, _ = model(inputs) loss = criterion(outputs, labels) #loss = torch.sum(loss) loss = li_regularizer(model, loss) _, preds = torch.max(outputs, 1) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() else: if not i: all_f = features all_l = labels else: all_f = torch.cat((all_f, features), dim=0) all_l = torch.cat((all_l, labels), dim=0) # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) if phase == 'train': epoch_loss = running_loss / num_tr_samples epoch_acc = running_corrects.double() / num_tr_samples else: epoch_loss = running_loss / len(dataloaders[phase].dataset) epoch_acc = running_corrects.double() / len( dataloaders[phase].dataset) print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) if phase == 'train': tr_loss = epoch_loss tr_acc = epoch_acc.cpu().data.numpy() writer.add_scalar('loss/train', epoch_loss, epoch) writer.add_scalar('acc/train', epoch_acc, epoch) else: val_loss = epoch_loss val_acc = epoch_acc.cpu().data.numpy() writer.add_scalar('loss/test', epoch_loss, epoch) writer.add_scalar('acc/test', epoch_acc, epoch) # deep copy the model if phase == 'val': if monitor == 'val_acc' and val_acc > best_val_acc: #epoch_loss < best_loss: best_loss = epoch_loss best_epoch = epoch best_model_wts = copy.deepcopy(model.state_dict()) best_val_acc = val_acc all_f_ = all_f all_l_ = all_l elif monitor == 'val_loss' and epoch_loss < best_loss: best_loss = epoch_loss best_epoch = epoch best_model_wts = copy.deepcopy(model.state_dict()) best_val_acc = val_acc all_f_ = all_f all_l_ = all_l print() csv.write( str(epoch) + ',' + str(tr_loss) + ',' + str(tr_acc) + ',' + str(val_loss) + ',' + str(val_acc) + '\n') # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model if monitor == 'val_acc': early_stopping(val_acc, model, hist_dir) else: early_stopping(val_loss, model, hist_dir) if early_stopping.early_stop: print("Early stopping") break time_elapsed = time.time() - since logging.info('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) #print('Best val Acc: {:4f}'.format(best_acc)) # save best predictions and respective labels #np.savez(os.path.join(hist_dir,'model_'+str(model_num)+'.npz'), # pred=best_pred_out.cpu().data.numpy(),gt=best_gt.cpu().data.numpy()) # load best model weights logging.info( "best model epoch: {}, val_loss: {:.4f}, val_acc:{:.4f}".format( best_epoch, best_loss, best_val_acc)) check_pt = "wt_best_ep{}_loss_{:.3f}_acc{:.3f}.pth".format( best_epoch, best_loss, best_val_acc) # save model model.load_state_dict(best_model_wts) # Best model predictions on the validation data #writer.add_figure('predictions_vs._actuals', plot_classes_preds(model, inputs, labels, classes), global_step=model_num) torch.save(model.state_dict(), os.path.join(hist_dir, check_pt)) return model, best_val_acc, best_loss, best_epoch, all_f_, all_l_
def train_model(model_num, model, data, criterion, optimizer, hist_dir, early_stop_patience, num_epochs=25, with_cuda=True): since = time.time() val_acc_history = [] best_model_wts = copy.deepcopy(model.state_dict()) best_loss = math.inf best_val_acc = 0 best_epoch = 0 hist_dir = os.path.join(hist_dir, 'model_' + str(model_num)) if not os.path.exists(hist_dir): os.mkdir(hist_dir) csv = open(os.path.join(hist_dir, 'eval_history.csv'), 'w') csv.write("epoch,train_loss,train_acc,val_loss,val_acc\n") early_stopping = EarlyStopping(patience=early_stop_patience, monitor='val_acc', verbose=False) for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 12) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 print(phase, '|', end='') inputs, labels, patch_count, _ = data[phase] # Iterate over data. for i in range(len(patch_count)): #print(' ',i+1,') vs',inputs.size(1), end = '') input_im = inputs[i, :patch_count[i]] assert not torch.isnan(input_im).any() label_im = [labels[i].item()] label_im = torch.tensor(label_im) if torch.cuda.is_available() and with_cuda: input_im = input_im.to('cuda') label_im = label_im.to('cuda') # zero the parameter gradients optimizer.zero_grad() model._init_hidden_state(last_batch_size=1) # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): # Get model outputs and calculate loss # Special case for inception because in training it has an auxiliary output. In train # mode we calculate the loss by summing the final output and the auxiliary output # but in testing we only consider the final output. outputs, _, _ = model(input_im) #assert not torch.isnan(outputs).any() loss = criterion(outputs, label_im) #loss = torch.sum(loss) loss = li_regularizer(model, loss) _, preds = torch.max(outputs, 1) # backward + optimize only if in training phase if phase == 'train': optimizer.zero_grad() loss.backward(retain_graph=True) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. #torch.nn.utils.clip_grad_norm_(model.parameters(), 0.50) optimizer.step() # statistics running_loss += loss.item() running_corrects += torch.sum(preds == label_im.data) if phase == 'train': epoch_loss = running_loss / len(patch_count) epoch_acc = running_corrects.double() / len(patch_count) else: epoch_loss = running_loss / len(patch_count) epoch_acc = running_corrects.double() / len(patch_count) print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) if phase == 'train': tr_loss = epoch_loss tr_acc = epoch_acc.cpu().data.numpy() else: val_loss = epoch_loss val_acc = epoch_acc.cpu().data.numpy() # deep copy the model if phase == 'val' and val_acc > best_val_acc: best_loss = epoch_loss best_epoch = epoch best_model_wts = copy.deepcopy(model.state_dict()) best_val_acc = val_acc # save model #check_pt = "wt_ep{}.pth".format(epoch) #torch.save(model.state_dict(),os.path.join(hist_dir,check_pt)) print() csv.write( str(epoch) + ',' + str(tr_loss) + ',' + str(tr_acc) + ',' + str(val_loss) + ',' + str(val_acc) + '\n') # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(val_acc, model, hist_dir) if early_stopping.early_stop: print("Early stopping") break time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print("best model epoch: {}, val_loss: {:.4f}, val_acc:{:.4f}".format( best_epoch, best_loss, best_val_acc)) check_pt = "wt_best_ep{}_acc_{:.3f}.pth".format(best_epoch, best_val_acc) # save model model.load_state_dict(best_model_wts) # Best model predictions on the validation data #writer.add_figure('predictions_vs._actuals', plot_classes_preds(model, inputs, labels, classes), global_step=model_num) torch.save(model.state_dict(), os.path.join(hist_dir, check_pt)) return model, best_val_acc, best_loss, best_epoch
def run_model_LastFM(feats_type, hidden_dim, num_heads, attn_vec_dim, rnn_type, num_epochs, patience, batch_size, neighbor_samples, repeat, save_postfix): adjlists_ua, edge_metapath_indices_list_ua, _, type_mask, train_val_test_pos_user_artist, train_val_test_neg_user_artist = load_LastFM_data( ) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') features_list = [] in_dims = [] if feats_type == 0: for i in range(num_ntype): dim = (type_mask == i).sum() in_dims.append(dim) indices = np.vstack((np.arange(dim), np.arange(dim))) indices = torch.LongTensor(indices) values = torch.FloatTensor(np.ones(dim)) features_list.append( torch.sparse.FloatTensor(indices, values, torch.Size([dim, dim])).to(device)) elif feats_type == 1: for i in range(num_ntype): dim = 10 num_nodes = (type_mask == i).sum() in_dims.append(dim) features_list.append(torch.zeros((num_nodes, 10)).to(device)) train_pos_user_artist = train_val_test_pos_user_artist[ 'train_pos_user_artist'] val_pos_user_artist = train_val_test_pos_user_artist['val_pos_user_artist'] test_pos_user_artist = train_val_test_pos_user_artist[ 'test_pos_user_artist'] train_neg_user_artist = train_val_test_neg_user_artist[ 'train_neg_user_artist'] val_neg_user_artist = train_val_test_neg_user_artist['val_neg_user_artist'] test_neg_user_artist = train_val_test_neg_user_artist[ 'test_neg_user_artist'] y_true_test = np.array([1] * len(test_pos_user_artist) + [0] * len(test_neg_user_artist)) auc_list = [] ap_list = [] for _ in range(repeat): net = MAGNN_lp([3, 3], 4, etypes_lists, in_dims, hidden_dim, hidden_dim, num_heads, attn_vec_dim, rnn_type, dropout_rate) net.to(device) optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay) # training loop net.train() early_stopping = EarlyStopping( patience=patience, verbose=True, save_path='checkpoint/checkpoint_{}.pt'.format(save_postfix)) dur1 = [] dur2 = [] dur3 = [] train_pos_idx_generator = index_generator( batch_size=batch_size, num_data=len(train_pos_user_artist)) val_idx_generator = index_generator(batch_size=batch_size, num_data=len(val_pos_user_artist), shuffle=False) for epoch in range(num_epochs): t_start = time.time() # training net.train() for iteration in range(train_pos_idx_generator.num_iterations()): # forward t0 = time.time() train_pos_idx_batch = train_pos_idx_generator.next() train_pos_idx_batch.sort() train_pos_user_artist_batch = train_pos_user_artist[ train_pos_idx_batch].tolist() train_neg_idx_batch = np.random.choice( len(train_neg_user_artist), len(train_pos_idx_batch)) train_neg_idx_batch.sort() train_neg_user_artist_batch = train_neg_user_artist[ train_neg_idx_batch].tolist() train_pos_g_lists, train_pos_indices_lists, train_pos_idx_batch_mapped_lists = parse_minibatch_LastFM( adjlists_ua, edge_metapath_indices_list_ua, train_pos_user_artist_batch, device, neighbor_samples, use_masks, num_user) train_neg_g_lists, train_neg_indices_lists, train_neg_idx_batch_mapped_lists = parse_minibatch_LastFM( adjlists_ua, edge_metapath_indices_list_ua, train_neg_user_artist_batch, device, neighbor_samples, no_masks, num_user) t1 = time.time() dur1.append(t1 - t0) [pos_embedding_user, pos_embedding_artist], _ = net( (train_pos_g_lists, features_list, type_mask, train_pos_indices_lists, train_pos_idx_batch_mapped_lists)) [neg_embedding_user, neg_embedding_artist], _ = net( (train_neg_g_lists, features_list, type_mask, train_neg_indices_lists, train_neg_idx_batch_mapped_lists)) pos_embedding_user = pos_embedding_user.view( -1, 1, pos_embedding_user.shape[1]) pos_embedding_artist = pos_embedding_artist.view( -1, pos_embedding_artist.shape[1], 1) neg_embedding_user = neg_embedding_user.view( -1, 1, neg_embedding_user.shape[1]) neg_embedding_artist = neg_embedding_artist.view( -1, neg_embedding_artist.shape[1], 1) pos_out = torch.bmm(pos_embedding_user, pos_embedding_artist) neg_out = -torch.bmm(neg_embedding_user, neg_embedding_artist) train_loss = -torch.mean( F.logsigmoid(pos_out) + F.logsigmoid(neg_out)) t2 = time.time() dur2.append(t2 - t1) # autograd optimizer.zero_grad() train_loss.backward() optimizer.step() t3 = time.time() dur3.append(t3 - t2) # print training info if iteration % 100 == 0: print( 'Epoch {:05d} | Iteration {:05d} | Train_Loss {:.4f} | Time1(s) {:.4f} | Time2(s) {:.4f} | Time3(s) {:.4f}' .format(epoch, iteration, train_loss.item(), np.mean(dur1), np.mean(dur2), np.mean(dur3))) # validation net.eval() val_loss = [] with torch.no_grad(): for iteration in range(val_idx_generator.num_iterations()): # forward val_idx_batch = val_idx_generator.next() val_pos_user_artist_batch = val_pos_user_artist[ val_idx_batch].tolist() val_neg_user_artist_batch = val_neg_user_artist[ val_idx_batch].tolist() val_pos_g_lists, val_pos_indices_lists, val_pos_idx_batch_mapped_lists = parse_minibatch_LastFM( adjlists_ua, edge_metapath_indices_list_ua, val_pos_user_artist_batch, device, neighbor_samples, no_masks, num_user) val_neg_g_lists, val_neg_indices_lists, val_neg_idx_batch_mapped_lists = parse_minibatch_LastFM( adjlists_ua, edge_metapath_indices_list_ua, val_neg_user_artist_batch, device, neighbor_samples, no_masks, num_user) [pos_embedding_user, pos_embedding_artist], _ = net( (val_pos_g_lists, features_list, type_mask, val_pos_indices_lists, val_pos_idx_batch_mapped_lists)) [neg_embedding_user, neg_embedding_artist], _ = net( (val_neg_g_lists, features_list, type_mask, val_neg_indices_lists, val_neg_idx_batch_mapped_lists)) pos_embedding_user = pos_embedding_user.view( -1, 1, pos_embedding_user.shape[1]) pos_embedding_artist = pos_embedding_artist.view( -1, pos_embedding_artist.shape[1], 1) neg_embedding_user = neg_embedding_user.view( -1, 1, neg_embedding_user.shape[1]) neg_embedding_artist = neg_embedding_artist.view( -1, neg_embedding_artist.shape[1], 1) pos_out = torch.bmm(pos_embedding_user, pos_embedding_artist) neg_out = -torch.bmm(neg_embedding_user, neg_embedding_artist) val_loss.append(-torch.mean( F.logsigmoid(pos_out) + F.logsigmoid(neg_out))) val_loss = torch.mean(torch.tensor(val_loss)) t_end = time.time() # print validation info print('Epoch {:05d} | Val_Loss {:.4f} | Time(s) {:.4f}'.format( epoch, val_loss.item(), t_end - t_start)) # early stopping early_stopping(val_loss, net) if early_stopping.early_stop: print('Early stopping!') break test_idx_generator = index_generator( batch_size=batch_size, num_data=len(test_pos_user_artist), shuffle=False) net.load_state_dict( torch.load('checkpoint/checkpoint_{}.pt'.format(save_postfix))) net.eval() pos_proba_list = [] neg_proba_list = [] with torch.no_grad(): for iteration in range(test_idx_generator.num_iterations()): # forward test_idx_batch = test_idx_generator.next() test_pos_user_artist_batch = test_pos_user_artist[ test_idx_batch].tolist() test_neg_user_artist_batch = test_neg_user_artist[ test_idx_batch].tolist() test_pos_g_lists, test_pos_indices_lists, test_pos_idx_batch_mapped_lists = parse_minibatch_LastFM( adjlists_ua, edge_metapath_indices_list_ua, test_pos_user_artist_batch, device, neighbor_samples, no_masks, num_user) test_neg_g_lists, test_neg_indices_lists, test_neg_idx_batch_mapped_lists = parse_minibatch_LastFM( adjlists_ua, edge_metapath_indices_list_ua, test_neg_user_artist_batch, device, neighbor_samples, no_masks, num_user) [pos_embedding_user, pos_embedding_artist], _ = net( (test_pos_g_lists, features_list, type_mask, test_pos_indices_lists, test_pos_idx_batch_mapped_lists)) [neg_embedding_user, neg_embedding_artist], _ = net( (test_neg_g_lists, features_list, type_mask, test_neg_indices_lists, test_neg_idx_batch_mapped_lists)) pos_embedding_user = pos_embedding_user.view( -1, 1, pos_embedding_user.shape[1]) pos_embedding_artist = pos_embedding_artist.view( -1, pos_embedding_artist.shape[1], 1) neg_embedding_user = neg_embedding_user.view( -1, 1, neg_embedding_user.shape[1]) neg_embedding_artist = neg_embedding_artist.view( -1, neg_embedding_artist.shape[1], 1) pos_out = torch.bmm(pos_embedding_user, pos_embedding_artist).flatten() neg_out = torch.bmm(neg_embedding_user, neg_embedding_artist).flatten() pos_proba_list.append(torch.sigmoid(pos_out)) neg_proba_list.append(torch.sigmoid(neg_out)) y_proba_test = torch.cat(pos_proba_list + neg_proba_list) y_proba_test = y_proba_test.cpu().numpy() auc = roc_auc_score(y_true_test, y_proba_test) ap = average_precision_score(y_true_test, y_proba_test) print('Link Prediction Test') print('AUC = {}'.format(auc)) print('AP = {}'.format(ap)) auc_list.append(auc) ap_list.append(ap) print('----------------------------------------------------------------') print('Link Prediction Tests Summary') print('AUC_mean = {}, AUC_std = {}'.format(np.mean(auc_list), np.std(auc_list))) print('AP_mean = {}, AP_std = {}'.format(np.mean(ap_list), np.std(ap_list)))
def train(rank, world_size, args, cfg): dist.init_process_group(backend='nccl', init_method=args.init_method, world_size=world_size, rank=rank) # dist.init_process_group(backend='nccl', rank=rank, ) torch.cuda.set_device(args.local_rank) seed = int(time.time() * 256) torch.manual_seed(seed) logger = logging.getLogger(__name__) logging.basicConfig(level=20, format='%(asctime)s - %(message)s') # ================================================ # 2) get data and load data # ================================================ train_dataloader = construct_loader(cfg, 'train') val_dataloader = construct_loader(cfg, 'val') # ================================================ # 3) init model/loss/optimizer # ================================================ model = build_model(cfg) model.cuda() optimizer = optim.Adam(model.parameters(), lr=cfg.SOLVER.BASE_LR, weight_decay=cfg.SOLVER.WEIGHT_DECAY) model, optimizer = amp.initialize(model, optimizer) model = torch.nn.parallel.DistributedDataParallel(model) cudnn.benchmark = True loss_function = F.cross_entropy().cuda() # ================================================ # 4) train loop # ================================================ print("|------------------------|") print("| train on train dataset |") print("|------------------------|") early_stopping = EarlyStopping(20, verbose=True, path='checkpoints/model.pth', trace_func=logging.info) writer = SummaryWriter() start_time = time.time() for epoch in range(args.n_epochs): train_loss_lst = [] val_loss_lst = [] train_acc_lst = [] val_acc_lst = [] model.train() for i, train_dataset in enumerate(train_dataloader): train_data, train_label = train_dataset if cfg.NUM_GPU: train_data.cuda(non_blocking=True) train_label.cuda(non_blocking=True) torch.distributed.barrier() optimizer.zero_grad() # # forward + backward + optimize train_outputs = model(train_data) train_loss = loss_function(train_outputs, train_label.long()) adjust_lr(optimizer, epoch, cfg.SOLVER.BASE_LR) with amp.scale_loss(train_loss, optimizer) as scaled_loss: scaled_loss.backward() # train_loss.backward() optimizer.step() train_acc = accuracy(train_outputs, train_label.long()) train_acc_lst.append(train_acc) train_loss_lst.append(train_loss) train_avg_loss = sum(train_loss_lst) / i train_avg_acc = sum(train_acc_lst) / i # ================================================ # 5) evaluate on validation dataset # ================================================ model.eval() for v, val_dataset in enumerate(val_dataloader): val_data, val_label = val_dataset val_outputs = model(val_data) val_loss = F.cross_entropy(val_outputs, val_label.long()) val_acc = accuracy(val_outputs, val_label) val_acc_lst.append(val_acc) val_loss_lst.append(val_loss) val_avg_acc = sum(val_acc_lst) / v val_avg_loss = sum(val_loss_lst) / v logging.info( "Train Phase, Epoch:{}, Train_avg_loss:{}, Val_avg_loss:{},Train_avg_acc:{}, Val_avg_acc:{}" .format(epoch, train_avg_loss, val_avg_loss, train_avg_acc, val_avg_acc)) early_stopping(val_avg_loss, model) if early_stopping.early_stop: print('|------- Early Stop ------|') end_time = time.time() logging.info("Total spend time:{}s".format(end_time - start_time)) break writer.add_scalar('Loss', train_avg_loss, epoch) writer.add_scalar('Accuracy', train_avg_acc, epoch) logging.FileHandler('logs/{}_log.txt'.format( time.strftime(r"%Y-%m-%d-%H_%M_%S", time.localtime())))
def train(model, model_type,criterion, optimizer, activate_early_stopping,scheduler, train_loader, val_loader, n_epochs=1, gpu=False, print_every=1,print_validation_every=1, earl_stopping_patience = 3): """Function to train deep learning model Input : model = model to train, model_type = (string) name of model type, criterion = loss function to use for training, optimizer = optimizer to use for training, activate_early_stopping = (boolean) active early stop if True, scheduler = scheduler to use for training, train_loader = (DataLoader) train set, val_loader = (DataLoader) validation set, n_epochs = (integer) number of epochs, gpu = (boolean) use GPU if True, print_every = (integer) periodicity for printing training loss and accuracy, print_validation_every = (integer) periodicity for printing validation loss and accuracy, earl_stopping_patience = (integer) number of epochs that produced the monitored quantity with no improvement after which training will be stopped """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if activate_early_stopping: early_stopping = EarlyStopping(patience = earl_stopping_patience, verbose=True) #for plotting hist = {'loss':[],'accuracy':[]} val_hist = {'loss':[],'accuracy':[]} for ep in range(n_epochs): running_loss = 0 #used by the scheduler running_accuracy = 0 if model_type=='rnn': h = model.init_hidden(train_loader.batch_size) for it, data in enumerate(train_loader): #extract right info from data if model_type=='bert': seq,attn_masks,labels = data elif model_type in ['rnn','cnn']: seq,attn_masks,labels = data[0],torch.ones(1),data[1] #attn_mask is not important here else: raise ValueError(f'Model type "{model_type}" not supported.') labels = labels.type(torch.LongTensor) #Clear gradients optimizer.zero_grad() #Converting these to cuda tensors if gpu: seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device) #Obtaining the logits from the model if model_type == 'rnn': h = tuple([e.data for e in h]) output,h = model(seq,h) elif model_type == 'cnn': output = model(seq) elif model_type =='bert': output,attentions = model(seq, attn_masks) else: raise ValueError(f'Model type "{model_type}" not supported.') #Computing loss loss = criterion(output.squeeze(-1), labels) running_loss += loss #Backpropagating the gradients loss.backward() #Optimization step optimizer.step() #accuracy update accuracy = torch.sum(torch.argmax(output,dim=1)==labels)/float(labels.size(0)) running_accuracy += accuracy if (it + 1) % print_every == 0: print("Iteration {} of epoch {} complete. Loss : {}, Accuracy {} ".format(it+1, ep+1, loss.item(),accuracy)) #scheduler step if not scheduler is None: scheduler.step(running_loss) #update training history hist['loss'].append(running_loss/it) #mean hist['accuracy'].append(running_accuracy/it) #mean #VALIDATION model.eval() n_batch_validation = 0 loss_validation = 0 accuracy_validation = 0 #init hidden if rnn if model_type == 'rnn': val_h = model.init_hidden(val_loader.batch_size) for it, data in enumerate(val_loader): #extract right info from data if model_type=='bert': seq,attn_masks,labels = data elif model_type in ['rnn','cnn']: seq,attn_masks,labels = data[0],torch.ones(1),data[1] #attn_mask is not important here else: raise ValueError(f'Model type "{model_type}" not supported.') labels = labels.type(torch.LongTensor) if gpu: seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device) #Obtaining the logits from the model if model_type == 'rnn': val_h = tuple([each.data for each in val_h]) out, val_h = model(seq, val_h) elif model_type == 'cnn': out = model(seq) elif model_type=='bert': out, attentions_val = model(seq, attn_masks) else: raise ValueError(f'Model type "{model_type}" not supported.') n_batch_validation+=1 #Computing loss _loss = float(criterion(out.squeeze(-1), labels)) #computing scores _accu = torch.sum(torch.argmax(out,dim=1)==labels)/float(labels.size(0)) loss_validation += _loss accuracy_validation += _accu #validation printing if ep % print_validation_every==0: print("EVALUATION Validation set : mean loss {} || mean accuracy {}".format(loss_validation/n_batch_validation, accuracy_validation/n_batch_validation)) val_hist['loss'].append(loss_validation/n_batch_validation) val_hist['accuracy'].append(accuracy_validation/n_batch_validation) #early stopping if activate_early_stopping: early_stopping(loss_validation, model) if early_stopping.early_stop: print("Early stopping") break model.train() #plot history fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14,5)) ax1.plot(hist['loss'],label='train') ax1.plot(val_hist['loss'],label='validation') ax1.set_title('Evolution of training loss') ax1.legend() ax2.plot(hist['accuracy'],label='train') ax2.plot(val_hist['accuracy'],label='validation') ax2.set_title('Evolution of training accuracy') ax2.legend() plt.tight_layout() plt.show()
def main(): #define train set transformations train_transform = transforms.Compose([ transforms.RandomRotation(5), transforms.RandomHorizontalFlip(0.3), transforms.RandomVerticalFlip(0.3), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) #define validation set transformations valid_transforms = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) train_dataset = ImageFolder('data/train_images/', transform=train_transform) valid_dataset = ImageFolder('data/train_images/', transform=valid_transforms) #split data : get indices of train and valid set valid_size = 0.2 data_size = len(train_dataset) indices = list(range(data_size)) #split indices train_indx, valid_indx, _, _ = train_test_split(indices, indices, test_size=valid_size, random_state=44) #create samplers from indices for train and validation sets. train_sampler = SubsetRandomSampler(train_indx) valid_sampler = SubsetRandomSampler(valid_indx) #create dataloaders train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, sampler=train_sampler) valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, sampler=valid_sampler) #create model model = MODEL_DISPATCHER[BASE_MODEL](pretrained=True) #model.load_state_dict(torch.load("model/checkpoints/checkpoint.pt")) model.to(DEVICE) optimizer = optim.Adam(model.parameters()) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.2, patience=5) early_stopping = EarlyStopping(patience=7, verbose=True) criterion = nn.CrossEntropyLoss() for e in range(EPOCHS): train(train_loader, model, optimizer, criterion) val_score = evaluate(valid_loader, model, criterion) scheduler.step(val_score) early_stopping(val_score, model) if early_stopping.early_stop: print("Early stopping!") break
def main(): fold = str(config.fold) # 3.1 创建必要的文件夹 if not os.path.exists(config.submit): os.mkdir(config.submit) if not os.path.exists(config.weights): os.mkdir(config.weights) if not os.path.exists(config.best_models): os.mkdir(config.best_models) if not os.path.exists(config.logs): os.mkdir(config.logs) if not os.path.exists(config.weights + config.model_name + os.sep + str(fold) + os.sep): os.makedirs(config.weights + config.model_name + os.sep + str(fold) + os.sep) if not os.path.exists(config.best_models + config.model_name + os.sep + str(fold) + os.sep): os.makedirs(config.best_models + config.model_name + os.sep + str(fold) + os.sep) if not os.path.exists(config.submit + config.model_name + os.sep + str(fold) + os.sep): os.makedirs(config.submit + config.model_name + os.sep + str(fold) + os.sep) if not os.path.exists(config.logs + config.model_name + os.sep + str(fold) + os.sep): os.makedirs(config.logs + config.model_name + os.sep + str(fold) + os.sep) # 3.2 获取模型和优化器,并初始化损失函数 # model = resnet18(num_classes=len(config.class_list)) # model = seresnet18() model = resnet18() model.cuda() # 初始化正则化 # if config.weight_decay > 0: # reg_loss = Regularization(model, config.weight_decay, p=1).cuda() # L1/L2正则 # else: # print("no regularization") optimizer = optim.SGD(model.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay) # optimizer = optim.Adam(model.parameters(), lr=config.lr, amsgrad=True, weight_decay=config.weight_decay) # 3.4 重新启动训练过程 criterion = nn.CrossEntropyLoss().cuda() start_epoch = 0 best_precision1 = 0 resume = False if resume: checkpoint = torch.load(config.best_models + config.model_name + os.sep + str(fold) + "/model_best.pth.tar") start_epoch = checkpoint["epoch"] fold = checkpoint["fold"] best_precision1 = checkpoint["best_precision1"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) # 3.5 获取文件和分割数据集 train_data_list, val_data_list = random_split_ratio(config.data_root, config.class_list, split_rate=0.2) # print(len(val_data_list)) # 3.6 加载数据为DataLoader train_dataloader = DataLoader(CreateImgDataset(train_data_list), batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=4) val_dataloader = DataLoader(CreateImgDataset(val_data_list, train=False), batch_size=1, shuffle=True, collate_fn=collate_fn, pin_memory=False, num_workers=4) # 4.1 初始化学习率调整 scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=config.step_size, gamma=config.gamma) # 4.2 定义指标 train_losses = AverageMeter() train_top1 = AverageMeter() valid_loss = [np.inf, 0, 0] model.train() # 5. 训练模块 start = timer() train_list = [] valid_list = [] label_list = [] y_pred = [] target_list = [] # 5.1 初始化早停止 early_stopping = EarlyStopping(patience=config.patience, verbose=True) for epoch in range(start_epoch, config.epochs): # 5.2 学习率调整 if get_learning_rate(optimizer) > 1e-8: scheduler.step(epoch) # 5.3 全局迭代 train_progressor = ProgressBar(mode="Train", epoch=epoch, total_epoch=config.epochs, model_name=config.model_name, path=config.logs + config.model_name + os.sep + str(fold) + os.sep, total=len(train_dataloader)) for batch, (input, target) in enumerate(train_dataloader): train_progressor.current = batch # 5.4 数据输入网络训练 model.train() input = Variable(input).cuda() target = Variable(torch.from_numpy(np.array(target)).long()).cuda() output = model(input) # 5.5 计算训练损失 loss = criterion(output, target) # if config.weight_decay > 0: # loss = loss + reg_loss(model) # 5.6 计算准确率 precision1_train, precision2_train = accuracy(output, target, topk=(1, 2)) train_losses.update(loss.item(), input.size(0)) train_top1.update(precision1_train[0], input.size(0)) train_progressor.current_loss = train_losses.avg train_progressor.current_top1 = train_top1.avg # 5.7 反向传播 optimizer.zero_grad() loss.backward() optimizer.step() train_progressor() train_progressor.done() train_list.append([train_losses.avg, train_top1.avg.cpu().data.item()]) # 6 评估每个epoch valid, target_list_t, label_list_t, y_pred_t = evaluate( val_dataloader, model, criterion, fold, epoch) valid_list.append(valid) # 6.1 保存最优模型 is_best = valid[1] > best_precision1 best_precision1 = max(valid[1], best_precision1) if is_best: target_list = target_list_t label_list = label_list_t y_pred = y_pred_t save_checkpoint( { "epoch": epoch + 1, "model_name": config.model_name, "state_dict": model.state_dict(), "best_precision1": best_precision1, "optimizer": optimizer.state_dict(), "fold": fold, "valid_loss": valid[0], }, is_best, fold) # 6.2 根据验证损失判断早停止 early_stopping(-valid[1], model) # 若满足 early stopping 要求 if early_stopping.early_stop: print("Early stopping") # 结束模型训练 break print("训练用时:" + time_to_str((timer() - start), 'min')) # 6.3 保存最优模型评估结果到excel Y_pred = np.asarray(y_pred) f = xlwt.Workbook() train_sheet = f.add_sheet(u'train', cell_overwrite_ok=True) # 创建sheet val_sheet = f.add_sheet(u'verify', cell_overwrite_ok=True) result_sheet = f.add_sheet(u'result', cell_overwrite_ok=True) for i, t in enumerate(train_list): train_sheet.write(i, 0, t[0]) train_sheet.write(i, 1, t[1]) for i, v in enumerate(valid_list): val_sheet.write(i, 0, v[0]) val_sheet.write(i, 1, v[1]) for i, r in enumerate(target_list): result_sheet.write(i, 0, r) result_sheet.write(i, 1, int(label_list[i])) for c in range(0, len(config.class_list)): result_sheet.write(i, 2 + c, Y_pred[i, c]) f.save(config.logs + config.model_name + os.sep + str(fold) + os.sep + config.model_name + '.xlsx') # 6.4 画训练过程曲线与结果图 plot_training(train_list, valid_list, fold, config.dpi) plot_result(config.class_list, fold, best_precision1, target_list, label_list, Y_pred, config.dpi)
def train_attention(): param_class = get_param_class(args.data) run_id = args.data + '_' + str(uuid.uuid1()) dataset_train = LeafDataset( data_path=args.dataset_path, genotype=param_class.genotype, inoculated=param_class.inoculated, dai=param_class.dai, test_size=param_class.test_size, signature_pre_clip=param_class.signature_pre_clip, signature_post_clip=param_class.signature_post_clip, max_num_balanced_inoculated=param_class.max_num_balanced_inoculated, num_samples_file=param_class.num_samples_file, split=args.split, mode='train', superpixel=True, bags=True, validation=True) # 50000 dataset_test = LeafDataset( data_path=args.dataset_path, genotype=param_class.genotype, inoculated=param_class.inoculated, dai=param_class.dai, test_size=param_class.test_size, signature_pre_clip=param_class.signature_pre_clip, signature_post_clip=param_class.signature_post_clip, max_num_balanced_inoculated=param_class. max_num_balanced_inoculated, # 50000 num_samples_file=param_class.num_samples_file, split=args.split, mode="test", superpixel=True, bags=True, validation=True) dataset_val = LeafDataset( data_path=args.dataset_path, genotype=param_class.genotype, inoculated=param_class.inoculated, dai=param_class.dai, test_size=param_class.test_size, signature_pre_clip=param_class.signature_pre_clip, signature_post_clip=param_class.signature_post_clip, max_num_balanced_inoculated=param_class. max_num_balanced_inoculated, # 50000 num_samples_file=param_class.num_samples_file, split=args.split, mode="validation", superpixel=True, bags=True, validation=True) print("Number of samples train", len(dataset_train)) print("Number of samples test", len(dataset_test)) print("Number of samples val", len(dataset_val)) dataloader = DataLoader(dataset_train, batch_size=1, shuffle=True, num_workers=0) dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=0, drop_last=False) dataloader_val = DataLoader(dataset_val, batch_size=1, shuffle=False, num_workers=0, drop_last=False) hyperparams = dataset_train.hyperparams print("Number of batches train", len(dataloader)) print("Number of batches test", len(dataloader_test)) print("Number of batches val", len(dataloader_val)) # Original class counts train: 67578 264112 # Original class counts test: 68093 263597 hyperparams['num_classes'] = param_class.num_classes hyperparams['hidden_layer_size'] = param_class.hidden_layer_size hyperparams['num_heads'] = param_class.num_heads hyperparams['lr'] = args.lr hyperparams['num_epochs'] = args.num_epochs hyperparams['lr_scheduler_steps'] = args.lr_scheduler_steps model = SANNetwork(input_size=dataset_train.input_size, num_classes=hyperparams['num_classes'], hidden_layer_size=hyperparams['hidden_layer_size'], dropout=0.9, num_heads=hyperparams['num_heads'], device="cuda") #model = ConvNetBarley(elu=False, avgpool=False, nll=False, num_classes=param_class.num_classes) #model = CNNModel(num_classes=param_class.num_classes) num_epochs = hyperparams['num_epochs'] optimizer = torch.optim.Adam(model.parameters(), lr=hyperparams['lr']) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, hyperparams['lr_scheduler_steps'], gamma=0.5, last_epoch=-1) num_params = sum(p.numel() for p in model.parameters()) print("Number of parameters {}".format(num_params)) print("Starting training for {} epochs".format(num_epochs)) save_dir = "./uv_dataset/results_cv/" writer = SummaryWriter(log_dir=save_dir + run_id, comment="_" + "_id_{}".format(run_id)) #device = "cuda" #model.to(device) #balanced_loss_weight = torch.tensor([0.75, 0.25], device=device) # torch.tensor([0.75, 0.25], device=device) balanced_loss_weight = torch.tensor([0.75, 0.25]) crit = torch.nn.CrossEntropyLoss(weight=balanced_loss_weight) best_acc = 0 early_stopping = EarlyStopping(patience=60, verbose=True) for epoch in tqdm(range(num_epochs)): setproctitle("Gerste_MIL" + args.mode + " | epoch {} of {}".format(epoch + 1, num_epochs)) losses_per_batch = [] correct = 0 target, pred = [], [] total = 0 for i, (features, labels) in enumerate(dataloader): #labels = labels[2] features = features.float() #.to(device) #features = features.permute((1, 0, 2, 3, 4)) labels = labels.long() #.to(device) model.train() outputs, _ = model.forward(features) outputs = outputs.view(labels.shape[0], -1) labels = labels.view(-1) loss = crit(outputs, labels) optimizer.zero_grad() _, predicted = torch.max(outputs.data, 1) batch_pred, batch_target = getPredAndTarget(outputs, labels) target.append(batch_target) pred.append(batch_pred) # correct += balanced_accuracy(batch_target, batch_pred) * labels.size(0) # mean # correct += (predicted == labels).sum().item() total += labels.size(0) loss.backward() optimizer.step() losses_per_batch.append(float(loss)) mean_loss = np.mean(losses_per_batch) correct = balanced_accuracy(target, pred) writer.add_scalar('Loss/train', mean_loss, epoch) writer.add_scalar('Accuracy/train', 100 * correct, epoch) print("Epoch {}, mean loss per batch {}, train acc {}".format( epoch, mean_loss, 100 * correct)) if (epoch + 1) % args.test_epoch == 0 or epoch + 1 == num_epochs: # Testing correct_test = 0 target, pred = [], [] total = 0 model.eval() losses_per_batch = [] attention_weights = [] with torch.no_grad(): for i, (features, labels) in enumerate(dataloader_test): #labels = labels[2] features = features.float() #.to(device) #features = features.permute((1, 0, 2, 3, 4)) labels = labels.long() #.to(device) outputs, att = model.forward(features) attention_weights.append(att.squeeze(0).numpy()) outputs = outputs.view(labels.shape[0], -1) labels = labels.view(-1) loss = crit(outputs, labels) losses_per_batch.append(float(loss)) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) batch_pred, batch_target = getPredAndTarget( outputs, labels) target.append(batch_target) pred.append(batch_pred) # correct_test += balanced_accuracy(batch_target, batch_pred) * labels.size(0) # correct += (predicted == labels).sum().item() mean_loss = np.mean(losses_per_batch) print(target, pred) correct_test = balanced_accuracy(target, pred) writer.add_scalar('Loss/test', mean_loss, epoch) np.save('attention_weights.npy', attention_weights) print( 'Accuracy, mean loss per batch of the network on the test samples: {} %, {}' .format(100 * correct_test, mean_loss)) writer.add_scalar('Accuracy/test', 100 * correct_test, epoch) # Validation correct_val = 0 target, pred = [], [] total = 0 losses_per_batch = [] attention_weights_val = [] with torch.no_grad(): for i, (features, labels) in enumerate(dataloader_val): #labels = labels[2] features = features.float() #.to(device) #features = features.permute((1, 0, 2, 3, 4)) labels = labels.long() #.to(device) outputs, att = model.forward(features) attention_weights_val.append(att.squeeze(0).numpy()) outputs = outputs.view(labels.shape[0], -1) labels = labels.view(-1) loss = crit(outputs, labels) losses_per_batch.append(float(loss)) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) batch_pred, batch_target = getPredAndTarget( outputs, labels) target.append(batch_target) pred.append(batch_pred) # correct_val += balanced_accuracy(batch_target, batch_pred) * labels.size(0) # correct += (predicted == labels).sum().item() mean_loss = np.mean(losses_per_batch) print(target, pred) correct_val = balanced_accuracy(target, pred) writer.add_scalar('Loss/val', mean_loss, epoch) #np.save('attention_weights_val.npy', attention_weights_val) print( 'Accuracy, mean loss per batch of the network on the validation samples: {} %, {}' .format(100 * correct_val, mean_loss)) writer.add_scalar('Accuracy/val', 100 * correct_val, epoch) early_stopping(mean_loss, model) if early_stopping.early_stop: print("Early stopping") break if (correct_test) >= best_acc: best_acc = (correct_test) model.train() scheduler.step()
torch.manual_seed(0) np.random.seed(0) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") if torch.cuda.is_available(): torch.cuda.manual_seed_all(0) model = FECNet(pretrained=args.pretrained) Num_Param = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Number of Trainable Parameters= %d" % (Num_Param)) optimizer = optim.Adam(model.parameters(), lr=args.lr) # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.9) early_stopping = EarlyStopping(patience=50, verbose=True) running_loss = 0 print_per_epoch = 1 correct = 0 Len = 0 tr_dataloader, val_dataloader = DATALoader(csv_file='data/labels.csv', args=args) for epoch in range(args.epochs): # scheduler.step() # Training for i_batch, sample_batched in enumerate(tr_dataloader): model.zero_grad()
def main(): cmd_ls = sys.argv[1:] cmd = generate_cmd(cmd_ls) if "--freeze_bn False" in cmd: opt.freeze_bn = False if "--addDPG False" in cmd: opt.addDPG = False print( "----------------------------------------------------------------------------------------------------" ) print("This is the model with id {}".format(save_ID)) print(opt) print("Training backbone is: {}".format(opt.backbone)) dataset_str = "" for k, v in config.train_info.items(): dataset_str += k dataset_str += "," print("Training data is: {}".format(dataset_str[:-1])) print("Warm up end at {}".format(warm_up_epoch)) for k, v in config.bad_epochs.items(): if v > 1: raise ValueError("Wrong stopping accuracy!") print( "----------------------------------------------------------------------------------------------------" ) exp_dir = os.path.join("exp/{}/{}".format(folder, save_ID)) log_dir = os.path.join(exp_dir, "{}".format(save_ID)) os.makedirs(log_dir, exist_ok=True) log_name = os.path.join(log_dir, "{}.txt".format(save_ID)) train_log_name = os.path.join(log_dir, "{}_train.xlsx".format(save_ID)) bn_file = os.path.join(log_dir, "{}_bn.txt".format(save_ID)) # Prepare Dataset # Model Initialize if device != "cpu": m = createModel(cfg=model_cfg).cuda() else: m = createModel(cfg=model_cfg).cpu() print(m, file=open("model.txt", "w")) begin_epoch = 0 pre_train_model = opt.loadModel flops = print_model_param_flops(m) print("FLOPs of current model is {}".format(flops)) params = print_model_param_nums(m) print("Parameters of current model is {}".format(params)) inf_time = get_inference_time(m, height=opt.outputResH, width=opt.outputResW) print("Inference time is {}".format(inf_time)) print( "----------------------------------------------------------------------------------------------------" ) if opt.freeze > 0 or opt.freeze_bn: if opt.backbone == "mobilenet": feature_layer_num = 155 feature_layer_name = "features" elif opt.backbone == "seresnet101": feature_layer_num = 327 feature_layer_name = "preact" elif opt.backbone == "seresnet18": feature_layer_num = 75 feature_layer_name = "seresnet18" elif opt.backbone == "shufflenet": feature_layer_num = 167 feature_layer_name = "shuffle" else: raise ValueError("Not a correct name") feature_num = int(opt.freeze * feature_layer_num) for idx, (n, p) in enumerate(m.named_parameters()): if len(p.shape) == 1 and opt.freeze_bn: p.requires_grad = False elif feature_layer_name in n and idx < feature_num: p.requires_grad = False else: p.requires_grad = True writer = SummaryWriter('exp/{}/{}'.format(folder, save_ID), comment=cmd) if device != "cpu": # rnd_inps = Variable(torch.rand(3, 3, 224, 224), requires_grad=True).cuda() rnd_inps = torch.rand(3, 3, 224, 224).cuda() else: rnd_inps = torch.rand(3, 3, 224, 224) # rnd_inps = Variable(torch.rand(3, 3, 224, 224), requires_grad=True) try: writer.add_graph(m, (rnd_inps, )) except: pass shuffle_dataset = False for k, v in config.train_info.items(): if k not in open_source_dataset: shuffle_dataset = True train_dataset = MyDataset(config.train_info, train=True) val_dataset = MyDataset(config.train_info, train=False) if shuffle_dataset: val_dataset.img_val, val_dataset.bbox_val, val_dataset.part_val = \ train_dataset.img_val, train_dataset.bbox_val, train_dataset.part_val train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.trainBatch, shuffle=True, num_workers=opt.trainNW, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opt.validBatch, shuffle=True, num_workers=opt.valNW, pin_memory=True) # for k, v in config.train_info.items(): # train_dataset = Mscoco([v[0], v[1]], train=True, val_img_num=v[2]) # val_dataset = Mscoco([v[0], v[1]], train=False, val_img_num=v[2]) # # train_loaders[k] = torch.utils.data.DataLoader( # train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=config.train_mum_worker, # pin_memory=True) # # val_loaders[k] = torch.utils.data.DataLoader( # val_dataset, batch_size=config.val_batch, shuffle=False, num_workers=config.val_num_worker, pin_memory=True) # # train_loader = torch.utils.data.DataLoader( # train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=config.train_mum_worker, # pin_memory=True) # val_loader = torch.utils.data.DataLoader( # val_dataset, batch_size=config.val_batch, shuffle=False, num_workers=config.val_num_worker, pin_memory=True) # assert train_loaders != {}, "Your training data has not been specific! " os.makedirs("exp/{}/{}".format(folder, save_ID), exist_ok=True) if pre_train_model: if "duc_se.pth" not in pre_train_model: if "pretrain" not in pre_train_model: try: info_path = os.path.join("exp", folder, save_ID, "option.pkl") info = torch.load(info_path) opt.trainIters = info.trainIters opt.valIters = info.valIters begin_epoch = int(pre_train_model.split("_")[-1][:-4]) + 1 except: # begin_epoch = int(pre_train_model.split("_")[-1][:-4]) + 1 with open(log_name, "a+") as f: f.write(cmd) print('Loading Model from {}'.format(pre_train_model)) m.load_state_dict(torch.load(pre_train_model)) else: with open(log_name, "a+") as f: f.write(cmd) print('Loading Model from {}'.format(pre_train_model)) m.load_state_dict(torch.load(pre_train_model)) m.conv_out = nn.Conv2d(m.DIM, opt.kps, kernel_size=3, stride=1, padding=1) if device != "cpu": m.conv_out.cuda() os.makedirs("exp/{}/{}".format(folder, save_ID), exist_ok=True) else: print('Create new model') with open(log_name, "a+") as f: f.write(cmd) print(opt, file=f) f.write("FLOPs of current model is {}\n".format(flops)) f.write("Parameters of current model is {}\n".format(params)) with open(os.path.join(log_dir, "tb.py"), "w") as pyfile: pyfile.write("import os\n") pyfile.write("os.system('conda init bash')\n") pyfile.write("os.system('conda activate py36')\n") pyfile.write( "os.system('tensorboard --logdir=../../../../exp/{}/{}')".format( folder, save_ID)) params_to_update, layers = [], 0 for name, param in m.named_parameters(): layers += 1 if param.requires_grad: params_to_update.append(param) print("Training {} layers out of {}".format(len(params_to_update), layers)) if optimize == 'rmsprop': optimizer = torch.optim.RMSprop(params_to_update, lr=opt.LR, momentum=opt.momentum, weight_decay=opt.weightDecay) elif optimize == 'adam': optimizer = torch.optim.Adam(params_to_update, lr=opt.LR, weight_decay=opt.weightDecay) elif optimize == 'sgd': optimizer = torch.optim.SGD(params_to_update, lr=opt.LR, momentum=opt.momentum, weight_decay=opt.weightDecay) else: raise Exception if mix_precision: m, optimizer = amp.initialize(m, optimizer, opt_level="O1") # Model Transfer if device != "cpu": m = torch.nn.DataParallel(m).cuda() criterion = torch.nn.MSELoss().cuda() else: m = torch.nn.DataParallel(m) criterion = torch.nn.MSELoss() # loss, acc = valid(val_loader, m, criterion, optimizer, writer) # print('Valid:-{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f}'.format( # idx=-1, # loss=loss, # acc=acc # )) early_stopping = EarlyStopping(patience=opt.patience, verbose=True) train_acc, val_acc, train_loss, val_loss, best_epoch, train_dist, val_dist, train_auc, val_auc, train_PR, val_PR = \ 0, 0, float("inf"), float("inf"), 0, float("inf"), float("inf"), 0, 0, 0, 0 train_acc_ls, val_acc_ls, train_loss_ls, val_loss_ls, train_dist_ls, val_dist_ls, train_auc_ls, val_auc_ls, \ train_pr_ls, val_pr_ls, epoch_ls, lr_ls = [], [], [], [], [], [], [], [], [], [], [], [] decay, decay_epoch, lr, i = 0, [], opt.LR, begin_epoch stop = False m_best = m train_log = open(train_log_name, "w", newline="") bn_log = open(bn_file, "w") csv_writer = csv.writer(train_log) csv_writer.writerow(write_csv_title()) begin_time = time.time() os.makedirs("result", exist_ok=True) result = os.path.join( "result", "{}_result_{}.csv".format(opt.expFolder, config.computer)) exist = os.path.exists(result) # Start Training try: for i in range(opt.nEpochs)[begin_epoch:]: opt.epoch = i epoch_ls.append(i) train_log_tmp = [save_ID, i, lr] log = open(log_name, "a+") print('############# Starting Epoch {} #############'.format(i)) log.write( '############# Starting Epoch {} #############\n'.format(i)) # optimizer, lr = adjust_lr(optimizer, i, config.lr_decay, opt.nEpochs) # writer.add_scalar("lr", lr, i) # print("epoch {}: lr {}".format(i, lr)) loss, acc, dist, auc, pr, pt_acc, pt_dist, pt_auc, pt_pr = \ train(train_loader, m, criterion, optimizer, writer) train_log_tmp.append(" ") train_log_tmp.append(loss) train_log_tmp.append(acc.tolist()) train_log_tmp.append(dist.tolist()) train_log_tmp.append(auc) train_log_tmp.append(pr) for a in pt_acc: train_log_tmp.append(a.tolist()) train_log_tmp.append(" ") for d in pt_dist: train_log_tmp.append(d.tolist()) train_log_tmp.append(" ") for ac in pt_auc: train_log_tmp.append(ac) train_log_tmp.append(" ") for p in pt_pr: train_log_tmp.append(p) train_log_tmp.append(" ") train_acc_ls.append(acc) train_loss_ls.append(loss) train_dist_ls.append(dist) train_auc_ls.append(auc) train_pr_ls.append(pr) train_acc = acc if acc > train_acc else train_acc train_loss = loss if loss < train_loss else train_loss train_dist = dist if dist < train_dist else train_dist train_auc = auc if auc > train_auc else train_auc train_PR = pr if pr > train_PR else train_PR log.write( 'Train:{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f} | dist:{dist:.4f} | AUC: {AUC:.4f} | PR: {PR:.4f}\n' .format( idx=i, loss=loss, acc=acc, dist=dist, AUC=auc, PR=pr, )) opt.acc = acc opt.loss = loss m_dev = m.module loss, acc, dist, auc, pr, pt_acc, pt_dist, pt_auc, pt_pr = valid( val_loader, m, criterion, writer) train_log_tmp.insert(9, loss) train_log_tmp.insert(10, acc.tolist()) train_log_tmp.insert(11, dist.tolist()) train_log_tmp.insert(12, auc) train_log_tmp.insert(13, pr) train_log_tmp.insert(14, " ") for a in pt_acc: train_log_tmp.append(a.tolist()) train_log_tmp.append(" ") for d in pt_dist: train_log_tmp.append(d.tolist()) train_log_tmp.append(" ") for ac in pt_auc: train_log_tmp.append(ac) train_log_tmp.append(" ") for p in pt_pr: train_log_tmp.append(p) train_log_tmp.append(" ") val_acc_ls.append(acc) val_loss_ls.append(loss) val_dist_ls.append(dist) val_auc_ls.append(auc) val_pr_ls.append(pr) if acc > val_acc: best_epoch = i val_acc = acc torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_acc.pkl'.format(folder, save_ID)) m_best = copy.deepcopy(m) val_loss = loss if loss < val_loss else val_loss if dist < val_dist: val_dist = dist torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_dist.pkl'.format(folder, save_ID)) if auc > val_auc: val_auc = auc torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_auc.pkl'.format(folder, save_ID)) if pr > val_PR: val_PR = pr torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_pr.pkl'.format(folder, save_ID)) log.write( 'Valid:{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f} | dist:{dist:.4f} | AUC: {AUC:.4f} | PR: {PR:.4f}\n' .format( idx=i, loss=loss, acc=acc, dist=dist, AUC=auc, PR=pr, )) bn_sum, bn_num = 0, 0 for mod in m.modules(): if isinstance(mod, nn.BatchNorm2d): bn_num += mod.num_features bn_sum += torch.sum(abs(mod.weight)) writer.add_histogram("bn_weight", mod.weight.data.cpu().numpy(), i) bn_ave = bn_sum / bn_num bn_log.write("{} --> {}".format(i, bn_ave)) print("Current bn : {} --> {}".format(i, bn_ave)) bn_log.write("\n") log.close() csv_writer.writerow(train_log_tmp) writer.add_scalar("lr", lr, i) print("epoch {}: lr {}".format(i, lr)) lr_ls.append(lr) torch.save(opt, 'exp/{}/{}/option.pkl'.format(folder, save_ID, i)) if i % opt.save_interval == 0 and i != 0: torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_{2}.pkl'.format(folder, save_ID, i)) # torch.save( # optimizer, 'exp/{}/{}/optimizer.pkl'.format(dataset, save_folder)) if i < warm_up_epoch: optimizer, lr = warm_up_lr(optimizer, i) elif i == warm_up_epoch: lr = opt.LR early_stopping(acc) else: early_stopping(acc) if early_stopping.early_stop: optimizer, lr = lr_decay(optimizer, lr) decay += 1 # if decay == 2: # draw_pred_img = False if decay > opt.lr_decay_time: stop = True else: decay_epoch.append(i) early_stopping.reset( int(opt.patience * patience_decay[decay])) # torch.save(m_dev.state_dict(), 'exp/{0}/{1}/{1}_decay{2}.pkl'.format(folder, save_ID, decay)) m = m_best for epo, ac in config.bad_epochs.items(): if i == epo and val_acc < ac: stop = True if stop: print("Training finished at epoch {}".format(i)) break training_time = time.time() - begin_time writer.close() train_log.close() # draw_graph(epoch_ls, train_loss_ls, val_loss_ls, train_acc_ls, val_acc_ls, train_dist_ls, val_dist_ls, log_dir) draw_graph(epoch_ls, train_loss_ls, val_loss_ls, "loss", log_dir) draw_graph(epoch_ls, train_acc_ls, val_acc_ls, "acc", log_dir) draw_graph(epoch_ls, train_auc_ls, val_auc_ls, "AUC", log_dir) draw_graph(epoch_ls, train_dist_ls, val_dist_ls, "dist", log_dir) draw_graph(epoch_ls, train_pr_ls, val_pr_ls, "PR", log_dir) with open(result, "a+") as f: if not exist: title_str = "id,backbone,structure,DUC,params,flops,time,loss_param,addDPG,kps,batch_size,optimizer," \ "freeze_bn,freeze,sparse,sparse_decay,epoch_num,LR,Gaussian,thresh,weightDecay,loadModel," \ "model_location, ,folder_name,training_time,train_acc,train_loss,train_dist,train_AUC," \ "train_PR,val_acc,val_loss,val_dist,val_AUC,val_PR,best_epoch,final_epoch" title_str = write_decay_title(len(decay_epoch), title_str) f.write(title_str) info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n".\ format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, os.path.join(folder, save_ID), training_time, train_acc, train_loss, train_dist, train_auc, train_PR, val_acc, val_loss, val_dist, val_auc, val_PR, best_epoch, i) info_str = write_decay_info(decay_epoch, info_str) f.write(info_str) # except IOError: # with open(result, "a+") as f: # training_time = time.time() - begin_time # writer.close() # train_log.close() # info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \ # format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, # opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, # opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, # os.path.join(folder, save_ID), training_time, "Some file is closed") # f.write(info_str) except ZeroDivisionError: with open(result, "a+") as f: training_time = time.time() - begin_time writer.close() train_log.close() info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \ format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, os.path.join(folder, save_ID), training_time, "Gradient flow") f.write(info_str) except KeyboardInterrupt: with open(result, "a+") as f: training_time = time.time() - begin_time writer.close() train_log.close() info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \ format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, os.path.join(folder, save_ID), training_time, "Be killed by someone") f.write(info_str) print("Model {} training finished".format(save_ID)) print( "----------------------------------------------------------------------------------------------------" )
def run_model_IMDB(feats_type, num_layers, hidden_dim, num_heads, attn_vec_dim, rnn_type, num_epochs, patience, repeat, save_postfix): nx_G_lists, edge_metapath_indices_lists, features_list, adjM, type_mask, labels, train_val_test_idx = load_IMDB_data( ) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') features_list = [ torch.FloatTensor(features.todense()).to(device) for features in features_list ] if feats_type == 0: in_dims = [features.shape[1] for features in features_list] elif feats_type == 1: in_dims = [features_list[0].shape[1]] + [10] * (len(features_list) - 1) for i in range(1, len(features_list)): features_list[i] = torch.zeros( (features_list[i].shape[0], 10)).to(device) elif feats_type == 2: in_dims = [features.shape[0] for features in features_list] in_dims[0] = features_list[0].shape[1] for i in range(1, len(features_list)): dim = features_list[i].shape[0] indices = np.vstack((np.arange(dim), np.arange(dim))) indices = torch.LongTensor(indices) values = torch.FloatTensor(np.ones(dim)) features_list[i] = torch.sparse.FloatTensor( indices, values, torch.Size([dim, dim])).to(device) elif feats_type == 3: in_dims = [features.shape[0] for features in features_list] for i in range(len(features_list)): dim = features_list[i].shape[0] indices = np.vstack((np.arange(dim), np.arange(dim))) indices = torch.LongTensor(indices) values = torch.FloatTensor(np.ones(dim)) features_list[i] = torch.sparse.FloatTensor( indices, values, torch.Size([dim, dim])).to(device) edge_metapath_indices_lists = [[ torch.LongTensor(indices).to(device) for indices in indices_list ] for indices_list in edge_metapath_indices_lists] labels = torch.LongTensor(labels).to(device) g_lists = [] for nx_G_list in nx_G_lists: g_lists.append([]) for nx_G in nx_G_list: g = dgl.DGLGraph(multigraph=True) g.add_nodes(nx_G.number_of_nodes()) g.add_edges(*list( zip(*sorted( map(lambda tup: (int(tup[0]), int(tup[1])), nx_G.edges()))))) g_lists[-1].append(g) train_idx = train_val_test_idx['train_idx'] val_idx = train_val_test_idx['val_idx'] test_idx = train_val_test_idx['test_idx'] svm_macro_f1_lists = [] svm_micro_f1_lists = [] nmi_mean_list = [] nmi_std_list = [] ari_mean_list = [] ari_std_list = [] for _ in range(repeat): net = MAGNN_nc(num_layers, [2, 2, 2], 4, etypes_lists, in_dims, hidden_dim, out_dim, num_heads, attn_vec_dim, rnn_type, dropout_rate) net.to(device) optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay) target_node_indices = np.where(type_mask == 0)[0] # training loop net.train() early_stopping = EarlyStopping( patience=patience, verbose=True, save_path='checkpoint/checkpoint_{}.pt'.format(save_postfix)) dur1 = [] dur2 = [] dur3 = [] for epoch in range(num_epochs): t0 = time.time() # training forward net.train() logits, embeddings = net((g_lists, features_list, type_mask, edge_metapath_indices_lists), target_node_indices) logp = F.log_softmax(logits, 1) train_loss = F.nll_loss(logp[train_idx], labels[train_idx]) t1 = time.time() dur1.append(t1 - t0) # autograd optimizer.zero_grad() train_loss.backward() optimizer.step() t2 = time.time() dur2.append(t2 - t1) # validation forward net.eval() with torch.no_grad(): logits, embeddings = net((g_lists, features_list, type_mask, edge_metapath_indices_lists), target_node_indices) logp = F.log_softmax(logits, 1) val_loss = F.nll_loss(logp[val_idx], labels[val_idx]) t3 = time.time() dur3.append(t3 - t2) # print info print( "Epoch {:05d} | Train_Loss {:.4f} | Val_Loss {:.4f} | Time1(s) {:.4f} | Time2(s) {:.4f} | Time3(s) {:.4f}" .format(epoch, train_loss.item(), val_loss.item(), np.mean(dur1), np.mean(dur2), np.mean(dur3))) # early stopping early_stopping(val_loss, net) if early_stopping.early_stop: print('Early stopping!') break # testing with evaluate_results_nc net.load_state_dict( torch.load('checkpoint/checkpoint_{}.pt'.format(save_postfix))) net.eval() with torch.no_grad(): logits, embeddings = net((g_lists, features_list, type_mask, edge_metapath_indices_lists), target_node_indices) svm_macro_f1_list, svm_micro_f1_list, nmi_mean, nmi_std, ari_mean, ari_std = evaluate_results_nc( embeddings[test_idx].cpu().numpy(), labels[test_idx].cpu().numpy(), num_classes=out_dim) svm_macro_f1_lists.append(svm_macro_f1_list) svm_micro_f1_lists.append(svm_micro_f1_list) nmi_mean_list.append(nmi_mean) nmi_std_list.append(nmi_std) ari_mean_list.append(ari_mean) ari_std_list.append(ari_std) # print out a summary of the evaluations svm_macro_f1_lists = np.transpose(np.array(svm_macro_f1_lists), (1, 0, 2)) svm_micro_f1_lists = np.transpose(np.array(svm_micro_f1_lists), (1, 0, 2)) nmi_mean_list = np.array(nmi_mean_list) nmi_std_list = np.array(nmi_std_list) ari_mean_list = np.array(ari_mean_list) ari_std_list = np.array(ari_std_list) print('----------------------------------------------------------------') print('SVM tests summary') print('Macro-F1: ' + ', '.join([ '{:.6f}~{:.6f} ({:.1f})'.format(macro_f1[:, 0].mean(), macro_f1[:, 1].mean(), train_size) for macro_f1, train_size in zip(svm_macro_f1_lists, [0.8, 0.6, 0.4, 0.2]) ])) print('Micro-F1: ' + ', '.join([ '{:.6f}~{:.6f} ({:.1f})'.format(micro_f1[:, 0].mean(), micro_f1[:, 1].mean(), train_size) for micro_f1, train_size in zip(svm_micro_f1_lists, [0.8, 0.6, 0.4, 0.2]) ])) print('K-means tests summary') print('NMI: {:.6f}~{:.6f}'.format(nmi_mean_list.mean(), nmi_std_list.mean())) print('ARI: {:.6f}~{:.6f}'.format(ari_mean_list.mean(), ari_std_list.mean()))
def main(): # Training settings # Note: Hyper-parameters need to be tuned in order to obtain results reported in the paper. parser = argparse.ArgumentParser( description= 'Implementation of COMPACT GRAPH ARCHITECTURE FOR SPEECH EMOTION RECOGNITION paper' ) parser.add_argument('--dataset', type=str, default="IEMOCAP", help='name of dataset (default: IEMOCAP)') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument('--batch_size', type=int, default=128, help='input batch size for training (default: 32)') parser.add_argument( '--iters_per_epoch', type=int, default=50, help='number of iterations per each epoch (default: 90)') parser.add_argument('--epochs', type=int, default=1000, help='number of epochs to train (default: 1000)') parser.add_argument('--lr', type=float, default=0.0005, help='learning rate (default: 0.01)') parser.add_argument( '--seed', type=int, default=0, help='random seed for splitting the dataset into 10 (default: 0)') parser.add_argument( '--fold_idx', type=int, default=5, help='the index of fold in 10-fold validation. Should be less then 10.' ) parser.add_argument( '--num_layers', type=int, default=2, help='number of layers INCLUDING the input one (default: 5)') parser.add_argument('--hidden_dim', type=int, default=64, help='number of hidden units (default: 64)') parser.add_argument('--final_dropout', type=float, default=0.5, help='final layer dropout (default: 0.5)') parser.add_argument( '--graph_pooling_type', type=str, default="sum", choices=["sum", "average"], help= 'Pooling over nodes in a graph to get graph embeddig: sum or average') parser.add_argument('--graph_type', type=str, default="line", choices=["line", "cycle"], help='Graph construction options') parser.add_argument('--Normalize', type=bool, default=True, choices=[True, False], help='Normalizing data') parser.add_argument('--patience', type=int, default=10, help='Normalizing data') parser.add_argument('--beta1', default=0.9, type=float, help='beta1 for adam') parser.add_argument('--beta2', default=0.999, type=float, help='beta2 for adam') parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)') args = parser.parse_args() #set up seeds and gpu device torch.manual_seed(0) np.random.seed(0) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") if torch.cuda.is_available(): torch.cuda.manual_seed_all(0) ##load data graphs, num_classes = load_data(args.dataset, args.Normalize) ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx. train_graphs, test_graphs = separate_data(graphs, args.seed, args.fold_idx) A = nx.to_numpy_matrix(train_graphs[0][0].g) if (args.graph_type == 'cycle'): A[0, -1] = 1 A[-1, 0] = 1 A = torch.Tensor(A).to(device) model = Graph_CNN_ortega(args.num_layers, train_graphs[0][0].node_features.shape[1], args.hidden_dim, num_classes, args.final_dropout, args.graph_pooling_type, device, A).to(device) Num_Param = sum(p.numel() for p in model.parameters() if p.requires_grad) b = 0 for p in model.parameters(): if p.requires_grad: a = p.numel() b += a print("Number of Trainable Parameters= %d" % (Num_Param)) acc_train_sum = 0 acc_test_sum = 0 for i in range(args.fold_idx): train_data = train_graphs[i] test_data = test_graphs[i] # optimizer = RAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), # weight_decay=args.weight_decay) optimizer = optim.Adam(model.parameters(), lr=args.lr) # optimizer = AdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), # weight_decay=args.weight_decay) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) early_stopping = EarlyStopping(patience=args.patience, verbose=True) for epoch in range(1, args.epochs + 1): scheduler.step() avg_loss = train(args, model, device, train_data, optimizer, epoch, A) if (epoch > 1): #### Validation check with torch.no_grad(): val_out = pass_data_iteratively(model, test_data) val_labels = torch.LongTensor( [graph.label for graph in test_data]).to(device) val_loss = criterion(val_out, val_labels) val_loss = np.average(val_loss.detach().cpu().numpy()) #### Check early stopping early_stopping(val_loss, model) if early_stopping.early_stop: print("Early stopping") break if ((epoch > 300) and (epoch % 20 == 0)) or (epoch % 10 == 0): acc_train, acc_test, _, _ = test(args, model, device, train_data, test_data, num_classes) model.load_state_dict(torch.load('checkpoint.pt')) acc_train, acc_test, output, label = test(args, model, device, train_data, test_data, num_classes) acc_train_sum += acc_train acc_test_sum += acc_test model = Graph_CNN_ortega(args.num_layers, train_graphs[0][0].node_features.shape[1], args.hidden_dim, num_classes, args.final_dropout, args.graph_pooling_type, device, A).to(device) print('Average train acc: %f, Average test acc: %f' % (acc_train_sum / args.fold_idx, acc_test_sum / args.fold_idx))
def __init__(self, input_size, n_channels, hparams, gpu, inference=False): self.hparams = hparams if inference: self.device = torch.device('cpu') self.model = ECGNet(n_channels=n_channels, hparams=self.hparams).to(self.device) else: if torch.cuda.device_count() > 1: if len(gpu) > 0: print("Number of GPUs will be used: ", len(gpu)) self.device = torch.device(f"cuda:{gpu[0]}" if torch.cuda. is_available() else "cpu") self.model = ECGNet(n_channels=n_channels, hparams=self.hparams).to(self.device) self.model = DP(self.model, device_ids=gpu, output_device=gpu[0]) else: print("Number of GPUs will be used: ", torch.cuda.device_count() - 5) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.model = ECGNet(n_channels=n_channels, hparams=self.hparams).to(self.device) self.model = DP(self.model, device_ids=list( range(torch.cuda.device_count() - 5))) else: self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.model = ECGNet(n_channels=n_channels, hparams=self.hparams).to(self.device) print('Only one GPU is available') # define the models #summary(self.model, (input_size, n_channels)) #print(torch.cuda.is_available()) self.metric = Metric() self.num_workers = 18 self.threshold = 0.5 ########################## compile the model ############################### # define optimizer self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=self.hparams['lr']) weights = torch.Tensor([ 1., 1., 1., 1., 0.5, 1., 1., 1., 1., 1., 1., 1., 0.5, 0.5, 1., 1., 1., 1., 0.5, 1., 1., 1., 1., 0.5, 1., 1., 0.5 ]).to(self.device) self.loss = nn.BCELoss(weight=weights) # CompLoss(self.device) # self.decoder_loss = nn.MSELoss() # define early stopping self.early_stopping = EarlyStopping( checkpoint_path=self.hparams['checkpoint_path'] + '/checkpoint' + str(self.hparams['start_fold']) + '.pt', patience=self.hparams['patience'], delta=self.hparams['min_delta'], is_maximize=True, ) # lr cheduler self.scheduler = ReduceLROnPlateau( optimizer=self.optimizer, mode='max', factor=0.2, patience=1, verbose=True, threshold=self.hparams['min_delta'], threshold_mode='abs', cooldown=0, eps=0, ) self.seed_everything(42) self.postprocessing = PostProcessing(fold=self.hparams['start_fold']) self.scaler = torch.cuda.amp.GradScaler()
def run_model_DBLP(feats_type, hidden_dim, num_heads, attn_vec_dim, rnn_type, num_epochs, patience, batch_size, neighbor_samples, repeat, save_postfix): adjlists, edge_metapath_indices_list, features_list, adjM, type_mask, labels, train_val_test_idx = load_DBLP_data( ) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') features_list = [ torch.FloatTensor(features).to(device) for features in features_list ] if feats_type == 0: in_dims = [features.shape[1] for features in features_list] elif feats_type == 1: in_dims = [features_list[0].shape[1]] + [10] * (len(features_list) - 1) for i in range(1, len(features_list)): features_list[i] = torch.zeros( (features_list[i].shape[0], 10)).to(device) elif feats_type == 2: in_dims = [features.shape[0] for features in features_list] in_dims[0] = features_list[0].shape[1] for i in range(1, len(features_list)): dim = features_list[i].shape[0] indices = np.vstack((np.arange(dim), np.arange(dim))) indices = torch.LongTensor(indices) values = torch.FloatTensor(np.ones(dim)) features_list[i] = torch.sparse.FloatTensor( indices, values, torch.Size([dim, dim])).to(device) elif feats_type == 3: in_dims = [features.shape[0] for features in features_list] for i in range(len(features_list)): dim = features_list[i].shape[0] indices = np.vstack((np.arange(dim), np.arange(dim))) indices = torch.LongTensor(indices) values = torch.FloatTensor(np.ones(dim)) features_list[i] = torch.sparse.FloatTensor( indices, values, torch.Size([dim, dim])).to(device) labels = torch.LongTensor(labels).to(device) train_idx = train_val_test_idx['train_idx'] train_idx = np.sort(train_idx) val_idx = train_val_test_idx['val_idx'] val_idx = np.sort(val_idx) test_idx = train_val_test_idx['test_idx'] test_idx = np.sort(test_idx) svm_macro_f1_lists = [] svm_micro_f1_lists = [] nmi_mean_list = [] nmi_std_list = [] ari_mean_list = [] ari_std_list = [] for _ in range(repeat): net = MAGNN_nc_mb(3, 6, etypes_list, in_dims, hidden_dim, out_dim, num_heads, attn_vec_dim, rnn_type, dropout_rate) net.to(device) optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay) # training loop net.train() early_stopping = EarlyStopping( patience=patience, verbose=True, save_path='checkpoint/checkpoint_{}.pt'.format(save_postfix)) dur1 = [] dur2 = [] dur3 = [] train_idx_generator = index_generator(batch_size=batch_size, indices=train_idx) val_idx_generator = index_generator(batch_size=batch_size, indices=val_idx, shuffle=False) for epoch in range(num_epochs): t_start = time.time() # training net.train() for iteration in range(train_idx_generator.num_iterations()): # forward t0 = time.time() train_idx_batch = train_idx_generator.next() train_idx_batch.sort() train_g_list, train_indices_list, train_idx_batch_mapped_list = parse_minibatch( adjlists, edge_metapath_indices_list, train_idx_batch, device, neighbor_samples) t1 = time.time() dur1.append(t1 - t0) logits, embeddings = net( (train_g_list, features_list, type_mask, train_indices_list, train_idx_batch_mapped_list)) logp = F.log_softmax(logits, 1) train_loss = F.nll_loss(logp, labels[train_idx_batch]) t2 = time.time() dur2.append(t2 - t1) # autograd optimizer.zero_grad() train_loss.backward() optimizer.step() t3 = time.time() dur3.append(t3 - t2) # print training info if iteration % 50 == 0: print( 'Epoch {:05d} | Iteration {:05d} | Train_Loss {:.4f} | Time1(s) {:.4f} | Time2(s) {:.4f} | Time3(s) {:.4f}' .format(epoch, iteration, train_loss.item(), np.mean(dur1), np.mean(dur2), np.mean(dur3))) # validation net.eval() val_logp = [] with torch.no_grad(): for iteration in range(val_idx_generator.num_iterations()): # forward val_idx_batch = val_idx_generator.next() val_g_list, val_indices_list, val_idx_batch_mapped_list = parse_minibatch( adjlists, edge_metapath_indices_list, val_idx_batch, device, neighbor_samples) logits, embeddings = net( (val_g_list, features_list, type_mask, val_indices_list, val_idx_batch_mapped_list)) logp = F.log_softmax(logits, 1) val_logp.append(logp) val_loss = F.nll_loss(torch.cat(val_logp, 0), labels[val_idx]) t_end = time.time() # print validation info print('Epoch {:05d} | Val_Loss {:.4f} | Time(s) {:.4f}'.format( epoch, val_loss.item(), t_end - t_start)) # early stopping early_stopping(val_loss, net) if early_stopping.early_stop: print('Early stopping!') break # testing with evaluate_results_nc test_idx_generator = index_generator(batch_size=batch_size, indices=test_idx, shuffle=False) net.load_state_dict( torch.load('checkpoint/checkpoint_{}.pt'.format(save_postfix))) net.eval() test_embeddings = [] with torch.no_grad(): for iteration in range(test_idx_generator.num_iterations()): # forward test_idx_batch = test_idx_generator.next() test_g_list, test_indices_list, test_idx_batch_mapped_list = parse_minibatch( adjlists, edge_metapath_indices_list, test_idx_batch, device, neighbor_samples) logits, embeddings = net( (test_g_list, features_list, type_mask, test_indices_list, test_idx_batch_mapped_list)) test_embeddings.append(embeddings) test_embeddings = torch.cat(test_embeddings, 0) svm_macro_f1_list, svm_micro_f1_list, nmi_mean, nmi_std, ari_mean, ari_std = evaluate_results_nc( test_embeddings.cpu().numpy(), labels[test_idx].cpu().numpy(), num_classes=out_dim) svm_macro_f1_lists.append(svm_macro_f1_list) svm_micro_f1_lists.append(svm_micro_f1_list) nmi_mean_list.append(nmi_mean) nmi_std_list.append(nmi_std) ari_mean_list.append(ari_mean) ari_std_list.append(ari_std) # print out a summary of the evaluations svm_macro_f1_lists = np.transpose(np.array(svm_macro_f1_lists), (1, 0, 2)) svm_micro_f1_lists = np.transpose(np.array(svm_micro_f1_lists), (1, 0, 2)) nmi_mean_list = np.array(nmi_mean_list) nmi_std_list = np.array(nmi_std_list) ari_mean_list = np.array(ari_mean_list) ari_std_list = np.array(ari_std_list) print('----------------------------------------------------------------') print('SVM tests summary') print('Macro-F1: ' + ', '.join([ '{:.6f}~{:.6f} ({:.1f})'.format(macro_f1[:, 0].mean(), macro_f1[:, 1].mean(), train_size) for macro_f1, train_size in zip(svm_macro_f1_lists, [0.8, 0.6, 0.4, 0.2]) ])) print('Micro-F1: ' + ', '.join([ '{:.6f}~{:.6f} ({:.1f})'.format(micro_f1[:, 0].mean(), micro_f1[:, 1].mean(), train_size) for micro_f1, train_size in zip(svm_micro_f1_lists, [0.8, 0.6, 0.4, 0.2]) ])) print('K-means tests summary') print('NMI: {:.6f}~{:.6f}'.format(nmi_mean_list.mean(), nmi_std_list.mean())) print('ARI: {:.6f}~{:.6f}'.format(ari_mean_list.mean(), ari_std_list.mean()))
#Iterative optimization routine, can choose other techniques (e.g., Adam) optimizer_conv = optim.SGD(filter(lambda p: p.requires_grad, model_conv.parameters()), lr=lr, momentum=momentum) #Here to switch weights #Learning rate decay scheduler exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_conv, 'min', factor=gamma, verbose=True, patience=8) #Try to reduce over fitting by using early stopping early_stopping = EarlyStopping() ##Note the following function is adapted from pytorch tutorial # https://github.com/pytorch/tutorials/blob/master/beginner_source/transfer_learning_tutorial.py def train_model(model, criterion, optimizer, scheduler, num_epochs): since = time.time() #List to save out and watch learning convergence val_loss = [] val_acc = [] train_loss = [] train_acc = [] best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0
def train(): # initialization device = t.device("cuda:0" if t.cuda.is_available() else "cpu") opt = DefaultConfig() train_losses, valid_losses, avg_train_losses, avg_valid_losses = [], [], [], [] writer = SummaryWriter('logs') criterion = nn.CrossEntropyLoss().to(device) config_data = [['Key', 'Value'], ['device', device]] # config config_generator(config_data, opt) # data train_data, train_dataloader, val_data, val_dataloader = data_generator( opt) # model model = model_generator(device, opt) # optimizer & lr_scheduler & early_stopping optimizer = Adam(model.fc.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) early_stopping = EarlyStopping(patience=5, verbose=False, path='checkpoints/%s_final_checkpoint.pth' % opt.model) print('Starting training on %d images:' % len(train_data)) # Train with frozen layers first, to get a stable loss. # Adjust num epochs to your dataset. This step is enough to obtain a not bad model. if opt.freeze: for epoch in range(opt.freeze_epoch): print('Epoch {}/{} :'.format(epoch, opt.freeze_epoch + opt.unfreeze_epoch)) model.train() train_loss, loss_meter, correct = 0, 0, 0 # train epoch for batch_i, (data, label, _) in enumerate(tqdm(train_dataloader)): input = Variable(data).to(device) target = Variable(label).to(device) optimizer.zero_grad() predict = model(input) loss = criterion(predict, target) loss.backward() optimizer.step() train_losses.append(loss.item()) loss_meter += loss.item() logits = t.relu(predict) pred = logits.data.max(1)[1] correct += pred.eq(target.data).sum() train_acc = correct.cpu().detach().numpy() * 1.0 / len( train_dataloader.dataset) # ending of train epoch # validation epoch if epoch % opt.evaluation_interval == 0: if t.cuda.device_count() > 1: model = nn.DataParallel(model, device_ids=[0]) model.eval() loss_meter, correct = 0, 0 with t.no_grad(): print('Validating on %d images:' % len(val_data)) for inputs, target, _ in tqdm(val_dataloader): inputs = inputs.to(device) target = target.to(device) output = model(inputs) loss = criterion(output, target) loss_meter += loss.item() valid_losses.append(loss.item()) logits = t.relu(output) pred = logits.data.max(1)[1] correct += pred.eq(target.data).sum() val_acc = correct.cpu().detach().numpy() * 1.0 / len( val_dataloader.dataset) # ending of validation epoch train_loss = np.average(train_losses) valid_loss = np.average(valid_losses) avg_train_losses.append(train_loss) avg_valid_losses.append(valid_loss) print('train_loss: %.3f, train_acc: %.3f' % (train_loss, train_acc)) print('val_loss: %.3f, val_acc: %.3f' % (valid_loss, val_acc)) writer.add_scalar('train_loss', train_loss, global_step=epoch) writer.add_scalar('train_acc', train_acc, global_step=epoch) writer.add_scalar('valid_loss', valid_loss, global_step=epoch) writer.add_scalar('val_acc', val_acc, global_step=epoch) # clear lists to track next epoch train_losses.clear() valid_losses.clear() # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(valid_loss, model) if early_stopping.early_stop: print("Early stopping") opt.unfreeze = False break if epoch % opt.checkpoint_interval == 0: t.save(model.state_dict(), 'checkpoints/' + '%s_ckpt_%d.pth' % (opt.model, epoch)) print_separator() # load the last checkpoint with the best model model.load_state_dict( t.load('checkpoints/' + '%s_final_checkpoint.pth' % opt.model)) # Unfreeze and continue training, to fine-tune. # Train longer if the result is not good. if opt.unfreeze: print('Unfreeze all layers:') for param in model.parameters(): param.requires_grad = True optimizer = Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) lr_scheduler = ReduceLROnPlateau(optimizer, 'min', factor=opt.lr_decay, patience=3, verbose=True) for epoch in range(opt.freeze_epoch, opt.unfreeze_epoch): print('Epoch {}/{} :'.format(epoch, opt.unfreeze_epoch + opt.freeze_epoch)) model.train() train_loss, loss_meter, correct = 0, 0, 0 # train epoch for batch_i, (data, label, _) in enumerate(tqdm(train_dataloader)): input = Variable(data).to(device) target = Variable(label).to(device) optimizer.zero_grad() predict = model(input) loss = criterion(predict, target) loss.backward() optimizer.step() train_losses.append(loss.item()) loss_meter += loss.item() logits = t.relu(predict) pred = logits.data.max(1)[1] correct += pred.eq(target.data).sum() train_acc = correct.cpu().detach().numpy() * 1.0 / len( train_dataloader.dataset) # ending of train epoch # validation epoch if epoch % opt.evaluation_interval == 0: if t.cuda.device_count() > 1: model = nn.DataParallel(model, device_ids=[0]) model.eval() loss_meter, correct = 0, 0 with t.no_grad(): print('Validating on %d images:' % len(val_data)) for inputs, target, _ in tqdm(val_dataloader): inputs = inputs.to(device) target = target.to(device) output = model(inputs) loss = criterion(output, target) loss_meter += loss.item() valid_losses.append(loss.item()) logits = t.relu(output) pred = logits.data.max(1)[1] correct += pred.eq(target.data).sum() val_acc = correct.cpu().detach().numpy() * 1.0 / len( val_dataloader.dataset) # ending of validation epoch lr_scheduler.step(loss.item()) writer.add_scalar('learning_rate', lr_scheduler.state_dict()['_last_lr'], global_step=epoch) train_loss = np.average(train_losses) valid_loss = np.average(valid_losses) avg_train_losses.append(train_loss) avg_valid_losses.append(valid_loss) print('train_loss: %.3f, train_acc: %.3f' % (train_loss, train_acc)) print('val_loss: %.3f, val_acc: %.3f' % (valid_loss, val_acc)) writer.add_scalar('train_loss', train_loss, global_step=epoch) writer.add_scalar('train_acc', train_acc, global_step=epoch) writer.add_scalar('valid_loss', valid_loss, global_step=epoch) writer.add_scalar('val_acc', val_acc, global_step=epoch) # clear lists to track next epoch train_losses.clear() valid_losses.clear() # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model early_stopping(valid_loss, model) if early_stopping.early_stop: print("Early stopping") break if epoch % opt.checkpoint_interval == 0: t.save(model.state_dict(), 'checkpoints/' + '%s_ckpt_%d.pth' % (opt.model, epoch)) print_separator() # load the last checkpoint with the best model model.load_state_dict( t.load('checkpoints/' + '%s_final_checkpoint.pth' % opt.model))
def train_test(args, data): user_history_dict, entity_embedding, relation_embedding, entity_adj, relation_adj, doc_feature_dict, entity_num, position_num, type_num, user2item_train, user2item_test, vert_train, vert_test, local_train, local_test, pop_train, pop_test, item2item_train, item2item_test = data #user2item_train, user2item_test, vert_train, vert_test, local_train, local_test, pop_train, pop_test, item2item_train, item2item_test = data train_data_u2i = NewsDataset(user2item_train) train_sampler_u2i = RandomSampler(train_data_u2i) train_dataloader_u2i = DataLoader(train_data_u2i, sampler=train_sampler_u2i, batch_size=args.batch_size, collate_fn=my_collate_fn, pin_memory=False) train_data_vert = NewsDataset(vert_train) train_sampler_vert = RandomSampler(train_data_vert) train_dataloader_vert = DataLoader(train_data_vert, sampler=train_sampler_vert, batch_size=args.batch_size, pin_memory=False) train_data_local = NewsDataset(local_train) train_sampler_local = RandomSampler(train_data_local) train_dataloader_local = DataLoader(train_data_local, sampler=train_sampler_local, batch_size=args.batch_size, pin_memory=False) train_data_pop = NewsDataset(pop_train) train_sampler_pop = RandomSampler(train_data_pop) train_dataloader_pop = DataLoader(train_data_pop, sampler=train_sampler_pop, batch_size=args.batch_size, pin_memory=False) train_data_i2i = NewsDataset(item2item_train) train_sampler_i2i = RandomSampler(train_data_i2i) train_dataloader_i2i = DataLoader(train_data_i2i, sampler=train_sampler_i2i, batch_size=args.batch_size, pin_memory=False) valid_scores = [] early_stopping = EarlyStopping(patience=2, verbose=True) print("learning rate {} l2_regular {}".format(args.learning_rate, args.l2_regular)) model = KRED(args, user_history_dict, doc_feature_dict, entity_embedding, relation_embedding, entity_adj, relation_adj, entity_num, position_num, type_num).cuda() if args.training_type == "multi-task": pretrain_epoch = 0 while(pretrain_epoch < 5): model.train() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=0) total_loss_vert = 0 model.train() for step, batch in enumerate(train_dataloader_vert): out = model(batch['item1'], batch['item2'], "vert_classify")[1] loss = criterion(out, torch.tensor(batch['label']).cuda()) total_loss_vert = total_loss_vert + loss optimizer.zero_grad() loss.backward() optimizer.step() print('epoch {} loss {}'.format(pretrain_epoch, total_loss_vert)) total_loss_pop = 0 model.train() for step, batch in enumerate(train_dataloader_pop): out = model(batch['item1'], batch['item2'], "pop_predict")[3] loss = criterion(out, torch.tensor(batch['label']).cuda()) total_loss_pop = total_loss_pop + loss optimizer.zero_grad() loss.backward() optimizer.step() print('epoch {} loss {}'.format(pretrain_epoch, total_loss_pop)) criterion = nn.BCELoss() total_loss_local = 0 model.train() for step, batch in enumerate(train_dataloader_local): out = model(batch['item1'], batch['item2'], "local_news")[2] loss = criterion(out, torch.tensor(batch['label']).float().cuda()) total_loss_local = total_loss_local + loss optimizer.zero_grad() loss.backward() optimizer.step() print('epoch {} loss {}'.format(pretrain_epoch, total_loss_local)) criterion = Softmax_BCELoss(args) total_loss_i2i = 0 model.train() for step, batch in enumerate(train_dataloader_i2i): out = model(batch['item1'], batch['item2'], "item2item")[4] loss = criterion(out, torch.stack(batch['label']).float().cuda()) total_loss_i2i = total_loss_i2i + loss optimizer.zero_grad() loss.backward() optimizer.step() print('epoch {} loss {}'.format(pretrain_epoch, total_loss_i2i)) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.l2_regular) total_loss_u2i = 0 model.train() for step, batch in enumerate(train_dataloader_u2i): batch = real_batch(batch) out = model(batch['item1'], batch['item2'], "user2item")[0] loss = criterion(out, torch.tensor(batch['label']).cuda()) total_loss_u2i = total_loss_u2i + loss optimizer.zero_grad() loss.backward() optimizer.step() print('epoch {} loss {}'.format(pretrain_epoch, total_loss_u2i)) pretrain_epoch = pretrain_epoch + 1 for epoch in range(args.epoch): if args.task == "user2item": test_data = user2item_test criterion = Softmax_BCELoss(args) train_data_loader = train_dataloader_u2i task_index = 0 elif args.task == "item2item": test_data = item2item_test criterion = Softmax_BCELoss(args) train_data_loader = train_dataloader_i2i task_index = 4 elif args.task == "vert_classify": test_data = user2item_test criterion = nn.CrossEntropyLoss() train_data_loader = train_dataloader_vert task_index = 1 elif args.task == "pop_predict": test_data = user2item_test criterion = nn.CrossEntropyLoss() train_data_loader = train_dataloader_pop task_index = 3 elif args.task == "local_news": test_data = user2item_test criterion = nn.BCELoss() train_data_loader = train_dataloader_local task_index = 2 else: print("Error: task name error.") break optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.l2_regular) total_loss = 0 model.train() for step, batch in enumerate(train_data_loader): if task_index == 0: batch = real_batch(batch) if task_index == 4: out = model(batch['item1'], batch['item2'], "item2item")[task_index] loss = criterion(out, torch.stack(batch['label']).float().cuda()) elif task_index == 2: out = model(batch['item1'], batch['item2'], "local_news")[task_index] loss = criterion(out, torch.tensor(batch['label']).float().cuda()) else: out = model(batch['item1'], batch['item2'], args.task)[task_index] loss = criterion(out, torch.tensor(batch['label']).cuda()) total_loss = total_loss + loss optimizer.zero_grad() loss.backward() optimizer.step() print('epoch {} loss {}'.format(epoch, total_loss)) model.eval() y_pred = [] start_list = list(range(0, len(test_data['label']), args.batch_size)) for start in start_list: if start + args.batch_size <= len(test_data['label']): end = start + args.batch_size else: end = len(test_data['label']) #out = model(test_data['user_id'][start:end], test_data['news_id'][start:end], args.task)[task_index].view(end-start).cpu().data.numpy() #test = model(test_data['user_id'][start:end], test_data['news_id'][start:end], args.task)[task_index].cpu().data.numpy() out = model(test_data['user_id'][start:end], test_data['news_id'][start:end], args.task)[task_index].cpu().data.numpy() #y_pred = y_pred + out.tolist() y_pred.extend(out) truth = test_data['label'] score = evaulate(y_pred, truth, test_data, args.task) valid_scores.append(score) early_stopping(score, model) if early_stopping.early_stop: print("Early stopping") break model.load_state_dict(torch.load('checkpoint.pt')) y_pred = [] start_list = list(range(0, len(test_data['label']), args.batch_size)) for start in start_list: if start + args.batch_size <= len(test_data['label']): end = start + args.batch_size else: end = len(test_data['label']) #out = model(test_data['user_id'][start:end], test_data['news_id'][start:end], args.task)[task_index].view(end - start).cpu().data.numpy() out = model(test_data['user_id'][start:end], test_data['news_id'][start:end], args.task)[ task_index].cpu().data.numpy() #y_pred = y_pred + out.tolist() y_pred.extend(out) result_path = "./result_log/" + args.logdir + '/' if not os.path.exists(result_path): os.mkdir(result_path) result_file_path = result_path + "predict_result.txt" fp = open(result_file_path, 'w') for line_index in range(len(y_pred)): fp.write(str(y_pred[line_index]) + '\t' + str(truth[line_index]) + '\n')
class Model: """ This class handles basic methods for handling the model: 1. Fit the model 2. Make predictions 3. Save 4. Load """ def __init__(self, input_size, n_channels, hparams, gpu, inference=False): self.hparams = hparams if inference: self.device = torch.device('cpu') self.model = ECGNet(n_channels=n_channels, hparams=self.hparams).to(self.device) else: if torch.cuda.device_count() > 1: if len(gpu) > 0: print("Number of GPUs will be used: ", len(gpu)) self.device = torch.device(f"cuda:{gpu[0]}" if torch.cuda. is_available() else "cpu") self.model = ECGNet(n_channels=n_channels, hparams=self.hparams).to(self.device) self.model = DP(self.model, device_ids=gpu, output_device=gpu[0]) else: print("Number of GPUs will be used: ", torch.cuda.device_count() - 5) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.model = ECGNet(n_channels=n_channels, hparams=self.hparams).to(self.device) self.model = DP(self.model, device_ids=list( range(torch.cuda.device_count() - 5))) else: self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.model = ECGNet(n_channels=n_channels, hparams=self.hparams).to(self.device) print('Only one GPU is available') # define the models #summary(self.model, (input_size, n_channels)) #print(torch.cuda.is_available()) self.metric = Metric() self.num_workers = 18 self.threshold = 0.5 ########################## compile the model ############################### # define optimizer self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=self.hparams['lr']) weights = torch.Tensor([ 1., 1., 1., 1., 0.5, 1., 1., 1., 1., 1., 1., 1., 0.5, 0.5, 1., 1., 1., 1., 0.5, 1., 1., 1., 1., 0.5, 1., 1., 0.5 ]).to(self.device) self.loss = nn.BCELoss(weight=weights) # CompLoss(self.device) # self.decoder_loss = nn.MSELoss() # define early stopping self.early_stopping = EarlyStopping( checkpoint_path=self.hparams['checkpoint_path'] + '/checkpoint' + str(self.hparams['start_fold']) + '.pt', patience=self.hparams['patience'], delta=self.hparams['min_delta'], is_maximize=True, ) # lr cheduler self.scheduler = ReduceLROnPlateau( optimizer=self.optimizer, mode='max', factor=0.2, patience=1, verbose=True, threshold=self.hparams['min_delta'], threshold_mode='abs', cooldown=0, eps=0, ) self.seed_everything(42) self.postprocessing = PostProcessing(fold=self.hparams['start_fold']) self.scaler = torch.cuda.amp.GradScaler() def seed_everything(self, seed): np.random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) torch.manual_seed(seed) def fit(self, train, valid): train_loader = DataLoader(train, batch_size=self.hparams['batch_size'], shuffle=True, num_workers=self.num_workers) valid_loader = DataLoader(valid, batch_size=self.hparams['batch_size'], shuffle=False, num_workers=self.num_workers) # tensorboard object writer = SummaryWriter() for epoch in range(self.hparams['n_epochs']): # trian the model self.model.train() avg_loss = 0.0 train_preds, train_true = torch.Tensor([]), torch.Tensor([]) for (X_batch, y_batch) in tqdm(train_loader): y_batch = y_batch.float().to(self.device) X_batch = X_batch.float().to(self.device) self.optimizer.zero_grad() # get model predictions pred, pred_decoder = self.model(X_batch) # process loss_1 pred = pred.view(-1, pred.shape[-1]) pred = pred**2 y_batch = y_batch.view(-1, y_batch.shape[-1]) train_loss = self.loss(pred, y_batch) y_batch = y_batch.float().cpu().detach() pred = pred.float().cpu().detach() # process loss_2 pred_decoder = pred_decoder.view(-1, pred_decoder.shape[-1]) X_batch = X_batch.view(-1, X_batch.shape[-1]) decoder_train_loss = self.decoder_loss(pred_decoder, X_batch) X_batch = X_batch.float().cpu().detach() pred_decoder = pred_decoder.float().cpu().detach() # calc loss avg_loss += train_loss.item() / len(train_loader) #sum up multi-head losses train_loss = train_loss + decoder_train_loss self.scaler.scale( train_loss).backward() # train_loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1) torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1) torch.nn.utils.clip_grad_value_(self.model.parameters(), 0.5) self.scaler.step(self.optimizer) # self.optimizer.step() self.scaler.update() train_true = torch.cat([train_true, y_batch], 0) train_preds = torch.cat([train_preds, pred], 0) # calc triaing metric train_preds = train_preds.numpy() train_true = train_true.numpy() threshold = self.postprocessing.find_opt_thresold( train_true, train_preds) self.postprocessing.update_threshold(threshold) train_preds = self.postprocessing.run(train_preds) metric_train = self.metric.compute(labels=train_true, outputs=train_preds) # evaluate the model print('Model evaluation...') self.model.eval() val_preds, val_true = torch.Tensor([]), torch.Tensor([]) avg_val_loss = 0.0 with torch.no_grad(): for X_batch, y_batch in valid_loader: y_batch = y_batch.float().to(self.device) X_batch = X_batch.float().to(self.device) pred, pred_decoder = self.model(X_batch) pred_decoder = pred_decoder.float().cpu().detach() X_batch = X_batch.float().cpu().detach() pred = pred.reshape(-1, pred.shape[-1]) pred = pred**2 y_batch = y_batch.view(-1, y_batch.shape[-1]) avg_val_loss += self.loss( pred, y_batch).item() / len(valid_loader) y_batch = y_batch.float().cpu().detach() pred = pred.float().cpu().detach() val_true = torch.cat([val_true, y_batch], 0) val_preds = torch.cat([val_preds, pred], 0) # evalueate metric val_preds = val_preds.numpy() val_true = val_true.numpy() # val_true, val_preds = self.metric.find_opt_thresold(val_true, val_preds) val_preds = self.postprocessing.run(val_preds) metric_val = self.metric.compute(val_true, val_preds) self.scheduler.step(metric_val) #avg_val_loss) res = self.early_stopping(score=metric_val, model=self.model, threshold=threshold) # print statistics if self.hparams['verbose_train']: print( '| Epoch: ', epoch + 1, '| Train_loss: ', avg_loss, '| Val_loss: ', avg_val_loss, '| Metric_train: ', metric_train, '| Metric_val: ', metric_val, '| Current LR: ', self.__get_lr(self.optimizer), ) # # add history to tensorboard writer.add_scalars( 'Loss', { 'Train_loss': avg_loss, 'Val_loss': avg_val_loss }, epoch, ) writer.add_scalars('Metric', { 'Metric_train': metric_train, 'Metric_val': metric_val }, epoch) if res == 2: print("Early Stopping") print( f'global best max val_loss model score {self.early_stopping.best_score}' ) break elif res == 1: print(f'save global val_loss model score {metric_val}') writer.close() self.model = self.early_stopping.load_best_weights() self.postprocessing.update_threshold(self.early_stopping.threshold) return True def predict(self, X_test): # evaluate the model self.model.eval() test_loader = torch.utils.data.DataLoader( X_test, batch_size=self.hparams['batch_size'], shuffle=False, num_workers=self.num_workers) # ,collate_fn=train.my_collate test_preds = torch.Tensor([]) test_val = torch.Tensor([]) print('Start generation of predictions') with torch.no_grad(): for i, (X_batch, y_batch) in enumerate(tqdm(test_loader)): X_batch = X_batch.float().to(self.device) pred, pred_decoder = self.model(X_batch) pred = pred**2 X_batch = X_batch.float().cpu().detach() test_preds = torch.cat([test_preds, pred.cpu().detach()], 0) test_val = torch.cat([test_val, y_batch.cpu().detach()], 0) return test_val.numpy(), test_preds.numpy() def get_heatmap(self, X_test): # evaluate the model self.model.eval() test_loader = torch.utils.data.DataLoader( X_test, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers) # ,collate_fn=train.my_collate test_preds = torch.Tensor([]) with torch.no_grad(): for i, (X_batch) in enumerate(test_loader): X_batch = X_batch.float().to(self.device) pred = self.model.activatations(X_batch) pred = torch.sigmoid(pred) pred = pred**2 X_batch = X_batch.float().cpu().detach() test_preds = torch.cat([test_preds, pred.cpu().detach()], 0) return test_preds.numpy() def model_save(self, model_path): torch.save(self.model.state_dict(), model_path) # self.model.module.state_dict(), PATH # torch.save(self.model, model_path) return True def model_load(self, model_path): self.model.load_state_dict( torch.load(model_path, map_location=self.device)) return True def model_load_old(self, model_path): self.model = torch.load(model_path, map_location=self.device) return True def inference(self, X, y): preprocessing = Preprocessing(aug=False) X = preprocessing.run(X, y, label_process=False) X = X.reshape(1, -1, X.shape[1]) self.model.eval() predictions, pred_decoder = self.model.forward(torch.Tensor(X)) predictions = predictions**2 predictions = predictions.detach().numpy() print(np.round(predictions, 3)) return predictions ################## Utils ##################### def __get_lr(self, optimizer): for param_group in optimizer.param_groups: return param_group['lr']
def train_velonet(args, dataset): training_data, training_label, valid_data, valid_label = create_dataset_Relative_Kinematic( args, dataset, windows_size) valid_data.requires_grad = False valid_label.requires_grad = False training_data = quick_std(training_data) valid_data = quick_std(valid_data) #training_label = quick_norm(training_label) #valid_label = quick_norm(valid_label) early_stopping = EarlyStopping(patience=35, verbose=True) device = torch.device( 'cuda:0' if torch.cuda.is_available() and not args.cpu else 'cpu') #device = torch.device('cpu') network = get_Relative_Kinematic().to(device) print('Number of train samples: {}'.format(training_data.shape[0])) print('Number of val samples: {}'.format(valid_data.shape[0])) total_params = network.get_num_params() print('Total number of parameters: ', total_params) optimizer = torch.optim.Adam(network.parameters(), lr) #If after 25 epochs the validation loss did not improve we reduce the learning rate to converge towards optimal solution scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=25, verbose=True, eps=1e-12) if load_model: dictionary = torch.load(model_path_VeloNet) network.load_state_dict(dictionary.get('model_state_dict')) optimizer.load_state_dict(dictionary.get('optimizer_state_dict')) start_time = time.time() avg_train_losses = [] avg_valid_losses = [] writer = SummaryWriter() for epoch in range(1, epoch_len + 1): train_loss, valid_loss = train_loop_Relative_Kinematic( args, dataset, network, device, optimizer, scheduler, training_data, valid_data, training_label, valid_label, batch_size, writer) avg_train_losses.append(train_loss) avg_valid_losses.append(valid_loss) writer.add_scalars(f'Train_loss/Validation_loss', { 'Train_loss': train_loss, 'Valid_loss': valid_loss, }, epoch) save = early_stopping(avg_valid_losses[-1], network) if save: #Save the model if the validation loss improved torch.save( { 'model_state_dict': network.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'epoch': epoch_len }, model_path_VeloNet) print('SAVE') if early_stopping.early_stop: #Otherwise if after patience = 35 epoch, the validation loss did not improve we stop the training print('Early stopping') break print("Epoch number : " + str(epoch) + "/" + str(epoch_len)) print('\tTrain_Loss: {:.9f}'.format(avg_train_losses[-1])) print('\tValid_Loss: {:.9f}'.format(avg_valid_losses[-1])) print("Amount of time spent for 1 epoch: {}s\n".format( int(time.time() - start_time))) start_time = time.time() writer.close()