def train(self, data, emb_dim, hidden_size, p_dropout, lr, l2_penalty, epochs): device = torch.device('cuda:0' if torch.cuda.is_avaliable() else 'cpu') encoder, dual_encoder = self.create_model(emb_dim, hidden_size, p_dropout) encoder.to(device) dual_encoder.to(device) self.initialize_df(data) training_dataframe, validation_dataframe = train_test_split( self.df, test_size=0.05) optimizer = torch.optim.Adam(dual_encoder.parameters(), lr=lr, weight_decay=l2_penalty) loss_func = torch.nn.BCEWithLogitsLoss() loss_func.to(device) for epoch in range(epochs): train(epoch, training_dataframe, self.embed_dict, dual_encoder, optimizer, loss_func, device) val(epoch, validation_dataframe, self.embed_dict, dual_encoder, optimizer, loss_func, device) self.dual_encoder = dual_encoder
def fit(self, x_train, y_train, x_val, y_val): x_train, x_val = self.preprocess_data(x_train, x_val) train((x_train, x_train, x_val, x_val), self.network, self.train_fn, self.val_fn, hlayer_fn=self.hlayer_fn, pred_fn=self.pred_fn, salmap_fn=self.salmap_fn, epochs=self.network_kwargs['epochs'], batchsize=self.network_kwargs['batch_size'], save_path=self.save_dir)
def fit(self, x_train, y_train, x_val,y_val): x_train, x_val = self.preprocess_data(x_train, x_val) train((x_train, x_train, x_val,x_val), self.network, self.train_fn, self.val_fn, hlayer_fn = self.hlayer_fn, pred_fn = self.pred_fn, salmap_fn = self.salmap_fn, epochs=self.network_kwargs['epochs'], batchsize=self.network_kwargs['batch_size'], save_path = self.save_dir)
def main(): global args, best_metric # specify dataset if args.dataset == 'ucf101': num_class = 101 elif args.dataset == 'hmdb51': num_class = 51 elif args.dataset == 'kinetics400': num_class = 400 elif args.dataset == 'kinetics200': num_class = 200 else: raise ValueError('Unknown dataset ' + args.dataset) data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/{}/access".format(args.dataset)) # create model org_model = VideoModule(num_class=num_class, base_model_name=args.arch, dropout=args.dropout, pretrained=args.pretrained, pretrained_model=args.pretrained_model) num_params = 0 for param in org_model.parameters(): num_params += param.reshape((-1, 1)).shape[0] print("Model Size is {:.3f}M".format(num_params / 1000000)) model = torch.nn.DataParallel(org_model).cuda() # model = org_model # define loss function (criterion) and optimizer criterion = torch.nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print(("=> loading checkpoint '{}'".format(args.resume))) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_metric = checkpoint['best_metric'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print(("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch']))) else: print(("=> no checkpoint found at '{}'".format(args.resume))) # Data loading code ## train data train_transform = torchvision.transforms.Compose([ org_model.get_augmentation(), Stack(mode=args.mode), ToTorchFormatTensor(), GroupNormalize(), ]) train_dataset = VideoDataSet(root_path=data_root, list_file=args.train_list, t_length=args.t_length, t_stride=args.t_stride, num_segments=args.num_segments, image_tmpl=args.image_tmpl, transform=train_transform, phase="Train") train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, num_workers=args.workers, pin_memory=True) ## val data val_transform = torchvision.transforms.Compose([ GroupScale(256), GroupCenterCrop(224), Stack(mode=args.mode), ToTorchFormatTensor(), GroupNormalize(), ]) val_dataset = VideoDataSet(root_path=data_root, list_file=args.val_list, t_length=args.t_length, t_stride=args.t_stride, num_segments=args.num_segments, image_tmpl=args.image_tmpl, transform=val_transform, phase="Val") val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.mode != "3D": cudnn.benchmark = True validate(val_loader, model, criterion, args.print_freq, args.start_epoch) for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, args.lr, epoch, args.lr_steps) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args.print_freq) # evaluate on validation set if (epoch + 1) % args.eval_freq == 0 or epoch == args.epochs - 1: metric = validate(val_loader, model, criterion, args.print_freq, epoch + 1) # remember best prec@1 and save checkpoint is_best = metric > best_metric best_metric = max(metric, best_metric) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_metric': best_metric, 'optimizer': optimizer.state_dict(), }, is_best, epoch + 1, args.experiment_root)
def main(): parser = init_parser() args = parser.parse_args() if not os.path.exists(args.result_dir): os.makedirs(args.result_dir) model = VarPred(in_channels=args.in_channels, out_dim=args.out_dim, input_mode=args.input_mode) model.cuda() model = nn.DataParallel(model) cudnn.benchmark = True # optimizer = torch.optim.SGD(model.module.parameters(), # lr=args.lr, # momentum=0.9) optimizer = torch.optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss().cuda() train_list, test_list = split_indices(args.data_dir, args.test_ratio) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) train_loader = torch.utils.data.DataLoader(PairDataset( args.data_dir, train_list, image_tmpl='pair_{:06d}.jpg', transform=transform), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, worker_init_fn=worker_init_fn) test_loader = torch.utils.data.DataLoader(PairDataset( args.data_dir, test_list, image_tmpl='pair_{:06d}.jpg', transform=transform), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, worker_init_fn=worker_init_fn) train_logger = os.path.join(args.result_dir, 'train.log') val_logger = os.path.join(args.result_dir, 'val.log') best_prec1 = 0 for epoch in range(args.epochs): # adjust_learning_rate(optimizer, epoch, args.lr_steps) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, train_logger=train_logger, args=args) with open(train_logger, 'a') as f: f.write('\n') save_checkpoint(state={ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best=False, result_dir=args.result_dir, filename='ep_' + str(epoch) + '_checkpoint.pth.tar') # evaluate on validation set if (epoch + 1) % 1 == 0 or epoch == args.epochs - 1: prec1 = validate(test_loader, model, criterion, val_logger=val_logger, epoch=epoch) # remember best prec@1 and save checkpoint if prec1 > best_prec1: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best=is_best, result_dir=args.result_dir)
def main(config_path): path_to_config = Path(config_path) if not (path_to_config.exists()): raise ValueError('{} doesn\'t exist'.format(path_to_config)) elif path_to_config.suffix.lower( ) != '.json' or not path_to_config.is_file(): raise ValueError('{} is not .json config file'.format(path_to_config)) model_configs = load_json(path_to_config) path_to_data = model_configs['path_to_data'] train_model = model_configs['train_model'] workers_num = model_configs['workers_num'] batch_size = model_configs['batch_size'] img_size = model_configs['img_size'] transforms = Compose([ Resize(*img_size), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) transforms_val = Compose([ Resize(*img_size), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) data_loaders = get_data_loaders(path_to_data, transforms, transforms_val, batch_size, workers_num) model_selector = get_models_selector() model_type = model_configs['model_type'] model = model_selector[model_type](True) criterion = nn.BCEWithLogitsLoss() metric = ConfMatrix() device = 'cpu' if cuda.is_available() and model_configs['cuda_usage']: device = 'cuda' criterion.to(device) metric.to(device) if device is not 'cpu' and cuda.device_count() > 1: model = nn.DataParallel(model).cuda() elif device is not 'cpu': model = model.cuda() optimizer = optim.SGD(model.parameters(), lr=model_configs['learning_rate'], momentum=0.9) info_paths = model_configs['info_paths'] writer = SummaryWriter(logdir=info_paths['log_dir']) total_epochs = model_configs['epochs'] best_f1_score = 0. for epoch in range(total_epochs): model.train() train(model, data_loaders['train'], epoch, optimizer, criterion, metric, writer, device=device) model.eval() pr, recall, f1_score = val(model, criterion, metric, data_loaders['val'], epoch, writer, device=device) if f1_score > best_f1_score: best_f1_score = f1_score path_to_save = os.path.join(model_configs['path_to_save_model'], 'best_model_{}.pth'.format(epoch)) save(model.state_dict(), path_to_save)
" training " criterion = torch.nn.L1Loss(reduction='mean') #MSELoss() sum optimizer = optim.Adam(model.parameters()) CLIP = 1 N_EPOCHS = 100 batch_size = 1 val_batch_size = 1 train_loss_list = [] valid_loss_list = [] for epoch in range(N_EPOCHS): start_time = time.time() train_loss = train(model, input_data, label_data, optimizer, criterion, CLIP, batch_size, device, epoch, N_EPOCHS) valid_loss = evaluate(model, input_data, label_data, criterion, val_batch_size, device, epoch, N_EPOCHS) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.6f}') print(f'\t Val. Loss: {valid_loss:.6f}') train_loss_list.append(train_loss) valid_loss_list.append(valid_loss) torch.save(model.state_dict(), 'tut2-model.pt')
assert os.path.isfile(args.resume), "=> no checkpoint found at '{}'".format(args.resume) print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) if args.multigpu: model = nn.DataParallel(model) iteration = (args.start_epoch-1)*len(train_loader) # Training & Validation for epoch in range(1, args.epochs + 1): print("\nepoch {}".format(epoch)) train(args, model, device, train_loader, optimizer, epoch, iteration) scheduler.step() iteration += len(train_loader) validation_loss, validation_accuracy = val(args, model, device, test_loader, iteration) if epoch % args.save_interval == 0: saved_weight = os.path.join(args.path2weight, "pt_"+args.dataset+"_ft_"+args.ft_dataset+"_"+args.usenet+"_epoch"+ str(epoch) +".pth") if args.multigpu: torch.save(model.module.cpu().state_dict(), saved_weight) model_state = model.module.cpu().state_dict() else: torch.save(model.cpu().state_dict(), saved_weight) model_state = model.cpu().state_dict() # Save checkpoint checkpoint = "{}/{}_{}_checkpoint.pth.tar".format(args.path2weight, args.dataset, args.usenet) torch.save({'epoch': epoch + 1,
print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if not args.no_multigpu: model = nn.DataParallel(model) # FractalDB Pre-training iteration = (args.start_epoch - 1) * len(train_loader) for epoch in range(args.start_epoch, args.epochs + 1): train(args, model, device, train_loader, optimizer, criterion, epoch) scheduler.step() iteration += len(train_loader) if args.val: validation_loss = validate(args, model, device, val_loader, criterion, iteration) if epoch % args.save_interval == 0: if args.no_multigpu: model_state = model.cpu().state_dict() else: model_state = model.module.cpu().state_dict() saved_weight = "{}/{}_{}_epoch{}.pth.tar".format( args.path2weight, args.dataset, args.usenet, epoch) torch.save(model_state, saved_weight.replace('.tar', '')) checkpoint = "{}/{}_{}_checkpoint.pth.tar".format( args.path2weight, args.dataset, args.usenet)
def train_model(args): global best_metric, epoch_resume epoch_resume = 0 best_metric = 0 model = get_model(args) if args.distribute: model = model.cuda() model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank]) else: model = torch.nn.DataParallel(model).cuda() writer = None if is_main_process(): log_file = args.model_name + '_' + args.dataset + '_t_length_' + str( args.t_length) + '_t_stride_' + str( args.t_stride) + '_batch_' + str( args.batch_size) + '_lr_' + str( args.lr) + "_logfile_" + time.strftime( "%d_%b_%Y_%H:%M:%S", time.localtime()) log_file = os.path.join(args.log_dir, args.model_name, log_file) writer = SummaryWriter(log_dir=log_file) print(model) dataloaders, dataset_sizes, samplers = get_dataloader(args) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=1e-4, momentum=0.9) criterion = nn.CrossEntropyLoss().cuda() scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.num_epochs) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') epoch_resume = checkpoint['epoch'] best_metric = checkpoint['best_metric'] model_dict = model.state_dict() idx = 0 print(len(model_dict)) print(len(checkpoint['state_dict'])) for k, v in checkpoint['state_dict'].items(): k = k.replace('module.', '') if k in model_dict: if v.shape == model_dict[k].shape: model_dict[k] = v.cuda() idx += 1 print(idx) print('upload parameter already') model.load_state_dict(model_dict) optimizer.load_state_dict(checkpoint['optimizer']) print(("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch']))) print(best_metric) elif is_main_process(): print(("=> no checkpoint found at '{}'".format(args.resume))) for epoch in range(epoch_resume, args.num_epochs): if args.distribute: samplers['train'].set_epoch(epoch) samplers['val'].set_epoch(epoch) end = time.time() train(dataloaders['train'], model, criterion, optimizer, epoch, args.print_freq, writer, args=args) scheduler.step() if epoch >= 0: metric = validate(dataloaders['val'], model, criterion, args.print_freq, epoch + 1, writer, args=args) if is_main_process(): print(metric) # remember best prec@1 and save checkpoint is_best = metric > best_metric best_metric = max(metric, best_metric) print(best_metric) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_metric': best_metric, 'optimizer': optimizer.state_dict(), }, is_best, str('current'), args.check_dir, args=args, name=args.model_name) time_elapsed = time.time() - end if is_main_process(): print( f"Training complete in {time_elapsed//3600}h {(time_elapsed%3600)//60}m {time_elapsed %60}s" )
def main(args, models_mngr): best_prec1 = 0 # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model = torch.nn.DataParallel(models_mngr.get_model(args.arch)) if args.cpu: model.cpu() else: model.cuda() if args.logs: if args.logs_dir == 'logs_dir': writer = SummaryWriter(f'log_dir/{args.arch}') else: writer = SummaryWriter(args.logs_dir) else: writer = None # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) trn_loader = train_loader(args.workers, args.batch_size, normalize) val_loader = validate_loader(args.workers, args.batch_size, normalize) # define loss function (criterion) and pptimizer criterion = nn.CrossEntropyLoss() if args.cpu: criterion = criterion.cpu() else: criterion = criterion.cuda() if args.half: model.half() criterion.half() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.evaluate: validate(val_loader, model, criterion, args.cpu, args.half, args.print_freq) return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch, args.lr) # train for one epoch train(trn_loader, model, criterion, optimizer, epoch, args.cpu, args.half, args.print_freq, writer) # evaluate on validation set prec1 = validate(val_loader, model, criterion, args.cpu, args.half, args.print_freq) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if epoch > 0 and epoch % args.save_every == 0: save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=os.path.join(args.save_dir, 'checkpoint_{}.tar'.format(epoch))) save_checkpoint({ 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=os.path.join(args.save_dir, 'model.th'))