if args.use_weights: softmax_proteins, *_ = next(iter(train_loader)) softmax_proteins = softmax_proteins[:4] else: softmax_proteins = next(iter(train_loader))[:4] softmax_name = args.results_dir / Path("softmax.png") if args.anneal_learning_rates and args.plot_learning_rates: learning_rates = [] learning_rates_name = args.results_dir / Path("learning_rates.png") try: for epoch in range(1, args.epochs + 1): start_time = time.time() train_loss, train_metrics = train_epoch( epoch, model, optimizer, train_loader, args.log_interval, args.clip_grad_norm, args.clip_grad_value, scheduler) if args.val_ratio > 0: val_loss, val_metrics = validate(epoch, model, val_loader) loss_str = "Validation" loss_value_str = f"{val_loss:.5f}" val_str = f"{loss_str} loss: {loss_value_str} " improved = val_loss < best_loss else: loss_str = "Training" loss_value_str = f"{train_loss:.5f}" val_str = "" improved = train_loss < best_loss
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': # opt.device = torch.device(f'cuda:{index}') opt.device = torch.device('cuda:{}'.format(index)) if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes, opt.strg) if opt.strg: model = STRG(model, nclass=opt.n_classes, nrois=opt.nrois) rpn = RPN(nrois=opt.nrois) rpn = make_data_parallel(rpn, opt.distributed, opt.device) else: rpn = None if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) # if opt.pretrain_path: # parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) # else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: #from torch.utils.tensorboard import SummaryWriter from tensorboardX import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None if opt.wandb: name = str(opt.result_path) wandb.init( project='strg', name=name, config=opt, dir=name, # resume=str(opt.resume_path) != '', sync_tensorboard=True) prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed, rpn=rpn, det_interval=opt.det_interval, nrois=opt.nrois) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed, rpn=rpn, det_interval=opt.det_interval, nrois=opt.nrois) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.resume_path is not None: if not opt.no_train: opt.begin_epoch, model, optimizer, scheduler = resume( opt.resume_path, opt.arch, opt.begin_epoch, model, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones else: opt.begin_epoch, model, _, _ = resume(opt.resume_path, opt.arch, opt.begin_epoch, model) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) print('after generating model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) print('after resume model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) # summary(model, input_size=(3, 112, 112)) # if opt.pretrain_path: # model = load_pretrained_model(model, opt.pretrain_path, opt.model, # opt.n_finetune_classes) print('after pretrained model:', model.fc.in_features, ':', model.fc.out_features) print('feature weights:', model.fc.weight.shape, ':', model.fc.bias.shape) print(torch_summarize(model)) # parameters = model.parameters() # for name, param in model.named_parameters(): # if param.requires_grad: # print(name, param.data) # summary(model, (3, 112, 112)) # return # print('model parameters shape', parameters.shape) (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, model.parameters()) for i, (inputs, targets) in enumerate(train_loader): print('input shape:', inputs.shape) print('targets shape:', targets.shape) outputs = model(inputs) print("output shape", outputs.shape) model_arch = make_dot(outputs, params=dict(model.named_parameters())) print(model_arch) model_arch.render("/apollo/data/model.png", format="png") # Source(model_arch).render('/apollo/data/model.png') # print("generating /apollo/data/model.png") break # make_dot(yhat, params=dict(list(model.named_parameters()))).render("rnn_torchviz", format="png") return if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main(): # Read data df = pd.read_csv("../data/thedeep.data.txt", sep=",", header=1, names=['sentence_id', 'text', 'label']) # Load the BERT tokenizer print('Loading BERT Tokenizer....') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Use model selection library to split data: training data and validating data training_inputs, validation_inputs = train_test_split(df, random_state=2018, test_size=0.3) batch_size = 32 max_length = 80 Epochs = 1 train_dataset = dataset.BertClassificationDataset( text=training_inputs.text.values, label=training_inputs.label.values, tokenizer=tokenizer, max_length=max_length) valid_dataset = dataset.BertClassificationDataset( text=validation_inputs.text.values, label=validation_inputs.label.values, tokenizer=tokenizer, max_length=max_length) # Create a dataloader for training and validation data train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size) valid_dataloader = DataLoader(valid_dataset, sampler=RandomSampler(valid_dataset), batch_size=batch_size) # Create a instance of bert model, optimizer and scheduler device = torch.device("cuda") print('Loading BERT Model....') model = BertTextClassification('bert-base-uncased') model = model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.1 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] num_train_steps = int(len(train_dataset) * Epochs) optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) loss_fn = nn.CrossEntropyLoss().to(device) # Training and evaluating the model on the validation dataset training_stats = [] best_accuracy = 0 for epoch in range(Epochs): print(f'Epoch {epoch + 1}/{Epochs}') print('-' * 10) train_accuracy, train_loss = training.train_epoch( model, train_dataloader, loss_fn, optimizer, device, scheduler, training_inputs) print(f'Train loss {train_loss} accuracy {train_accuracy}') val_accuracy, val_loss = training.eval_model(model, valid_dataloader, loss_fn, device, validation_inputs) print(f'Val loss {val_loss} accuracy {val_accuracy}') print() # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch + 1, 'Training Loss': train_loss, 'Training Accuracy': train_accuracy, 'Valid. Loss': val_loss, 'Valid. Accur.': val_accuracy, }) if val_accuracy > best_accuracy: torch.save(model.state_dict(), 'best_model.bin') best_accuracy = val_accuracy
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.dropout: n_classes = opt.n_classes if opt.pretrain_path is not None: n_classes = opt.n_finetune_classes model = replace_fc_layer(model=model, dropout_factor=opt.dropout_factor, n_classes=n_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) if opt.labelsmoothing: criterion = LabelSmoothingCrossEntropy().to(opt.device) else: criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None if opt.lr_finder and not opt.no_train and not opt.no_val: print( "Performing Learning Rate Search\nWith Leslie Smith's approach...") lr_finder = LRFinder(model, optimizer, criterion, device=opt.device) lr_finder.range_test(train_loader, val_loader=val_loader, start_lr=opt.learning_rate, end_lr=opt.lrf_end_lr, num_iter=opt.lrf_num_it, step_mode=opt.lrf_mode) lr_finder.plot(log_lr=False) with (opt.result_path / 'lr_search.json').open('w') as results_file: json.dump(lr_finder.history, results_file, default=json_serial) lr_finder.reset() return prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) #current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, train_logger, train_batch_logger, scheduler, opt.lr_scheduler, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) elif not opt.no_train and opt.lr_scheduler == 'cosineannealing': scheduler.step() if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) # opt.n_threads = int( # (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = genarate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.distributed: model = make_data_parallel(model,opt.device) else: model.to(opt.device) # model = nn.DataParallel(model).cuda() print('Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) if opt.is_master_node: print(model) parameters = model.parameters() criterion = CrossEntropyLoss().to(opt.device) (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) val_loader, val_logger = get_val_utils(opt) if not opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None print('数据加载完毕') for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) # train_sampler2.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, opt.is_master_node, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i,model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger,opt.is_master_node, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss)
def main(opt): place = fluid.CPUPlace() if opt.no_cuda else fluid.CUDAPlace(0) with fluid.dygraph.guard(place): print(place) random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) prog = fluid.default_main_program() prog.global_seed(opt.manual_seed) os.environ['PYTHONHASHSEED'] = str(opt.manual_seed) model = generate_model(opt) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, model) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if not opt.no_train: (train_loader, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) best_acc = 0.88 for epoch in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: train_epoch(epoch, train_loader, model, optimizer, scheduler, train_logger, train_batch_logger) if epoch % opt.checkpoint == 0: save_file_path = str( opt.result_path) + 'save_{}_{}_{}'.format( epoch, opt.train_crop, opt.batch_size) save_checkpoint(save_file_path, model, optimizer) if not opt.no_val: prev_val_loss, val_acc = val_epoch(epoch, val_loader, model, val_logger) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.epoch() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if not opt.no_val: if val_acc > best_acc: best_acc = val_acc save_file_path = str( opt.result_path) + 'save_{}_{}_best_val_acc'.format( epoch, opt.train_crop) save_checkpoint(save_file_path, model, optimizer) if not opt.no_train: current_lr = optimizer.current_step_lr() print("current val_loss is %s, current lr is %s" % (prev_val_loss.numpy()[0], current_lr)) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}_{}.json'.format( opt.inference_subset, opt.train_crop) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main(): if not os.path.exists(DATASET_PATH): download_dataset() df = pd.read_csv(DATASET_PATH) tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME) df['overall'] -= 1 df_train, df_test = train_test_split(df, test_size=0.25, random_state=RANDOM_SEED, stratify=df[['overall']]) train_data_loader = create_data_loader(df_train, tokenizer, TOKEN_MAX_LEN, BATCH_SIZE) test_data_loader = create_data_loader(df_test, tokenizer, TOKEN_MAX_LEN, BATCH_SIZE) model = SentimentClassifier(len(class_names), PRE_TRAINED_MODEL_NAME) model = model.to(device) optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) total_steps = len(train_data_loader) * EPOCHS scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) # class weights for loss function for imbalanced problem class_weights = compute_class_weight(classes=[0, 1, 2, 3, 4], y=df_train['overall'], class_weight='balanced') class_weights = torch.FloatTensor(class_weights).to(device) loss_fn = nn.CrossEntropyLoss(weight=class_weights).to(device) history = defaultdict(list) best_accuracy = 0 for epoch in range(EPOCHS): print(f'Epoch {epoch + 1}/{EPOCHS}') print('-' * 10) train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)) print(f'Train loss {train_loss} accuracy {train_acc}') val_acc, val_loss = eval_model(model, test_data_loader, loss_fn, device, len(df_test)) print(f'Val loss {val_loss} accuracy {val_acc}') print() history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) history['val_acc'].append(val_acc) history['val_loss'].append(val_loss) if val_acc > best_accuracy: torch.save(model.state_dict(), 'best_model_state.bin') best_accuracy = val_acc plot_history(history) test_acc, _ = eval_model(model, test_data_loader, loss_fn, device, len(df_test)) y_review_texts, y_pred, y_pred_probs, y_test = get_predictions( model, test_data_loader, device) os.makedirs("model", exist_ok=True) torch.save(model.state_dict(), "model/model.pt") show_metrics(y_pred, y_pred_probs, y_test) preprocessing = Preprocessing() predict_single_review("I like it, perfect", preprocessing, tokenizer, model, device)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) print('resume model from ', opt.resume_path) print('model after resume:', model) # save model to current running id # mlflow.pytorch.log_model(model, "action_model") # model_path = mlflow.get_artifact_uri("action_model") # print('mlflow action model path: ', model_path) # model = mlflow.pytorch.load_model(model_path) if opt.ml_tag_name != '' and opt.ml_tag_value != '': # mlflow.set_tag("test_tag", 'inference_test') mlflow.set_tag(opt.ml_tag_name, opt.ml_tag_value) # load from previous published model version if opt.ml_model_name != '' and opt.ml_model_version != '': # model_name = 'action_model' # model_version = '1' model_uri = "models:/{}/{}".format(opt.ml_model_name, opt.ml_model_version) model = mlflow.pytorch.load_model(model_uri) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if opt.ml_model_name != '': mlflow.pytorch.log_model(model, opt.ml_model_name) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) mlflow.log_metric("loss", prev_val_loss) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main(args): set_seed(SEED) train_transforms, test_transforms = get_transforms(args.dataset) print(f"Data transformations:\n{train_transforms}\n") # Get the dataloaders train_loader, test_loader = get_dataloaders(args.dataset, args.batch_size, args.workers, train_transforms, test_transforms) # Architecture if args.dataset == 'mnist': in_channels = 1 else: raise NotImplementedError() if args.activation == 'relu': activation = nn.ReLU(inplace=True) else: raise NotImplementedError() if args.pooling == 'max': pooling = nn.MaxPool2d(kernel_size=(2, 2), stride=2) else: raise NotImplementedError() drop_rate = args.drop_rate # Build model model = LeNet5(in_channels, activation, pooling, drop_rate) if torch.cuda.is_available(): torch.cuda.set_device(args.gpu) model = model.cuda() # Weight normal initialization if args.init_weights: model.apply(normal_initialization) start_epoch = 0 if args.resume is not None: model, optimizer, start_epoch = load_training_state( model, optimizer, args.resume) # Loss function & optimizer if args.criterion == 'ce': criterion = nn.CrossEntropyLoss() else: raise NotImplementedError() if args.optimizer == 'sgd': # Issue optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) elif args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise NotImplementedError() scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=0, threshold=1e-2, verbose=True) # Output folder output_folder = os.path.join(args.output_folder, args.training_name) if not os.path.exists(output_folder): os.makedirs(output_folder) log_path = os.path.join(args.output_folder, 'logs', args.training_name) if os.path.exists(log_path): rmtree(log_path) logger = SummaryWriter(log_path) # Train best_loss = math.inf mb = master_bar(range(args.nb_epochs)) for epoch_idx in mb: # Training train_epoch(model, train_loader, optimizer, criterion, mb, tb_logger=logger, epoch=start_epoch + epoch_idx) # Evaluation val_loss, accuracy = evaluate(model, test_loader, criterion) mb.first_bar.comment = f"Epoch {start_epoch+epoch_idx+1}/{start_epoch+args.nb_epochs}" mb.write( f'Epoch {start_epoch+epoch_idx+1}/{start_epoch+args.nb_epochs} - Validation loss: {val_loss:.4} (Acc@1: {accuracy:.2%})' ) # State saving if val_loss < best_loss: print( f"Validation loss decreased {best_loss:.4} --> {val_loss:.4}: saving state..." ) best_loss = val_loss torch.save( dict(epoch=start_epoch + epoch_idx, model_state_dict=model.state_dict(), optimizer_state_dict=optimizer.state_dict(), val_loss=val_loss), os.path.join(output_folder, "training_state.pth")) if logger is not None: current_iter = (start_epoch + epoch_idx + 1) * len(train_loader) logger.add_scalar(f"Validation loss", val_loss, current_iter) logger.add_scalar(f"Error rate", 1 - accuracy, current_iter) logger.flush() scheduler.step(val_loss)
def run(*configs, group=None): config = configuration.load(*configs) if config.group: config.group = config.data_source + '-' + config.group else: config.group = config.data_source if group: config.group = config.group + "-" + str(group) if config.from_scratch: config.group = 'scratch-' + config.group config.name = 'scratch-' + config.name if config.log: wandb.init(project='explainable-asag', group=config.group, name=config.name, config=config) config = wandb.config model = transformers.AutoModelForSequenceClassification.from_pretrained( config.model_name, num_labels=config.num_labels) if config.token_types: embedding_size = model.config.__dict__.get('embedding_size', model.config.hidden_size) update_token_type_embeddings(model, embedding_size, model.config.initializer_range) if config.from_scratch: model.init_weights() cuda = torch.cuda.is_available() if cuda: model.cuda() train_dataloader = dataset.dataloader(val_mode=False, data_file=config.train_data, data_source=config.data_source, vocab_file=config.model_name, num_labels=config.num_labels, train_percent=config.train_percent, batch_size=config.batch_size, drop_last=config.drop_last, num_workers=config.num_workers) val_dataloader = dataset.dataloader(val_mode=True, data_file=config.val_data, data_source=config.data_source, vocab_file=config.model_name, num_labels=config.num_labels, train_percent=config.val_percent, batch_size=config.batch_size, drop_last=config.drop_last, num_workers=config.num_workers) optimizer = torch.optim.__dict__[config.optimizer]( model.parameters(), lr=config.learn_rate, **config.optimizer_kwargs) # Hack to get any scheduler we want. transformers.get_scheduler does not implement e.g. linear_with_warmup. get_scheduler = { 'linear_with_warmup': transformers.get_linear_schedule_with_warmup, 'cosine_with_warmup': transformers.get_cosine_schedule_with_warmup, 'constant_with_warmup': transformers.get_constant_schedule_with_warmup, 'cosine_with_hard_restarts_with_warmup': transformers.get_cosine_with_hard_restarts_schedule_with_warmup } lr_scheduler = get_scheduler[config.scheduler](optimizer, *config.scheduler_args, **config.scheduler_kwargs) best_f1 = 0.0 patience = 0 epoch = 0 log_line = '' try: #while lr_scheduler.last_epoch <= total_steps: while epoch < config.max_epochs: epoch += 1 av_epoch_loss = training.train_epoch( train_dataloader, model, optimizer, lr_scheduler, config.num_labels, cuda, log=config.log, token_types=config.token_types) #tidy stuff up every epoch gc.collect() torch.cuda.empty_cache() metrics_weighted, metrics_macro = training.val_loop( model, val_dataloader, cuda, token_types=config.token_types) p, r, f1, val_acc = metrics_weighted p_m, r_m, f1_m, val_acc_m = metrics_macro log_line = f'model: {config.model_name} | epoch: {epoch} | av_epoch_loss {av_epoch_loss:.5f} | f1: {f1:.5f} | accuracy: {val_acc:.5f} \n' print(log_line[:-1]) if config.log: wandb.log({ 'precision': p, 'recall': r, 'f1': f1, 'accuracy': val_acc, 'av_epoch_loss': av_epoch_loss }) wandb.log({ 'precision-macro': p_m, 'recall-macro': r_m, 'f1-macro': f1_m, 'accuracy-macro': val_acc_m }) if f1 > best_f1: if config.log: this_model = os.path.join(wandb.run.dir, 'best_f1.pt') print("saving to: ", this_model) torch.save([model.state_dict(), config.__dict__], this_model) wandb.save('*.pt') best_f1 = f1 patience = 0 #max((0, patience-1)) elif config.max_patience: patience += 1 if patience >= config.max_patience: break # Move stuff off the gpu model.cpu() #This is for sure a kinda dumb way of doing it, but the least mentally taxing right now optimizer = torch.optim.__dict__[config.optimizer]( model.parameters(), lr=config.learn_rate) gc.collect() torch.cuda.empty_cache() #return model #Gives Error except KeyboardInterrupt: if config.log: wandb.save('*.pt') #Move stuff off the gpu model.cpu() optimizer = torch.optim.__dict__[config.optimizer]( model.parameters(), lr=config.learn_rate) gc.collect() torch.cuda.empty_cache()
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) #criterion = CrossEntropyLoss().to(opt.device) # ADDED for 231n criterion = FocalLoss().to(opt.device) if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None conf_mtx_dict = {} # ADDED for CS231n prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed, conf_mtx_dict) # ADDED for CS231n # ADDED for 231n - uncomment if using cross entropy loss #if not opt.no_train and opt.lr_scheduler == 'multistep': # scheduler.step() #elif not opt.no_train and opt.lr_scheduler == 'plateau': # scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk) # ADDED for CS231n conf_mtx_file = csv.writer(open("conf_mtxs.csv", "w+")) for key, val in conf_mtx_dict.items(): conf_mtx_file.writerow([key, val])
if args.visualize_interval != "never": plot_data(args.results_dir / Path(f"epoch_0_val_loss_inf.png") if save else None, args.figure_type, model, all_data, args.batch_size, show=show, only_subset_labels=subset_labels) for epoch in range(1, args.epochs + 1): start_time = time.time() train_loss, train_metrics = train_epoch( epoch=epoch, model=model, optimizer=optimizer, train_loader=train_loader, log_interval=args.log_interval, clip_grad_norm=args.clip_grad_norm, clip_grad_value=args.clip_grad_value, random_weighted_sampling=args.random_weighted_sampling) if args.val_ratio > 0: val_loss, val_metrics = validate(epoch, model, val_loader) loss_str = "Validation" loss_value_str = f"{val_loss:.5f}" val_str = f"{loss_str} loss: {loss_value_str} " val_nll_losses.append(val_metrics["nll_loss"]) val_kld_losses.append(val_metrics["kld_loss"]) val_param_klds.append(val_metrics["param_kld"]) val_total_losses.append(val_loss)
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 if opt.inference: model = generate_model(opt) else: model = generate_model(opt, use_features=True) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) ##################################################################################### ### here add a classifier to predict videos and audios if opt.inference is False: ### define loss criterion = CrossEntropyLoss().to(opt.device) if opt.use_audio or opt.use_image: criterion_jsd = JSDLoss(weight=0.5) ################################################################################# if opt.use_audio: ### define loss criterion_ct_av = NCELoss(temperature=0.5) ### audio teacher model feature_dim = 512 * 2 if opt.pretrain_path is not None: joint_prediction_aud = generate_prediction( feature_dim, opt.n_finetune_classes, normalization=True) else: joint_prediction_aud = generate_prediction(feature_dim, opt.n_classes, normalization=True) if opt.resume_path is not None: aux_checkpoint = Path( os.path.join(str(opt.resume_path.parent), str(opt.resume_path.name[:-4] + '_audio.pth'))) joint_prediction_aud = resume_model(aux_checkpoint, opt.arch, joint_prediction_aud) joint_prediction_aud = make_data_parallel(joint_prediction_aud, opt.distributed, opt.device) aud_para = joint_prediction_aud.parameters() joint_prediction_aud.cuda() else: aud_para = None ################################################################################# if opt.use_image: ### define loss criterion_ct_iv = NCELoss(temperature=0.1) ### image teacher model image_model = torchvision.models.resnet34(pretrained=True) # remove the fc layers (only use the image features) image_model = torch.nn.Sequential( *list(image_model.children())[:-1]) image_model = make_data_parallel(image_model, opt.distributed, opt.device) feature_dim = 512 * 2 if opt.pretrain_path is not None: joint_prediction_img = generate_prediction( feature_dim, opt.n_finetune_classes, normalization=True) else: joint_prediction_img = generate_prediction(feature_dim, opt.n_classes, normalization=True) if opt.resume_path is not None: aux_checkpoint = Path( os.path.join(str(opt.resume_path.parent), str(opt.resume_path.name[:-4] + '_image.pth'))) joint_prediction_img = resume_model(aux_checkpoint, opt.arch, joint_prediction_img) joint_prediction_img = make_data_parallel(joint_prediction_img, opt.distributed, opt.device) img_para = joint_prediction_img.parameters() joint_prediction_img.cuda() else: img_para = None ################################################################################# (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, optimizer_av, optimizer_iv, scheduler) = \ get_train_utils(opt, model_parameters=parameters, av_parameters=aud_para, iv_parameters=img_para) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None pre_val_acc = 0.0 if opt.image_size > opt.sample_size: image_size = opt.image_size else: image_size = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) if optimizer_av is None and optimizer_iv is None: train_epoch(epoch=i, data_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed) elif optimizer_av is not None and optimizer_iv is None: train_a_epoch(epoch=i, data_loader=train_loader, model=model, joint_prediction_aud=joint_prediction_aud, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_av=criterion_ct_av, optimizer=optimizer, optimizer_av=optimizer_av, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed) elif optimizer_av is None and optimizer_iv is not None: train_i_epoch(epoch=i, data_loader=train_loader, model=model, image_model=image_model, joint_prediction_img=joint_prediction_img, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_iv=criterion_ct_iv, optimizer=optimizer, optimizer_iv=optimizer_iv, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed, image_size=image_size) else: train_ai_epoch(epoch=i, data_loader=train_loader, model=model, image_model=image_model, joint_prediction_aud=joint_prediction_aud, joint_prediction_img=joint_prediction_img, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_av=criterion_ct_av, criterion_ct_iv=criterion_ct_iv, optimizer=optimizer, optimizer_av=optimizer_av, optimizer_iv=optimizer_iv, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed, image_size=image_size, loss_weight=opt.loss_weight) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if opt.use_audio: save_file_path = opt.result_path / 'save_{}_audio.pth'.format( i) save_checkpoint(save_file_path, i, opt.arch, joint_prediction_aud, optimizer, scheduler) if opt.use_image: save_file_path = opt.result_path / 'save_{}_image.pth'.format( i) save_checkpoint(save_file_path, i, opt.arch, joint_prediction_img, optimizer, scheduler) if not opt.no_val and i % opt.val_freq == 0: prev_val_loss, val_acc = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if pre_val_acc < val_acc: pre_val_acc = val_acc save_file_path = opt.result_path / 'save_model.pth' save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': if prev_val_loss is not None: scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)