def training_step(self, batch, batch_idx): # mixup + labelsmooth+ circle_loss data, labels = batch # mixed_x, labels_a, labels_b, lam = mixup_data(data, labels, 0.2) # output = self(mixed_x) # loss = mixup_criterion(LabelSmoothingLoss(4, smoothing=0.1), output, labels_a, labels_b, lam) output = self(data) loss = LabelSmoothingLoss(dc.num_classes, smoothing=0.1)(output, labels) return {'loss': loss}
def main(): fold = 0 #4.1 mkdirs if not os.path.exists(config.submit): os.mkdir(config.submit) if not os.path.exists(config.weights): os.mkdir(config.weights) if not os.path.exists(config.best_models): os.mkdir(config.best_models) if not os.path.exists(config.logs): os.mkdir(config.logs) if not os.path.exists(config.weights + config.model_name + os.sep + str(fold) + os.sep): os.makedirs(config.weights + config.model_name + os.sep + str(fold) + os.sep) if not os.path.exists(config.best_models + config.model_name + os.sep + str(fold) + os.sep): os.makedirs(config.best_models + config.model_name + os.sep + str(fold) + os.sep) #define model model_name = 'mynet' # model = base_net() print(model) if cuda_avail: model.cuda() optimizer = optim.SGD(model.parameters(), lr=config.lr, momentum=0.9, weight_decay=config.weight_decay) #optimizer = Lookahead(optimizer) Loss = LabelSmoothingLoss(config.num_classes, smoothing=0.1).cuda() #lr_scheduler = CosineWarmupLr(optimizer, 320, 40, # base_lr=config.lr, warmup_epochs=1) #optimizer = optim.Adam(model.parameters(),lr = config.lr,amsgrad=True,weight_decay=config.weight_decay) weights = torch.tensor([1., 5]) criterion = nn.CrossEntropyLoss(weight=weights).cuda() #criterion = CircleLoss(m=0.25, gamma=30) #criterion = FocalLoss().cuda() log = Logger() log.open(config.logs + "log_train.txt", mode="a") log.write( "\n----------------------------------------------- [START %s] %s\n\n" % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 51)) #4.3 some parameters for K-fold and restart model start_epoch = 0 best_precision1 = 0 best_precision_save = 0 resume = False test_only = False eval_only = True #True False #4.4 restart the training process if resume: checkpoint = torch.load(config.weights + config.model_name + '/' + str(fold) + "/checkpoint.pth.tar") start_epoch = checkpoint["epoch"] fold = checkpoint["fold"] best_precision1 = checkpoint["best_precision1"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) #4.5 get files and split for K-fold dataset #4.5.1 read files train_ = get_files(config.train_data, "train") test_files = get_files(config.test_data, "test") train_data_list, val_data_list = train_test_split(train_, test_size=0.6, stratify=train_["label"]) print(val_data_list) #4.5.2 split 5 folds split_fold = StratifiedKFold(n_splits=5) folds_indexes = split_fold.split(X=train_list["filename"], y=train_list["label"]) folds_indexes = np.array(list(folds_indexes)) fold_index = folds_indexes[fold] train_im = [] train_label = [] val_im = [] val_label = [] print(train_list['filename']) print(len(fold_index[0])) for i in fold_index[0]: i = int(i) train_im.append(train_list["filename"][i]) train_label.append(train_list["label"][i]) train_data_list = pd.DataFrame({ "filename": train_im, 'label': train_label }) for i in fold_index[1]: val_im.append(train_list["filename"][i]) val_label.append(train_list["label"][i]) val_data_list = pd.DataFrame({"filename": val_im, 'label': val_label}) #print(fold_index[0]) #4.5.3 using fold index to split for train data and val data #train_data_list = pd.concat([train_data_list["filename"][fold_index[0]],train_data_list["label"][fold_index[0]]],axis=1) #val_data_list = pd.concat([train_data_list["filename"][fold_index[1]],train_data_list["label"][fold_index[1]]],axis=1) #4.5.4 load datase4 train_dataloader = DataLoader(ChaojieDataset(train_data_list), batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=4) #val_list for x ray val_dataloader = DataLoader(ChaojieDataset(val_data_list, train=False), batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=False, num_workers=4) test_dataloader = DataLoader(ChaojieDataset(test_files, test=True), batch_size=config.batch_size * 2, shuffle=False, pin_memory=False) #scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,"max",verbose=1,patience=3) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) #4.5.5.1 define metrics train_losses = AverageMeter() train_top1 = AverageMeter() train_top2 = AverageMeter() valid_loss = [np.inf, 0, 0] model.train() #logs log.write('** start training here! **\n') log.write( ' |------------ VALID -------------|----------- TRAIN -------------|------Accuracy------|------------|\n' ) log.write( 'lr iter epoch | loss top-1 top-2 | loss top-1 top-2 | Current Best | time |\n' ) log.write( '-------------------------------------------------------------------------------------------------------------------------------\n' ) #4.5.5 train start = timer() if eval_only: #best_model = torch.load(config.best_models +config.model_name+os.sep+ str(fold) +os.sep+ 'model_best.pth.tar') best_model = torch.load(config.weights + 'x-ray/' + 'model_best.pth.tar') model.load_state_dict(best_model["state_dict"]) #valid_loss = evaluate(val_dataloader,model,criterion,0.5) df = pd.DataFrame({'true': trues, 'prob': prob}) df.to_csv('gt.csv', index=False) for i in tqdm(range(201)): valid_loss = evaluate(val_dataloader, model, criterion, i * 0.005) df = pd.DataFrame({ 'Sensitivity': Sensitivity, 'Specificity': Specificity }) df.to_csv('roc.csv', index=False) df = pd.DataFrame({'Precisions': Precisions, 'Recalls': Recalls}) df.to_csv('prc.csv', index=False) return if test_only: best_model = torch.load(config.best_models + config.model_name + os.sep + str(fold) + os.sep + 'model_best.pth.tar') model.load_state_dict(best_model["state_dict"]) test(test_dataloader, model, fold) total_loss = 10 for epoch in range(start_epoch, config.epochs): scheduler.step(epoch) # train #global iter for iter, (input, target) in enumerate(train_dataloader): #4.5.5 switch to continue train process model.train() input = Variable(input).cuda() target = Variable(torch.from_numpy(np.array(target)).long()).cuda() #target = Variable(target).cuda() output = model(input) loss = criterion(output, target) #loss = Loss(output,target) precision1_train, precision2_train = accuracy(output, target, topk=(1, 2)) train_losses.update(loss.item(), input.size(0)) train_top1.update(precision1_train[0], input.size(0)) train_top2.update(precision2_train[0], input.size(0)) #backward optimizer.zero_grad() loss.backward() optimizer.step() #lr_scheduler.step() lr = get_learning_rate(optimizer) print('\r', end='', flush=True) print('%0.4f %5.1f %6.1f | %0.3f %0.3f %0.3f | %0.3f %0.3f %0.3f | %s | %s' % (\ lr, iter/len(train_dataloader) + epoch, epoch, valid_loss[0], valid_loss[1], valid_loss[2], train_losses.avg, train_top1.avg, train_top2.avg,str(best_precision_save), time_to_str((timer() - start),'min')) , end='',flush=True) #evaluate lr = get_learning_rate(optimizer) #evaluate every half epoch valid_loss = evaluate(val_dataloader, model, criterion, 0.5) #criterion Loss loss_min = False if valid_loss[0] < total_loss: total_loss = valid_loss[0] loss_min = True #valid_loss = [0.5,0.5,0.5] is_best = False is_best = valid_loss[1] >= best_precision1 best_precision1 = max(valid_loss[1], best_precision1) try: best_precision_save = best_precision1.cpu().data.numpy() except: pass if is_best: save_checkpoint( { "epoch": epoch + 1, "model_name": config.model_name, "state_dict": model.state_dict(), "best_precision1": best_precision1, "optimizer": optimizer.state_dict(), "fold": fold, "valid_loss": valid_loss, }, is_best, loss_min, fold) if loss_min: save_checkpoint( { "epoch": epoch + 1, "model_name": config.model_name, "state_dict": model.state_dict(), "best_precision1": best_precision1, "optimizer": optimizer.state_dict(), "fold": fold, "valid_loss": valid_loss, }, is_best, loss_min, fold) #adjust learning rate #scheduler.step(valid_loss[1]) print("\r", end="", flush=True) log.write('%0.4f %5.1f %6.1f | %0.3f %0.3f %0.3f | %0.3f %0.3f %0.3f | %s | %s' % (\ lr, 0 + epoch, epoch, valid_loss[0], valid_loss[1], valid_loss[2], train_losses.avg, train_top1.avg, train_top2.avg, str(best_precision_save), time_to_str((timer() - start),'min')) ) log.write('\n') time.sleep(0.01)
parser.add_argument('--epochs', default=3, type=int, help='Number of epoch for training') parser.add_argument('--train_batch_size', default=256, type=int, help='Batch size for training') parser.add_argument('--test_batch_size', default=128, type=int, help='Batch size for training') parser.add_argument('--save_dir', default='../weights', type=str, help='directory to save model') args = parser.parse_args() LSLoss = LabelSmoothingLoss(3, smoothing=0.1) def loss_fn(outputs,target): loss = nn.CrossEntropyLoss()(outputs, target) return loss def train(dataset, dataloader, model, optimizer, device, loss_fn): model.train() final_loss = 0 counter = 0 for batch_ind, d in tqdm(enumerate(dataloader),total=int(len(dataset))/dataloader.batch_size): counter += 1 image = d['image'] label = d['label'] image = image.to(device,dtype=torch.float)
else: model = architecture.HighDimensionalModel(model_name, num_classes) model = nn.DataParallel(model).to(device) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[299, 449], gamma=0.1) epoch_stop = 600 if "category" in label: if smoothing > 0: criterion = LabelSmoothingLoss(num_classes, smoothing=smoothing) else: criterion = nn.CrossEntropyLoss() else: criterion = nn.SmoothL1Loss() # Initializes training load_from_checkpoint = False if load_from_checkpoint: checkpoint = torch.load(checkpoint_path) epoch_start = checkpoint["epoch"] train_loss = checkpoint["train_loss"] valid_loss = checkpoint["valid_loss"] valid_acc = checkpoint["valid_acc"] model.load_state_dict(checkpoint["model_state_dict"])
base_lr=args.lr, warmup_epochs=args.warmup_epochs) if resume_epoch > 0: checkpoint = torch.load(args.resume_param) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) amp.load_state_dict(checkpoint['amp']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) print("Finish loading resume param.") top1_acc = metric.Accuracy(name='Top1 Accuracy') top5_acc = metric.TopKAccuracy(top=5, name='Top5 Accuracy') loss_record = metric.NumericalCost(name='Loss') Loss = nn.CrossEntropyLoss() if not args.label_smoothing else \ LabelSmoothingLoss(classes, smoothing=0.1) @torch.no_grad() def test(epoch=0, save_status=False): top1_acc.reset() top5_acc.reset() loss_record.reset() model.eval() for data, labels in val_data: data = data.to(device, non_blocking=True) labels = labels.to(device, non_blocking=True) outputs = model(data) losses = Loss(outputs, labels)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu logger = get_logger(args.logging_file) logger.info("Use GPU: {} for training".format(args.gpu)) args.rank = args.rank * ngpus_per_node + gpu torch.distributed.init_process_group(backend="nccl", init_method=args.dist_url, world_size=args.world_size, rank=args.rank) epochs = args.epochs input_size = args.input_size resume_epoch = args.resume_epoch initializer = KaimingInitializer() zero_gamma = ZeroLastGamma() mix_precision_training = args.mix_precision_training is_first_rank = True if args.rank % ngpus_per_node == 0 else False batches_pre_epoch = args.num_training_samples // (args.batch_size * ngpus_per_node) lr = 0.1 * (args.batch_size * ngpus_per_node // 32) if args.lr == 0 else args.lr model = get_model(models, args.model) model.apply(initializer) if args.last_gamma: model.apply(zero_gamma) logger.info('Apply zero last gamma init.') if is_first_rank and args.model_info: summary(model, torch.rand((1, 3, input_size, input_size))) parameters = model.parameters() if not args.no_wd else no_decay_bias(model) if args.sgd_gc: logger.info('Use SGD_GC optimizer.') optimizer = SGD_GC(parameters, lr=lr, momentum=args.momentum, weight_decay=args.wd, nesterov=True) else: optimizer = optim.SGD(parameters, lr=lr, momentum=args.momentum, weight_decay=args.wd, nesterov=True) lr_scheduler = CosineWarmupLr(optimizer, batches_pre_epoch, epochs, base_lr=args.lr, warmup_epochs=args.warmup_epochs) # dropblock_scheduler = DropBlockScheduler(model, batches_pre_epoch, epochs) if args.lookahead: optimizer = Lookahead(optimizer) logger.info('Use lookahead optimizer.') torch.cuda.set_device(args.gpu) model.cuda(args.gpu) args.num_workers = int( (args.num_workers + ngpus_per_node - 1) / ngpus_per_node) if args.mix_precision_training and is_first_rank: logger.info('Train with FP16.') scaler = GradScaler(enabled=args.mix_precision_training) model = nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) Loss = nn.CrossEntropyLoss().cuda(args.gpu) if not args.label_smoothing else \ LabelSmoothingLoss(args.classes, smoothing=0.1).cuda(args.gpu) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if args.autoaugment: train_transform = transforms.Compose([ transforms.RandomResizedCrop(input_size), transforms.RandomHorizontalFlip(), ImageNetPolicy, transforms.ToTensor(), normalize, ]) else: train_transform = transforms.Compose([ transforms.RandomResizedCrop(input_size), # Cutout(), transforms.RandomHorizontalFlip(), transforms.ColorJitter(0.4, 0.4, 0.4), transforms.ToTensor(), normalize, ]) val_transform = transforms.Compose([ transforms.Resize(int(input_size / 0.875)), transforms.CenterCrop(input_size), transforms.ToTensor(), normalize, ]) train_set = ImageNet(args.data_path, split='train', transform=train_transform) val_set = ImageNet(args.data_path, split='val', transform=val_transform) train_sampler = DistributedSampler(train_set) train_loader = DataLoader(train_set, args.batch_size, False, pin_memory=True, num_workers=args.num_workers, drop_last=True, sampler=train_sampler) val_loader = DataLoader(val_set, args.batch_size, False, pin_memory=True, num_workers=args.num_workers, drop_last=False) if resume_epoch > 0: loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume_param, map_location=loc) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scaler.load_state_dict(checkpoint['scaler']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) print("Finish loading resume param.") torch.backends.cudnn.benchmark = True top1_acc = metric.Accuracy(name='Top1 Accuracy') top5_acc = metric.TopKAccuracy(top=5, name='Top5 Accuracy') loss_record = metric.NumericalCost(name='Loss') for epoch in range(resume_epoch, epochs): tic = time.time() train_sampler.set_epoch(epoch) if not args.mixup: train_one_epoch(model, train_loader, Loss, optimizer, epoch, lr_scheduler, logger, top1_acc, loss_record, scaler, args) else: train_one_epoch_mixup(model, train_loader, Loss, optimizer, epoch, lr_scheduler, logger, loss_record, scaler, args) train_speed = int(args.num_training_samples // (time.time() - tic)) if is_first_rank: logger.info( 'Finish one epoch speed: {} samples/s'.format(train_speed)) test(model, val_loader, Loss, epoch, logger, top1_acc, top5_acc, loss_record, args) if args.rank % ngpus_per_node == 0: checkpoint = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scaler': scaler.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), } torch.save( checkpoint, '{}/{}_{}_{:.5}.pt'.format(args.save_dir, args.model, epoch, top1_acc.get()))
def forward(self, predictions, targets): """Multibox Loss Args: predictions (tuple): A tuple containing loc preds, conf preds, and prior boxes from SSD net. conf shape: torch.size(batch_size,num_priors,num_classes) loc shape: torch.size(batch_size,num_priors,4) priors shape: torch.size(num_priors,4) targets (tensor): Ground truth boxes and labels for a batch, shape: [batch_size,num_objs,5] (last idx is the label). """ arm_loc_data, arm_conf_data, odm_loc_data, odm_conf_data, priors = predictions if self.use_ARM: loc_data, conf_data = odm_loc_data, odm_conf_data else: loc_data, conf_data = arm_loc_data, arm_conf_data num = loc_data.size(0) priors = priors[:loc_data.size(1), :] num_priors = (priors.size(0)) num_classes = self.num_classes # match priors (default boxes) and ground truth boxes loc_t = torch.Tensor(num, num_priors, 4) conf_t = torch.LongTensor(num, num_priors) for idx in range(num): truths = targets[idx][:, :-1].data labels = targets[idx][:, -1].data if num_classes == 2: labels = labels >= 0 defaults = priors.data if self.use_ARM: refine_match(self.threshold, truths, defaults, self.variance, labels, loc_t, conf_t, idx, arm_loc_data[idx].data) else: refine_match(self.threshold, truths, defaults, self.variance, labels, loc_t, conf_t, idx) if self.use_gpu: loc_t = loc_t.cuda() conf_t = conf_t.cuda() # wrap targets loc_t.requires_grad = False conf_t.requires_grad = False if self.use_ARM: P = F.softmax(arm_conf_data, 2) arm_conf_tmp = P[:, :, 1] object_score_index = arm_conf_tmp <= self.theta pos = conf_t > 0 pos[object_score_index.data] = 0 else: pos = conf_t > 0 # Localization Loss (Smooth L1) # Shape: [batch,num_priors,4] pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) loc_p = loc_data[pos_idx].view(-1, 4) loc_t = loc_t[pos_idx].view(-1, 4) loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') # Compute max conf across batch for hard negative mining batch_conf = conf_data.view(-1, self.num_classes) loss_c = log_sum_exp(batch_conf) - batch_conf.gather( 1, conf_t.view(-1, 1)) #print(loss_c.size()) # Hard Negative Mining loss_c[pos.view(-1, 1)] = 0 # filter out pos boxes for now loss_c = loss_c.view(num, -1) _, loss_idx = loss_c.sort(1, descending=True) _, idx_rank = loss_idx.sort(1) num_pos = pos.long().sum(1, keepdim=True) num_neg = torch.clamp(self.negpos_ratio * num_pos, max=pos.size(1) - 1) neg = idx_rank < num_neg.expand_as(idx_rank) # Confidence Loss Including Positive and Negative Examples pos_idx = pos.unsqueeze(2).expand_as(conf_data) neg_idx = neg.unsqueeze(2).expand_as(conf_data) conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view( -1, self.num_classes) targets_weighted = conf_t[(pos + neg).gt(0)] #print(pos_idx.size(), neg_idx.size(), conf_p.size(), targets_weighted.size()) Loss_label = LabelSmoothingLoss(self.num_classes, 0.1) loss_c = Loss_label(conf_p, targets_weighted) # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N N = num_pos.data.sum().float() #N = max(num_pos.data.sum().float(), 1) loss_l /= N loss_c /= N #print(N, loss_l, loss_c) return loss_l, loss_c