def train(): criterion = FocalLoss(NUM_CLASS, alpha=weights_t) criterion.to(device) def compute_loss(x, label): loss = criterion(x, label) prec = (x.argmax(1) == label).float().mean() return loss, prec print('train on:', device) model = Cnn().to(device) optim = torch.optim.Adam(model.parameters(), lr=1e-3) model.train() step = 0 prec = 0 target_step = 1200 last_time = time.monotonic() is_saved = False best = 999 while step < target_step or not is_saved: images_t, labels_t = get_data(img_files, item_id_map, circle_map, img_map) optim.zero_grad() score = model(images_t) loss, prec = compute_loss(score, labels_t) loss.backward() optim.step() if step < 10 or step % 50 == 0: print(step, loss.item(), prec.item(), time.monotonic() - last_time) last_time = time.monotonic() step += 1 if step > target_step - 300 and best > loss.item(): model.eval() if test(model): best = loss.item() print(f'save best {best}') model.train() torch.save(model.state_dict(), './model.pth') torch.onnx.export(model, torch.rand((1, 3, 60, 60)).to(device), 'ark_material.onnx') is_saved = True else: model.train() from dl_data import request_get request_get( 'https://purge.jsdelivr.net/gh/triwinds/arknights-ml@latest/inventory/index_itemid_relation.json', True) request_get( 'https://purge.jsdelivr.net/gh/triwinds/arknights-ml@latest/inventory/ark_material.onnx', True)
def train_model(epochs, model, dl_train, device, dl_val=None, path=model_path, file_name='', print_freq=50): """ Trains the model for several epochs and saves the model that has best strong confidence predictions """ optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) total_steps = len(dl_train) * epochs if dl_val != None: evaluate = True else: evaluate = False scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) loss_fn = FocalLoss(gamma=4, class_num=2).to(device) #FocalLoss().to(device) best_correct_ratio = 0 correct_ratio = 0 if evaluate: _, correct_ratio = get_predictions(model, dl_val, device, n_examples=50, force_n_examples=True, use_targets=True) eval_model(0, model, dl_val, loss_fn, device) epoch = 0 for epoch in range(epochs): print(f'Epoch {epoch + 1}/{epochs}') print('-' * 10) train_acc, train_loss = train_epoch(epoch, model, dl_train, loss_fn, optimizer, device, scheduler, print_freq=print_freq) if evaluate: val_acc, val_loss, f1 = eval_model(epoch, model, dl_val, loss_fn, device) _, correct_ratio = get_predictions(model, dl_val, device, n_examples=50, force_n_examples=True, use_targets=True) if correct_ratio > best_correct_ratio and file_name != '': print('Save. Epoch: ', epoch + 1) torch.save(model.state_dict(), path + '/' + file_name) best_correct_ratio = correct_ratio
def train_one_epoch(args, model, optimizer, train_loader, logger, model_saver): # criterion = nn.BCEWithLogitsLoss() # criterion = nn.BCELoss() criterion = FocalLoss() model.train() device = get_device(args) for step, (imgs, targets) in enumerate(train_loader, start=1): t1 = time.perf_counter() optimizer.zero_grad() targets_one_hot = label_to_one_hot(targets, n_class=args.n_class) # test the one-hot transform # targets_one_hot_argmax = targets_one_hot.argmax(dim=1, keepdim=True) # print(f'targets_one_hot_argmax:{targets_one_hot_argmax}\ntargets:{targets}') # print(f'check:{torch.eq(targets, targets_one_hot_argmax)}') imgs, targets_one_hot = imgs.to(device), targets_one_hot.to(device) outs = model(imgs).sigmoid() loss = criterion(input=outs, target=targets_one_hot) loss.backward() optimizer.step() t2 = time.perf_counter() print( f'step:{step} [{step}/{len(train_loader)}] ' f'| loss:{loss.item():.8f} | lr:{get_lr(optimizer)} | time:{t2 - t1}' ) logger.log(key='train_loss', data=loss.item()) # save the model, optimizer every args.save_steps if step % args.save_steps == 0: logger.visualize(key='train_loss', range=(-1000, -1)) logger.save_log() model_saver.save(name=args.model_name, model=model) model_saver.save(name=args.optimizer, model=optimizer)
def __init__(self, num_classes=80): super(RetinaNet, self).__init__() self.fpn = FPN50() self.num_classes = num_classes self.loc_head = self._make_head(self.num_anchors * 4) self.cls_head = self._make_head(self.num_anchors * self.num_classes) self.focal_loss = FocalLoss()
def __init__(self, picker: Picker, results_path: Path, train_set: SeisDataset, valid_set: SeisDataset, test_set: SeisDataset, device: torch.device, batch_size: int, lr: float, num_workers: int, freq_valid: int, visual: Dict[str, List[int]], dt_ms: float, height_model: int, width_model: int, stopper: Stopper, weights: torch.Tensor): self._picker = picker self._results_path = results_path self._train_set = train_set self._valid_set = valid_set self._test_set = test_set self._device = device self._batch_size = batch_size self._lr = lr self._num_workers = num_workers self._freq_valid = freq_valid self._visual = visual self._dt_ms = dt_ms self._height_model = height_model self._width_model = width_model self._stopper = stopper self._weights = weights # self._criterion = nn.CrossEntropyLoss(weight=self._weights).to(self._device) self._criterion = FocalLoss(alpha=self._weights, gamma=2) self._optimizer = torch.optim.Adam(picker.parameters(), lr=self._lr) self._net_path, self._tensorboard_path = self._results_path / 'net', self._results_path / 'tensorboard' for folder in [self._net_path, self._tensorboard_path]: folder.mkdir(exist_ok=True, parents=True) self._writer_tb = SummaryWriter(log_dir=str(self._tensorboard_path), flush_secs=20) self._picker.to(self._device) self._num_batch = 0 self._correct_visual() self._freq_valid = min((self._freq_valid, len(self._train_set) // self._batch_size + 1))
def __init__(self, cfg): super(type(self), self).__init__() # params and flags self.loss_lambda = cfg.loss_lambda self.im_size = cfg.im_size self.map_size = (down2n(cfg.im_size[0],cfg.conv_npool[-1]),down2n(cfg.im_size[1],cfg.conv_npool[-1])) self.bbox_thres = cfg.bbox_thres self.head_oproi = cfg.head_oproi # loss objects self.cl_loss = FocalLoss(gamma=cfg.loss_gamma, alpha=cfg.loss_alpha, size_average=True) self.cf_loss = nn.CrossEntropyLoss() self.op_loss = nn.BCEWithLogitsLoss()
def __init__(self, pp): super(mirDNN, self).__init__() self.device = pp.device self.nll_correction = -0.5 + mt.log(2 * mt.exp(0.5)) self.embedding = NucleotideEmbedding() layers = [] layers.append( nn.Conv1d(in_channels, pp.width, kernel_size=pp.kernel_size, padding=int(pp.kernel_size / 2))) seq_len = pp.seq_len while seq_len > 10: for i in range(pp.n_resnets): layers.append( ResNet(pp.width, nfilters=[pp.width, pp.width], ksizes=[pp.kernel_size, pp.kernel_size])) layers.append(nn.MaxPool1d(2)) seq_len = int(seq_len / 2) layers.append(nn.ELU()) layers.append(nn.BatchNorm1d(pp.width)) self.conv_layers = nn.Sequential(*layers) self.conv_out_dim = pp.width * seq_len self.ivar_layers = nn.BatchNorm1d(1) in_dim = self.conv_out_dim + 1 layers = [] layers.append(nn.Linear(in_dim, 32)) layers.append(nn.ELU()) layers.append(nn.BatchNorm1d(32)) layers.append(nn.Linear(32, 1)) layers.append(nn.Sigmoid()) self.fcon_layers = nn.Sequential(*layers) if pp.focal_loss: self.loss_function = FocalLoss() else: self.loss_function = nn.BCELoss() self.to(device=self.device) self.optimizer = RAdam(self.parameters(), lr=5e-3, weight_decay=1e-5) self.lr_scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=0.5, patience=100, min_lr=1e-6, eps=1e-9)
def transfer_model_train(data_builder, json_file_name, weights_file_name, metrics_file_name): """ Run with pretrained model """ transfer_model = MultiModal(data_builder) transfer_model.compile_json_model(json_model=json_file_name, weights=weights_file_name) # transfer_model.compile_multi_modal_network(model_summary=False, save_img=True, save_json=True) transfer_model.get_label_ratios() focal_loss = FocalLoss(alpha=transfer_model.label_ratios, class_proportions=True) transfer_model.train_model(epochs=10, loss_function=focal_loss, learning_rate=0.00001, metrics=['loss', 'F1'], predict_after_epoch=True, save_weights=True, save_metrics=True, assert_weight_update=True, weights_file_name=weights_file_name, metrics_file_name=metrics_file_name)
def __init__(self, args): super(Trainer, self).__init__() self.epoch = args.epoch self.batch_size = args.batch_size self.data_dir = args.data_dir self.save_dir = args.save_dir self.result_dir = args.result_dir self.log_dir = args.log_dir self.gpu_mode = args.gpu_mode self.verbose = args.verbose if args.model == 'fcn16s': self.model = FCN16s() elif args.model == 'fcn32s': self.model = FCN32s() elif args.model == 'fcn8s': self.model = FCN8s() elif args.model == 'pspnet': self.model = PSPnet() else: print("No this model type") exit(-1) if self.gpu_mode: self.model = self.model.cuda() self.parameter = self.model.parameters() self.optimizer = optim.Adam(self.parameter, lr=args.learning_rate) self.scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, gamma=0.5) self.train_dataloader = get_data_loader(self.data_dir, self.batch_size, split='train') self.test_dataloader = get_data_loader(self.data_dir, 1, split='val') # experiment_id = args.model + time.strftime('%m%d%H%m') # self.writer = SummaryWriter(log_dir=self.log_dir + '/tboard_' + experiment_id) self.loss = FocalLoss(gamma=1.25) if args.pretrain != '': self._load_pretrain(args.pretrain)
def LossFunction(output, label, transform, regularization_weight=1e-3): b, n, c = output.shape output = output.view(-1, c) label = label.view(b * n) #weight = torch.from_numpy(np.array([0.06,1,1,1,1,1,1,1])).float().cuda() #criterion1 = nn.CrossEntropyLoss(weight=weight) output = output.float() label = label.long() weight = [0.06, 1, 1, 0.99, 1, 1, 1, 1] criterion1 = FocalLoss(gamma=2, alpha=weight) classify_loss = criterion1(output, label) batch_size, k, _ = transform.shape matrix_difference = torch.bmm(transform, transform.permute(0, 2, 1)) identity = torch.from_numpy(np.eye(k).astype(np.float32)).repeat( batch_size, 1).cuda() identity = Variable(identity).cuda() criterion2 = nn.MSELoss() matrix_difference_loss = criterion2(matrix_difference, identity) return classify_loss + matrix_difference_loss * regularization_weight
def train_new_model(data_builder, json_file_name, weights_file_name, metrics_file_name): """ build, compile, train, test, and evaluate new model """ model = MultiModal(data_builder) model.compile_multi_modal_network(model_summary=False, save_img=False, save_json=True, json_file_name=json_file_name) model.get_label_ratios() focal_loss = FocalLoss(alpha=model.label_ratios, class_proportions=True) model.train_model(epochs=10, loss_function=focal_loss, learning_rate=0.0001, metrics=['loss', 'F1'], predict_after_epoch=True, save_weights=True, weights_file_name=weights_file_name, save_metrics=True, metrics_file_name=metrics_file_name, assert_weight_update=True) model.predict_model()
def train_step(self, net, batch, optimizer, device): net.train() # train mode imgs, true = batch # batch is NHWC imgs = imgs.permute(0, 3, 1, 2) # to NCHW # push data to GPUs and convert to float32 imgs = imgs.to(device).float() true = true.to(device).long()# not one-hot # ----------------------------------------------------------- net.zero_grad() # not rnn so not accumulate logit = net(imgs) # forward prob = F.softmax(logit, dim=1) # has built-int log softmax so accept logit # true = torch.squeeze(true) focal_loss=FocalLoss(gamma=0.5) loss = focal_loss(logit, true) prob = prob.permute(0, 2, 3, 1) # to NHWC pred = torch.argmax(prob, dim=-1) # with ignore index at 0 foc = (true > 0).type(torch.float32) acc = (pred == true).type(torch.float32) * foc acc = torch.sum(acc) / torch.sum(foc) # gradient update loss.backward() optimizer.step() # ----------------------------------------------------------- return dict(loss=loss.item(), acc=acc.item())
def train(dataLoader, netmodel, optimizer, epoch, logger, exp_args): batch_time = AverageMeter('batch_time') data_time = AverageMeter('data_time') losses = AverageMeter('losses') losses_mask = AverageMeter('losses_mask') if exp_args.addEdge == True: losses_edge_ori = AverageMeter('losses_edge_ori') losses_edge = AverageMeter('losses_edge') if exp_args.stability == True: losses_mask_ori = AverageMeter('losses_mask_ori') losses_stability_mask = AverageMeter('losses_stability_mask') losses_stability_edge = AverageMeter('losses_stability_edge') netmodel.train() # switch to train mode loss_Softmax = nn.CrossEntropyLoss(ignore_index=255) # mask loss # in our experiments, focalloss is better than l2 loss loss_Focalloss = FocalLoss(gamma=2) # boundary loss # loss_l2 = nn.MSELoss() # boundary loss end = time.time() for i, (input_ori, input, edge, mask) in enumerate(dataLoader): data_time.update(time.time() - end) input_ori_var = Variable(input_ori.cuda()) input_var = Variable(input.cuda()) edge_var = Variable(edge.cuda()) mask_var = Variable(mask.cuda()) if exp_args.addEdge == True: output_mask, output_edge = netmodel(input_var) loss_mask = loss_Softmax(output_mask, mask_var) losses_mask.update(loss_mask.data.item(), input.size(0)) # loss_edge = loss_l2(output_edge, edge_var) * exp_args.edgeRatio loss_edge = loss_Focalloss(output_edge, edge_var) * exp_args.edgeRatio losses_edge.update(loss_edge.data.item(), input.size(0)) # total loss loss = loss_mask + loss_edge if exp_args.stability == True: output_mask_ori, output_edge_ori = netmodel(input_ori_var) loss_mask_ori = loss_Softmax(output_mask_ori, mask_var) losses_mask_ori.update(loss_mask_ori.data.item(), input.size(0)) # loss_edge_ori = loss_l2(output_edge_ori, edge_var) * exp_args.edgeRatio loss_edge_ori = loss_Focalloss(output_edge_ori, edge_var) * exp_args.edgeRatio losses_edge_ori.update(loss_edge_ori.data.item(), input.size(0)) # in our experiments, kl loss is better than l2 loss if exp_args.use_kl == False: # consistency constraint loss: L2 distance loss_stability_mask = loss_l2( output_mask, Variable(output_mask_ori.data, requires_grad=False)) * exp_args.alpha loss_stability_edge = loss_l2( output_edge, Variable(output_edge_ori.data, requires_grad=False) ) * exp_args.alpha * exp_args.edgeRatio else: # consistency constraint loss: KL distance (better than L2 distance) loss_stability_mask = loss_KL( output_mask, Variable(output_mask_ori.data, requires_grad=False), exp_args.temperature) * exp_args.alpha loss_stability_edge = loss_KL( output_edge, Variable(output_edge_ori.data, requires_grad=False), exp_args.temperature ) * exp_args.alpha * exp_args.edgeRatio losses_stability_mask.update(loss_stability_mask.data.item(), input.size(0)) losses_stability_edge.update(loss_stability_edge.data.item(), input.size(0)) # total loss # loss = loss_mask + loss_mask_ori + loss_edge + loss_edge_ori + loss_stability_mask + loss_stability_edge loss = loss_mask + loss_mask_ori + loss_stability_mask + loss_edge else: output_mask = netmodel(input_var) loss_mask = loss_Softmax(output_mask, mask_var) losses_mask.update(loss_mask.data.item(), input.size(0)) # total loss: only include mask loss loss = loss_mask if exp_args.stability == True: output_mask_ori = netmodel(input_ori_var) loss_mask_ori = loss_Softmax(output_mask_ori, mask_var) losses_mask_ori.update(loss_mask_ori.data.item(), input.size(0)) if exp_args.use_kl == False: # consistency constraint loss: L2 distance loss_stability_mask = loss_l2( output_mask, Variable(output_mask_ori.data, requires_grad=False)) * exp_args.alpha else: # consistency constraint loss: KL distance (better than L2 distance) loss_stability_mask = loss_KL( output_mask, Variable(output_mask_ori.data, requires_grad=False), exp_args.temperature) * exp_args.alpha losses_stability_mask.update(loss_stability_mask.data.item(), input.size(0)) # total loss loss = loss_mask + loss_mask_ori + loss_stability_mask losses.update(loss.data.item(), input.size(0)) # compute gradient and do Adam step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.printfreq == 0: print( 'Epoch: [{0}][{1}/{2}]\t' 'Lr-deconv: [{3}]\t' 'Lr-other: [{4}]\t' # 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' # 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( epoch, i, len(dataLoader), optimizer.param_groups[0]['lr'], optimizer.param_groups[1]['lr'], loss=losses)) ## '===========> logger <===========' # (1) Log the scalar values if exp_args.addEdge == True and exp_args.stability == True: info = { # batch_time.name: batch_time.val, # data_time.name: data_time.val, losses.name: losses.val, losses_mask_ori.name: losses_mask_ori.val, losses_mask.name: losses_mask.val, losses_edge_ori.name: losses_edge_ori.val, losses_edge.name: losses_edge.val, losses_stability_mask.name: losses_stability_mask.val, losses_stability_edge.name: losses_stability_edge.val } elif exp_args.addEdge == True and exp_args.stability == False: info = { # batch_time.name: batch_time.val, # data_time.name: data_time.val, losses.name: losses.val, losses_mask.name: losses_mask.val, losses_edge.name: losses_edge.val, } elif exp_args.addEdge == False and exp_args.stability == True: info = { # batch_time.name: batch_time.val, # data_time.name: data_time.val, losses.name: losses.val, losses_mask_ori.name: losses_mask_ori.val, losses_mask.name: losses_mask.val, losses_stability_mask.name: losses_stability_mask.val, } elif exp_args.addEdge == False and exp_args.stability == False: info = { # batch_time.name: batch_time.val, # data_time.name: data_time.val, losses.name: losses.val, losses_mask.name: losses_mask.val, } for tag, value in info.items(): logger.scalar_summary(tag, value, step=i) ''' # (2) Log values and gradients of the parameters (histogram) for tag, value in netmodel.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.data.cpu().numpy(), step=i) if value.grad is None: continue logger.histo_summary(tag+'/grad', value.grad.cpu().data.numpy(), step=i) break ''' # (3) Log the images if i % (args.printfreq) == 0: num = 2 input_img = np.uint8((Anti_Normalize_Img( np.transpose(input.cpu().numpy()[0:num], (0, 2, 3, 1)), scale=exp_args.img_scale, mean=exp_args.img_mean, val=exp_args.img_val)))[:, :, :, :3][:, :, :, ::-1] if exp_args.video == True: input_prior = np.float32( np.transpose(input.cpu().numpy()[0:num], (0, 2, 3, 1))[:, :, :, 3]) input_mask = mask.cpu().numpy()[0:num] input_mask[input_mask == 255] = 0 softmax = nn.Softmax(dim=1) prob = softmax(output_mask) masks_pred = np.transpose(prob.data.cpu().numpy()[0:num], (0, 2, 3, 1))[:, :, :, 1] info = {} info['input_img'] = input_img if exp_args.video == True: info['input_prior'] = input_prior * 255 info['input_mask'] = input_mask * 255 info['output_mask'] = masks_pred * 255 if exp_args.addEdge == True: input_edge = edge.cpu().numpy()[0:num] edge_pred = np.transpose(output_edge.data.cpu().numpy()[0:num], (0, 2, 3, 1))[:, :, :, 0] if exp_args.stability == True: input_img_ori = np.uint8((Anti_Normalize_Img( np.transpose(input_ori.cpu().numpy()[0:num], (0, 2, 3, 1)), scale=exp_args.img_scale, mean=exp_args.img_mean, val=exp_args.img_val)))[:, :, :, :3][:, :, :, ::-1] prob_ori = softmax(output_mask_ori) masks_pred_ori = np.transpose( prob_ori.data.cpu().numpy()[0:num], (0, 2, 3, 1))[:, :, :, 1] edge_pred_ori = np.transpose( output_edge_ori.data.cpu().numpy()[0:num], (0, 2, 3, 1))[:, :, :, 0] info['input_img_ori'] = input_img_ori info['output_mask_ori'] = masks_pred_ori * 255 info['input_edge'] = input_edge * 255 info['output_edge'] = edge_pred * 255 info['output_edge_ori'] = edge_pred_ori * 255 else: info['input_edge'] = input_edge * 255 info['output_edge'] = edge_pred * 255 else: if exp_args.stability == True: input_img_ori = np.uint8((Anti_Normalize_Img( np.transpose(input_ori.cpu().numpy()[0:num], (0, 2, 3, 1)), scale=exp_args.img_scale, mean=exp_args.img_mean, val=exp_args.img_val)))[:, :, :, :3][:, :, :, ::-1] prob_ori = softmax(output_mask_ori) masks_pred_ori = np.transpose( prob_ori.data.cpu().numpy()[0:num], (0, 2, 3, 1))[:, :, :, 1] info['input_img_ori'] = input_img_ori info['output_mask_ori'] = masks_pred_ori * 255 print(np.max(masks_pred), np.min(masks_pred)) for tag, images in info.items(): logger.image_summary(tag, images, step=i) pass
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_acc = float('-inf') writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: if args.network == 'r18': model = resnet18(args) elif args.network == 'r34': model = resnet34(args) elif args.network == 'r50': model = resnet50(args) elif args.network == 'r101': model = resnet101(args) elif args.network == 'r152': model = resnet152(args) elif args.network == 'mobile': model = MobileNetV2() else: raise TypeError('network {} is not supported.'.format( args.network)) # print(model) model = nn.DataParallel(model) metric_fc = ArcMarginModel(args) metric_fc = nn.DataParallel(metric_fc) if args.optimizer == 'sgd': optimizer = torch.optim.SGD([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, momentum=args.mom, weight_decay=args.weight_decay) else: optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, weight_decay=args.weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] metric_fc = checkpoint['metric_fc'] optimizer = checkpoint['optimizer'] logger = get_logger() # Move to GPU, if available model = model.to(device) metric_fc = metric_fc.to(device) # Loss function if args.focal_loss: criterion = FocalLoss(gamma=args.gamma).to(device) else: criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders train_dataset = ArcFaceDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4) scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1) # Epochs for epoch in range(start_epoch, args.end_epoch): # One epoch's training train_loss, train_acc = train(train_loader=train_loader, model=model, metric_fc=metric_fc, criterion=criterion, optimizer=optimizer, epoch=epoch, logger=logger) writer.add_scalar('model/train_loss', train_loss, epoch) writer.add_scalar('model/train_acc', train_acc, epoch) # One epoch's validation lfw_acc, threshold = lfw_test(model) writer.add_scalar('model/valid_acc', lfw_acc, epoch) writer.add_scalar('model/valid_thres', threshold, epoch) # Check if there was an improvement is_best = lfw_acc > best_acc best_acc = max(lfw_acc, best_acc) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, metric_fc, optimizer, best_acc, is_best) scheduler.step(epoch)
def train(model_name, outputDir): train_dataset = FurnitureDataset('train', transform=preprocess_with_augmentation) val_dataset = FurnitureDataset('val', transform=preprocess) training_data_loader = DataLoader(dataset=train_dataset, num_workers=12, batch_size=BATCH_SIZE, shuffle=True) validation_data_loader = DataLoader(dataset=val_dataset, num_workers=1, batch_size=BATCH_SIZE, shuffle=False) model = get_model(model_name) nb_learnable_params = sum(p.numel() for p in model.fresh_params()) print('Number of learnable params: %s' % str(nb_learnable_params)) # Use model.fresh_params() to train only the newly initialized weights optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2) if model_name.endswith("_focal"): print ("Using Focal loss instead of normal cross-entropy") criterion = FocalLoss(NB_CLASSES).to(device) else: criterion = nn.CrossEntropyLoss().to(device) min_loss = float("inf") max_acc = 0.0 patience = 0 for epoch in range(NUM_EPOCHS): print('Epoch: %d' % epoch) running_loss = RunningMean() running_error = RunningMean() running_accuracy = RunningMean() model.train() pbar = tqdm(training_data_loader, total=len(training_data_loader)) for inputs, labels in pbar: batch_size = inputs.size(0) inputs = Variable(inputs) labels = Variable(labels) inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(inputs) _, preds = torch.max(outputs.data, dim=1) loss = criterion(outputs, labels) running_loss.update(loss.data[0], 1) running_error.update(torch.sum(preds != labels.data), batch_size) running_accuracy.update(torch.sum(preds == labels.data), batch_size) loss.backward() optimizer.step() pbar.set_description('%.5f %.3f %.3f' % (running_loss.value, running_accuracy.value, running_error.value)) print('Epoch: %d | Running loss: %.5f | Running accuracy: %.3f | Running error: %.3f' % (epoch, running_loss.value, running_accuracy.value, running_error.value)) lx, px = utils.predict(model, validation_data_loader, device) log_loss = criterion(Variable(px), Variable(lx)) log_loss = log_loss.data[0] _, preds = torch.max(px, dim=1) accuracy = torch.mean((preds == lx).float()) error = torch.mean((preds != lx).float()) print('Validation loss: %.5f | Accuracy: %.3f | Error: %.3f' % (log_loss, accuracy, error)) scheduler.step(log_loss) # Save model after each epoch torch.save(model.state_dict(), os.path.join(outputDir, 'weight_' + model_name + '.pth')) betterModelFound = False if log_loss < min_loss: torch.save(model.state_dict(), os.path.join(outputDir, 'best_val_loss_weight_' + model_name + '.pth')) print('Validation score improved from %.5f to %.5f. Model snapshot saved!' % (min_loss, log_loss)) min_loss = log_loss patience = 0 betterModelFound = True if accuracy > max_acc: torch.save(model.state_dict(), os.path.join(outputDir, 'best_val_acc_weight_' + model_name + '.pth')) print('Validation accuracy improved from %.5f to %.5f. Model snapshot saved!' % (max_acc, accuracy)) max_acc = accuracy patience = 0 betterModelFound = True if not betterModelFound: patience += 1
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_acc = 0 writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: if args.network == 'r18': model = resnet18(args) elif args.network == 'r34': model = resnet34(args) elif args.network == 'r50': model = resnet50(args) elif args.network == 'r101': model = resnet101(args) elif args.network == 'r152': model = resnet152(args) elif args.network == 'mobile': model = MobileNet(1.0) elif args.network == 'mr18': print("mr18") model = myResnet18() else: model = resnet_face18(args.use_se) model = nn.DataParallel(model) metric_fc = ArcMarginModel(args) metric_fc = nn.DataParallel(metric_fc) if args.optimizer == 'sgd': optimizer = torch.optim.SGD([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, momentum=args.mom, weight_decay=args.weight_decay) else: optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, weight_decay=args.weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] metric_fc = checkpoint['metric_fc'] optimizer = checkpoint['optimizer'] logger = get_logger() # Move to GPU, if available model = model.to(device) metric_fc = metric_fc.to(device) # Loss function if args.focal_loss: criterion = FocalLoss(gamma=args.gamma).to(device) else: criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders train_dataset = ArcFaceDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1) # Epochs for epoch in range(start_epoch, args.end_epoch): scheduler.step() if args.full_log: lfw_acc, threshold = lfw_test(model) writer.add_scalar('LFW_Accuracy', lfw_acc, epoch) full_log(epoch) start = datetime.now() # One epoch's training train_loss, train_top5_accs = train(train_loader=train_loader, model=model, metric_fc=metric_fc, criterion=criterion, optimizer=optimizer, epoch=epoch, logger=logger, writer=writer) writer.add_scalar('Train_Loss', train_loss, epoch) writer.add_scalar('Train_Top5_Accuracy', train_top5_accs, epoch) end = datetime.now() delta = end - start print('{} seconds'.format(delta.seconds)) # One epoch's validation lfw_acc, threshold = lfw_test(model) writer.add_scalar('LFW Accuracy', lfw_acc, epoch) # Check if there was an improvement is_best = lfw_acc > best_acc best_acc = max(lfw_acc, best_acc) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, metric_fc, optimizer, best_acc, is_best)
def train_net(args): torch.manual_seed(7) np.random.seed(7) best_loss = 100000 torch.manual_seed(7) np.random.seed(7) checkpoint = None start_epoch = 0 writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: if args.network == 'r100': model = resnet101(args) elif args.network == 'r50': model = resnet50(args) elif args.network == 'r34': model = resnet34(args) elif args.network == 'r18': model = resnet18(args) else: # 'face' model = resnet50(args) optimizer = torch.optim.SGD(params=filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, momentum=args.mom, weight_decay=args.weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] optimizer = checkpoint['optimizer'] # Move to GPU, if available model = model.to(device) # Loss function if args.focal_loss: age_criterion = FocalLoss(gamma=args.gamma).to(device) gender_criterion = FocalLoss(gamma=args.gamma).to(device) else: age_criterion = nn.CrossEntropyLoss().to(device) gender_criterion = nn.CrossEntropyLoss().to(device) criterion_info = (age_criterion, gender_criterion, args.age_weight) # Custom dataloaders train_dataset = AgeGenDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) val_dataset = AgeGenDataset('valid') val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=workers, pin_memory=True) scheduler = StepLR(optimizer, step_size=args.lr_step, gamma=0.1) # Epochs for epoch in range(start_epoch, epochs): scheduler.step() # One epoch's training train_loss, train_gen_accs, train_age_mae = train(train_loader=train_loader, model=model, criterion_info=criterion_info, optimizer=optimizer, epoch=epoch) writer.add_scalar('Train Loss', train_loss, epoch) writer.add_scalar('Train Gender Accuracy', train_gen_accs, epoch) writer.add_scalar('Train Age MAE', train_age_mae, epoch) # One epoch's validation valid_loss, valid_gen_accs, valid_age_mae = validate(val_loader=val_loader, model=model, criterion_info=criterion_info) writer.add_scalar('Valid Loss', valid_loss, epoch) writer.add_scalar('Valid Gender Accuracy', valid_gen_accs, epoch) writer.add_scalar('Valid Age MAE', valid_age_mae, epoch) # Check if there was an improvement is_best = valid_loss < best_loss best_loss = min(valid_loss, best_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement,)) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, optimizer, best_loss, is_best)
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_acc = 0 writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: if args.network == 'r18': model = resnet18(args) elif args.network == 'r34': model = resnet34(args) elif args.network == 'r50': model = resnet50(args) elif args.network == 'r101': model = resnet101(args) elif args.network == 'r152': model = resnet152(args) elif args.network == 'mobile': model = MobileNet(1.0) else: model = resnet_face18(args.use_se) model = nn.DataParallel(model) metric_fc = ArcMarginModel(args) metric_fc = nn.DataParallel(metric_fc) if args.optimizer == 'sgd': # optimizer = torch.optim.SGD([{'params': model.parameters()}, {'params': metric_fc.parameters()}], # lr=args.lr, momentum=args.mom, weight_decay=args.weight_decay) optimizer = InsightFaceOptimizer( torch.optim.SGD([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, momentum=args.mom, weight_decay=args.weight_decay)) else: optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, weight_decay=args.weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] metric_fc = checkpoint['metric_fc'] optimizer = checkpoint['optimizer'] logger = get_logger() # Move to GPU, if available model = model.to(device) metric_fc = metric_fc.to(device) # Loss function if args.focal_loss: criterion = FocalLoss(gamma=args.gamma).to(device) else: criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders train_dataset = ArcFaceDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=8) # Epochs for epoch in range(start_epoch, args.end_epoch): # One epoch's training train_loss, train_top1_accs = train(train_loader=train_loader, model=model, metric_fc=metric_fc, criterion=criterion, optimizer=optimizer, epoch=epoch, logger=logger) print('\nCurrent effective learning rate: {}\n'.format(optimizer.lr)) print('Step num: {}\n'.format(optimizer.step_num)) writer.add_scalar('model/train_loss', train_loss, epoch) writer.add_scalar('model/train_accuracy', train_top1_accs, epoch) writer.add_scalar('model/learning_rate', optimizer.lr, epoch) # One epoch's validation megaface_acc = megaface_test(model) writer.add_scalar('model/megaface_accuracy', megaface_acc, epoch) # Check if there was an improvement is_best = megaface_acc > best_acc best_acc = max(megaface_acc, best_acc) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, metric_fc, optimizer, best_acc, is_best)
def train(): device = args.device log_every = args.log_every valid_iter = args.valid_iter train_iter = 0 cum_loss = 0 avg_loss = 0 avg_util_loss = 0 avg_answer_loss = 0 valid_num = 0 patience = 0 num_trial = 0 hist_valid_scores = [] begin_time = time.time() vocab = get_vocab(args.vocab_file) model = EVPI(args, vocab) if args.use_embed == 1: model.load_vector(args, vocab) print("Placing model on ", args.device) if args.device == 'cuda': model.cuda() lr = args.lr optim = torch.optim.Adam(list(model.parameters()), lr=lr) # The loss functions #criterion = torch.nn.CrossEntropyLoss().to(device=device) criterion = FocalLoss(gamma=5).to(device=device) print("Beginning Training") model.train() cosine_function = torch.nn.functional.cosine_similarity model_counter = 0 train_iter = 0 for ep in range(args.max_epochs): val_iter = 0 count = 0 hello = set() for ids, posts, questions, answers, labels in batch_iter(train_ids, \ post_content, qa_dict, vocab, args.batch_size, shuffle=False): train_iter += 1 optim.zero_grad() question_vectors = vocab.id2vector(questions) post_vectors = vocab.id2vector(posts) answer_vectors = vocab.id2vector(answers) padded_posts, post_pad_idx = pad_sequence(args.device, posts) padded_questions, question_pad_idx = pad_sequence(args.device, questions) padded_answers, answer_pad_idx = pad_sequence(args.device, answers) pqa_probs = model(ids, (padded_posts, post_pad_idx),\ (padded_questions, question_pad_idx),\ (padded_answers, answer_pad_idx)) labels = torch.tensor(labels).to(device=args.device) total_loss = criterion(pqa_probs, labels) #bp() avg_loss += total_loss.item() cum_loss += total_loss.item() total_loss.backward() torch.nn.utils.clip_grad_norm_(list(model.parameters()), args.clip_grad) optim.step() if train_iter % log_every == 0: print('epoch %d, iter %d, avg.loss %.6f, time elapsed %.2f'\ % (ep + 1, train_iter, avg_loss / log_every, time.time() - begin_time), file=sys.stderr) begin_time = time.time() avg_loss = 0 if train_iter % valid_iter == 0: print('epoch %d, iter %d, cum.loss %.2f, time elapsed %.2f'\ % (ep + 1, train_iter, cum_loss / valid_iter, time.time() - begin_time), file=sys.stderr) cum_loss = 0 valid_num += 1 print("Begin Validation ", file=sys.stderr) model.eval() val_loss = get_val_loss(vocab, args, model) model.train() print('validation: iter %d, loss %f' % (train_iter, val_loss), file=sys.stderr) is_better = (len(hist_valid_scores) == 0) or (val_loss < min(hist_valid_scores)) hist_valid_scores.append(val_loss) if is_better: patience = 0 print("Save the current model and optimiser state") torch.save(model, args.model_save_path) #torch.save(model, args.model_save_path + '.' + str(val_loss) + '-' + str(model_counter)) #model_counter += 1 torch.save(optim.state_dict(), args.model_save_path + '.optim') elif patience < args.patience: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == args.patience: num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == args.max_num_trials: print('early stop!', file=sys.stderr) return lr = lr * args.lr_decay print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) model = load(args.model_save_path) model.train() print('restore parameters of the optimizers', file=sys.stderr) optim = torch.optim.Adam(list(model.parameters()), lr=lr) optim.load_state_dict(torch.load(args.model_save_path + '.optim')) for state in optim.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(args.device) for group in optim.param_groups: group['lr'] = lr patience = 0 print("Training Finished", file=sys.stderr)
def train_net( net, epochs=20, batch_size=32, lr=0.01, save_cp=True, gpu=True, ): # # setting paths root_data = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/hand/') dir_checkpoint = 'checkpoints/hand/' writer = SummaryWriter('log/hand_batch8') # # setting data train_set = Hand(root_data, train=True) test_set = Hand(root_data, test=True) train_data = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=12) test_data = DataLoader(test_set, batch_size=1, shuffle=False, num_workers=12) N_train = train_set.getLen() N_test = test_set.getLen() # # setting optimizer # optimizer = torch.optim.Adam( # net.parameters(), # lr=lr, # weight_decay=1e-3) optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005) # # setting loss fuc use_focal = True use_CE = False use_dice = False use_iou = False if use_focal: # to use focal loss criterion = FocalLoss(class_num=2, gamma=2) elif use_CE: # # to use CEloss with weight weight = torch.Tensor([2, 3]) if gpu: weight = weight.cuda() criterion = torch.nn.CrossEntropyLoss(weight=weight) elif use_dice: criterion = soft_dice_loss elif use_iou: # to use BCE loss criterion1 = nn.BCELoss() criterion2 = mIoULoss() else: criterion = nn.BCELoss() processed_batch = 0 print(''' Starting training: Epochs: {} Batch size: {} Use FocalLoss: {} Learning rate: {} Training size: {} Validation size: {} Checkpoints: {} CUDA: {} '''.format(epochs, batch_size, str(use_focal), lr, N_train, N_test, str(save_cp), str(gpu))) for epoch in range(epochs): print('Starting epoch {}/{}.'.format(epoch + 1, epochs)) epoch_loss = 0 num_i = 0 # Sets the learning rate to the initial LR decayed by 10 every 20 epochs when epoch < 70 if (epoch + 1) % 10 == 0 and epoch < 50: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.1 print('NOTE!!! Learn rate is changed to ' + str(param_group['lr'] * 0.1)) for ii, (imgs, true_masks) in enumerate(train_data): num_i += 1 processed_batch += 1 imgs = Variable(imgs) true_masks = Variable(true_masks) if use_iou: true_masks_miou = Variable(to_one_hot(true_masks.long(), 2)) if use_focal or use_CE: true_masks = true_masks.long() if gpu: imgs = imgs.cuda() true_masks = true_masks.cuda() if use_iou: true_masks_miou = true_masks_miou.cuda() optimizer.zero_grad() masks_pred = net(imgs) if use_focal or use_CE: # # to use classification loss if use_CE: masks_pred = masks_pred.contiguous().view( masks_pred.size(0), masks_pred.size(1), -1) masks_pred = masks_pred.transpose(1, 2) masks_pred = masks_pred.contiguous().view( -1, masks_pred.size(2)).squeeze() true_masks = true_masks.contiguous().view( true_masks.size(0), true_masks.size(1), -1) true_masks = true_masks.transpose(1, 2) true_masks = true_masks.contiguous().view( -1, true_masks.size(2)).squeeze() loss = criterion(masks_pred, true_masks) elif use_dice: loss = criterion(masks_pred, true_masks) elif use_iou: # # combine iou and dice loss # channel0 = torch.ones(masks_pred.size()) # if masks_pred.is_cuda: # channel0.cuda() channel0 = 1 - masks_pred masks_pred_iou = torch.cat((channel0, masks_pred), dim=1) masks_pred = F.sigmoid(masks_pred) masks_probs_flat = masks_pred.view(-1) true_masks_flat = true_masks.view(-1) loss1 = criterion1(masks_probs_flat, true_masks_flat) # # 需要把输入变为双通道 loss2 = criterion2(masks_pred_iou, true_masks_miou) loss = loss1.div(2) + loss2.div(2) else: masks_pred = F.sigmoid(masks_pred) masks_probs_flat = masks_pred.view(-1) loss = criterion(masks_probs_flat, masks_probs_flat) epoch_loss += loss.data[0] writer.add_scalar('loss', loss.data[0], processed_batch) loss.backward() optimizer.step() print('Epoch finished ! Loss: {}'.format(epoch_loss / num_i)) writer.add_scalar('train_loss_epoch', epoch_loss / num_i, epoch + 1) # # test the net net.eval() # # use dice coff # val_score = eval_net(net, test_data, gpu, focal=use_focal, CE=use_CE, dice=use_dice) # print('Validation Dice Coeff: {}'.format(val_score)) # writer.add_scalar('val_dice', val_score, epoch + 1) # # use Jaccard(iou) index val_score = calcul_iou_for_focal(net, test_data, gpu) print('Validation jaccard_similarity_score is : {}'.format(val_score)) writer.add_scalar('val_iou', val_score, epoch + 1) net.train() if save_cp and val_score > 0.90: torch.save( net.state_dict(), dir_checkpoint + 'CP{}_deeper_SE_{:.4}.pth'.format(epoch + 1, val_score)) print('Checkpoint {} saved !'.format(epoch + 1))
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_acc = float('-inf') writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: if args.network == 'r18': model = resnet18(args) elif args.network == 'r34': model = resnet34(args) elif args.network == 'r50': model = resnet50(args) elif args.network == 'r101': model = resnet101(args) elif args.network == 'r152': model = resnet152(args) elif args.network == 'mobile': from mobilenet_v2 import MobileNetV2 model = MobileNetV2() else: raise TypeError('network {} is not supported.'.format( args.network)) metric_fc = ArcMarginModel(args) if args.optimizer == 'sgd': optimizer = torch.optim.SGD([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, momentum=args.mom, weight_decay=args.weight_decay) else: optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, weight_decay=args.weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] metric_fc = checkpoint['metric_fc'] optimizer = checkpoint['optimizer'] model = nn.DataParallel(model) metric_fc = nn.DataParallel(metric_fc) # Move to GPU, if available model = model.to(device) metric_fc = metric_fc.to(device) # Loss function if args.focal_loss: criterion = FocalLoss(gamma=args.gamma).to(device) else: criterion = nn.CrossEntropyLoss().to(device) # Custom dataloaders train_dataset = ArcFaceDataset('train') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=num_workers) # Epochs for epoch in range(start_epoch, args.end_epoch): # Decay learning rate if there is no improvement for 2 consecutive epochs, and terminate training after 10 if epochs_since_improvement == 10: break if epochs_since_improvement > 0 and epochs_since_improvement % 2 == 0: checkpoint = 'BEST_checkpoint.tar' checkpoint = torch.load(checkpoint) model = checkpoint['model'] metric_fc = checkpoint['metric_fc'] optimizer = checkpoint['optimizer'] adjust_learning_rate(optimizer, 0.5) # One epoch's training train_loss, train_top1_accs = train(train_loader=train_loader, model=model, metric_fc=metric_fc, criterion=criterion, optimizer=optimizer, epoch=epoch) lr = optimizer.param_groups[0]['lr'] print('\nCurrent effective learning rate: {}\n'.format(lr)) # print('Step num: {}\n'.format(optimizer.step_num)) writer.add_scalar('model/train_loss', train_loss, epoch) writer.add_scalar('model/train_accuracy', train_top1_accs, epoch) writer.add_scalar('model/learning_rate', lr, epoch) if epoch % 5 == 0: # One epoch's validation megaface_acc = megaface_test(model) writer.add_scalar('model/megaface_accuracy', megaface_acc, epoch) # Check if there was an improvement is_best = megaface_acc > best_acc best_acc = max(megaface_acc, best_acc) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, metric_fc, optimizer, best_acc, is_best)
def get_focal_loss(classifier): print("==> Using Focal Loss.....") classifier.writer.add_text('Info', "Using Focal Loss ") return FocalLoss(gamma)
def main(args, GAMMA, pretrain_model_path=None): # Now load pickle labels mapping file class_dict_fname = F_CLASS_DICT_PKL print(class_dict_fname) with open(class_dict_fname, "rb") as f: class_dict, _ = pickle.load(f) f.close() print("CLASS DICT: {}".format(class_dict)) # Use to get numeric classes --> semantic classes seg_classes = class_dict seg_label_to_cat = {} print(seg_label_to_cat) for i, cat in enumerate(seg_classes.values()): seg_label_to_cat[i] = cat print('SEG LABEL', seg_label_to_cat) # First load class weights file with open(F_CLASS_WEIGHTS_PKL, "rb") as f: class_weights = pickle.load(f) f.close() print('SEG CLASSES', seg_classes) COUNTS = np.array( [class_weights[key] for key in list(class_weights.keys())]) weight_normalizer = np.max(COUNTS) weights = [] for count in COUNTS: if count != 0: weights.append(weight_normalizer / count) else: weights.append(0) # Threshold weights WEIGHTS_NP = np.array(weights) WEIGHTS_NP[WEIGHTS_NP > THRESHOLD] = THRESHOLD print("WEIGHTS ARE: {}".format(WEIGHTS_NP)) # Convert to pytorch tensor weights = torch.from_numpy(WEIGHTS_NP.astype('float32')) if USE_CLI: gpu = args.gpu multi_gpu = args.multi_gpu batch_size = args.batch_size model_name = args.model_name optimizer = args.optimizer learning_rate = args.learning_rate pretrain = args.pretrain multi_gpu = args.multi_gpu batchsize = args.batchsize decay_rate = args.decay_rate epochs = args.epochs else: gpu = GPU multi_gpu = MULTI_GPU batch_size = BATCH_SIZE model_name = MODEL_NAME optimizer = OPTIMIZER learning_rate = LEARNING_RATE pretrain = PRETRAIN multi_gpu = MULTI_GPU batchsize = BATCH_SIZE decay_rate = DECAY_RATE epochs = EPOCHS os.environ[ "CUDA_VISIBLE_DEVICES"] = gpu if multi_gpu is None else '0,1,2,3' '''CREATE DIR''' experiment_dir = Path('./experiment/{}'.format( EXPERIMENT_HEADER.format(GAMMA))) experiment_dir.mkdir(exist_ok=True) file_dir = Path( str(experiment_dir) + '/%sSemSeg-' % model_name + str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M'))) file_dir.mkdir(exist_ok=True) checkpoints_dir = file_dir.joinpath('checkpoints/') checkpoints_dir.mkdir(exist_ok=True) log_dir = file_dir.joinpath('logs/') log_dir.mkdir(exist_ok=True) '''LOG''' if USE_CLI: args = parse_args() logger = logging.getLogger(model_name) else: logger = logging.getLogger(MODEL_NAME) logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if USE_CLI: file_handler = logging.FileHandler( str(log_dir) + '/train_%s_semseg.txt' % args.model_name) else: file_handler = logging.FileHandler( str(log_dir) + '/train_%s_semseg.txt' % MODEL_NAME) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.info( '---------------------------------------------------TRANING---------------------------------------------------' ) if USE_CLI: logger.info('PARAMETER ...') logger.info(args) print('Load data...') #train_data, train_label, test_data, test_label = recognize_all_data(test_area = 5) # Now pickle our dataset if USE_CLI: f_in = args.data_path else: f_in = DATA_PATH # Now pickle file with open(f_in, "rb") as f: DATA = pickle.load(f) f.close() random_seed = 42 indices = [i for i in range(len(list(DATA.keys())))] np.random.seed(random_seed) np.random.shuffle(indices) TEST_SPLIT = 0.2 test_index = int(np.floor(TEST_SPLIT * len(list(DATA.keys())))) print("val index is: {}".format(test_index)) train_indices, test_indices = indices[test_index:], indices[:test_index] if USE_CLI: print("LEN TRAIN: {}, LEN TEST: {}, EPOCHS: {}, OPTIMIZER: {}, DECAY_RATE: {}, LEARNING RATE: {}, \ DATA PATH: {}" .format(len(train_indices), len(test_indices), epochs, args.optimizer, args.decay_rate, \ args.learning_rate, args.data_path)) else: print("LEN TRAIN: {}, LEN TEST: {}, EPOCHS: {}, OPTIMIZER: {}, DECAY_RATE: {}, LEARNING RATE: {}, \ DATA PATH: {}" .format(len(train_indices), len(test_indices), EPOCHS, OPTIMIZER, DECAY_RATE, \ LEARNING_RATE, DATA_PATH)) # Creating PT data samplers and loaders: train_sampler = SubsetRandomSampler(train_indices) test_sampler = SubsetRandomSampler(test_indices) print("INTERSECTION OF TRAIN/TEST (should be 0): {}".format( len(set(train_indices).intersection(set(test_indices))))) # Training dataset dataset = A2D2DataLoader(DATA) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batchsize, shuffle=False, sampler=train_sampler, collate_fn=collate_fn) # Test dataset test_dataset = A2D2DataLoader(DATA) testdataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batchsize, shuffle=False, sampler=test_sampler, collate_fn=collate_fn) num_classes = NUM_CLASSES blue = lambda x: '\033[94m' + x + '\033[0m' model = PointNet2SemSeg( num_classes) if model_name == 'pointnet2' else PointNetSeg( num_classes, feature_transform=True, semseg=True) if pretrain_model_path is not None: model.load_state_dict(torch.load(pretrain_model_path)) print('load model %s' % pretrain_model_path) logger.info('load model %s' % pretrain_model_path) else: print('Training from scratch') logger.info('Training from scratch') #pretrain_var = pretrain init_epoch = int(pretrain_var[-14:-11]) if pretrain is not None else 0 if optimizer == 'SGD': optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) elif optimizer == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=decay_rate) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5) LEARNING_RATE_CLIP = 1e-5 '''GPU selection and multi-GPU''' if multi_gpu is not None: device_ids = [int(x) for x in multi_gpu.split(',')] torch.backends.cudnn.benchmark = True model.cuda(device_ids[0]) model = torch.nn.DataParallel(model, device_ids=device_ids) else: model.cuda() history = defaultdict(lambda: list()) best_acc = 0 best_meaniou = 0 graph_losses = [] steps = [] step = 0 print("NUMBER OF EPOCHS IS: {}".format(epochs)) for epoch in range(epochs): scheduler.step() lr = max(optimizer.param_groups[0]['lr'], LEARNING_RATE_CLIP) print('Learning rate:%f' % lr) for param_group in optimizer.param_groups: param_group['lr'] = lr counter = 0 # Init confusion matrix if USE_CONMAT: conf_matrix = torch.zeros(NUM_CLASSES, NUM_CLASSES) for points, targets in tqdm(dataloader): #for points, target in tqdm(dataloader): #points, target = data points, targets = Variable(points.float()), Variable( targets.long()) points = points.transpose(2, 1) points, targets = points.cuda(), targets.cuda() weights = weights.cuda() optimizer.zero_grad() # REMOVE gradients model = model.train() if model_name == 'pointnet': pred, trans_feat = model(points) else: pred = model( points[:, :3, :], points[:, 3:, :] ) # Channels: xyz_norm (first 3) | rgb_norm (second three) #pred = model(points) if USE_CONMAT: conf_matrix = confusion_matrix(pred, targets, conf_matrix) pred = pred.contiguous().view(-1, num_classes) targets = targets.view(-1, 1)[:, 0] loss = FocalLoss(gamma=GAMMA)(pred, targets) #loss = F.nll_loss(pred, targets, weight=weights) # Add class weights from dataset if model_name == 'pointnet': loss += feature_transform_reguliarzer(trans_feat) * 0.001 graph_losses.append(loss.cpu().data.numpy()) steps.append(step) if counter % 100 == 0: print("LOSS IS: {}".format(loss.cpu().data.numpy())) #print((loss.cpu().data.numpy())) history['loss'].append(loss.cpu().data.numpy()) loss.backward() optimizer.step() counter += 1 step += 1 #if counter > 3: # break if USE_CONMAT: print("CONFUSION MATRIX: \n {}".format(conf_matrix)) pointnet2 = model_name == 'pointnet2' test_metrics, test_hist_acc, cat_mean_iou = test_semseg(model.eval(), testdataloader, seg_label_to_cat,\ num_classes = num_classes,pointnet2=pointnet2) mean_iou = np.mean(cat_mean_iou) print('Epoch %d %s accuracy: %f meanIOU: %f' % (epoch, blue('test'), test_metrics['accuracy'], mean_iou)) logger.info('Epoch %d %s accuracy: %f meanIOU: %f' % (epoch, 'test', test_metrics['accuracy'], mean_iou)) if test_metrics['accuracy'] > best_acc: best_acc = test_metrics['accuracy'] print("HERE") save_path = '%s/%s_%.3d_%.4f.pth' % (checkpoints_dir, model_name, epoch, best_acc) torch.save(model.state_dict(), save_path) logger.info(cat_mean_iou) logger.info('Save model..') print('Save model..') print(cat_mean_iou) # if mean_iou > best_meaniou: best_meaniou = mean_iou print('Best accuracy is: %.5f' % best_acc) logger.info('Best accuracy is: %.5f' % best_acc) print('Best meanIOU is: %.5f' % best_meaniou) logger.info('Best meanIOU is: %.5f' % best_meaniou) if USE_CONMAT: logger.info('Confusion matrix is: \n {}'.format(conf_matrix)) # Plot loss vs. steps plt.plot(steps, graph_losses) plt.xlabel("Batched Steps (Batch Size = {}".format(batch_size)) plt.ylabel("Multiclass NLL Loss") plt.title("NLL Loss vs. Number of Batched Steps") # Make directory for loss and other plots graphs_dir = os.path.join(experiment_dir, "graphs") os.makedirs(graphs_dir, exist_ok=True) # Save and close figure plt.savefig(os.path.join(graphs_dir, "losses.png")) plt.clf()
total_epoch_loss = running_loss / dataset_sizes[phase] if phase == 'train': cls_loss_values.append(total_cls_loss) reg_loss_values.append(total_reg_loss) tot_loss_values.append(total_epoch_loss) if phase == 'valid': val_cls_loss_values.append(total_cls_loss) val_reg_loss_values.append(total_reg_loss) val_tot_loss_values.append(total_epoch_loss) print('{} rpn_cls Loss: {:.4f} {} rpn_reg Loss: {:.4f} {} Total Loss: {:.4f}'.format(phase, total_cls_loss,phase, total_reg_loss,phase, total_epoch_loss)) # deep copy the model # if phase == 'train' and total_epoch_loss < best_loss: # best_loss = total_epoch_loss # model_wts = copy.deepcopy(model.state_dict()) print() save_loss_graphs(cls_loss_values,reg_loss_values,tot_loss_values,val_cls_loss_values,val_reg_loss_values,val_tot_loss_values,start_epoch,num_epochs, exp) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) # print('Best Loss: {:4f}'.format(best_loss)) torch.save(model,'saved_models/'+str(exp)+'/rpn_'+str(start_epoch+num_epochs-1)+'.pth') fl = FocalLoss() train_model(rpn, fl, 0, parser.e, parser.exp, torch.cuda.is_available())
checkpoint = torch.load('%s/ckpt.t7' % ckpt_path) net = checkpoint['net'] best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] else: net = VGG('VGG11', 4) #from torchvision.models.vgg import * #net = vgg11_bn(num_classes=4) tensorboard_logger.configure(log_path) if use_cuda: net.cuda() #net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) cudnn.benchmark = True #criterion = nn.CrossEntropyLoss() criterion = FocalLoss(4, use_cuda) optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.5, 0.999), weight_decay=1e-6) def train(epoch): global optimizer print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): #if batch_idx > 10:
}, { 'params': param_groups[2], 'lr': 10 * args.lr, 'weight_decay': args.weight_decay }, { 'params': param_groups[3], 'lr': 20 * args.lr, 'weight_decay': 0 }], lr=args.lr, weight_decay=args.weight_decay, max_step=len(train_loader) * args.epochs) if args.fl: ## focal loss criterion = FocalLoss() else: ## frequency-based weighting class_weight = 45117 / torch.tensor([ 30160, 2, 9757, 1004, 4, 205, 833, 4, 252, 21, 7, 1366, 1323, 83, 56, 26, 1, 13 ]).cuda() criterion = nn.CrossEntropyLoss(weight=class_weight) for current_epoch in range(model.epochs): model.epoch = current_epoch print("Training epoch...") model.train_epoch(train_loader, optimizer, criterion)
def main(): global args, best_prec1 args = parser.parse_args() # if args.tensorboard: configure("runs/%s"%(args.name)) # Data loading code normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) # Data augmentation if args.augment: transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) else: transform_train = transforms.Compose([ transforms.ToTensor(), normalize, ]) # transform_test = transforms.Compose([ # transforms.ToTensor(), # normalize # ]) #normalize = transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]) # load Data train_dataset = datasets.ImageFolder(train_dirs, transform_train) #pdb.set_trace() val_dataset = datasets.ImageFolder(val_dirs, transform_train) kwargs = {'num_workers': 0, 'pin_memory': True} train_loader = torch.utils.data.DataLoader( #datasets.CIFAR10('../data', train=True, download=True,transform=transform_train), train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) val_loader = torch.utils.data.DataLoader( #datasets.CIFAR10('../data', train=False, transform=transform_test), val_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) # create model model = dn.DenseNet3(args.layers, 3, args.growth, reduction=args.reduce, bottleneck=args.bottleneck, dropRate=args.droprate, small_inputs=False) # get the number of model parameters print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) # for training on multiple GPUs. # Use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use # model = torch.nn.DataParallel(model).cuda() model = model.cuda() # for network visualization in tensorboard dummy_input = torch.rand(20, 3, 200, 200).cuda() writer.add_graph(model, (dummy_input, )) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer # criterion = nn.CrossEntropyLoss().cuda() criterion = FocalLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, nesterov=True, weight_decay=args.weight_decay) for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, criterion, epoch) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best) print('Best accuracy: ', best_prec1)
momentum=0.85, nesterov=True) optimizer = adam # selected optimizer # learning rate scheduler # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.4, patience=20, verbose=True,min_lr=1e-5) # weighted cross entropy loss function for training counter = Counter(train_fold_labels.numpy().T.reshape(1, -1)[0, :].tolist()) mw = max([counter[x] for x in range(NUM_CLASSES)]) weight = torch.tensor([mw / counter[x] for x in range(NUM_CLASSES)]).to(device) # print ("Weights: ", [mw/counter[x] for x in range(NUM_CLASSES)]) loss_fn = torch.nn.CrossEntropyLoss(weight=weight) loss_fn = FocalLoss(class_num=2, gamma=2, alpha=weight) # # weighted cross entropy loss for validation dataset # counter = Counter(valid_fold_labels.numpy().T.reshape(1,-1)[0,:].tolist()) # mw = max([counter[x] for x in range(NUM_CLASSES)]) # weight = torch.tensor([mw/counter[x] for x in range(NUM_CLASSES)]).to(device) # # valid_loss_fn = torch.nn.CrossEntropyLoss(weight=weight) # valid_loss_fn = FocalLoss(class_num=2, gamma=1, alpha=weight) # scale all samples according to training set scaler = preprocessing.MinMaxScaler().fit(train_fold_data.numpy()) train_fold_data_normalized = torch.from_numpy( scaler.transform(train_fold_data.numpy())).float().to(device) test_fold_data_normalized = torch.from_numpy( scaler.transform(test_fold_data.numpy())).float().to(device) # valid_fold_data_normalized = torch.from_numpy(scaler.transform(valid_fold_data.numpy())).float().to(device)
def train_net(args): torch.manual_seed(7) np.random.seed(7) checkpoint = args.checkpoint start_epoch = 0 best_acc = float('-inf') writer = SummaryWriter() epochs_since_improvement = 0 # Initialize / load checkpoint if checkpoint is None: if args.network == 'r18': model = resnet18(args) elif args.network == 'r34': model = resnet34(args) elif args.network == 'r50': model = resnet50(args) elif args.network == 'r101': model = resnet101(args) elif args.network == 'r152': model = resnet152(args) else: raise TypeError('network {} is not supported.'.format( args.network)) if args.pretrained: model.load_state_dict(torch.load('insight-face-v3.pt')) model = nn.DataParallel(model) metric_fc = ArcMarginModel(args) metric_fc = nn.DataParallel(metric_fc) if args.optimizer == 'sgd': optimizer = torch.optim.SGD([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, momentum=args.mom, nesterov=True, weight_decay=args.weight_decay) else: optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': metric_fc.parameters() }], lr=args.lr, weight_decay=args.weight_decay) else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] model = checkpoint['model'] metric_fc = checkpoint['metric_fc'] optimizer = checkpoint['optimizer'] # Move to GPU, if available model = model.to(device) metric_fc = metric_fc.to(device) # Loss function if args.focal_loss: criterion = FocalLoss(gamma=args.gamma) else: criterion = nn.CrossEntropyLoss() # Custom dataloaders # train_dataset = ArcFaceDataset('train') # train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, # num_workers=num_workers) train_dataset = ArcFaceDatasetBatched('train', img_batch_size) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size // img_batch_size, shuffle=True, num_workers=num_workers, collate_fn=batched_collate_fn) scheduler = MultiStepLR(optimizer, milestones=[8, 16, 24, 32], gamma=0.1) # Epochs for epoch in range(start_epoch, args.end_epoch): lr = optimizer.param_groups[0]['lr'] logger.info('\nCurrent effective learning rate: {}\n'.format(lr)) # print('Step num: {}\n'.format(optimizer.step_num)) writer.add_scalar('model/learning_rate', lr, epoch) # One epoch's training train_loss, train_top1_accs = train(train_loader=train_loader, model=model, metric_fc=metric_fc, criterion=criterion, optimizer=optimizer, epoch=epoch) writer.add_scalar('model/train_loss', train_loss, epoch) writer.add_scalar('model/train_accuracy', train_top1_accs, epoch) scheduler.step(epoch) if args.eval_ds == "LFW": from lfw_eval import lfw_test # One epochs's validata accuracy, threshold = lfw_test(model) elif args.eval_ds == "Megaface": from megaface_eval import megaface_test accuracy = megaface_test(model) else: accuracy = -1 writer.add_scalar('model/evaluation_accuracy', accuracy, epoch) # Check if there was an improvement is_best = accuracy > best_acc best_acc = max(accuracy, best_acc) if not is_best: epochs_since_improvement += 1 logger.info("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 # Save checkpoint save_checkpoint(epoch, epochs_since_improvement, model, metric_fc, optimizer, best_acc, is_best, scheduler)
def run(train_sets, valid_sets, idx, save_dr): batch_size = 8 imagenet_data = ImageFolder(train_sets, transform=data_transforms['train']) test_data = ImageFolder(valid_sets, transform=data_transforms['val']) data_loader = DataLoader(imagenet_data, batch_size=batch_size, shuffle=True) test_data_loader = DataLoader(test_data, batch_size=1, shuffle=True) cls_num = len(imagenet_data.class_to_idx) model = inceptionresnetv2(num_classes=1001, pretrained=None) model.load_state_dict( torch.load('/home/dsl/all_check/inceptionresnetv2-520b38e4.pth'), strict=True) model.last_linear = nn.Linear(1536, cls_num) model.cuda() state = {'learning_rate': 0.01, 'momentum': 0.9, 'decay': 0.0005} #optimizer = torch.optim.SGD(model.parameters(), state['learning_rate'], momentum=state['momentum'], #weight_decay=state['decay'], nesterov=True) optimizer = torch.optim.Adam(model.parameters(), state['learning_rate'], weight_decay=state['decay'], amsgrad=True) state['label_ix'] = imagenet_data.class_to_idx state['cls_name'] = idx state['best_accuracy'] = 0 sch = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, factor=0.9, patience=3) focal_loss = FocalLoss(gamma=2) focal_loss.cuda() def train(): model.train() loss_avg = 0.0 progress = ProgressBar() ip1_loader = [] idx_loader = [] correct = 0 for (data, target) in progress(data_loader): data, target = torch.autograd.Variable( data.cuda()), torch.autograd.Variable(target.cuda()) output = model(data) pred = output.data.max(1)[1] correct += float(pred.eq(target.data).sum()) optimizer.zero_grad() loss = focal_loss(output, target) loss.backward() optimizer.step() loss_avg = loss_avg * 0.2 + float(loss) * 0.8 print(correct, len(data_loader.dataset), loss_avg) state['train_accuracy'] = correct / len(data_loader.dataset) state['train_loss'] = loss_avg def test(): with torch.no_grad(): model.eval() loss_avg = 0.0 correct = 0 for (data, target) in test_data_loader: data, target = torch.autograd.Variable( data.cuda()), torch.autograd.Variable(target.cuda()) output = model(data) loss = F.cross_entropy(output, target) pred = output.data.max(1)[1] correct += float(pred.eq(target.data).sum()) loss_avg += float(loss) state['test_loss'] = loss_avg / len(test_data_loader.dataset) state['test_accuracy'] = correct / len( test_data_loader.dataset) print(state['test_accuracy']) best_accuracy = 0.0 for epoch in range(40): state['epoch'] = epoch train() test() sch.step(state['train_accuracy']) best_accuracy = (state['train_accuracy'] + state['test_accuracy']) / 2 if best_accuracy > state['best_accuracy']: state['best_accuracy'] = best_accuracy torch.save(model.state_dict(), os.path.join(save_dr, idx + '.pth')) with open(os.path.join(save_dr, idx + '.json'), 'w') as f: f.write(json.dumps(state)) f.flush() print(state) print("Best accuracy: %f" % state['best_accuracy']) if state['test_accuracy'] == 1 and epoch > 10: break