def train(images_folder, num_refinement_stages, base_lr, batch_size, batches_per_iter, num_workers, checkpoint_path, weights_only, from_mobilenet, checkpoints_folder, log_after, checkpoint_after, num_kps, finetune=False): net = SinglePersonPoseEstimationWithMobileNet( num_refinement_stages=num_refinement_stages, num_heatmaps=num_kps + 1).cuda() stride = 8 sigma = 7 # num of kps is default 16 ,+bg=17 # the img size is arbitrary , flip may not need data_flag = "real" if images_folder.split( "/")[-1] == "data_lip" else "anime" train_log = get_logger(checkpoints_folder, cmd_stream=True) if data_flag == "real": dataset = LipTrainDataset(images_folder, stride, sigma, transform=transforms.Compose([ SinglePersonBodyMasking(), ChannelPermutation(), SinglePersonRotate(pad=(128, 128, 128), max_rotate_degree=40), SinglePersonCropPad(pad=(128, 128, 128), crop_x=256, crop_y=256), SinglePersonFlip() ])) else: dataset = AnimeTrainDataset( images_folder, stride, sigma, transform=transforms.Compose([ SinglePersonBodyMasking(), ChannelPermutation(), SinglePersonRotate(pad=(128, 128, 128), max_rotate_degree=40), SinglePersonCropPad(pad=(128, 128, 128), crop_x=256, crop_y=256) ])) # b=32 default train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) backbone_p = [{ 'params': get_parameters_conv(net.model, 'weight') }, { 'params': get_parameters_conv_depthwise(net.model, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.model, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.model, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }] cpm_p = [{ 'params': get_parameters_conv(net.cpm, 'weight'), 'lr': base_lr }, { 'params': get_parameters_conv(net.cpm, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, { 'params': get_parameters_conv_depthwise(net.cpm, 'weight'), 'weight_decay': 0 }] initial_p = [{ 'params': get_parameters_conv(net.initial_stage, 'weight'), 'lr': base_lr }, { 'params': get_parameters_conv(net.initial_stage, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, { 'params': get_parameters_bn(net.initial_stage, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.initial_stage, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }] refine_p = [{ 'params': get_parameters_conv(net.refinement_stages, 'weight'), 'lr': base_lr * 4 }, { 'params': get_parameters_conv(net.refinement_stages, 'bias'), 'lr': base_lr * 8, 'weight_decay': 0 }, { 'params': get_parameters_bn(net.refinement_stages, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.refinement_stages, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }] opt_p = [] #TODO modify params needed update above and change the model structure. if not finetune: opt_p += backbone_p opt_p += cpm_p opt_p += initial_p opt_p += refine_p optimizer = optim.Adam(opt_p, lr=base_lr, weight_decay=5e-4) num_iter = 0 current_epoch = 0 scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5, threshold=1e-2, verbose=True) if checkpoint_path: checkpoint = torch.load(checkpoint_path) if from_mobilenet: load_from_mobilenet(net, checkpoint) else: load_state(net, checkpoint) if not weights_only: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) num_iter = checkpoint['iter'] num_iter = num_iter // log_after * log_after # round iterations, to print proper loss when resuming current_epoch = checkpoint['current_epoch'] + 1 net = DataParallel(net, device_ids=[0]) net.train() for epochId in range(current_epoch, 100): train_log.debug('Epoch: {}'.format(epochId)) net.train() total_losses = [0] * (num_refinement_stages + 1 ) # heatmaps loss per stage batch_per_iter_idx = 0 for batch_data in train_loader: if batch_per_iter_idx == 0: optimizer.zero_grad() images = batch_data['image'].cuda() keypoint_maps = batch_data['keypoint_maps'].cuda() stages_output = net(images) losses = [] # guess to update the init stage + refinement stages for loss_idx in range(len(total_losses)): losses.append( l2_loss(stages_output[loss_idx], keypoint_maps, images.shape[0])) total_losses[loss_idx] += losses[-1].item() / batches_per_iter loss = losses[0] for loss_idx in range(1, len(losses)): loss += losses[loss_idx] loss /= batches_per_iter loss.backward() batch_per_iter_idx += 1 if batch_per_iter_idx == batches_per_iter: optimizer.step() batch_per_iter_idx = 0 num_iter += 1 else: continue #per 100 iter if num_iter % log_after == 0: train_log.debug('Iter: {}'.format(num_iter)) for loss_idx in range(len(total_losses)): train_log.debug('\n'.join([ 'stage{}_heatmaps_loss: {}' ]).format(loss_idx + 1, total_losses[loss_idx] / log_after)) for loss_idx in range(len(total_losses)): total_losses[loss_idx] = 0 snapshot_name = '{}/checkpoint_last_epoch.pth'.format( checkpoints_folder) torch.save( { 'state_dict': net.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'iter': num_iter, 'current_epoch': epochId }, snapshot_name) if (epochId + 1) % checkpoint_after == 0: snapshot_name = '{}/checkpoint_epoch_{}.pth'.format( checkpoints_folder, epochId) torch.save( { 'state_dict': net.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'iter': num_iter, 'current_epoch': epochId }, snapshot_name) train_log.debug('Validation...') net.eval() eval_num = 1000 if data_flag == "real": val_dataset = LipValDataset(images_folder, eval_num) else: val_dataset = AnimeValDataset(images_folder, eval_num) predictions_name = '{}/val_results.csv'.format(checkpoints_folder) evaluate(val_dataset, predictions_name, net, num_kps=num_kps) pck = calc_pckh(val_dataset.labels_file_path, predictions_name, eval_num=eval_num) val_loss = 100 - pck[-1][-1] # 100 - avg_pckh train_log.debug('Val loss: {}'.format(val_loss)) scheduler.step(val_loss, epochId)
def train(images_folder, num_refinement_stages, base_lr, batch_size, batches_per_iter, num_workers, checkpoint_path, weights_only, from_mobilenet, checkpoints_folder, log_after, checkpoint_after): dataset = CocoSingleTrainDataset(images_folder, transform=transforms.Compose([ HalfBodyTransform(), RandomScaleRotate(), SinglePersonFlip(left_keypoints_indice= CocoSingleTrainDataset.left_keypoints_indice, right_keypoints_indice= CocoSingleTrainDataset.right_keypoints_indice), SinglePersonRandomAffineTransform(), SinglePersonBodyMasking(), Normalization(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ChannelPermutation() ])) net = SinglePersonPoseEstimationWithMobileNet(num_refinement_stages, num_heatmaps=dataset._num_keypoints, mode='nearest').cuda() train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) optimizer = optim.Adam(net.parameters(), lr=base_lr) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [170, 200], 0.1) num_iter = 0 current_epoch = 0 if checkpoint_path: checkpoint = torch.load(checkpoint_path) if from_mobilenet: load_from_mobilenet(net, checkpoint) else: load_state(net, checkpoint) if not weights_only: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) num_iter = checkpoint['iter'] current_epoch = checkpoint['current_epoch']+1 net = DataParallel(net) net.train() for epochId in range(current_epoch, 210): print('Epoch: {}'.format(epochId)) net.train() total_losses = [0] * (num_refinement_stages + 1) # heatmaps loss per stage batch_per_iter_idx = 0 for batch_data in train_loader: if batch_per_iter_idx == 0: optimizer.zero_grad() images = batch_data['image'].float().cuda() keypoint_maps = batch_data['keypoint_maps'] stages_output = net(images) losses = [] for loss_idx in range(len(total_losses)): losses.append(mse_loss(stages_output[loss_idx], keypoint_maps, batch_data['keypoints'][:, 2::3].view(batch_data['keypoints'].shape[0], -1, 1))) total_losses[loss_idx] += losses[-1].item() / batches_per_iter loss = 0 for loss_idx in range(len(losses)): loss += losses[loss_idx] loss /= batches_per_iter loss.backward() batch_per_iter_idx += 1 if batch_per_iter_idx == batches_per_iter: optimizer.step() batch_per_iter_idx = 0 num_iter += 1 else: continue if num_iter % log_after == 0: print('Iter: {}'.format(num_iter)) for loss_idx in range(len(total_losses)): print('\n'.join(['stage{}_heatmaps_loss: {}']).format( loss_idx + 1, total_losses[loss_idx] / log_after)) for loss_idx in range(len(total_losses)): total_losses[loss_idx] = 0 snapshot_name = '{}/checkpoint_last_epoch.pth'.format(checkpoints_folder) torch.save({'state_dict': net.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'iter': num_iter, 'current_epoch': epochId}, snapshot_name) if (epochId + 1) % checkpoint_after == 0: snapshot_name = '{}/checkpoint_epoch_{}.pth'.format(checkpoints_folder, epochId) torch.save({'state_dict': net.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'iter': num_iter, 'current_epoch': epochId}, snapshot_name) print('Validation...') net.eval() val_dataset = CocoSingleValDataset(images_folder, transform=transforms.Compose([ SinglePersonRandomAffineTransform(mode='val'), Normalization(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])) predictions_name = '{}/val_results2.json'.format(checkpoints_folder) val_loss = val(net, val_dataset, predictions_name, 'CocoSingle') print('Val loss: {}'.format(val_loss)) scheduler.step()
def train(images_folder, num_refinement_stages, base_lr, batch_size, batches_per_iter, num_workers, checkpoint_path, weights_only, from_mobilenet, checkpoints_folder, log_after, checkpoint_after): net = SinglePersonPoseEstimationWithMobileNet(num_refinement_stages, num_heatmaps=18).cuda() train_dataset = dtst_train(images_folder, STRIDE, SIGMA, transform=transforms.Compose([ SinglePersonBodyMasking(), ChannelPermutation(), SinglePersonRotate(pad=(128, 128, 128), max_rotate_degree=40), SinglePersonCropPad(pad=(128, 128, 128), crop_x=256, crop_y=256), SinglePersonFlip() ])) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) val_dataset = dtst_val(images_folder, STRIDE, SIGMA) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) optimizer = optim.Adam([ { 'params': get_parameters_conv(net.model, 'weight') }, { 'params': get_parameters_conv_depthwise(net.model, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.model, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.model, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, { 'params': get_parameters_conv(net.cpm, 'weight'), 'lr': base_lr }, { 'params': get_parameters_conv(net.cpm, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, { 'params': get_parameters_conv_depthwise(net.cpm, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_conv(net.initial_stage, 'weight'), 'lr': base_lr }, { 'params': get_parameters_conv(net.initial_stage, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, { 'params': get_parameters_bn(net.initial_stage, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.initial_stage, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, { 'params': get_parameters_conv(net.refinement_stages, 'weight'), 'lr': base_lr * 4 }, { 'params': get_parameters_conv(net.refinement_stages, 'bias'), 'lr': base_lr * 8, 'weight_decay': 0 }, { 'params': get_parameters_bn(net.refinement_stages, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.refinement_stages, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, ], lr=base_lr, weight_decay=5e-4) num_iter = 0 current_epoch = 0 scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5, threshold=1e-2, verbose=True) if checkpoint_path: checkpoint = torch.load(checkpoint_path) if from_mobilenet: load_from_mobilenet(net, checkpoint) else: load_state(net, checkpoint) if not weights_only: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) num_iter = checkpoint['iter'] num_iter = num_iter // log_after * log_after # round iterations, to print proper loss when resuming current_epoch = checkpoint['current_epoch'] + 1 net = DataParallel(net) net.train() for epochId in range(current_epoch, 100): print('Epoch: {}'.format(epochId)) N_losses = num_refinement_stages + 1 total_losses = [0] * N_losses # heatmaps loss per stage for batch in train_loader: images = batch['image'].cuda() keypoint_maps = batch['keypoint_maps'].cuda() stages_output = net(images) losses = [] for loss_idx in range(N_losses): loss = l2_loss(stages_output[loss_idx], keypoint_maps, len(images)) losses.append(loss) total_losses[loss_idx] += loss.item() optimizer.zero_grad() loss = losses[0] for i in range(1, N_losses): loss += losses[i] loss.backward() optimizer.step() num_iter += 1 if num_iter % log_after == 0: print('Iter: {}'.format(num_iter)) # for loss_idx in range(N_losses): # print('\n'.join(['stage{}_heatmaps_loss: {}']).format( # loss_idx + 1, total_losses[loss_idx] / log_after)) for loss_idx in range(N_losses): total_losses[loss_idx] = 0 validate2(epochId, net, val_loader, scheduler) snapshot_name = '{}/{}_epoch_last.pth'.format(checkpoints_folder, DATASET) torch.save( { 'state_dict': net.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'iter': num_iter, 'current_epoch': epochId }, snapshot_name) if epochId % checkpoint_after == 0: snapshot_name = '{}/{}_epoch_{}.pth'.format( checkpoints_folder, DATASET, epochId) torch.save( { 'state_dict': net.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'iter': num_iter, 'current_epoch': epochId }, snapshot_name) validate2(epochID, net, val_loader, scheduler)
def train( images_folder, num_refinement_stages, base_lr, batch_size, batches_per_iter, num_workers, checkpoint_path, weights_only, from_mobilenet, checkpoints_folder, log_after, checkpoint_after, ): net = SinglePersonPoseEstimationWithMobileNet(num_refinement_stages).cuda() stride = 8 sigma = 7 dataset = LipTrainDataset( images_folder, stride, sigma, transform=transforms.Compose([ SinglePersonBodyMasking(), ChannelPermutation(), SinglePersonRotate(pad=(128, 128, 128), max_rotate_degree=40), SinglePersonCropPad(pad=(128, 128, 128), crop_x=256, crop_y=256), SinglePersonFlip( left_keypoints_indice=LipTrainDataset.left_keypoints_indice, right_keypoints_indice=LipTrainDataset.right_keypoints_indice, ), ]), ) train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) optimizer = optim.Adam( [ { "params": get_parameters_conv(net.model, "weight") }, { "params": get_parameters_conv_depthwise(net.model, "weight"), "weight_decay": 0, }, { "params": get_parameters_bn(net.model, "weight"), "weight_decay": 0 }, { "params": get_parameters_bn(net.model, "bias"), "lr": base_lr * 2, "weight_decay": 0, }, { "params": get_parameters_conv(net.cpm, "weight"), "lr": base_lr }, { "params": get_parameters_conv(net.cpm, "bias"), "lr": base_lr * 2, "weight_decay": 0, }, { "params": get_parameters_conv_depthwise(net.cpm, "weight"), "weight_decay": 0, }, { "params": get_parameters_conv(net.initial_stage, "weight"), "lr": base_lr }, { "params": get_parameters_conv(net.initial_stage, "bias"), "lr": base_lr * 2, "weight_decay": 0, }, { "params": get_parameters_bn(net.initial_stage, "weight"), "weight_decay": 0, }, { "params": get_parameters_bn(net.initial_stage, "bias"), "lr": base_lr * 2, "weight_decay": 0, }, { "params": get_parameters_conv(net.refinement_stages, "weight"), "lr": base_lr * 4, }, { "params": get_parameters_conv(net.refinement_stages, "bias"), "lr": base_lr * 8, "weight_decay": 0, }, { "params": get_parameters_bn(net.refinement_stages, "weight"), "weight_decay": 0, }, { "params": get_parameters_bn(net.refinement_stages, "bias"), "lr": base_lr * 2, "weight_decay": 0, }, ], lr=base_lr, weight_decay=5e-4, ) num_iter = 0 current_epoch = 0 scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5, threshold=1e-2, verbose=True) if checkpoint_path: checkpoint = torch.load(checkpoint_path) if from_mobilenet: load_from_mobilenet(net, checkpoint) else: load_state(net, checkpoint) if not weights_only: optimizer.load_state_dict(checkpoint["optimizer"]) scheduler.load_state_dict(checkpoint["scheduler"]) num_iter = checkpoint["iter"] current_epoch = checkpoint["current_epoch"] + 1 net = DataParallel(net) net.train() for epochId in range(current_epoch, 100): print("Epoch: {}".format(epochId)) net.train() total_losses = [0] * (num_refinement_stages + 1 ) # heatmaps loss per stage batch_per_iter_idx = 0 for batch_data in train_loader: if batch_per_iter_idx == 0: optimizer.zero_grad() images = batch_data["image"].cuda() keypoint_maps = batch_data["keypoint_maps"].cuda() stages_output = net(images) losses = [] for loss_idx in range(len(total_losses)): losses.append( l2_loss(stages_output[loss_idx], keypoint_maps, images.shape[0])) total_losses[loss_idx] += losses[-1].item() / batches_per_iter loss = losses[0] for loss_idx in range(1, len(losses)): loss += losses[loss_idx] loss /= batches_per_iter loss.backward() batch_per_iter_idx += 1 if batch_per_iter_idx == batches_per_iter: optimizer.step() batch_per_iter_idx = 0 num_iter += 1 else: continue if num_iter % log_after == 0: print("Iter: {}".format(num_iter)) for loss_idx in range(len(total_losses)): print("\n".join(["stage{}_heatmaps_loss: {}"]).format( loss_idx + 1, total_losses[loss_idx] / log_after)) for loss_idx in range(len(total_losses)): total_losses[loss_idx] = 0 snapshot_name = "{}/checkpoint_last_epoch.pth".format( checkpoints_folder) torch.save( { "state_dict": net.module.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), "iter": num_iter, "current_epoch": epochId, }, snapshot_name, ) if (epochId + 1) % checkpoint_after == 0: snapshot_name = "{}/checkpoint_epoch_{}.pth".format( checkpoints_folder, epochId) torch.save( { "state_dict": net.module.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), "iter": num_iter, "current_epoch": epochId, }, snapshot_name, ) print("Validation...") net.eval() eval_num = 1000 val_dataset = LipValDataset(images_folder, eval_num) predictions_name = "{}/val_results.csv".format(checkpoints_folder) evaluate(val_dataset, predictions_name, net) pck = calc_pckh(val_dataset.labels_file_path, predictions_name, eval_num=eval_num) val_loss = 100 - pck[-1][-1] # 100 - avg_pckh print("Val loss: {}".format(val_loss)) scheduler.step(val_loss, epochId)
def train(images_folder, num_refinement_stages, base_lr, batch_size, batches_per_iter, num_workers, checkpoint_path, weights_only, from_mobilenet, checkpoints_folder, log_after, checkpoint_after): net = SinglePersonPoseEstimationWithMobileNet(num_refinement_stages).cuda() stride = 8 sigma = 7 dataset = LipTrainDataset(images_folder, stride, sigma, transform=transforms.Compose([ SinglePersonBodyMasking(), ChannelPermutation(), SinglePersonRotate(pad=(128, 128, 128), max_rotate_degree=40), SinglePersonCropPad(pad=(128, 128, 128), crop_x=256, crop_y=256), SinglePersonFlip() ])) train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) optimizer = optim.Adam([ { 'params': get_parameters_conv(net.model, 'weight') }, { 'params': get_parameters_conv_depthwise(net.model, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.model, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.model, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, { 'params': get_parameters_conv(net.cpm, 'weight'), 'lr': base_lr }, { 'params': get_parameters_conv(net.cpm, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, { 'params': get_parameters_conv_depthwise(net.cpm, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_conv(net.initial_stage, 'weight'), 'lr': base_lr }, { 'params': get_parameters_conv(net.initial_stage, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, { 'params': get_parameters_bn(net.initial_stage, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.initial_stage, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, { 'params': get_parameters_conv(net.refinement_stages, 'weight'), 'lr': base_lr * 4 }, { 'params': get_parameters_conv(net.refinement_stages, 'bias'), 'lr': base_lr * 8, 'weight_decay': 0 }, { 'params': get_parameters_bn(net.refinement_stages, 'weight'), 'weight_decay': 0 }, { 'params': get_parameters_bn(net.refinement_stages, 'bias'), 'lr': base_lr * 2, 'weight_decay': 0 }, ], lr=base_lr, weight_decay=5e-4) num_iter = 0 current_epoch = 0 scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5, threshold=1e-2, verbose=True) if checkpoint_path: checkpoint = torch.load(checkpoint_path) if from_mobilenet: load_from_mobilenet(net, checkpoint) else: load_state(net, checkpoint) if not weights_only: optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) num_iter = checkpoint['iter'] num_iter = num_iter // log_after * log_after # round iterations, to print proper loss when resuming current_epoch = checkpoint['current_epoch'] + 1 net = DataParallel(net) net.train() for epochId in range(current_epoch, 100): print('Epoch: {}'.format(epochId)) net.train() total_losses = [0] * (num_refinement_stages + 1 ) # heatmaps loss per stage batch_per_iter_idx = 0 for batch_data in train_loader: if batch_per_iter_idx == 0: optimizer.zero_grad() images = batch_data['image'].cuda() keypoint_maps = batch_data['keypoint_maps'].cuda() stages_output = net(images) losses = [] for loss_idx in range(len(total_losses)): losses.append( l2_loss(stages_output[loss_idx], keypoint_maps, images.shape[0])) total_losses[loss_idx] += losses[-1].item() / batches_per_iter loss = losses[0] for loss_idx in range(1, len(losses)): loss += losses[loss_idx] loss /= batches_per_iter loss.backward() batch_per_iter_idx += 1 if batch_per_iter_idx == batches_per_iter: optimizer.step() batch_per_iter_idx = 0 num_iter += 1 else: continue if num_iter % log_after == 0: print('Iter: {}'.format(num_iter)) for loss_idx in range(len(total_losses)): print('\n'.join(['stage{}_heatmaps_loss: {}']).format( loss_idx + 1, total_losses[loss_idx] / log_after)) for loss_idx in range(len(total_losses)): total_losses[loss_idx] = 0 snapshot_name = '{}/checkpoint_last_epoch.pth'.format( checkpoints_folder) torch.save( { 'state_dict': net.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'iter': num_iter, 'current_epoch': epochId }, snapshot_name) if (epochId + 1) % checkpoint_after == 0: snapshot_name = '{}/checkpoint_epoch_{}.pth'.format( checkpoints_folder, epochId + 1) torch.save( { 'state_dict': net.module.state_dict(), 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'iter': num_iter, 'current_epoch': epochId }, snapshot_name) print('Validation...') net.eval() eval_num = 1000 val_dataset = LipValDataset(images_folder, eval_num) predictions_name = '{}/val_results.csv'.format(checkpoints_folder) evaluate(val_dataset, predictions_name, net) pck = calc_pckh(val_dataset.labels_file_path, predictions_name, eval_num=eval_num) val_loss = 100 - pck[-1][-1] # 100 - avg_pckh print('Val loss: {}'.format(val_loss)) scheduler.step(val_loss, epochId)