def main(): opt = TrainOptions().parse() if opt.sr_dir == '': print('sr directory is null.') exit() sr_pretrain_dir = os.path.join( opt.exp_dir, opt.exp_id, opt.sr_dir + '-' + opt.load_prefix_pose[0:-1]) if not os.path.isdir(sr_pretrain_dir): os.makedirs(sr_pretrain_dir) train_history = ASNTrainHistory() # print(train_history.lr) # exit() checkpoint_hg = Checkpoint() # visualizer = Visualizer(opt) # log_name = opt.resume_prefix_pose + 'log.txt' # visualizer.log_path = sr_pretrain_dir + '/' + log_name train_distri_path = sr_pretrain_dir + '/' + 'train_rotations.txt' train_distri_path_2 = sr_pretrain_dir + '/' + 'train_rotations_copy.txt' # train_distri_path = sr_pretrain_dir + '/' + 'train_rotations.txt' # train_distri_path_2 = sr_pretrain_dir + '/' + 'train_rotations_copy.txt' val_distri_path = sr_pretrain_dir + '/' + 'val_rotations.txt' val_distri_path_2 = sr_pretrain_dir + '/' + 'val_rotations_copy.txt' # val_distri_path = sr_pretrain_dir + '/' + 'val_rotations.txt' # val_distri_path_2 = sr_pretrain_dir + '/' + 'val_rotations_copy.txt' if opt.dataset == 'mpii': num_classes = 16 os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id hg = model.create_hg(num_stacks=2, num_modules=1, num_classes=num_classes, chan=256) hg = torch.nn.DataParallel(hg).cuda() if opt.load_prefix_pose == '': print('please input the checkpoint name of the pose model') # exit() # checkpoint_hg.save_prefix = os.path.join(opt.exp_dir, opt.exp_id, opt.resume_prefix_pose) checkpoint_hg.load_prefix = os.path.join(opt.exp_dir, opt.exp_id, opt.load_prefix_pose)[0:-1] checkpoint_hg.load_checkpoint(hg) print 'collecting training distributions ...\n' train_distri_list = collect_train_valid_data(train_distri_path, train_distri_path_2, hg, opt, is_train=True) print 'collecting validation distributions ...\n' val_distri_list = collect_train_valid_data(val_distri_path, val_distri_path_2, hg, opt, is_train=False)
def main(): opt = TrainOptions().parse() train_history = PoseTrainHistory() checkpoint = Checkpoint() visualizer = Visualizer(opt) exp_dir = os.path.join(opt.exp_dir, opt.exp_id) log_name = opt.vis_env + 'log.txt' visualizer.log_path = os.path.join(exp_dir, log_name) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id # if opt.dataset == 'mpii': num_classes = 16 net = create_hg(num_stacks=2, num_modules=1, num_classes=num_classes, chan=256) # num1 = get_n_params(net) # num2 = get_n_trainable_params(net) # num3 = get_n_conv_params(net) # print('number of params: ', num1) # print('number of trainalbe params: ', num2) # print('number of conv params: ', num3) # exit() net = torch.nn.DataParallel(net).cuda() """optimizer""" optimizer = torch.optim.RMSprop(net.parameters(), lr=opt.lr, alpha=0.99, eps=1e-8, momentum=0, weight_decay=0) """optionally resume from a checkpoint""" if opt.load_prefix_pose != '': # if 'pth' in opt.resume_prefix: # trunc_index = opt.resume_prefix.index('pth') # opt.resume_prefix = opt.resume_prefix[0:trunc_index - 1] checkpoint.save_prefix = os.path.join(exp_dir, opt.load_prefix_pose) checkpoint.load_prefix = os.path.join(exp_dir, opt.load_prefix_pose)[0:-1] checkpoint.load_checkpoint(net, optimizer, train_history) # trunc_index = checkpoint.save_prefix.index('lr-0.00025-80') # checkpoint.save_prefix = checkpoint.save_prefix[0:trunc_index] # checkpoint.save_prefix = exp_dir + '/' else: checkpoint.save_prefix = exp_dir + '/' print('save prefix: ', checkpoint.save_prefix) # model = {'state_dict': net.state_dict()} # save_path = checkpoint.save_prefix + 'test-model-size.pth.tar' # torch.save(model, save_path) # exit() """load data""" train_loader = torch.utils.data.DataLoader(MPII( 'dataset/mpii-hr-lsp-normalizer.json', '/bigdata1/zt53/data', is_train=True), batch_size=opt.bs, shuffle=True, num_workers=opt.nThreads, pin_memory=True) val_loader = torch.utils.data.DataLoader(MPII( 'dataset/mpii-hr-lsp-normalizer.json', '/bigdata1/zt53/data', is_train=False), batch_size=opt.bs, shuffle=False, num_workers=opt.nThreads, pin_memory=True) print(type(optimizer), optimizer.param_groups[0]['lr']) # idx = range(0, 16) # idx = [e for e in idx if e not in (6, 7, 8, 9, 12, 13)] idx = [0, 1, 2, 3, 4, 5, 10, 11, 14, 15] # criterion = torch.nn.MSELoss(size_average=True).cuda() if not opt.is_train: visualizer.log_path = os.path.join(opt.exp_dir, opt.exp_id, 'val_log.txt') val_loss, val_pckh, predictions = validate( val_loader, net, train_history.epoch[-1]['epoch'], visualizer, idx, num_classes) checkpoint.save_preds(predictions) return """training and validation""" start_epoch = 0 if opt.load_prefix_pose != '': start_epoch = train_history.epoch[-1]['epoch'] + 1 for epoch in range(start_epoch, opt.nEpochs): adjust_lr(opt, optimizer, epoch) # # train for one epoch train_loss, train_pckh = train(train_loader, net, optimizer, epoch, visualizer, idx, opt) # evaluate on validation set val_loss, val_pckh, predictions = validate(val_loader, net, epoch, visualizer, idx, num_classes) # visualizer.display_imgpts(imgs, pred_pts, 4) # exit() # update training history e = OrderedDict([('epoch', epoch)]) lr = OrderedDict([('lr', optimizer.param_groups[0]['lr'])]) loss = OrderedDict([('train_loss', train_loss), ('val_loss', val_loss)]) pckh = OrderedDict([('train_pckh', train_pckh), ('val_pckh', val_pckh)]) train_history.update(e, lr, loss, pckh) checkpoint.save_checkpoint(net, optimizer, train_history, predictions) visualizer.plot_train_history(train_history)
def main(): opt = TrainOptions().parse() if opt.sr_dir == '': print('sr directory is null.') exit() sr_pretrain_dir = os.path.join( opt.exp_dir, opt.exp_id, opt.sr_dir + '-' + opt.load_prefix_pose[0:-1]) if not os.path.isdir(sr_pretrain_dir): os.makedirs(sr_pretrain_dir) train_history = ASNTrainHistory() # print(train_history.lr) # exit() checkpoint_agent = Checkpoint() visualizer = Visualizer(opt) visualizer.log_path = sr_pretrain_dir + '/' + 'log.txt' train_scale_path = sr_pretrain_dir + '/' + 'train_scales.txt' train_rotation_path = sr_pretrain_dir + '/' + 'train_rotations.txt' val_scale_path = sr_pretrain_dir + '/' + 'val_scales.txt' val_rotation_path = sr_pretrain_dir + '/' + 'val_rotations.txt' # with open(visualizer.log_path, 'a+') as log_file: # log_file.write(opt.resume_prefix_pose + '.pth.tar\n') # lost_joint_count_path = os.path.join(opt.exp_dir, opt.exp_id, opt.astn_dir, 'joint-count.txt') # print("=> log saved to path '{}'".format(visualizer.log_path)) # if opt.dataset == 'mpii': # num_classes = 16 os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id print 'collecting training scale and rotation distributions ...\n' train_scale_distri = read_grnd_distri_from_txt(train_scale_path) train_rotation_distri = read_grnd_distri_from_txt(train_rotation_path) dataset = MPII('dataset/mpii-hr-lsp-normalizer.json', '/bigdata1/zt53/data', is_train=True, grnd_scale_distri=train_scale_distri, grnd_rotation_distri=train_rotation_distri) train_loader = torch.utils.data.DataLoader(dataset, batch_size=opt.bs, shuffle=True, num_workers=opt.nThreads, pin_memory=True) print 'collecting validation scale and rotation distributions ...\n' val_scale_distri = read_grnd_distri_from_txt(val_scale_path) val_rotation_distri = read_grnd_distri_from_txt(val_rotation_path) dataset = MPII('dataset/mpii-hr-lsp-normalizer.json', '/bigdata1/zt53/data', is_train=False, grnd_scale_distri=val_scale_distri, grnd_rotation_distri=val_rotation_distri) val_loader = torch.utils.data.DataLoader(dataset, batch_size=opt.bs, shuffle=False, num_workers=opt.nThreads, pin_memory=True) agent = model.create_asn(chan_in=256, chan_out=256, scale_num=len(dataset.scale_means), rotation_num=len(dataset.rotation_means), is_aug=True) agent = torch.nn.DataParallel(agent).cuda() optimizer = torch.optim.RMSprop(agent.parameters(), lr=opt.lr, alpha=0.99, eps=1e-8, momentum=0, weight_decay=0) # optimizer = torch.optim.Adam(agent.parameters(), lr=opt.agent_lr) if opt.load_prefix_sr == '': checkpoint_agent.save_prefix = sr_pretrain_dir + '/' else: checkpoint_agent.save_prefix = sr_pretrain_dir + '/' + opt.load_prefix_sr checkpoint_agent.load_prefix = checkpoint_agent.save_prefix[0:-1] checkpoint_agent.load_checkpoint(agent, optimizer, train_history) # adjust_lr(optimizer, opt.lr) # lost_joint_count_path = os.path.join(opt.exp_dir, opt.exp_id, opt.asdn_dir, 'joint-count-finetune.txt') print 'agent: ', type(optimizer), optimizer.param_groups[0]['lr'] if opt.dataset == 'mpii': num_classes = 16 hg = model.create_hg(num_stacks=2, num_modules=1, num_classes=num_classes, chan=256) hg = torch.nn.DataParallel(hg).cuda() if opt.load_prefix_pose == '': print('please input the checkpoint name of the pose model') exit() checkpoint_hg = Checkpoint() # checkpoint_hg.save_prefix = os.path.join(opt.exp_dir, opt.exp_id, opt.resume_prefix_pose) checkpoint_hg.load_prefix = os.path.join(opt.exp_dir, opt.exp_id, opt.load_prefix_pose)[0:-1] checkpoint_hg.load_checkpoint(hg) logger = Logger(sr_pretrain_dir + '/' + 'training-summary.txt', title='training-summary') logger.set_names(['Epoch', 'LR', 'Train Loss', 'Val Loss']) """training and validation""" start_epoch = 0 if opt.load_prefix_sr != '': start_epoch = train_history.epoch[-1]['epoch'] + 1 for epoch in range(start_epoch, opt.nEpochs): # train for one epoch train_loss = train(train_loader, hg, agent, optimizer, epoch, visualizer, opt) val_loss = validate(val_loader, hg, agent, epoch, visualizer, opt) # update training history e = OrderedDict([('epoch', epoch)]) lr = OrderedDict([('lr', optimizer.param_groups[0]['lr'])]) loss = OrderedDict([('train_loss', train_loss), ('val_loss', val_loss)]) # pckh = OrderedDict( [('val_pckh', val_pckh)] ) train_history.update(e, lr, loss) # print(train_history.lr[-1]['lr']) checkpoint_agent.save_checkpoint(agent, optimizer, train_history, is_asn=True) visualizer.plot_train_history(train_history, 'sr') logger.append( [epoch, optimizer.param_groups[0]['lr'], train_loss, val_loss]) logger.close()
def main(): opt = TrainOptions().parse() if opt.joint_dir == '': print('joint directory is null.') exit() joint_dir = os.path.join(opt.exp_dir, opt.exp_id, opt.joint_dir + '-' + opt.load_prefix_pose[0:-1]) # joint_dir = os.path.join(opt.exp_dir, opt.exp_id, # opt.joint_dir) if not os.path.isdir(joint_dir): os.makedirs(joint_dir) visualizer = Visualizer(opt) visualizer.log_path = joint_dir + '/' + 'train-log.txt' os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id # lost_joint_count_path = os.path.join(opt.exp_dir, opt.exp_id, # opt.joint_dir, 'joint-count.txt') if opt.dataset == 'mpii': num_classes = 16 hg = model.create_hg(num_stacks=2, num_modules=1, num_classes=num_classes, chan=256) hg = torch.nn.DataParallel(hg).cuda() """optimizer""" optimizer_hg = torch.optim.RMSprop(hg.parameters(), lr=opt.lr, alpha=0.99, eps=1e-8, momentum=0, weight_decay=0) if opt.load_prefix_pose == '': print('please input the checkpoint name of the pose model') exit() train_history_pose = PoseTrainHistory() checkpoint_hg = Checkpoint() if opt.load_checkpoint: checkpoint_hg.load_prefix = joint_dir + '/' + opt.load_prefix_pose[0:-1] checkpoint_hg.load_checkpoint(hg, optimizer_hg, train_history_pose) else: checkpoint_hg.load_prefix = os.path.join(opt.exp_dir, opt.exp_id) + \ '/' + opt.load_prefix_pose[0:-1] checkpoint_hg.load_checkpoint(hg, optimizer_hg, train_history_pose) for param_group in optimizer_hg.param_groups: param_group['lr'] = opt.lr checkpoint_hg.save_prefix = joint_dir + '/pose-' # trunc_index = checkpoint.save_prefix_pose.index('lr-0.00025-85') # checkpoint.save_prefix_pose = checkpoint.save_prefix_pose[0:trunc_index] # print(checkpoint.save_prefix_pose) print 'hg optimizer: ', type( optimizer_hg), optimizer_hg.param_groups[0]['lr'] agent_sr = model.create_asn(chan_in=256, chan_out=256, scale_num=len(dataset.scale_means), rotation_num=len(dataset.rotation_means), is_aug=True) agent_sr = torch.nn.DataParallel(agent_sr).cuda() optimizer_sr = torch.optim.RMSprop(agent_sr.parameters(), lr=opt.agent_lr, alpha=0.99, eps=1e-8, momentum=0, weight_decay=0) if opt.load_prefix_sr == '': print('please input the checkpoint name of the sr agent.') exit() train_history_sr = ASNTrainHistory() checkpoint_sr = Checkpoint() if opt.load_checkpoint: checkpoint_sr.load_prefix = joint_dir + '/' + opt.load_prefix_sr[0:-1] checkpoint_sr.load_checkpoint(agent_sr, optimizer_sr, train_history_sr) else: sr_pretrain_dir = os.path.join( opt.exp_dir, opt.exp_id, opt.sr_dir + '-' + opt.load_prefix_pose[0:-1]) checkpoint_sr.load_prefix = sr_pretrain_dir + '/' + opt.load_prefix_sr[ 0:-1] checkpoint_sr.load_checkpoint(agent_sr, optimizer_sr, train_history_sr) for param_group in optimizer_sr.param_groups: param_group['lr'] = opt.agent_lr checkpoint_sr.save_prefix = joint_dir + '/agent-' # trunc_index = checkpoint.save_prefix_asn.index('lr-0.00025-80') # checkpoint.save_prefix_asn = checkpoint.save_prefix_asn[0:trunc_index] # print(checkpoint.save_prefix_asn) # adjust_lr(optimizer_asn, 5e-5) print 'agent optimizer: ', type( optimizer_sr), optimizer_sr.param_groups[0]['lr'] train_dataset_hg = MPII('dataset/mpii-hr-lsp-normalizer.json', '/bigdata1/zt53/data', is_train=True) train_loader_hg = torch.utils.data.DataLoader(train_dataset_hg, batch_size=opt.bs, shuffle=True, num_workers=opt.nThreads, pin_memory=True) val_dataset_hg = MPII('dataset/mpii-hr-lsp-normalizer.json', '/bigdata1/zt53/data', is_train=False) val_loader_hg = torch.utils.data.DataLoader(val_dataset_hg, batch_size=opt.bs, shuffle=False, num_workers=opt.nThreads, pin_memory=True) train_dataset_agent = AGENT('dataset/mpii-hr-lsp-normalizer.json', '/bigdata1/zt53/data', separate_s_r=True) train_loader_agent = torch.utils.data.DataLoader(train_dataset_agent, batch_size=opt.bs, shuffle=True, num_workers=opt.nThreads, pin_memory=True) # idx = range(0, 16) # idx_pckh = [e for e in idx if e not in (6, 7, 8, 9, 12, 13)] if not opt.is_train: visualizer.log_path = joint_dir + '/' + 'val-log.txt' val_loss, val_pckh, predictions = validate( val_loader_hg, hg, train_history_pose.epoch[-1]['epoch'], visualizer, num_classes) checkpoint_hg.save_preds(predictions) return logger = Logger(joint_dir + '/' + 'pose-training-summary.txt', title='pose-training-summary') logger.set_names( ['Epoch', 'LR', 'Train Loss', 'Val Loss', 'Train PCKh', 'Val PCKh']) """training and validation""" start_epoch_pose = train_history_pose.epoch[-1]['epoch'] + 1 epoch_sr = train_history_sr.epoch[-1]['epoch'] + 1 for epoch in range(start_epoch_pose, opt.nEpochs): adjust_lr(opt, optimizer_hg, epoch) # train hg for one epoch train_loss_pose, train_pckh = train_hg(train_loader_hg, hg, optimizer_hg, agent_sr, epoch, visualizer, opt) # util.save_drop_count(drop_count, lost_joint_count_path) # evaluate on validation set val_loss, val_pckh, predictions = validate(val_loader_hg, hg, epoch, visualizer, num_classes) # visualizer.display_imgpts(imgs, pred_pts, 4) # exit() # update training history e_pose = OrderedDict([('epoch', epoch)]) lr_pose = OrderedDict([('lr', optimizer_hg.param_groups[0]['lr'])]) loss_pose = OrderedDict([('train_loss', train_loss_pose), ('val_loss', val_loss)]) pckh = OrderedDict([('train_pckh', train_pckh), ('val_pckh', val_pckh)]) train_history_pose.update(e_pose, lr_pose, loss_pose, pckh) checkpoint_hg.save_checkpoint(hg, optimizer_hg, train_history_pose, predictions) visualizer.plot_train_history(train_history_pose) logger.append([ epoch, optimizer_hg.param_groups[0]['lr'], train_loss_pose, val_loss, train_pckh, val_pckh ]) # exit() # if train_history_pose.is_best: # visualizer.display_imgpts(imgs, pred_pts, 4) # train agent_sr for one epoch train_loss_sr = train_agent_sr(train_loader_agent, hg, agent_sr, optimizer_sr, epoch_sr, visualizer, opt) e_sr = OrderedDict([('epoch', epoch_sr)]) lr_sr = OrderedDict([('lr', optimizer_sr.param_groups[0]['lr'])]) loss_sr = OrderedDict([('train_loss', train_loss_sr), ('val_loss', 0)]) train_history_sr.update(e_sr, lr_sr, loss_sr) # print(train_history.lr[-1]['lr']) checkpoint_sr.save_checkpoint(agent_sr, optimizer_sr, train_history_sr, is_asn=True) visualizer.plot_train_history(train_history_sr, 'sr') # exit() epoch_sr += 1 logger.close()