rotation_action = gt.select_orientation([state[-1]]) action = gt.select_action_active(frame, pos[0:2]) # print('action : ', action, 'rotation : ', rotation_action) action = np.append(action, rotation_action) # print('action : ', action) reward, next_state, done, next_frame, next_pos, save_orientation = env.step(action) # print('pos : ', pos.shape, next_pos.shape) # print(frame.shape) # print(episode_reward, reward) episode_reward[0] += reward[0] episode_reward[1] += reward[1] # print(reward) # time.sleep(1) gt.buffer.add((state, action[0:2], rotation_action, next_state, reward, done, frame, next_frame, pos, next_pos)) if k > 32 or done == 1: gt.learn_active() k = 0 if done == 1: break state = next_state frame = next_frame pos = next_pos gt.buffer.clear() if t % 10 == 0: print('in epoch ' + str(t) + ' episode reward : ', episode_reward) if t % 200 == 199: gt.save_active_model('active_policy_' + str(t) + '.para', 'active_value_' + str(t) + '.para') # writer.add_scalar('episode_reward', episode_reward, steps) writer.add_scalars('episode_reward', {'distance reward': episode_reward[0]}, steps)
class VPoserTrainer: def __init__(self, work_dir, ps): from tensorboardX import SummaryWriter from human_body_prior.data.dataloader import VPoserDS self.pt_dtype = torch.float64 if ps.fp_precision == '64' else torch.float32 torch.manual_seed(ps.seed) ps.work_dir = makepath(work_dir, isfile=False) logger = log2file(os.path.join(work_dir, '%s.log' % ps.expr_code)) summary_logdir = os.path.join(work_dir, 'summaries') self.swriter = SummaryWriter(log_dir=summary_logdir) logger('tensorboard --logdir=%s' % summary_logdir) logger('Torch Version: %s\n' % torch.__version__) shutil.copy2(os.path.realpath(__file__), work_dir) use_cuda = torch.cuda.is_available() if use_cuda: torch.cuda.empty_cache() self.comp_device = torch.device("cuda:%d"%ps.cuda_id if torch.cuda.is_available() else "cpu") logger('%d CUDAs available!' % torch.cuda.device_count()) gpu_brand= torch.cuda.get_device_name(ps.cuda_id) if use_cuda else None logger('Training with %s [%s]' % (self.comp_device,gpu_brand) if use_cuda else 'Training on CPU!!!') logger('Base dataset_dir is %s'%ps.dataset_dir) kwargs = {'num_workers': ps.n_workers} ds_train = VPoserDS(dataset_dir=os.path.join(ps.dataset_dir, 'train')) self.ds_train = DataLoader(ds_train, batch_size=ps.batch_size, shuffle=True, drop_last=True, **kwargs) ds_val = VPoserDS(dataset_dir=os.path.join(ps.dataset_dir, 'vald')) self.ds_val = DataLoader(ds_val, batch_size=ps.batch_size, shuffle=True, drop_last=True, **kwargs) ds_test = VPoserDS(dataset_dir=os.path.join(ps.dataset_dir, 'test')) self.ds_test = DataLoader(ds_test, batch_size=ps.batch_size, shuffle=True, drop_last=True, **kwargs) logger('Train dataset size %.2f M' % (len(self.ds_train.dataset)*1e-6)) logger('Validation dataset size %d' % len(self.ds_val.dataset)) logger('Test dataset size %d' % len(self.ds_test.dataset)) ps.data_shape = list(ds_val[0]['pose_aa'].shape) self.vposer_model = VPoser(num_neurons=ps.num_neurons, latentD=ps.latentD, data_shape=ps.data_shape, use_cont_repr=ps.use_cont_repr) if ps.use_multigpu : self.vposer_model = nn.DataParallel(self.vposer_model) self.vposer_model.to(self.comp_device) varlist = [var[1] for var in self.vposer_model.named_parameters()] params_count = sum(p.numel() for p in varlist if p.requires_grad) logger('Total Trainable Parameters Count is %2.2f M.' % ((params_count) * 1e-6)) self.optimizer = optim.Adam(varlist, lr=ps.base_lr, weight_decay=ps.reg_coef) self.logger = logger self.best_loss_total = np.inf self.try_num = ps.try_num self.epochs_completed = 0 self.ps = ps if ps.best_model_fname is not None: if isinstance(self.vposer_model, torch.nn.DataParallel): self.vposer_model.module.load_state_dict( torch.load(ps.best_model_fname, map_location=self.comp_device)) else: self.vposer_model.load_state_dict(torch.load(ps.best_model_fname, map_location=self.comp_device)) logger('Restored model from %s' % ps.best_model_fname) chose_ids = np.random.choice(list(range(len(ds_val))), size=ps.num_bodies_to_display, replace=False, p=None) data_all = {} for id in chose_ids: for k, v in ds_val[id].items(): if k in data_all.keys(): data_all[k] = torch.cat([data_all[k], v[np.newaxis]], dim=0) else: data_all[k] = v[np.newaxis] self.vis_dorig = {k: data_all[k].to(self.comp_device) for k in data_all.keys()} self.bm = BodyModel(self.ps.bm_path, 'smplh', batch_size=self.ps.batch_size, use_posedirs=True).to(self.comp_device) def train(self): self.vposer_model.train() save_every_it = len(self.ds_train) / self.ps.log_every_epoch train_loss_dict = {} for it, dorig in enumerate(self.ds_train): dorig = {k: dorig[k].to(self.comp_device) for k in dorig.keys()} self.optimizer.zero_grad() drec = self.vposer_model(dorig['pose_aa'], output_type='aa') loss_total, cur_loss_dict = self.compute_loss(dorig, drec) loss_total.backward() self.optimizer.step() train_loss_dict = {k: train_loss_dict.get(k, 0.0) + v.item() for k, v in cur_loss_dict.items()} if it % (save_every_it + 1) == 0: cur_train_loss_dict = {k: v / (it + 1) for k, v in train_loss_dict.items()} train_msg = VPoserTrainer.creat_loss_message(cur_train_loss_dict, expr_code=self.ps.expr_code, epoch_num=self.epochs_completed, it=it, try_num=self.try_num, mode='train') self.logger(train_msg) self.swriter.add_histogram('q_z_sample', c2c(drec['mean']), it) train_loss_dict = {k: v / len(self.ds_train) for k, v in train_loss_dict.items()} return train_loss_dict def evaluate(self, split_name= 'vald'): self.vposer_model.eval() eval_loss_dict = {} data = self.ds_val if split_name == 'vald' else self.ds_test with torch.no_grad(): for dorig in data: dorig = {k: dorig[k].to(self.comp_device) for k in dorig.keys()} drec = self.vposer_model(dorig['pose_aa'], output_type='aa') _, cur_loss_dict = self.compute_loss(dorig, drec) eval_loss_dict = {k: eval_loss_dict.get(k, 0.0) + v.item() for k, v in cur_loss_dict.items()} eval_loss_dict = {k: v / len(data) for k, v in eval_loss_dict.items()} return eval_loss_dict def compute_loss(self, dorig, drec): q_z = torch.distributions.normal.Normal(drec['mean'], drec['std']) prec = drec['pose_aa'] porig = dorig['pose_aa'] device = dorig['pose_aa'].device dtype = dorig['pose_aa'].dtype MESH_SCALER = 1000 # Reconstruction loss - L1 on the output mesh mesh_orig = self.bm(pose_body=porig.view(self.ps.batch_size,-1)).v*MESH_SCALER mesh_rec = self.bm(pose_body=prec.view(self.ps.batch_size,-1)).v*MESH_SCALER loss_mesh_rec = (1. - self.ps.kl_coef) * torch.mean(torch.abs(mesh_orig - mesh_rec)) # KL loss p_z = torch.distributions.normal.Normal( loc=torch.tensor(np.zeros([self.ps.batch_size, self.ps.latentD]), requires_grad=False).to(device).type(dtype), scale=torch.tensor(np.ones([self.ps.batch_size, self.ps.latentD]), requires_grad=False).to(device).type(dtype)) loss_kl = self.ps.kl_coef * torch.mean(torch.sum(torch.distributions.kl.kl_divergence(q_z, p_z), dim=[1])) ## Archive of losses # loss_rec = (1. - self.ps.kl_coef) * torch.mean(torch.sum(torch.pow(dorig - prec, 2), dim=[1, 2, 3])) # R = prec.view([batch_size, n_joints, 3, 3]) # R_T = torch.transpose(R, 2, 3) # R_eye = torch.tensor(np.tile(np.eye(3,3).reshape(1,1,3,3), [batch_size, n_joints, 1, 1]), dtype=dtype, requires_grad = False).to(device) # loss_ortho = self.ps.ortho_coef * torch.mean(torch.sum(torch.pow(torch.matmul(R, R_T) - R_eye,2),dim=[1,2,3])) # # det_R = torch.transpose(torch.stack([determinant_3d(R[:,jIdx,...]) for jIdx in range(n_joints)]),0,1) # # one = torch.tensor(np.ones([batch_size, n_joints]), dtype = dtype, requires_grad = False).to(device) # loss_det1 = self.ps.det1_coef * torch.mean(torch.sum(torch.abs(det_R - one), dim=[1])) loss_dict = {'loss_kl': loss_kl, 'loss_mesh_rec': loss_mesh_rec, } if self.vposer_model.training and self.epochs_completed < 10: loss_dict['loss_pose_rec'] = (1. - self.ps.kl_coef) * torch.mean(torch.sum(torch.pow(porig - prec, 2), dim=[1, 2, 3])) loss_total = torch.stack(list(loss_dict.values())).sum() loss_dict['loss_total'] = loss_total return loss_total, loss_dict def perform_training(self, num_epochs=None, message=None): starttime = datetime.now().replace(microsecond=0) if num_epochs is None: num_epochs = self.ps.num_epochs self.logger( 'Started Training at %s for %d epochs' % (datetime.strftime(starttime, '%Y-%m-%d_%H:%M:%S'), num_epochs)) vis_bm = BodyModel(self.ps.bm_path, 'smplh', num_betas=16).to(self.comp_device) prev_lr = np.inf scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=int(num_epochs // 3), gamma=0.5) for epoch_num in range(1, num_epochs + 1): scheduler.step() cur_lr = self.optimizer.param_groups[0]['lr'] if cur_lr != prev_lr: self.logger('--- Optimizer learning rate changed from %.2e to %.2e ---' % (prev_lr, cur_lr)) prev_lr = cur_lr self.epochs_completed += 1 train_loss_dict = self.train() eval_loss_dict = self.evaluate() with torch.no_grad(): eval_msg = VPoserTrainer.creat_loss_message(eval_loss_dict, expr_code=self.ps.expr_code, epoch_num=self.epochs_completed, it=len(self.ds_val), try_num=self.try_num, mode='evald') if eval_loss_dict['loss_total'] < self.best_loss_total: self.ps.best_model_fname = makepath(os.path.join(self.ps.work_dir, 'snapshots', 'TR%02d_E%03d.pt' % ( self.try_num, self.epochs_completed)), isfile=True) self.logger(eval_msg + ' ** ') self.best_loss_total = eval_loss_dict['loss_total'] torch.save(self.vposer_model.module.state_dict() if isinstance(self.vposer_model, torch.nn.DataParallel) else self.vposer_model.state_dict(), self.ps.best_model_fname) imgname = '[%s]_TR%02d_E%03d.png' % (self.ps.expr_code, self.try_num, self.epochs_completed) imgpath = os.path.join(self.ps.work_dir, 'images', imgname) try: VPoserTrainer.vis_results(self.vis_dorig, self.vposer_model, bm=vis_bm, imgpath=imgpath) except: print('The visualization failed.') else: self.logger(eval_msg) self.swriter.add_scalars('total_loss/scalars', {'train_loss_total': train_loss_dict['loss_total'], 'evald_loss_total': eval_loss_dict['loss_total'], }, self.epochs_completed) # if early_stopping(eval_loss_dict['loss_total']): # self.logger("Early stopping at epoch %d"%self.epochs_completed) # break endtime = datetime.now().replace(microsecond=0) self.logger('Finished Training at %s\n' % (datetime.strftime(endtime, '%Y-%m-%d_%H:%M:%S'))) self.logger( 'Training done in %s! Best val total loss achieved: %.2e\n' % (endtime - starttime, self.best_loss_total)) self.logger('Best model path: %s\n' % self.ps.best_model_fname) @staticmethod def creat_loss_message(loss_dict, expr_code='XX', epoch_num=0, it=0, try_num=0, mode='evald'): ext_msg = ' | '.join(['%s = %.2e' % (k, v) for k, v in loss_dict.items() if k != 'loss_total']) return '[%s]_TR%02d_E%03d - It %05d - %s: [T:%.2e] - [%s]' % ( expr_code, try_num, epoch_num, it, mode, loss_dict['loss_total'], ext_msg) @staticmethod def vis_results(dorig, vposer_model, bm, imgpath): from human_body_prior.mesh import MeshViewer from human_body_prior.tools.omni_tools import copy2cpu as c2c import trimesh from human_body_prior.tools.omni_tools import colors from human_body_prior.tools.omni_tools import apply_mesh_tranfsormations_ from human_body_prior.tools.visualization_tools import imagearray2file from human_body_prior.train.vposer_smpl import VPoser view_angles = [0, 180, 90, -90] imw, imh = 800, 800 batch_size = len(dorig['pose_aa']) mv = MeshViewer(width=imw, height=imh, use_offscreen=True) mv.render_wireframe = True dorig_aa = dorig['pose_aa'] prec_aa = vposer_model(dorig_aa, output_type='aa')['pose_aa'].view(batch_size,-1) if hasattr(vposer_model, 'module'): pgen_aa = vposer_model.module.sample_poses(num_poses=batch_size, output_type='aa') else: pgen_aa = vposer_model.sample_poses(num_poses=batch_size, output_type='aa') pgen_aa = pgen_aa.view(batch_size,-1) dorig_aa = dorig_aa.view(batch_size, -1) images = np.zeros([len(view_angles), batch_size, 1, imw, imh, 3]) images_gen = np.zeros([len(view_angles), batch_size, 1, imw, imh, 3]) for cId in range(0, batch_size): bm.pose_body.data[:] = bm.pose_body.new(dorig_aa[cId]) orig_body_mesh = trimesh.Trimesh(vertices=c2c(bm().v[0]), faces=c2c(bm.f), vertex_colors=np.tile(colors['grey'], (6890, 1))) bm.pose_body.data[:] = bm.pose_body.new(prec_aa[cId]) rec_body_mesh = trimesh.Trimesh(vertices=c2c(bm().v[0]), faces=c2c(bm.f), vertex_colors=np.tile(colors['blue'], (6890, 1))) bm.pose_body.data[:] = bm.pose_body.new(pgen_aa[cId]) gen_body_mesh = trimesh.Trimesh(vertices=c2c(bm().v[0]), faces=c2c(bm.f), vertex_colors=np.tile(colors['blue'], (6890, 1))) all_meshes = [orig_body_mesh, rec_body_mesh, gen_body_mesh] for rId, angle in enumerate(view_angles): if angle != 0: apply_mesh_tranfsormations_(all_meshes, trimesh.transformations.rotation_matrix(np.radians(angle), (0, 1, 0))) mv.set_meshes([orig_body_mesh, rec_body_mesh], group_name='static') images[rId, cId, 0] = mv.render() mv.set_meshes([gen_body_mesh], group_name='static') images_gen[rId, cId, 0] = mv.render() if angle != 0: apply_mesh_tranfsormations_(all_meshes, trimesh.transformations.rotation_matrix(np.radians(-angle), (0, 1, 0))) imagearray2file(images, imgpath) imagearray2file(images_gen, imgpath.replace('.png','_gen.png'))
import numpy as np from tensorboardX import SummaryWriter writer = SummaryWriter(comment="base_scalar", log_dir="scalar") for epoch in range(100): writer.add_scalar("scalar/test", np.random.rand(), epoch) writer.add_scalars("scalar/scalars_test", { 'xsinx': epoch * np.sin(epoch), 'xcosx': epoch * np.cos(epoch) }, epoch) writer.close()
def train(season_id, dm_train_set, dm_test_set): EMBEDDING_DIM = 200 feature_dim = 50 max_len = 49 windows_size = [1, 2, 3, 4] batch_size = 128 epoch_num = 100 max_acc = 0 max_v_acc = 0 model_save_path = '.tmp/model_save/straight_CNN.model' dm_dataloader = data.DataLoader(dataset=dm_train_set, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=8) dm_test_dataloader = data.DataLoader(dataset=dm_test_set, batch_size=batch_size, shuffle=False, drop_last=False, num_workers=8) model = E2ECNNModeler(dm_train_set.vocab_size(), EMBEDDING_DIM, feature_dim, windows_size, max_len) print(model) init_weight = np.loadtxt( os.path.join('./tmp', season_id, 'unigram_weights.txt')) model.init_emb(init_weight) if torch.cuda.is_available(): print("CUDA : On") model.cuda() else: print("CUDA : Off") embedding_params = list(map(id, model.dynamic_embedding.parameters())) other_params = filter(lambda p: id(p) not in embedding_params, model.parameters()) optimizer = optim.Adam([{ 'params': other_params }, { 'params': model.dynamic_embedding.parameters(), 'lr': 1e-3 }], lr=1e-3, betas=(0.9, 0.99)) logging = True if logging: writer = SummaryWriter() log_name = 'Direct_CNN' history = None for epoch in range(epoch_num): if epoch > 0: for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.8 model.train(mode=True) for batch_idx, sample_dict in enumerate(dm_dataloader): sentence = Variable(torch.LongTensor(sample_dict['sentence'])) label = Variable(torch.LongTensor(sample_dict['label'])) if torch.cuda.is_available(): sentence = sentence.cuda() label = label.cuda() optimizer.zero_grad() pred = model.forward(sentence) cross_entropy = nn.NLLLoss() loss = cross_entropy(F.log_softmax(pred, dim=1), label) if batch_idx % 10 == 0: accuracy = valid_util.running_accuracy(pred, label) print('epoch: %d batch %d : loss: %4.6f accuracy: %4.6f' % (epoch, batch_idx, loss.item(), accuracy)) if logging: writer.add_scalar(log_name + '_data/loss', loss.item(), epoch * 10 + batch_idx // 10) loss.backward() optimizer.step() model.eval() if logging: result_dict = valid_util.validate(model, dm_test_set, dm_test_dataloader, mode='report') writer.add_scalars( log_name + '_data/0-PRF', { '0-Precision': result_dict['0']['precision'], '0-Recall': result_dict['0']['recall'], '0-F1-score': result_dict['0']['f1-score'] }, epoch) writer.add_scalars( log_name + '_data/1-PRF', { '1-Precision': result_dict['1']['precision'], '1-Recall': result_dict['1']['recall'], '1-F1-score': result_dict['1']['f1-score'] }, epoch) writer.add_scalar(log_name + '_data/accuracy', result_dict['accuracy'], epoch) accuracy = valid_util.validate(model, dm_test_set, dm_test_dataloader, mode='output') if accuracy > max_acc: max_acc = accuracy # dm_valid_set = pickle.load(open(os.path.join('./tmp', season_id, 'unigram_valid_dataset.pkl'), 'rb')) # v_acc = valid_util.validate(model, dm_valid_set, mode='output') # if v_acc > max_v_acc: # max_v_acc = v_acc if logging: writer.close() print("Max Accuracy: %4.6f" % max_acc) print("Max Validation Accuracy: %4.6f" % max_v_acc) return
print("\t\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, end - start)) print( "\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f dloss s: %.4f dloss t: %.4f dloss s pixel: %.4f dloss t pixel: %.4f eta: %.4f" \ % (loss_rpn_cls, loss_rpn_box, loss_rcnn_cls, loss_rcnn_box, dloss_s, dloss_t, dloss_s_p, dloss_t_p, args.eta)) if args.use_tfboard: info = { 'loss': loss_temp, 'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_box': loss_rpn_box, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_box } logger.add_scalars("logs_s_{}/losses".format(args.session), info, (epoch - 1) * iters_per_epoch + step) loss_temp = 0 start = time.time() save_name = os.path.join( output_dir, 'globallocal_target_{}_eta_{}_local_context_{}_global_context_{}_gamma_{}_session_{}_epoch_{}_step_{}.pth' .format(args.dataset_t, args.eta, args.lc, args.gc, args.gamma, args.session, epoch, step)) save_checkpoint( { 'session': args.session, 'epoch': epoch + 1,
def train(config_path, model_dir, result_path=None, create_folder=False, display_step=50, summary_step=5, pickle_result=True): """train a VoxelNet model specified by a config file. """ if create_folder: if pathlib.Path(model_dir).exists(): model_dir = torchplus.train.create_folder(model_dir) model_dir = pathlib.Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) if result_path is None: result_path = model_dir / 'results' config_file_bkp = "pipeline.config" config = pipeline_pb2.TrainEvalPipelineConfig() with open(config_path, "r") as f: proto_str = f.read() text_format.Merge(proto_str, config) shutil.copyfile(config_path, str(model_dir / config_file_bkp)) input_cfg = config.train_input_reader eval_input_cfg = config.eval_input_reader model_cfg = config.model.second train_cfg = config.train_config class_names = list(input_cfg.class_names) ###################### # BUILD VOXEL GENERATOR ###################### voxel_generator = voxel_builder.build(model_cfg.voxel_generator) ###################### # BUILD TARGET ASSIGNER ###################### bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]] box_coder = box_coder_builder.build(model_cfg.box_coder) target_assigner_cfg = model_cfg.target_assigner target_assigner = target_assigner_builder.build(target_assigner_cfg, bv_range, box_coder) ###################### # BUILD NET ###################### center_limit_range = model_cfg.post_center_limit_range net = second_builder.build(model_cfg, voxel_generator, target_assigner) net.cuda() # net_train = torch.nn.DataParallel(net).cuda() print("num_trainable parameters:", len(list(net.parameters()))) # for n, p in net.named_parameters(): # print(n, p.shape) ###################### # BUILD OPTIMIZER ###################### # we need global_step to create lr_scheduler, so restore net first. torchplus.train.try_restore_latest_checkpoints(model_dir, [net]) gstep = net.get_global_step() - 1 optimizer_cfg = train_cfg.optimizer if train_cfg.enable_mixed_precision: net.half() net.metrics_to_float() net.convert_norm_to_float(net) optimizer = optimizer_builder.build(optimizer_cfg, net.parameters()) if train_cfg.enable_mixed_precision: loss_scale = train_cfg.loss_scale_factor mixed_optimizer = torchplus.train.MixedPrecisionWrapper( optimizer, loss_scale) else: mixed_optimizer = optimizer # must restore optimizer AFTER using MixedPrecisionWrapper torchplus.train.try_restore_latest_checkpoints(model_dir, [mixed_optimizer]) lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, gstep) if train_cfg.enable_mixed_precision: float_dtype = torch.float16 else: float_dtype = torch.float32 ###################### # PREPARE INPUT ###################### dataset = input_reader_builder.build(input_cfg, model_cfg, training=True, voxel_generator=voxel_generator, target_assigner=target_assigner) eval_dataset = input_reader_builder.build(eval_input_cfg, model_cfg, training=False, voxel_generator=voxel_generator, target_assigner=target_assigner) def _worker_init_fn(worker_id): time_seed = np.array(time.time(), dtype=np.int32) np.random.seed(time_seed + worker_id) print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0]) dataloader = torch.utils.data.DataLoader(dataset, batch_size=input_cfg.batch_size, shuffle=True, num_workers=input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch, worker_init_fn=_worker_init_fn) eval_dataloader = torch.utils.data.DataLoader( eval_dataset, batch_size=eval_input_cfg.batch_size, shuffle=False, num_workers=eval_input_cfg.num_workers, pin_memory=False, collate_fn=merge_second_batch) data_iter = iter(dataloader) ###################### # TRAINING ###################### log_path = model_dir / 'log.txt' logf = open(log_path, 'a') logf.write(proto_str) logf.write("\n") summary_dir = model_dir / 'summary' summary_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(summary_dir)) total_step_elapsed = 0 remain_steps = train_cfg.steps - net.get_global_step() t = time.time() ckpt_start_time = t total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1 # total_loop = remain_steps // train_cfg.steps_per_eval + 1 clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch if train_cfg.steps % train_cfg.steps_per_eval == 0: total_loop -= 1 mixed_optimizer.zero_grad() try: print(total_loop) for _ in range(total_loop): if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps: steps = train_cfg.steps % train_cfg.steps_per_eval else: steps = train_cfg.steps_per_eval print(steps) for step in range(steps): lr_scheduler.step() try: example = next(data_iter) except StopIteration: print("end epoch") if clear_metrics_every_epoch: net.clear_metrics() data_iter = iter(dataloader) example = next(data_iter) example_torch = example_convert_to_torch(example, float_dtype) batch_size = example["anchors"].shape[0] ret_dict = net(example_torch) # box_preds = ret_dict["box_preds"] cls_preds = ret_dict["cls_preds"] loss = ret_dict["loss"].mean() cls_loss_reduced = ret_dict["cls_loss_reduced"].mean() loc_loss_reduced = ret_dict["loc_loss_reduced"].mean() cls_pos_loss = ret_dict["cls_pos_loss"] cls_neg_loss = ret_dict["cls_neg_loss"] loc_loss = ret_dict["loc_loss"] cls_loss = ret_dict["cls_loss"] dir_loss_reduced = ret_dict["dir_loss_reduced"] cared = ret_dict["cared"] labels = example_torch["labels"] if train_cfg.enable_mixed_precision: loss *= loss_scale loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0) mixed_optimizer.step() mixed_optimizer.zero_grad() net.update_global_step() net_metrics = net.update_metrics(cls_loss_reduced, loc_loss_reduced, cls_preds, labels, cared) step_time = (time.time() - t) t = time.time() metrics = {} num_pos = int((labels > 0)[0].float().sum().cpu().numpy()) num_neg = int((labels == 0)[0].float().sum().cpu().numpy()) if 'anchors_mask' not in example_torch: num_anchors = example_torch['anchors'].shape[1] else: num_anchors = int(example_torch['anchors_mask'][0].sum()) global_step = net.get_global_step() if global_step % display_step == 0: loc_loss_elem = [ float(loc_loss[:, :, i].sum().detach().cpu().numpy() / batch_size) for i in range(loc_loss.shape[-1]) ] metrics["step"] = global_step metrics["steptime"] = step_time metrics.update(net_metrics) metrics["loss"] = {} metrics["loss"]["loc_elem"] = loc_loss_elem metrics["loss"]["cls_pos_rt"] = float( cls_pos_loss.detach().cpu().numpy()) metrics["loss"]["cls_neg_rt"] = float( cls_neg_loss.detach().cpu().numpy()) # if unlabeled_training: # metrics["loss"]["diff_rt"] = float( # diff_loc_loss_reduced.detach().cpu().numpy()) if model_cfg.use_direction_classifier: metrics["loss"]["dir_rt"] = float( dir_loss_reduced.detach().cpu().numpy()) metrics["num_vox"] = int(example_torch["voxels"].shape[0]) metrics["num_pos"] = int(num_pos) metrics["num_neg"] = int(num_neg) metrics["num_anchors"] = int(num_anchors) metrics["lr"] = float( mixed_optimizer.param_groups[0]['lr']) metrics["image_idx"] = example['image_idx'][0] flatted_metrics = flat_nested_json_dict(metrics) flatted_summarys = flat_nested_json_dict(metrics, "/") for k, v in flatted_summarys.items(): if isinstance(v, (list, tuple)): v = {str(i): e for i, e in enumerate(v)} writer.add_scalars(k, v, global_step) else: writer.add_scalar(k, v, global_step) metrics_str_list = [] for k, v in flatted_metrics.items(): if isinstance(v, float): metrics_str_list.append(f"{k}={v:.3}") elif isinstance(v, (list, tuple)): if v and isinstance(v[0], float): v_str = ', '.join([f"{e:.3}" for e in v]) metrics_str_list.append(f"{k}=[{v_str}]") else: metrics_str_list.append(f"{k}={v}") else: metrics_str_list.append(f"{k}={v}") log_str = ', '.join(metrics_str_list) print(log_str, file=logf) print(log_str) ckpt_elasped_time = time.time() - ckpt_start_time if ckpt_elasped_time > train_cfg.save_checkpoints_secs: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) ckpt_start_time = time.time() total_step_elapsed += steps torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) net.eval() result_path_step = result_path / f"step_{net.get_global_step()}" result_path_step.mkdir(parents=True, exist_ok=True) print("#################################") print("#################################", file=logf) print("# EVAL") print("# EVAL", file=logf) print("#################################") print("#################################", file=logf) print("Generate output labels...") print("Generate output labels...", file=logf) t = time.time() dt_annos = [] prog_bar = ProgressBar() prog_bar.start(len(eval_dataset) // eval_input_cfg.batch_size + 1) for example in iter(eval_dataloader): example = example_convert_to_torch(example, float_dtype) if pickle_result: dt_annos += predict_kitti_to_anno(net, example, class_names, center_limit_range, model_cfg.lidar_input) else: _predict_kitti_to_file(net, example, result_path_step, class_names, center_limit_range, model_cfg.lidar_input) prog_bar.print_bar() sec_per_ex = len(eval_dataset) / (time.time() - t) print(f"avg forward time per example: {net.avg_forward_time:.3f}") print( f"avg postprocess time per example: {net.avg_postprocess_time:.3f}" ) net.clear_time_metrics() print(f'generate label finished({sec_per_ex:.2f}/s). start eval:') print(f'generate label finished({sec_per_ex:.2f}/s). start eval:', file=logf) gt_annos = [ info["annos"] for info in eval_dataset.dataset.kitti_infos ] if not pickle_result: dt_annos = kitti.get_label_annos(result_path_step) result = get_official_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) writer.add_text('eval_result', result, global_step) result = get_coco_eval_result(gt_annos, dt_annos, class_names) print(result, file=logf) print(result) if pickle_result: with open(result_path_step / "result.pkl", 'wb') as f: pickle.dump(dt_annos, f) writer.add_text('eval_result', result, global_step) net.train() except Exception as e: torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close() raise e # save model before exit torchplus.train.save_models(model_dir, [net, optimizer], net.get_global_step()) logf.close()
class MtcnnTrainer(object): """ Train Templet """ def __init__(self, configer, net, params, trainset, validset, testset, criterion, optimizer, lr_scheduler, num_to_keep=5, valid_freq=1): self.configer = configer self.valid_freq = valid_freq self.net = net ## directory for log and checkpoints self.logdir = os.path.join(configer.logdir, self.net._get_name()) if not os.path.exists(self.logdir): os.makedirs(self.logdir) self.ckptdir = configer.ckptdir if not os.path.exists(self.ckptdir): os.makedirs(self.ckptdir) ## datasets self.trainset = trainset self.validset = validset self.testset = testset self.trainloader = DataLoader(trainset, configer.batchsize, True, collate_fn=collate_fn) self.validloader = DataLoader(validset, configer.batchsize, True, collate_fn=collate_fn) self.testloader = DataLoader(testset, configer.batchsize, False, collate_fn=collate_fn) ## for optimization self.criterion = criterion self.optimizer = optimizer(params, configer.lrbase, weight_decay=4e-5) # self.lr_scheduler = lr_scheduler(self.optimizer, configer.adjstep, configer.gamma) # MultiStepLR self.lr_scheduler = lr_scheduler(self.optimizer, configer.gamma) # ExponentialLR self.writer = SummaryWriter(configer.logdir) self.writer.add_graph(self.net, (torch.rand([1] + trainset.image_size), )) ## initialize self.valid_loss = float('inf') self.elapsed_time = 0 self.cur_epoch = 0 self.cur_batch = 0 self.save_times = 0 self.num_to_keep = num_to_keep ## print information # stat(self.net, trainset.image_size) if configer.cuda and cuda.is_available(): self.net.cuda() print("==============================================================================================") print("model: {}".format(self.net._get_name())) print("logdir: {}".format(self.logdir)) print("ckptdir: {}".format(self.ckptdir)) print("train samples: {}k".format(len(trainset)/1000)) print("valid samples: {}k".format(len(validset)/1000)) print("batch size: {}".format(configer.batchsize)) print("batch per epoch: {}".format(len(trainset)/configer.batchsize)) print("epoch: [{:4d}]/[{:4d}]".format(self.cur_epoch, configer.n_epoch)) print("val frequency: {}".format(self.valid_freq)) print("learing rate: {}".format(configer.lrbase)) print("==============================================================================================") def train(self): n_epoch = self.configer.n_epoch - self.cur_epoch print("Start training! current epoch: {}, remain epoch: {}".format(self.cur_epoch, n_epoch)) bar = ProcessBar(n_epoch) loss_train = 0.; loss_valid = 0. for i_epoch in range(n_epoch): if self.configer.cuda and cuda.is_available(): cuda.empty_cache() self.cur_epoch += 1 bar.step() self.lr_scheduler.step(self.cur_epoch) cur_lr = self.lr_scheduler.get_lr()[-1] self.writer.add_scalar('{}/lr'.format(self.net._get_name()), cur_lr, self.cur_epoch) loss_train = self.train_epoch() # print("----------------------------------------------------------------------------------------------") if self.valid_freq != 0 and self.cur_epoch % self.valid_freq == 0: loss_valid = self.valid_epoch() else: loss_valid = self.valid_loss # print("----------------------------------------------------------------------------------------------") self.writer.add_scalars('loss', {'train': loss_train, 'valid': loss_valid}, self.cur_epoch) if self.valid_freq == 0: self.save_checkpoint() else: if loss_valid < self.valid_loss: self.valid_loss = loss_valid self.save_checkpoint() # print("==============================================================================================") def train_epoch(self): self.net.train() avg_loss = [] start_time = time.time() n_batch = len(self.trainset) // self.configer.batchsize bar = ProcessBar(n_batch, title=' [Train|Epoch %d] ' % self.cur_epoch) for i_batch, (images, labels, offsets, landmarks) in enumerate(self.trainloader): bar.step(i_batch) self.cur_batch += 1 if self.configer.cuda and cuda.is_available(): images = images.cuda() labels = labels.cuda() offsets = offsets.cuda() landmarks = landmarks.cuda() pred = self.net(images) loss_i, loss_cls, loss_offset, loss_landmark = self.criterion(pred, labels, offsets, landmarks) cls_pred = torch.where(torch.sigmoid(pred[:, 0].squeeze()) > 0.5, torch.ones_like(labels), torch.zeros_like(labels)) cls_gt = torch.where((labels == 1)^(labels == -2), torch.ones_like(labels), torch.zeros_like(labels)) mask = labels >= 0 cls_pred = torch.masked_select(cls_pred, mask) cls_gt = torch.masked_select(cls_gt, mask) acc_i = torch.mean((cls_pred == cls_gt).float()) self.optimizer.zero_grad() loss_i.backward() self.optimizer.step() global_step = self.cur_epoch*n_batch + i_batch self.writer.add_scalar('{}/train/loss_i'.format(self.net._get_name()), loss_i, global_step=global_step) self.writer.add_scalar('{}/train/loss_cls'.format(self.net._get_name()), loss_cls, global_step=global_step) self.writer.add_scalar('{}/train/loss_offset'.format(self.net._get_name()), loss_offset, global_step=global_step) self.writer.add_scalar('{}/train/loss_landmark'.format(self.net._get_name()), loss_landmark, global_step=global_step) self.writer.add_scalar('{}/train/acc_i'.format(self.net._get_name()), acc_i, global_step=global_step) avg_loss += [loss_i.detach().cpu().numpy()] avg_loss = np.mean(np.array(avg_loss)) return avg_loss def valid_epoch(self): self.net.eval() avg_loss = [] start_time = time.time() n_batch = len(self.validset) // self.configer.batchsize bar = ProcessBar(n_batch, title=' [Valid|Epoch %d] ' % self.cur_epoch) with torch.no_grad(): for i_batch, (images, labels, offsets, landmarks) in enumerate(self.validloader): bar.step(i_batch) if self.configer.cuda and cuda.is_available(): images = images.cuda() labels = labels.cuda() offsets = offsets.cuda() landmarks = landmarks.cuda() pred = self.net(images) loss_i, loss_cls, loss_offset, loss_landmark = self.criterion(pred, labels, offsets, landmarks) cls_pred = torch.where(torch.sigmoid(pred[:, 0].squeeze()) > 0.5, torch.ones_like(labels), torch.zeros_like(labels)) cls_gt = torch.where((labels == 1)^(labels == -2), torch.ones_like(labels), torch.zeros_like(labels)) mask = labels >= 0 cls_pred = torch.masked_select(cls_pred, mask) cls_gt = torch.masked_select(cls_gt, mask) acc_i = torch.mean((cls_pred == cls_gt).float()) global_step = self.cur_epoch*n_batch + i_batch self.writer.add_scalar('{}/valid/loss_i'.format(self.net._get_name()), loss_i, global_step=global_step) self.writer.add_scalar('{}/valid/loss_cls'.format(self.net._get_name()), loss_cls, global_step=global_step) self.writer.add_scalar('{}/valid/loss_offset'.format(self.net._get_name()), loss_offset, global_step=global_step) self.writer.add_scalar('{}/valid/loss_landmark'.format(self.net._get_name()), loss_landmark, global_step=global_step) self.writer.add_scalar('{}/valid/acc_i'.format(self.net._get_name()), acc_i, global_step=global_step) avg_loss += [loss_i.detach().cpu().numpy()] avg_loss = np.mean(np.array(avg_loss)) return avg_loss def test(self): pass def save_checkpoint(self): checkpoint_state = { 'save_time': getTime(), 'cur_epoch': self.cur_epoch, 'cur_batch': self.cur_batch, 'elapsed_time': self.elapsed_time, 'valid_loss': self.valid_loss, 'save_times': self.save_times, 'net_state': self.net.state_dict(), } checkpoint_path = os.path.join(self.ckptdir, "{}_{:04d}.pkl".\ format(self.net._get_name(), self.save_times)) torch.save(checkpoint_state, checkpoint_path) checkpoint_path = os.path.join(self.ckptdir, "{}_{:04d}.pkl".\ format(self.net._get_name(), self.save_times-self.num_to_keep)) if os.path.exists(checkpoint_path): os.remove(checkpoint_path) self.save_times += 1 # print("checkpoint saved at {}".format(checkpoint_path)) def load_checkpoint(self, index): checkpoint_path = os.path.join(self.ckptdir, "{}_{:04d}.pkl".\ format(self.net._get_name(), index)) checkpoint_state = torch.load(checkpoint_path, map_location='cuda' if cuda.is_available() else 'cpu') self.cur_epoch = checkpoint_state['cur_epoch'] self.cur_batch = checkpoint_state['cur_batch'] self.elapsed_time = checkpoint_state['elapsed_time'] self.valid_loss = checkpoint_state['valid_loss'] self.save_times = checkpoint_state['save_times'] self.net.load_state_dict(checkpoint_state['net_state']) self.optimizer.load_state_dict(checkpoint_state['optimizer_state']) self.lr_scheduler.load_state_dict(checkpoint_state['lr_scheduler_state'])
bg_cnt = rois_label.data.numel() - fg_cnt print("[session %d][epoch %2d][iter %4d/%4d] loss: %.4f, lr: %.2e" \ % (args.session, epoch, step, iters_per_epoch, loss_temp, lr)) print("\t\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, end-start)) print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" \ % (loss_rpn_cls, loss_rpn_box, loss_rcnn_cls, loss_rcnn_box)) if args.use_tfboard: info = { 'loss': loss_temp, 'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_box': loss_rpn_box, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_box } logger.add_scalars("logs_s_{}/losses".format(args.session), info, (epoch - 1) * iters_per_epoch + step) loss_temp = 0 start = time.time() save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step)) save_checkpoint({ 'session': args.session, 'epoch': epoch + 1, 'model': fasterRCNN.module.state_dict() if args.mGPUs else fasterRCNN.state_dict(), 'optimizer': optimizer.state_dict(), 'pooling_mode': cfg.POOLING_MODE, 'class_agnostic': args.class_agnostic, }, save_name) print('save model: {}'.format(save_name))
true_positive_counts = [75, 64, 21, 5, 0] false_positive_counts = [150, 105, 18, 0, 0] true_negative_counts = [0, 45, 132, 150, 150] false_negative_counts = [0, 11, 54, 70, 75] precision = [0.3333333, 0.3786982, 0.5384616, 1.0, 0.0] recall = [1.0, 0.8533334, 0.28, 0.0666667, 0.0] for n_iter in range(100): s1 = torch.rand(1) # value to keep s2 = torch.rand(1) writer.add_scalar('data/scalar1', s1[0], n_iter) # data grouping by `slash` writer.add_scalars( 'data/scalar_group', { "xsinx": n_iter * np.sin(n_iter), "xcosx": n_iter * np.cos(n_iter), "arctanx": np.arctan(n_iter) }, n_iter) x = torch.rand(32, 3, 64, 64) # output from network if n_iter % 10 == 0: x = vutils.make_grid(x, normalize=True, scale_each=True) writer.add_image('Image', x, n_iter) # Tensor #writer.add_image('astronaut', skimage.data.astronaut(), n_iter) # numpy #writer.add_image('imread', skimage.io.imread('screenshots/audio.png'), n_iter) # numpy x = torch.zeros(sample_rate * 2) for i in range(x.size(0)): x[i] = np.cos( freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) # sound amplitude should in [-1, 1] writer.add_audio('myAudio', x, n_iter) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
cnn, train_loader, phase='training') val_epoch_loss, val_epoch_accuracy = fit(epoch, cnn, val_loader, phase='validation') train_losses.append(epoch_loss) train_accuracy.append(epoch_accuracy) val_losses.append(val_epoch_loss) val_accuracy.append(val_epoch_accuracy) writer_train.add_scalars("losses", { 'train_bm': epoch_loss, 'val_bm': val_epoch_loss }, int(epoch)) writer_train.add_scalars("accuracies", { 'train_bm': epoch_accuracy, 'val_bm': val_epoch_accuracy }, int(epoch)) # Learning rate scheduler update scheduler.step(val_epoch_loss) writer_train.add_histogram("error_bm", np.array(train_losses)) elapsed = clock() - start print(elapsed)
iou_test += env.iou() reward_test_total = reward_test_total / N_iteration_test IOU_test_total = iou_test / N_iteration_test secs = int(time.time() - start_time) mins = secs / 60 secs = secs % 60 print('Epodise: ', episode, '| Ep_reward_test:', reward_test_total, '| Ep_IOU_test: ', IOU_test_total) print(" | time in %d minutes, %d seconds\n" % (mins, secs)) if agent.greedy_epsilon > FINAL_EPSILON: agent.greedy_epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / N_iteration if reward_test_total >= best_reward: torch.save(agent.Eval_net.state_dict(), log_path + 'Eval_net_episode_%d.pth' % (episode)) torch.save(agent.Target_net.state_dict(), log_path + 'Target_net_episode_%d.pth' % (episode)) best_reward = reward_test_total writer.add_scalars( OUT_FILE_NAME, { 'train_loss': train_loss, 'train_reward': reward_train, 'train_iou': train_iou, 'test_reward': reward_test_total, 'test_iou': IOU_test_total, }, episode) JSON_log_PATH = "./JSON/" if os.path.exists(JSON_log_PATH) == False: os.makedirs(JSON_log_PATH) writer.export_scalars_to_json(JSON_log_PATH + OUT_FILE_NAME + ".json") writer.close()
def train(season_id, dm_train_set, dm_test_set, features, edges): EMBEDDING_DIM = 200 batch_size = 128 epoch_num = 300 max_acc = 0 max_v_acc = 0 model_save_path = './tmp/model_save/gcn.model' dm_dataloader = data.DataLoader(dataset=dm_train_set, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=8) dm_test_dataloader = data.DataLoader(dataset=dm_test_set, batch_size=batch_size, shuffle=False, drop_last=False, num_workers=8) graph = build_graph(features, edges) features = torch.FloatTensor(features) graph = graph.to(device) model = GCN(graph, EMBEDDING_DIM, 256, dropout=0.5) # model.init_emb(features) print(model) model.to(device) if torch.cuda.is_available(): print("CUDA : On") else: print("CUDA : Off") optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.99)) # scheduler = StepLR(optimizer, step_size=100, gamma=0.1) logging = False if logging: writer = SummaryWriter() log_name = 'gcn' for epoch in tqdm(range(epoch_num)): model.train(mode=True) # scheduler.step() for batch_idx, sample_dict in enumerate(dm_dataloader): sentence = torch.LongTensor(sample_dict['sentence']) label = torch.LongTensor(sample_dict['label']) sentence = sentence.to(device) label = label.to(device) optimizer.zero_grad() pred = model.forward(sentence) cross_entropy = nn.CrossEntropyLoss() loss = cross_entropy(pred, label) if batch_idx % 10 == 0: accuracy = valid_util.running_accuracy(pred, label) print('epoch: %d batch %d : loss: %4.6f accuracy: %4.6f' % (epoch, batch_idx, loss.item(), accuracy)) if logging: writer.add_scalar(log_name + '_data/loss', loss.item(), epoch * 10 + batch_idx // 10) loss.backward() optimizer.step() model.eval() accuracy = valid_util.validate(model, dm_test_set, dm_test_dataloader, mode='output', type='normal') if accuracy > max_acc: max_acc = accuracy if logging: result_dict = valid_util.validate(model, dm_test_set, dm_test_dataloader, mode='report', type='normal') writer.add_scalars( log_name + '_data/0-PRF', { '0-Precision': result_dict['0']['precision'], '0-Recall': result_dict['0']['recall'], '0-F1-score': result_dict['0']['f1-score'] }, epoch) writer.add_scalars( log_name + '_data/1-PRF', { '1-Precision': result_dict['1']['precision'], '1-Recall': result_dict['1']['recall'], '1-F1-score': result_dict['1']['f1-score'] }, epoch) writer.add_scalars(log_name + '_data/accuracy', { 'accuracy': result_dict['accuracy'], 'max_accuracy': max_acc }, epoch) if logging: writer.close() print("Max Accuracy: %4.6f" % max_acc) return
def train(self): img_size = Constants.IMAGE_SIZE if not Constants.PRETRAINED: summary(self.network, (1, img_size, img_size)) else: summary(self.network, (3, img_size, img_size)) writer = SummaryWriter(Path(self.base_out_dir) / "tensorboard") print(f"Run ID : {self.config.run_id}") print("Training started at: {}".format( time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()))) self.best_result["train_kappa"] = np.inf self.best_result["val_kappa"] = np.inf start = self.start_epoch end = start + self.model_config.epoch self.network = self.network.train() for epoch in range(start, end): for iteration, batch_set in enumerate(self.train_data): inputs = torch.cat((batch_set["positive_x"].to(self.device), batch_set["negative_x"].to(self.device))) target = torch.cat((batch_set["positive_y"].to(self.device), batch_set["negative_y"].to(self.device))) self.optimizer.zero_grad() predictions = self.network(inputs) loss = self.loss_function(predictions, target) loss.backward() self.optimizer.step() if iteration % self.model_config.model_dump_gap == 0 \ and iteration != 0: train_loss, train_kappa = self.test(self.train_data, ds_type="train_set") self.results["epochs"].append(epoch + iteration / len(self.train_data)) self.results["train_loss"].append(train_loss) self.results["train_kappa"].append(train_kappa) print("lr: {:.2E}".format( self.optimizer.param_groups[0]['lr'])) print( "{} Epoch: {}, Iteration: {}, Train loss: {:.4f}, Train Cohen Kappa Score: {:.4f}" .format( time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()), epoch, iteration, train_loss, train_kappa)) writer.add_scalars("Loss", {"train_loss": train_loss}, epoch) if self.val_data: val_loss, val_kappa = self.test(self.val_data) self.results["val_loss"].append(val_loss) self.results["val_kappa"].append(val_kappa) print( "{} Epoch: {}, Iteration: {}, Val loss: {:.4f}, Val Cohen Kappa Score: {:.4f}" .format( time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()), epoch, iteration, val_loss, val_kappa)) writer.add_scalars("Loss", {"val_loss": val_loss}, epoch) # Now plotting loss_fig = generate_plot(self.results["epochs"], self.results["train_loss"], self.results["val_loss"], title="Loss Progression", y_label="loss") writer.add_figure("Loss", loss_fig) loss_fig.savefig(Path(self.base_out_dir) / "loss.png") plt.show() plt.close(loss_fig) kappa_score_fig = generate_plot( self.results["epochs"], self.results["train_kappa"], self.results["val_kappa"], title="Cohen Kappa Score", y_label="CKS") writer.add_figure("Cohen Kappa Score", kappa_score_fig) loss_fig.savefig( Path(self.base_out_dir) / "kappa_score.png") plt.show() plt.close(kappa_score_fig) if (self.results["train_kappa"][-1] <= self.best_result["train_kappa"] and self.results["val_kappa"][-1] < self.best_result["val_kappa"]): self.best_result["train_kappa"] = self.results[ "train_kappa"][-1] self.best_result["val_kappa"] = self.results[ "val_kappa"][-1] self.best_result["state"] = { "epoch": epoch, "network_dict": deepcopy(self.network.state_dict()), "optimizer_dict": deepcopy(self.optimizer.state_dict()), "results": deepcopy(self.results) } self.save_model(epoch, tag=iteration) self.scheduler.step() print( "Best results: Epoch{}, Train Cohen Kappa Score: {}, Val Cohen Kappa Score: {}" .format(self.best_result["epoch"], self.best_result["train_kappa"], self.best_result["val_kappa"])) model_save_dir = "best_model_{}.pth".format( str(self.best_result["state"]["epoch"])) torch.save(self.best_result["state"], Path(self.model_out_dir) / model_save_dir) if self.test_data: test_loss, test_kappa = self.test(self.test_data) print( "{} Epoch: {}, Test loss: {:.4f}, Test Cohen Kappa Score: {:.4f}" .format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()), epoch, test_loss, test_kappa))
def train(self, protocol_name, subset='development', n_calls=1): train_dir = self.TRAIN_DIR.format( experiment_dir=self.experiment_dir, protocol=protocol_name, subset=subset) mkdir_p(train_dir) protocol = get_protocol(protocol_name, progress=False, preprocessors=self.preprocessors_) tune_db = f'{train_dir}/tune.db' params_yml = f'{train_dir}/params.yml' params_yml_lock = f'{train_dir}/params.yml.lock' pid = os.getpid() writer = SummaryWriter(log_dir=f"{train_dir}/{pid}") progress_bar = tqdm(unit='trial') progress_bar.set_description('Trial #1 : ...') progress_bar.update(0) iterations = self.pipeline_.tune_iter( tune_db, protocol, subset=subset, sampler=self.sampler_) for s, status in enumerate(iterations): if s+1 == n_calls: break loss = status['latest']['loss'] writer.add_scalar(f'train/{protocol_name}.{subset}/loss/latest', loss, global_step=s + 1) writer.add_scalars( f'train/{protocol_name}.{subset}/params/latest', status['latest']['params'], global_step=s + 1) if 'new_best' in status: _ = self.dump(status['new_best'], params_yml, params_yml_lock) n_trials = status['new_best']['n_trials'] best_loss = status['new_best']['loss'] writer.add_scalar(f'train/{protocol_name}.{subset}/loss/best', best_loss, global_step=n_trials) writer.add_scalars( f'train/{protocol_name}.{subset}/params/best', status['new_best']['params'], global_step=n_trials) # progress bar desc = f"Trial #{s+1}" loss = status['latest']['loss'] if abs(loss) < 1: desc += f" = {100 * loss:.3f}%" desc += f" : Best = {100 * best_loss:.3f}% after {n_trials} trials" else: desc += f" = {loss:.3f}" desc += f" : Best = {best_loss:.3f} after {n_trials} trials" progress_bar.set_description(desc=desc) progress_bar.update(1) best = self.pipeline_.best(tune_db) content = self.dump(best, params_yml, params_yml_lock) sep = "=" * max(len(params_yml), max(len(l) for l in content.split('\n'))) print(f"\n{sep}\n{params_yml}\n{sep}\n{content}{sep}") print(f"Loss = {best['loss']:g} | {best['n_trials']} trials") print(f"{sep}")
info_loss.backward() optimizerInfo.step() total_g_loss += g_loss.data / batch_size total_d_loss += d_loss.data / batch_size total_info_loss += info_loss.data / batch_size total_real_prob += real.mean(0).data total_fake_prob += fake.mean(0).data total_fake2_prob += fake2.mean(0).data t2.update() if i % plot_every == 0 and i != 0: writer.add_scalars( 'infogan/loss', { 'g_loss': total_g_loss / plot_every, 'd_loss': total_d_loss / plot_every, 'info_loss': total_info_loss / plot_every }, plot) writer.add_scalars( 'infogan/prob', { 'real_data': total_real_prob / plot_every, 'fake_data_before': total_fake_prob / plot_every, 'fake_data_after': total_fake2_prob / plot_every }, plot) plot += 1 total_g_loss = 0.0 total_d_loss = 0.0 total_info_loss = 0.0 total_real_prob = 0.0 total_fake_prob = 0.0
def main(): cfg = Config() # Redirect logs to both console and file. if cfg.log_to_file: ReDirectSTD(cfg.stdout_file, 'stdout', False) ReDirectSTD(cfg.stderr_file, 'stderr', False) # Lazily create SummaryWriter writer = None TVTs, TMOs, relative_device_ids = set_devices_for_ml(cfg.sys_device_ids) if cfg.seed is not None: set_seed(cfg.seed) # Dump the configurations to log. import pprint print('-' * 60) print('cfg.__dict__') pprint.pprint(cfg.__dict__) print('-' * 60) ########### # Dataset # ########### train_set = create_dataset(**cfg.train_set_kwargs) test_sets = [] test_set_names = [] if cfg.dataset == 'combined': for name in ['market1501', 'cuhk03', 'duke']: cfg.test_set_kwargs['name'] = name test_sets.append(create_dataset(**cfg.test_set_kwargs)) test_set_names.append(name) else: test_sets.append(create_dataset(**cfg.test_set_kwargs)) test_set_names.append(cfg.dataset) ########### # Models # ########### models = [Model(local_conv_out_channels=cfg.local_conv_out_channels, num_classes=len(train_set.ids2labels)) for _ in range(cfg.num_models)] # Model wrappers model_ws = [DataParallel(models[i], device_ids=relative_device_ids[i]) for i in range(cfg.num_models)] ############################# # Criteria and Optimizers # ############################# id_criterion = nn.CrossEntropyLoss() g_tri_loss = TripletLoss(margin=cfg.global_margin) l_tri_loss = TripletLoss(margin=cfg.local_margin) optimizers = [optim.Adam(m.parameters(), lr=cfg.base_lr, weight_decay=cfg.weight_decay) for m in models] # Bind them together just to save some codes in the following usage. modules_optims = models + optimizers ################################ # May Resume Models and Optims # ################################ if cfg.resume: resume_ep, scores = load_ckpt(modules_optims, cfg.ckpt_file) # May Transfer Models and Optims to Specified Device. Transferring optimizers # is to cope with the case when you load the checkpoint to a new device. for TMO, model, optimizer in zip(TMOs, models, optimizers): TMO([model, optimizer]) ######## # Test # ######## # Test each model using different distance settings. def test(load_model_weight=False): if load_model_weight: load_ckpt(modules_optims, cfg.ckpt_file) use_local_distance = (cfg.l_loss_weight > 0) \ and cfg.local_dist_own_hard_sample for i, (model_w, TVT) in enumerate(zip(model_ws, TVTs)): for test_set, name in zip(test_sets, test_set_names): test_set.set_feat_func(ExtractFeature(model_w, TVT)) print('\n=========> Test Model #{} on dataset: {} <=========\n' .format(i + 1, name)) test_set.eval( normalize_feat=cfg.normalize_feature, use_local_distance=use_local_distance) if cfg.only_test: test(load_model_weight=True) return ############ # Training # ############ # Storing things that can be accessed cross threads. ims_list = [None for _ in range(cfg.num_models)] labels_list = [None for _ in range(cfg.num_models)] done_list1 = [False for _ in range(cfg.num_models)] done_list2 = [False for _ in range(cfg.num_models)] probs_list = [None for _ in range(cfg.num_models)] g_dist_mat_list = [None for _ in range(cfg.num_models)] l_dist_mat_list = [None for _ in range(cfg.num_models)] # Two phases for each model: # 1) forward and single-model loss; # 2) further add mutual loss and backward. # The 2nd phase is only ready to start when the 1st is finished for # all models. run_event1 = threading.Event() run_event2 = threading.Event() # This event is meant to be set to stop threads. However, as I found, with # `daemon` set to true when creating threads, manually stopping is # unnecessary. I guess some main-thread variables required by sub-threads # are destroyed when the main thread ends, thus the sub-threads throw errors # and exit too. # Real reason should be further explored. exit_event = threading.Event() # The function to be called by threads. def thread_target(i): while not exit_event.isSet(): # If the run event is not set, the thread just waits. if not run_event1.wait(0.001): continue ###################################### # Phase 1: Forward and Separate Loss # ###################################### TVT = TVTs[i] model_w = model_ws[i] ims = ims_list[i] labels = labels_list[i] optimizer = optimizers[i] ims_var = Variable(TVT(torch.from_numpy(ims).float())) labels_t = TVT(torch.from_numpy(labels).long()) labels_var = Variable(labels_t) global_feat, local_feat, logits = model_w(ims_var) probs = F.softmax(logits) log_probs = F.log_softmax(logits) g_loss, p_inds, n_inds, g_dist_ap, g_dist_an, g_dist_mat = global_loss( g_tri_loss, global_feat, labels_t, normalize_feature=cfg.normalize_feature) if cfg.l_loss_weight == 0: l_loss, l_dist_mat = 0, 0 elif cfg.local_dist_own_hard_sample: # Let local distance find its own hard samples. l_loss, l_dist_ap, l_dist_an, l_dist_mat = local_loss( l_tri_loss, local_feat, None, None, labels_t, normalize_feature=cfg.normalize_feature) else: l_loss, l_dist_ap, l_dist_an = local_loss( l_tri_loss, local_feat, p_inds, n_inds, labels_t, normalize_feature=cfg.normalize_feature) l_dist_mat = 0 id_loss = 0 if cfg.id_loss_weight > 0: id_loss = id_criterion(logits, labels_var) probs_list[i] = probs g_dist_mat_list[i] = g_dist_mat l_dist_mat_list[i] = l_dist_mat done_list1[i] = True # Wait for event to be set, meanwhile checking if need to exit. while True: phase2_ready = run_event2.wait(0.001) if exit_event.isSet(): return if phase2_ready: break ##################################### # Phase 2: Mutual Loss and Backward # ##################################### # Probability Mutual Loss (KL Loss) pm_loss = 0 if (cfg.num_models > 1) and (cfg.pm_loss_weight > 0): for j in range(cfg.num_models): if j != i: pm_loss += F.kl_div(log_probs, TVT(probs_list[j]).detach(), False) pm_loss /= 1. * (cfg.num_models - 1) * len(ims) # Global Distance Mutual Loss (L2 Loss) gdm_loss = 0 if (cfg.num_models > 1) and (cfg.gdm_loss_weight > 0): for j in range(cfg.num_models): if j != i: gdm_loss += torch.sum(torch.pow( g_dist_mat - TVT(g_dist_mat_list[j]).detach(), 2)) gdm_loss /= 1. * (cfg.num_models - 1) * len(ims) * len(ims) # Local Distance Mutual Loss (L2 Loss) ldm_loss = 0 if (cfg.num_models > 1) \ and cfg.local_dist_own_hard_sample \ and (cfg.ldm_loss_weight > 0): for j in range(cfg.num_models): if j != i: ldm_loss += torch.sum(torch.pow( l_dist_mat - TVT(l_dist_mat_list[j]).detach(), 2)) ldm_loss /= 1. * (cfg.num_models - 1) * len(ims) * len(ims) loss = g_loss * cfg.g_loss_weight \ + l_loss * cfg.l_loss_weight \ + id_loss * cfg.id_loss_weight \ + pm_loss * cfg.pm_loss_weight \ + gdm_loss * cfg.gdm_loss_weight \ + ldm_loss * cfg.ldm_loss_weight optimizer.zero_grad() loss.backward() optimizer.step() ################################## # Step Log For One of the Models # ################################## # These meters are outer-scope variables # Just record for the first model if i == 0: # precision g_prec = (g_dist_an > g_dist_ap).data.float().mean() # the proportion of triplets that satisfy margin g_m = (g_dist_an > g_dist_ap + cfg.global_margin).data.float().mean() g_d_ap = g_dist_ap.data.mean() g_d_an = g_dist_an.data.mean() g_prec_meter.update(g_prec) g_m_meter.update(g_m) g_dist_ap_meter.update(g_d_ap) g_dist_an_meter.update(g_d_an) g_loss_meter.update(to_scalar(g_loss)) if cfg.l_loss_weight > 0: # precision l_prec = (l_dist_an > l_dist_ap).data.float().mean() # the proportion of triplets that satisfy margin l_m = (l_dist_an > l_dist_ap + cfg.local_margin).data.float().mean() l_d_ap = l_dist_ap.data.mean() l_d_an = l_dist_an.data.mean() l_prec_meter.update(l_prec) l_m_meter.update(l_m) l_dist_ap_meter.update(l_d_ap) l_dist_an_meter.update(l_d_an) l_loss_meter.update(to_scalar(l_loss)) if cfg.id_loss_weight > 0: id_loss_meter.update(to_scalar(id_loss)) if (cfg.num_models > 1) and (cfg.pm_loss_weight > 0): pm_loss_meter.update(to_scalar(pm_loss)) if (cfg.num_models > 1) and (cfg.gdm_loss_weight > 0): gdm_loss_meter.update(to_scalar(gdm_loss)) if (cfg.num_models > 1) \ and cfg.local_dist_own_hard_sample \ and (cfg.ldm_loss_weight > 0): ldm_loss_meter.update(to_scalar(ldm_loss)) loss_meter.update(to_scalar(loss)) ################### # End Up One Step # ################### run_event1.clear() run_event2.clear() done_list2[i] = True threads = [] for i in range(cfg.num_models): thread = threading.Thread(target=thread_target, args=(i,)) # Set the thread in daemon mode, so that the main program ends normally. thread.daemon = True thread.start() threads.append(thread) start_ep = resume_ep if cfg.resume else 0 for ep in range(start_ep, cfg.total_epochs): # Adjust Learning Rate for optimizer in optimizers: if cfg.lr_decay_type == 'exp': adjust_lr_exp( optimizer, cfg.base_lr, ep + 1, cfg.total_epochs, cfg.exp_decay_at_epoch) else: adjust_lr_staircase( optimizer, cfg.base_lr, ep + 1, cfg.staircase_decay_at_epochs, cfg.staircase_decay_multiply_factor) may_set_mode(modules_optims, 'train') epoch_done = False g_prec_meter = AverageMeter() g_m_meter = AverageMeter() g_dist_ap_meter = AverageMeter() g_dist_an_meter = AverageMeter() g_loss_meter = AverageMeter() l_prec_meter = AverageMeter() l_m_meter = AverageMeter() l_dist_ap_meter = AverageMeter() l_dist_an_meter = AverageMeter() l_loss_meter = AverageMeter() id_loss_meter = AverageMeter() # Global Distance Mutual Loss gdm_loss_meter = AverageMeter() # Local Distance Mutual Loss ldm_loss_meter = AverageMeter() # Probability Mutual Loss pm_loss_meter = AverageMeter() loss_meter = AverageMeter() ep_st = time.time() step = 0 while not epoch_done: step += 1 step_st = time.time() ims, im_names, labels, mirrored, epoch_done = train_set.next_batch() for i in range(cfg.num_models): ims_list[i] = ims labels_list[i] = labels done_list1[i] = False done_list2[i] = False run_event1.set() # Waiting for phase 1 done while not all(done_list1): continue run_event2.set() # Waiting for phase 2 done while not all(done_list2): continue ############ # Step Log # ############ if step % cfg.log_steps == 0: time_log = '\tStep {}/Ep {}, {:.2f}s'.format( step, ep + 1, time.time() - step_st, ) if cfg.g_loss_weight > 0: g_log = (', gp {:.2%}, gm {:.2%}, ' 'gd_ap {:.4f}, gd_an {:.4f}, ' 'gL {:.4f}'.format( g_prec_meter.val, g_m_meter.val, g_dist_ap_meter.val, g_dist_an_meter.val, g_loss_meter.val, )) else: g_log = '' if cfg.l_loss_weight > 0: l_log = (', lp {:.2%}, lm {:.2%}, ' 'ld_ap {:.4f}, ld_an {:.4f}, ' 'lL {:.4f}'.format( l_prec_meter.val, l_m_meter.val, l_dist_ap_meter.val, l_dist_an_meter.val, l_loss_meter.val, )) else: l_log = '' if cfg.id_loss_weight > 0: id_log = (', idL {:.4f}'.format(id_loss_meter.val)) else: id_log = '' if (cfg.num_models > 1) and (cfg.pm_loss_weight > 0): pm_log = (', pmL {:.4f}'.format(pm_loss_meter.val)) else: pm_log = '' if (cfg.num_models > 1) and (cfg.gdm_loss_weight > 0): gdm_log = (', gdmL {:.4f}'.format(gdm_loss_meter.val)) else: gdm_log = '' if (cfg.num_models > 1) \ and cfg.local_dist_own_hard_sample \ and (cfg.ldm_loss_weight > 0): ldm_log = (', ldmL {:.4f}'.format(ldm_loss_meter.val)) else: ldm_log = '' total_loss_log = ', loss {:.4f}'.format(loss_meter.val) log = time_log + \ g_log + l_log + id_log + \ pm_log + gdm_log + ldm_log + \ total_loss_log print(log) ############# # Epoch Log # ############# time_log = 'Ep {}, {:.2f}s'.format(ep + 1, time.time() - ep_st, ) if cfg.g_loss_weight > 0: g_log = (', gp {:.2%}, gm {:.2%}, ' 'gd_ap {:.4f}, gd_an {:.4f}, ' 'gL {:.4f}'.format( g_prec_meter.avg, g_m_meter.avg, g_dist_ap_meter.avg, g_dist_an_meter.avg, g_loss_meter.avg, )) else: g_log = '' if cfg.l_loss_weight > 0: l_log = (', lp {:.2%}, lm {:.2%}, ' 'ld_ap {:.4f}, ld_an {:.4f}, ' 'lL {:.4f}'.format( l_prec_meter.avg, l_m_meter.avg, l_dist_ap_meter.avg, l_dist_an_meter.avg, l_loss_meter.avg, )) else: l_log = '' if cfg.id_loss_weight > 0: id_log = (', idL {:.4f}'.format(id_loss_meter.avg)) else: id_log = '' if (cfg.num_models > 1) and (cfg.pm_loss_weight > 0): pm_log = (', pmL {:.4f}'.format(pm_loss_meter.avg)) else: pm_log = '' if (cfg.num_models > 1) and (cfg.gdm_loss_weight > 0): gdm_log = (', gdmL {:.4f}'.format(gdm_loss_meter.avg)) else: gdm_log = '' if (cfg.num_models > 1) \ and cfg.local_dist_own_hard_sample \ and (cfg.ldm_loss_weight > 0): ldm_log = (', ldmL {:.4f}'.format(ldm_loss_meter.avg)) else: ldm_log = '' total_loss_log = ', loss {:.4f}'.format(loss_meter.avg) log = time_log + \ g_log + l_log + id_log + \ pm_log + gdm_log + ldm_log + \ total_loss_log print(log) # Log to TensorBoard if cfg.log_to_file: if writer is None: writer = SummaryWriter(log_dir=osp.join(cfg.exp_dir, 'tensorboard')) writer.add_scalars( 'loss', dict(global_loss=g_loss_meter.avg, local_loss=l_loss_meter.avg, id_loss=id_loss_meter.avg, pm_loss=pm_loss_meter.avg, gdm_loss=gdm_loss_meter.avg, ldm_loss=ldm_loss_meter.avg, loss=loss_meter.avg, ), ep) writer.add_scalars( 'tri_precision', dict(global_precision=g_prec_meter.avg, local_precision=l_prec_meter.avg, ), ep) writer.add_scalars( 'satisfy_margin', dict(global_satisfy_margin=g_m_meter.avg, local_satisfy_margin=l_m_meter.avg, ), ep) writer.add_scalars( 'global_dist', dict(global_dist_ap=g_dist_ap_meter.avg, global_dist_an=g_dist_an_meter.avg, ), ep) writer.add_scalars( 'local_dist', dict(local_dist_ap=l_dist_ap_meter.avg, local_dist_an=l_dist_an_meter.avg, ), ep) # save ckpt if cfg.log_to_file: save_ckpt(modules_optims, ep + 1, 0, cfg.ckpt_file) ######## # Test # ######## test(load_model_weight=False)
class Monitor(Thread): """Monitor Class.""" def __init__(self, log_dir, delay=1, gpu_id=0, verbose=False): """Initialize monitor, log_dir and gpu_id are needed.""" super(Monitor, self).__init__() DEVICE_ID_LIST = GPUtil.getAvailable( order="memory", limit=1) # get the fist gpu with the lowest load if len(DEVICE_ID_LIST) < 1 or gpu_id is None: self.hasgpu = False else: self.hasgpu = True self.gpu_id = gpu_id self.start_time = time.time() # Start time self.verbose = verbose # if update the usage status during the process self.stopped = False # flag for stop the monitor self.delay = delay # Time between calls to GPUtil self.pid = os.getpid() self.writer = SummaryWriter(log_dir=log_dir) # tensorboard writer self.writer.add_text( "device/CPU", "cpu count: {:d} \t brand: {:s}".format( os.cpu_count(), cpuinfo.get_cpu_info()["brand"]), 0, ) self.writer.add_text( "device/RAM", "Current RAM - total:\t {:.3f}GB;".format( psutil.virtual_memory().total / 2.0**30), 0, ) self.count = 0 # Count for calculate the average usage self.GPU_memoryUsed = [] self.GPU_memoryFree = [] self.CPU_load = [] self.memoryUsed = [] if self.hasgpu: self.GPU = GPUtil.getGPUs()[self.gpu_id] self.GPU_memoryTotal = (self.GPU.memoryTotal / 2.0**10 ) # Total gpu memory amount in GB self.writer.add_text( "device/GPU", "Current GPU (ID:{:d}) name:{:s} ".format( self.gpu_id, self.GPU.name) + "Total_GPU_memory: {:.3f}GB;".format(self.GPU_memoryTotal), 0, ) if verbose: devices_status() self.start() def write_cpu_status(self): """Write CPU status.""" CPU_load = psutil.Process(self.pid).cpu_percent(interval=1) self.writer.add_scalars( "device/cpu", {"CPU_load (%)": CPU_load}, self.count, ) self.CPU_load.append(CPU_load) def write_mem_status(self): """Write memory usage status.""" memoryUsed = (psutil.Process(self.pid).memory_info()[0] / 2.0**30 ) # current app memory use in GB self.writer.add_scalars( "device/mem", {"memory_used (GB)": memoryUsed}, self.count, ) self.memoryUsed.append(memoryUsed) def write_gpu_status(self): """Write gpu usage status.""" self.GPU = GPUtil.getGPUs()[self.gpu_id] GPU_load = self.GPU.load * 100 GPU_memoryUsed = self.GPU.memoryUsed / self.GPU_memoryTotal * 100 GPU_memoryFree = self.GPU.memoryFree / self.GPU_memoryTotal * 100 self.writer.add_scalars( "device/GPU", { "GPU_load (%)": GPU_load, "GPU_memory_used (%)": GPU_memoryUsed, "GPU_memory_free (%)": GPU_memoryFree, }, self.count, ) self.GPU_memoryUsed.append(GPU_memoryUsed) self.GPU_memoryFree.append(GPU_memoryFree) def run(self): """Run the monitor.""" while not self.stopped: self.count += 1 self.write_cpu_status() self.write_mem_status() if self.hasgpu: self.write_gpu_status() def stop(self): """Stop the monitor.""" self.run_time = time.time() - self.start_time print("Program running time:%d seconds" % self.run_time) self.stopped = True return self.run_time
def main(): global args, best_prec1 args = parser.parse_args() args.distributed = args.world_size > 1 if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) # Use CUDA os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id use_cuda = torch.cuda.is_available() # Random seed if args.manual_seed is None: args.manual_seed = random.randint(1, 10000) random.seed(args.manual_seed) torch.manual_seed(args.manual_seed) if use_cuda: torch.cuda.manual_seed_all(args.manual_seed) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) elif args.arch.startswith('resnext'): model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) elif args.arch.startswith('shufflenet'): model = models.__dict__[args.arch](groups=args.groups) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if not args.distributed: if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) # define loss function (criterion) and optimizer # criterion = nn.CrossEntropyLoss().cuda() criterion = FocalLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint title = 'CelebA-' + args.arch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) args.checkpoint = os.path.dirname(args.resume) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) cudnn.benchmark = True # Data loading code normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = CelebA( args.data, 'training.txt', transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) val_dataset = CelebA( args.data, 'validation.txt', transforms.Compose([ transforms.ToTensor(), normalize, ])) test_dataset = CelebA( args.data, 'validation.txt', transforms.Compose([ transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(test_loader, model, criterion) return # visualization writer = SummaryWriter(os.path.join(args.checkpoint, 'logs')) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) lr = adjust_learning_rate(optimizer, epoch) print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, lr)) # train for one epoch train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set val_loss, prec1, _ = validate(val_loader, model, criterion) # append logger file logger.append([lr, train_loss, val_loss, train_acc, prec1]) # tensorboardX writer.add_scalar('learning rate', lr, epoch + 1) writer.add_scalars('loss', { 'train loss': train_loss, 'validation loss': val_loss }, epoch + 1) writer.add_scalars('accuracy', { 'train accuracy': train_acc, 'validation accuracy': prec1 }, epoch + 1) #for name, param in model.named_parameters(): # writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch + 1) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) writer.close() print('Best accuracy:') print(best_prec1) checkpoint = torch.load(os.path.join(args.checkpoint, 'model_best.pth.tar')) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("Printing training set attribute accuracy") _, _, train_top1 = validate(train_loader, model, criterion) print(train_top1) print("Printing validation set attribute accuracy") _, _, val_top1 = validate(val_loader, model, criterion) print(val_top1)
def train(): learning_rate = configer.learningrate batch_size = configer.batchsize n_epoch = configer.n_epoch early_stopping = configer.earlystopping modelname = configer.modelname logger = init_logger() log_dir = os.path.join(configer.logspath, modelname) if not os.path.exists(log_dir): os.mkdir(log_dir) writer = SummaryWriter(log_dir) trainsets = HyperECUST(configer.splitmode, configer.facesize, 'train') trainloader = DataLoader(trainsets, batch_size, shuffle=True) validsets = HyperECUST(configer.splitmode, configer.facesize, 'valid') validloader = DataLoader(validsets, batch_size) model, modelpath = init_model() # writer.add_graph(model, input_to_model=torch.Tensor(batch_size, configer.getint('global', 'N_CHANNLES'), # eval(configer.get('global', 'N_CHANNLES'))[0], eval(configer.get('global', 'N_CHANNLES'))[1])) print_log = 'load model: {}'.format(modelpath) print(print_log); logger.debug(print_log) loss = init_loss() optimizor = optim.Adam(model.parameters(), learning_rate, betas=(0.9, 0.95), weight_decay=0.0005) scheduler = lr_scheduler.StepLR(optimizor, configer.stepsize, configer.gamma) acc_train_epoch = 0.; acc_valid_epoch = 0. loss_train_epoch = float('inf'); loss_valid_epoch = float('inf') acc_train_epoch_last = acc_train_epoch; acc_valid_epoch_last = acc_valid_epoch loss_train_epoch_last = loss_train_epoch; loss_valid_epoch_last = loss_valid_epoch for i_epoch in range(n_epoch): if torch.cuda.is_available(): torch.cuda.empty_cache() scheduler.step(i_epoch) acc_train_epoch = []; acc_valid_epoch = [] loss_train_epoch = []; loss_valid_epoch = [] model.train() for i_batch, (X, y) in enumerate(trainloader): X = Variable(X.float()) if torch.cuda.is_available(): X = X.cuda() y = y.cuda() y_pred_prob = model(X) loss_train_batch = loss(y_pred_prob, y) optimizor.zero_grad() loss_train_batch.backward() optimizor.step() acc_train_batch = accuracy(y_pred_prob, y, multi=False) print_log = 'training... epoch [{:3d}]/[{:3d}] | batch [{:2d}]/[{:2d}] || accuracy: {:2.2%}, loss: {:4.4f}'.\ format(i_epoch+1, n_epoch, i_batch+1, len(trainsets)//batch_size, acc_train_batch, loss_train_batch) print(print_log); logger.debug(print_log) acc_train_epoch.append(acc_train_batch.cpu().numpy()) loss_train_epoch.append(loss_train_batch.detach().cpu().numpy()) acc_train_epoch = np.mean(np.array(acc_train_epoch)) loss_train_epoch = np.mean(np.array(loss_train_epoch)) model.eval() for i_batch, (X, y) in enumerate(validloader): X = Variable(X.float()) if torch.cuda.is_available(): X = X.cuda() y = y.cuda() y_pred_prob = model(X) loss_valid_batch = loss(y_pred_prob, y) acc_valid_batch = accuracy(y_pred_prob, y, multi=False) print_log = 'validating... epoch [{:3d}]/[{:3d}] | batch [{:2d}]/[{:2d}] || accuracy: {:2.2%}, loss: {:4.4f}'.\ format(i_epoch+1, n_epoch, i_batch+1, len(validsets)//batch_size, acc_valid_batch, loss_valid_batch) print(print_log); logger.debug(print_log) acc_valid_epoch.append(acc_valid_batch.cpu().numpy()) loss_valid_epoch.append(loss_valid_batch.detach().cpu().numpy()) acc_valid_epoch = np.mean(np.array(acc_valid_epoch)) loss_valid_epoch = np.mean(np.array(loss_valid_epoch)) writer.add_scalars('accuracy', {'train': acc_train_epoch, 'valid': acc_valid_epoch}, i_epoch) writer.add_scalars('logloss', {'train': loss_train_epoch, 'valid': loss_valid_epoch}, i_epoch) writer.add_scalar('lr', scheduler.get_lr()[-1], i_epoch) print_log = '--------------------------------------------------------------------' print(print_log); logger.debug(print_log) print_log = 'epoch [{:3d}]/[{:3d}] || training: accuracy: {:2.2%}, loss: {:4.4f} | validing: accuracy: {:2.2%}, loss: {:4.4f}'.\ format(i_epoch, n_epoch, acc_train_epoch, loss_train_epoch, acc_valid_epoch, loss_valid_epoch) print(print_log); logger.debug(print_log) if early_stopping: if loss_valid_epoch_last > loss_valid_epoch: torch.save(model, modelpath) acc_train_epoch_last = acc_train_epoch; acc_valid_epoch_last = acc_valid_epoch loss_train_epoch_last = loss_train_epoch; loss_valid_epoch_last = loss_valid_epoch print_log = 'model saved!' print(print_log); logger.debug(print_log) else: torch.save(model, modelpath) acc_train_epoch_last = acc_train_epoch; acc_valid_epoch_last = acc_valid_epoch loss_train_epoch_last = loss_train_epoch; loss_valid_epoch_last = loss_valid_epoch print_log = 'model saved!' print(print_log); logger.debug(print_log) print_log = '====================================================================' print(print_log); logger.debug(print_log)
def main(): cfg = parse_args() exp_dir = 'exp/{}_train_{}'.format(cfg.train_set, cfg.task) # Redirect logs to both console and file. ReDirectSTD(osp.join(exp_dir, 'stdout_{}.txt'.format(time_str())), 'stdout', False) ReDirectSTD(osp.join(exp_dir, 'stderr_{}.txt'.format(time_str())), 'stderr', False) ckpt_file = osp.join(exp_dir, 'ckpt.pth') model_weight_file = osp.join(exp_dir, 'model_weight.pth') writer = SummaryWriter(log_dir=osp.join(exp_dir, 'tensorboard')) # Dump the configurations to log. import pprint print('-' * 60) print('cfg.__dict__') pprint.pprint(cfg.__dict__) print('-' * 60) ########### # Dataset # ########### im_mean = [0.486, 0.459, 0.408] im_std = [0.229, 0.224, 0.225] dataset_kwargs = dict( resize_h_w=cfg.resize_h_w, scale=True, im_mean=im_mean, im_std=im_std, batch_dims='NCHW', num_prefetch_threads=cfg.num_prefetch_threads, prefetch_size=cfg.prefetch_size, ) train_set_kwargs = dict( name=cfg.train_set, part='trainval', batch_size=cfg.train_batch_size, final_batch=False, shuffle=True, crop_prob=cfg.crop_prob, crop_ratio=cfg.crop_ratio, mirror_type='random', prng=np.random, ) test_set_kwargs = dict( part='test', batch_size=cfg.test_batch_size, final_batch=True, shuffle=False, mirror_type=None, prng=np.random, ) train_set_kwargs.update(dataset_kwargs) train_set = create_dataset(**train_set_kwargs) test_set_kwargs.update(dataset_kwargs) test_sets = [] for name in cfg.test_sets: test_set_kwargs['name'] = name test_sets.append(create_dataset(**test_set_kwargs)) ########### # Models # ########### TVT, TMO = set_devices(cfg.sys_device_ids) # You may find that dropout=0 is also OK under current lr settings. if cfg.dropout_rate is not None: dropout_rate = cfg.dropout_rate elif cfg.train_set == 'market1501': dropout_rate = 0.6 else: dropout_rate = 0.5 model = Model( last_conv_stride=cfg.last_conv_stride, max_or_avg=cfg.max_or_avg, dropout_rate=dropout_rate, num_classes=len(set(train_set.labels)), ) # Model wrapper model_w = DataParallel(model) ############################# # Criteria and Optimizers # ############################# criterion = torch.nn.CrossEntropyLoss() # To finetune from ImageNet weights finetuned_params = list(model.base.parameters()) # To train from scratch new_params = [ p for n, p in model.named_parameters() if not n.startswith('base.') ] param_groups = [{ 'params': finetuned_params, 'lr': cfg.finetuned_params_lr }, { 'params': new_params, 'lr': cfg.new_params_lr }] optimizer = optim.SGD( param_groups, momentum=0.9, weight_decay=5e-4, ) # Bind them together just to save some codes in the following usage. modules_optims = [model, optimizer] ################################ # May Resume Models and Optims # ################################ if cfg.resume: resume_ep, scores = load_ckpt(modules_optims, ckpt_file) # May Transfer Models and Optims to Specified Device. Transferring optimizer # is to cope with the case when you load the checkpoint to a new device. TMO(modules_optims) ######## # Test # ######## def extract_feat(ims): model.eval() ims = Variable(TVT(torch.from_numpy(ims).float())) feat, logits = model_w(ims) feat = feat.data.cpu().numpy() return feat def test(load_model_weight=False): if load_model_weight: if model_weight_file != '': sd = torch.load(model_weight_file, map_location=(lambda storage, loc: storage)) load_state_dict(model, sd) print('Loaded model weights from {}'.format(model_weight_file)) else: load_ckpt(modules_optims, ckpt_file) for test_set, name in zip(test_sets, cfg.test_sets): if test_set.extract_feat_func is None: test_set.set_feat_func(extract_feat) print('\n=========> Test on dataset: {} <=========\n'.format(name)) test_set.eval( normalize_feat=True, to_re_rank=False, verbose=False, ) if cfg.only_test: test(load_model_weight=True) return ############ # Training # ############ prob_diff, all_masks = None, None if cfg.task in ['No-Adversary', 'Hard-1', 'Sampling']: prob_diff = load_pickle('exp/{}_sw_occlusion/prob_diff.pkl'.format( cfg.train_set)) prob_diff = blur_prob_diff(prob_diff) all_masks = load_pickle('exp/{}_sw_occlusion/all_masks.pkl'.format( cfg.train_set)) start_ep = resume_ep if cfg.resume else 0 for ep in range(start_ep, cfg.total_epochs): # Adjust Learning Rate adjust_lr_staircase( optimizer.param_groups, [cfg.finetuned_params_lr, cfg.new_params_lr], ep + 1, cfg.staircase_decay_at_epochs, cfg.staircase_decay_multiply_factor, ) model.train() # For recording loss loss_meter = AverageMeter(name='cls loss') ep_st = time.time() step = 0 epoch_done = False while not epoch_done: step += 1 step_st = time.time() ims, im_names, labels, mirrored, epoch_done = train_set.next_batch( ) # Occlude images before feeding to network if cfg.task != 'Baseline': masks = get_masks(im_names, mirrored, cfg, all_masks=all_masks, prob_diff=prob_diff) ims = ims * np.expand_dims(masks, 1) ims_var = Variable(TVT(torch.from_numpy(ims).float())) labels_var = Variable(TVT(torch.from_numpy(labels).long())) _, logits = model_w(ims_var) loss = criterion(logits, labels_var) optimizer.zero_grad() loss.backward() optimizer.step() loss_meter.update(to_scalar(loss)) if step % cfg.steps_per_log == 0: time_log = '\tStep {}/Ep {}, {:.2f}s'.format( step, ep + 1, time.time() - step_st, ) loss_log = loss_meter.val_str log = join([time_log, loss_log], ', ') print(log) ############# # Epoch Log # ############# time_log = 'Ep {}, {:.2f}s'.format( ep + 1, time.time() - ep_st, ) loss_log = loss_meter.avg_str log = join([time_log, loss_log], ', ') print(log) writer.add_scalars( loss_meter.name, {loss_meter.name: loss_meter.avg}, ep, ) ######## # Test # ######## if ((ep + 1) % cfg.epochs_per_val == 0) or ((ep + 1) == cfg.total_epochs): test(load_model_weight=False) ############# # Save CKPT # ############# save_ckpt(modules_optims, ep + 1, 0, ckpt_file)
def trainAgent(net): # initialize our game game = pong.PongGame() # create a queue for experience replay to store policies # and set the maxlength equals to the size of replay memory D = deque(maxlen=REPLAY_MEMORY) # intial frame frame = game.getPresentFrame() # convert rgb to gray scale for processing frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) # binary colors, black or white ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) # stack frames, that is our input tensor inp_t = np.stack((frame, frame, frame, frame), axis=2) Network = net().cuda() optimizer = optim.Adam(Network.parameters(), lr=1e-5) criterion = nn.MSELoss() writer = SummaryWriter(log_dir='./logs') if os.path.exists('./params.pkl'): print('Restore from exists model') Network.load_state_dict(torch.load('./params.pkl')) # TODO: record steps steps = 0 else: steps = 0 expected_epsilon = INITIAL_EPSILON - steps * (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE if expected_epsilon > FINAL_EPSILON: epsilon = expected_epsilon else: epsilon = FINAL_EPSILON total_observe = steps + ADDITIONAL_OB # training time while(1): out_t = Network(to_tensor(inp_t)) # argmax function argmax_t = np.zeros([ACTIONS]) if random.random() <= epsilon: maxIndex = random.randrange(ACTIONS) else: _, maxIndex = torch.max(out_t, 1) maxIndex = maxIndex.cpu().numpy()[0] argmax_t[maxIndex] = 1 if epsilon > FINAL_EPSILON: epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE # reward tensor if score is positive reward_t, frame, hit_rate, hit_rate_100 = game.getNextFrame(argmax_t) # get frame pixel data frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY) ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY) frame = np.reshape(frame, (84, 84, 1)) # new input tensor inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2) # add our input tensor, argmax tensor, reward and updated input tensor tos tack of experiences D.append((inp_t, argmax_t, reward_t, inp_t1)) # training iteration if steps > total_observe: # get values from our replay memory minibatch = random.sample(D, BATCH) # minibatch = np.array(minibatch).transpose() inp_batch = [d[0] for d in minibatch] argmax_batch = [d[1] for d in minibatch] reward_batch = [d[2] for d in minibatch] inp_t1_batch = [d[3] for d in minibatch] gt_batch = [] # out_batch = out.eval(feed_dict={inp: inp_t1_batch}) out_prev_batch = Network(to_tensor(inp_batch)) out_batch = Network(to_tensor(inp_t1_batch)) # add values to our batch for i in range(0, len(minibatch)): gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch.data.cpu().numpy()[i])) # action = np.mean(np.multiply(argmax_batch, out_prev_batch.data.cpu().numpy()), axis=1) action = torch.sum(out_prev_batch.mul(torch.FloatTensor(argmax_batch).cuda()), dim=1) gt_batch = torch.FloatTensor(reward_batch).cuda() + GAMMA * out_batch.max(1)[0] gt_batch = torch.autograd.Variable(gt_batch, requires_grad=False) loss = criterion(action, gt_batch) optimizer.zero_grad() loss.backward() optimizer.step() # update our input tensor the the next frame inp_t = inp_t1 steps += 1 # record the agent's performance every 100 steps if steps % 100 == 0: writer.add_scalars('', {'hit_rate': hit_rate, 'hit_rate_100': hit_rate_100}, steps) # print our where we are after saving where we are if steps % 10000 == 0: torch.save(Network.state_dict(), './params.pkl') print("TIMESTEP", steps, "/ EPSILON %7.5f" % epsilon, "/ ACTION", maxIndex, "/ REWARD", reward_t, "/ Q_MAX %e" % torch.max(out_t)) # stop traing after 1M steps if steps > 1000000: break
def pruning_DDPG(model,test_loader,criterion,pruning_rate,num_episode,warmup,lbound=0,rbound=0.8,output='./',nb_states=8,nb_actions=1,hidden1=300,hidden2=300,lr_a=1e-4,lr_c=1e-4,device='cuda'): writer = SummaryWriter("./DDPG") left_bound = get_left_bound(model,pruning_rate/3.0) right_bound = [] for i in range(len(left_bound)): left_bound[i] = float(left_bound[i].cpu().numpy()) right_bound.append(rbound) print("left_bound:",left_bound) print("right_bound:",right_bound) agent = DDPG(warmup,nb_states, nb_actions,left_bound,right_bound,hidden1,hidden2,lr_a,lr_c) agent.is_training = True env = Pruning_Env(model,test_loader,criterion,pruning_rate,left_bound,right_bound,device) step = episode = episode_steps = 0 episode_reward = 0. observation = None T = [] # trajectory while episode < num_episode: # counting based on episode # reset if it is the start of episode if observation is None: observation = deepcopy(env.reset()) agent.reset(observation) # agent pick action ... if episode <= warmup: action = agent.random_action() # action = sample_from_truncated_normal_distribution(lower=0., upper=1., mu=env.pruning_rate, sigma=0.5) else: action = agent.select_action(observation, episode=episode) # env response with next_observation, reward, terminate_info observation2, reward, done, info,action = env.step(action) if (episode>warmup): writer.add_scalars('Acc_DDPG',{'Acc':100+reward},episode-warmup) observation2 = deepcopy(observation2) #print(observation) T.append([reward, deepcopy(observation), deepcopy(observation2), action, done]) # fix-length, never reach here # if max_episode_length and episode_steps >= max_episode_length - 1: # done = True # [optional] save intermideate model if episode % int(num_episode / 3) == 0: agent.save_model(output) # update step += 1 episode_steps += 1 episode_reward += reward observation = deepcopy(observation2) if done: # end of episode print('#{}: episode_reward:{:.4f} acc: {:.4f}, ratio: {:.4f}'.format(episode,episode_reward,info['accuracy'],info['pruning_ratio'])) final_reward = T[-1][0] #if final_reward > best_reward - 5.0: # print('final_reward: {}'.format(final_reward)) # agent observe and update policy for i in range(len(T)): r_t, s_t, s_t1, a_t, done = T[i] agent.observe(final_reward, s_t, s_t1, a_t, done) if episode > warmup: agent.update_policy() #agent.memory.append( # observation, # agent.select_action(observation, episode=episode), # 0., False #) # reset observation = None episode_steps = 0 episode_reward = 0. episode += 1 T = [] print("best_action_list:") print(env.best_action_list) model_new = pruning_by_action_list(model,env.best_action_list) return model_new
lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) writer = SummaryWriter(log_dir=args.logdir) for epoch in range(5): # train for one epoch train_loss = train(train_data, model, optimizer, epoch) # evaluate on validation set val_loss = validate(valid_data, model, epoch) writer.add_scalars('data/scalar_group', { 'train loss': train_loss, 'val loss': val_loss }, epoch) # Release all weights for param in model.module.parameters(): param.requires_grad = True trainable_vars = [param for param in model.parameters() if param.requires_grad] optimizer = torch.optim.SGD(trainable_vars, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.8,
class Solver(): def __init__(self, exp): self.root_dir = '/vulcan/scratch/koutilya/projects/Domain_Adaptation/Common_Domain_Adaptation-Lighting/UNet_Baseline' # Seed self.seed = 1729 random.seed(self.seed) torch.manual_seed(self.seed) np.random.seed(self.seed) torch.cuda.manual_seed_all(self.seed) # Initialize networks self.netT = all_networks.define_G(3, 1, 64, 4, 'batch', 'PReLU', 'UNet', 'kaiming', 0, False, [0], 0.1) self.netT.cuda() # Initialize Loss self.netT_loss_fn = nn.L1Loss() self.netT_loss_fn = self.netT_loss_fn.cuda() # Training Configuration details self.batch_size = 16 joint_transform_list = [ RandomImgAugment(no_flip=True, no_rotation=True, no_augment=True, size=(192, 640)) ] img_transform_list = [ tr.ToTensor(), tr.Normalize([.5, .5, .5], [.5, .5, .5]) ] self.joint_transform = tr.Compose(joint_transform_list) self.img_transform = tr.Compose(img_transform_list) self.depth_transform = tr.Compose([DepthToTensor()]) self.exp = exp if self.exp == 'UNet_Baseline_NEW': self.model_string = '' elif self.exp == 'UNet_Baseline_bicubic_NEW': self.model_string = '_bicubic' self.writer = SummaryWriter( os.path.join(self.root_dir, '../tensorboard_logs/Vkitti-kitti/test/' + self.exp)) # Initialize Data self.get_validation_data() self.garg_crop = True self.eigen_crop = False self.kitti = KITTI() def compute_errors(self, ground_truth, predication): # accuracy threshold = np.maximum((ground_truth / predication), (predication / ground_truth)) a1 = (threshold < 1.25).mean() a2 = (threshold < 1.25**2).mean() a3 = (threshold < 1.25**3).mean() #MSE rmse = (ground_truth - predication)**2 rmse = np.sqrt(rmse.mean()) #MSE(log) rmse_log = (np.log(ground_truth) - np.log(predication))**2 rmse_log = np.sqrt(rmse_log.mean()) # Abs Relative difference abs_rel = np.mean(np.abs(ground_truth - predication) / ground_truth) # Squared Relative difference sq_rel = np.mean(((ground_truth - predication)**2) / ground_truth) return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 def get_validation_data(self): self.real_val_dataset = real_dataset( data_file='test.txt', phase='test', img_transform=self.img_transform, joint_transform=self.joint_transform, depth_transform=self.depth_transform) self.real_val_dataloader = DataLoader(self.real_val_dataset, shuffle=False, batch_size=self.batch_size, num_workers=4) def load_prev_model(self): saved_models = glob.glob( os.path.join( self.root_dir, 'saved_models_all_iters', 'UNet_baseline-' + str(self.iteration) + self.model_string + '.pth.tar')) if len(saved_models) > 0: model_state = torch.load(saved_models[0]) self.netT.load_state_dict(model_state['netT_state_dict']) # self.iteration = model_state['iteration'] return True return False def tensor2im(self, depth): depth_numpy = depth.cpu().data.float().numpy().transpose(0, 2, 3, 1) depth_numpy = (depth_numpy + 1.0) / 2.0 # Unnormalize between 0 and 1 return depth_numpy * 80.0 def get_depth_manually(self, depth_file): root_dir = '/vulcan/scratch/koutilya/kitti/Depth_from_velodyne/' depth_split = depth_file.split('/') main_file = osp.join(root_dir, 'test', depth_split[0], depth_split[1], depth_split[-1].split('.')[0] + '.png') depth = Image.open(main_file) depth = np.array(depth, dtype=np.float32) / 255.0 return depth def Validate(self): self.netT.eval() saved_models_list = glob.glob( os.path.join(self.root_dir, 'saved_models_all_iters', 'UNet_baseline-*999' + self.model_string + '.pth.tar')) for self.iteration in range(999, 1000 * len(saved_models_list), 1000): self.load_prev_model() self.Validation() def Validation(self): num_samples = len(self.real_val_dataset) abs_rel = np.zeros(num_samples, np.float32) sq_rel = np.zeros(num_samples, np.float32) rmse = np.zeros(num_samples, np.float32) rmse_log = np.zeros(num_samples, np.float32) a1 = np.zeros(num_samples, np.float32) a2 = np.zeros(num_samples, np.float32) a3 = np.zeros(num_samples, np.float32) with torch.no_grad(): for i, (data, depth_filenames) in tqdm( enumerate(self.real_val_dataloader)): self.real_val_image = data[ 'left_img'] #, data['depth'] # self.real_depth is a numpy array self.real_val_image = Variable(self.real_val_image.cuda()) depth = self.netT(self.real_val_image) depth = depth[-1] depth_numpy = self.tensor2im(depth) # 0-80m for t_id in range(depth_numpy.shape[0]): t_id_global = (i * self.batch_size) + t_id # _,_,_,ground_depth = self.real_val_dataset.read_data(self.real_val_dataset.files[(i*self.batch_size)+t_id]) h, w = self.real_val_image.shape[ 2], self.real_val_image.shape[3] datafiles1 = self.real_val_dataset.files[t_id_global] ground_depth = self.get_depth_manually(datafiles1['depth']) height, width = ground_depth.shape predicted_depth = cv2.resize( depth_numpy[t_id], (width, height), interpolation=cv2.INTER_LINEAR) predicted_depth[predicted_depth < 1.0] = 1.0 # predicted_depth[predicted_depth < 1.0] = 1.0 predicted_depth[predicted_depth > 50.0] = 50.0 mask = np.logical_and(ground_depth > 1.0, ground_depth < 50.0) # crop used by Garg ECCV16 if self.garg_crop: self.crop = np.array([ 0.40810811 * height, 0.99189189 * height, 0.03594771 * width, 0.96405229 * width ]).astype(np.int32) # crop we found by trail and error to reproduce Eigen NIPS14 results elif self.eigen_crop: self.crop = np.array([ 0.3324324 * height, 0.91351351 * height, 0.0359477 * width, 0.96405229 * width ]).astype(np.int32) crop_mask = np.zeros(mask.shape) crop_mask[self.crop[0]:self.crop[1], self.crop[2]:self.crop[3]] = 1 mask = np.logical_and(mask, crop_mask) abs_rel[t_id_global], sq_rel[t_id_global], rmse[ t_id_global], rmse_log[t_id_global], a1[ t_id_global], a2[t_id_global], a3[ t_id_global] = self.compute_errors( ground_depth[mask], predicted_depth[mask]) # print('{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f}' # .format(t_id, abs_rel[t_id], sq_rel[t_id], rmse[t_id], rmse_log[t_id], a1[t_id], a2[t_id], a3[t_id])) print('{:>10},{:>10},{:>10},{:>10},{:>10},{:>10},{:>10}'.format( 'abs_rel', 'sq_rel', 'rmse', 'rmse_log', 'a1', 'a2', 'a3')) print( '{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f}' .format(abs_rel.mean(), sq_rel.mean(), rmse.mean(), rmse_log.mean(), a1.mean(), a2.mean(), a3.mean())) self.writer.add_scalars('Kitti_Validatoin_metrics/Abs_Rel', {self.exp: abs_rel.mean()}, self.iteration) self.writer.add_scalars('Kitti_Validatoin_metrics/Sq_Rel', {self.exp: sq_rel.mean()}, self.iteration) self.writer.add_scalars('Kitti_Validatoin_metrics/RMSE', {self.exp: rmse.mean()}, self.iteration) self.writer.add_scalars('Kitti_Validatoin_metrics/RMSE_log', {self.exp: rmse_log.mean()}, self.iteration) self.writer.add_scalars('Kitti_Validatoin_metrics/del<1.25', {self.exp: a1.mean()}, self.iteration) self.writer.add_scalars('Kitti_Validatoin_metrics/del<1.25^2', {self.exp: a2.mean()}, self.iteration) self.writer.add_scalars('Kitti_Validatoin_metrics/del<1.25^3', {self.exp: a3.mean()}, self.iteration) self.writer.close()
class SemanticSeg(object): ''' Control the training, evaluation, and inference process. Args: - net_name: string - lr: float, learning rate. - n_epoch: integer, the epoch number - channels: integer, the channel number of the input - num_classes: integer, the number of class - input_shape: tuple of integer, input dim - crop: integer, cropping size - batch_size: integer - num_workers: integer, how many subprocesses to use for data loading. - device: string, use the specified device - pre_trained: True or False, default False - weight_path: weight path of pre-trained model - mode: string __all__ = ['cls','seg','cls_and_seg','cls_or_seg'] ''' def __init__(self, net_name=None, lr=1e-3, n_epoch=1, channels=1, num_classes=2, roi_number=1, scale=None, seq_len=3, input_shape=None, crop=0, batch_size=6, num_workers=0, device=None, pre_trained=False, ckpt_point=True, weight_path=None, weight_decay=0., momentum=0.95, gamma=0.1, milestones=[40, 80], T_max=5, mode='cls', topk=10, freeze=None): super(SemanticSeg, self).__init__() self.net_name = net_name self.lr = lr self.n_epoch = n_epoch self.channels = channels self.num_classes = num_classes self.roi_number = roi_number self.scale = scale self.seq_len = seq_len self.input_shape = input_shape self.crop = crop self.batch_size = batch_size self.num_workers = num_workers self.device = device self.net = self._get_net(self.net_name) self.pre_trained = pre_trained self.ckpt_point = ckpt_point self.weight_path = weight_path self.start_epoch = 0 self.global_step = 0 self.loss_threshold = 2.0 self.weight_decay = weight_decay self.momentum = momentum self.gamma = gamma self.milestones = milestones self.T_max = T_max self.mode = mode self.topk = topk self.freeze = freeze os.environ['CUDA_VISIBLE_DEVICES'] = self.device if self.pre_trained: self._get_pre_trained(self.weight_path,ckpt_point) if self.roi_number is not None: assert self.num_classes == 2, "num_classes must be set to 2 for binary segmentation" def trainer(self, train_path, val_path, cur_fold, output_dir=None, log_dir=None, optimizer='Adam', loss_fun='Cross_Entropy', class_weight=None, lr_scheduler=None): torch.manual_seed(1000) np.random.seed(1000) torch.cuda.manual_seed_all(1000) print('Device:{}'.format(self.device)) torch.backends.cudnn.deterministic = True torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True output_dir = os.path.join(output_dir, "fold" + str(cur_fold)) log_dir = os.path.join(log_dir, "fold" + str(cur_fold)) if os.path.exists(log_dir): if not self.pre_trained: shutil.rmtree(log_dir) os.makedirs(log_dir) else: os.makedirs(log_dir) if os.path.exists(output_dir): if not self.pre_trained: shutil.rmtree(output_dir) os.makedirs(output_dir) else: os.makedirs(output_dir) self.step_pre_epoch = len(train_path) // self.batch_size self.writer = SummaryWriter(log_dir) self.global_step = self.start_epoch * math.ceil( len(train_path[0]) / self.batch_size) net = self.net # only for deeplab if self.freeze is not None and 'deeplab' in self.net_name: if self.freeze == 'backbone': net.freeze_backbone() elif self.freeze == 'classifier': net.freeze_classifier() lr = self.lr loss = self._get_loss(loss_fun, class_weight) if len(self.device.split(',')) > 1: net = DataParallel(net) # dataloader setting if self.mode == 'cls': train_transformer = transforms.Compose([ Trunc_and_Normalize(self.scale), CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop), RandomEraseHalf(scale_flag=False), RandomDistortHalf(), RandomTranslationRotationZoomHalf(num_class=self.num_classes), RandomFlipHalf(mode='hv'), RandomAdjustHalf(), To_Tensor(num_class=self.num_classes) ]) else: train_transformer = transforms.Compose([ Trunc_and_Normalize(self.scale), CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop), RandomEraseHalf(scale_flag=False), RandomDistortHalf(), RandomTranslationRotationZoomHalf(num_class=self.num_classes), # RandomFlipHalf(mode='hv'), # RandomAdjustHalf(), RandomNoiseHalf(), To_Tensor(num_class=self.num_classes) ]) train_dataset = DataGenerator(train_path, roi_number=self.roi_number, num_class=self.num_classes, transform=train_transformer, seq_len=self.seq_len) train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers, pin_memory=True) # copy to gpu net = net.cuda() loss = loss.cuda() # optimizer setting optimizer = self._get_optimizer(optimizer, net, lr) if self.pre_trained and self.ckpt_point: checkpoint = torch.load(self.weight_path) optimizer.load_state_dict(checkpoint['optimizer']) if lr_scheduler is not None: lr_scheduler = self._get_lr_scheduler(lr_scheduler, optimizer) # loss_threshold = 1.0 early_stopping = EarlyStopping(patience=20,verbose=True,monitor='val_loss',op_type='min') for epoch in range(self.start_epoch, self.n_epoch): train_loss, train_dice, train_acc = self._train_on_epoch(epoch, net, loss, optimizer, train_loader) val_loss, val_dice, val_acc = self._val_on_epoch(epoch, net, loss, val_path) if lr_scheduler is not None: lr_scheduler.step(val_loss) torch.cuda.empty_cache() print('epoch:{},train_loss:{:.5f},val_loss:{:.5f}'.format(epoch, train_loss, val_loss)) print('epoch:{},train_dice:{:.5f},val_dice:{:.5f}'.format(epoch, train_dice, val_dice)) self.writer.add_scalars('data/loss', { 'train': train_loss, 'val': val_loss }, epoch) self.writer.add_scalars('data/dice', { 'train': train_dice, 'val': val_dice }, epoch) self.writer.add_scalars('data/acc', { 'train': train_acc, 'val': val_acc }, epoch) self.writer.add_scalar('data/lr', optimizer.param_groups[0]['lr'],epoch) early_stopping(val_loss) #save if val_loss <= self.loss_threshold: self.loss_threshold = val_loss if len(self.device.split(',')) > 1: state_dict = net.module.state_dict() else: state_dict = net.state_dict() saver = { 'epoch': epoch, 'save_dir': output_dir, 'state_dict': state_dict, 'optimizer': optimizer.state_dict() } file_name = 'epoch:{}-train_loss:{:.5f}-train_dice:{:.5f}-train_acc:{:.5f}-val_loss:{:.5f}-val_dice:{:.5f}-val_acc:{:.5f}.pth'.format( epoch, train_loss, train_dice, train_acc, val_loss, val_dice, val_acc) save_path = os.path.join(output_dir, file_name) print("Save as %s" % file_name) torch.save(saver, save_path) if early_stopping.early_stop: print('Early Stopping!') break self.writer.close() def _train_on_epoch(self, epoch, net, criterion, optimizer, train_loader): net.train() train_loss = AverageMeter() train_dice = AverageMeter() train_acc = AverageMeter() from metrics import RunningDice run_dice = RunningDice(labels=[0,1],ignore_label=-1) for step, sample in enumerate(train_loader): data = sample['image'] # img:(N,cin,seq_len, H, W), target = sample['mask'] # mask:(N,num_class,seq_len, H, W), label = sample['label'] # label (N,seq_len,num_class-1) # print(data.size(),target.size(),label.size()) data = data.cuda() target = target.cuda() label = label.cuda() output = net(data) #(N,seq_len,num_class,H,W) (N,seq_len,num_class-1) # print(output[0].size(),output[1].size()) loss = 0. if self.mode == 'cls': for i in range(data.size(2)): loss += criterion(output[1][:,i], label[:,i]) elif self.mode == 'seg': for i in range(data.size(2)): loss += criterion(output[0][:,i], target[:,:,i]) else: for i in range(data.size(2)): loss += criterion([output[0][:,i],output[1][:,i]],[target[:,:,i],label[:,i]]) loss /= data.size(2) optimizer.zero_grad() loss.backward() optimizer.step() loss = loss.float() train_loss.update(loss.item(), data.size(0)) total_dice = 0. total_acc = 0. for i in range(data.size(2)): cls_output = output[1][:,i] #(N,num_class-1) cls_output = F.sigmoid(cls_output).float() seg_output = output[0][:,i].float() #(N,num_class,H,W) seg_output = F.softmax(seg_output, dim=1) # measure acc acc = accuracy(cls_output.detach(), label[:,i]) train_acc.update(acc.item(), data.size(0)) total_acc += acc.item() # measure dice and record loss dice = compute_dice(seg_output.detach(), target[:,:,i]) train_dice.update(dice.item(), data.size(0)) total_dice += dice.item() # measure run dice seg_output = torch.argmax(seg_output,1).detach().cpu().numpy() #N*H*W tmp_target = torch.argmax(target[:,:,i],1).detach().cpu().numpy() run_dice.update_matrix(tmp_target,seg_output) torch.cuda.empty_cache() if self.global_step % 10 == 0: if self.mode == 'cls': print('epoch:{},step:{},train_loss:{:.5f},train_acc:{:.5f},lr:{}'.format(epoch, step, loss.item(), total_acc/data.size(2), optimizer.param_groups[0]['lr'])) elif self.mode == 'seg': rundice, dice_list = run_dice.compute_dice() print("Category Dice: ", dice_list) print('epoch:{},step:{},train_loss:{:.5f},train_dice:{:.5f},run_dice:{:.5f},lr:{}'.format(epoch, step, loss.item(), total_dice/data.size(2), rundice, optimizer.param_groups[0]['lr'])) run_dice.init_op() else: print('epoch:{},step:{},train_loss:{:.5f},train_dice:{:.5f},train_acc:{:.5f},lr:{}'.format(epoch, step, loss.item(), total_dice/data.size(2),total_acc/data.size(2), optimizer.param_groups[0]['lr'])) self.writer.add_scalars('data/train_loss_dice', { 'train_loss': loss.item(), 'train_dice': total_dice/data.size(2), 'train_acc': total_acc/data.size(2) }, self.global_step) self.global_step += 1 return train_loss.avg, train_dice.avg, train_acc.avg def _val_on_epoch(self, epoch, net, criterion, val_path, val_transformer=None): net.eval() if self.mode == 'cls': val_transformer = transforms.Compose([ Trunc_and_Normalize(self.scale), CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop), To_Tensor(num_class=self.num_classes) ]) else: val_transformer = transforms.Compose([ Trunc_and_Normalize(self.scale), CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop), To_Tensor(num_class=self.num_classes) ]) val_dataset = DataGenerator(val_path, roi_number=self.roi_number, num_class=self.num_classes, transform=val_transformer, seq_len=-1) val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=self.num_workers, pin_memory=True) val_loss = AverageMeter() val_dice = AverageMeter() val_acc = AverageMeter() from metrics import RunningDice run_dice = RunningDice(labels=[0,1],ignore_label=-1) with torch.no_grad(): for step, sample in enumerate(val_loader): data = sample['image'] target = sample['mask'] label = sample['label'] data = data.cuda() target = target.cuda() label = label.cuda() output = net(data) loss = 0. if self.mode == 'cls': for i in range(data.size(2)): loss += criterion(output[1][:,i], label[:,i]) elif self.mode == 'seg': for i in range(data.size(2)): loss += criterion(output[0][:,i], target[:,:,i]) else: for i in range(data.size(2)): loss += criterion([output[0][:,i],output[1][:,i]],[target[:,:,i],label[:,i]]) loss /= data.size(2) loss = loss.float() val_loss.update(loss.item(), data.size(0)) total_dice = 0. total_acc = 0. for i in range(data.size(2)): cls_output = output[1][:,i] #(N,num_class-1) cls_output = F.sigmoid(cls_output).float() seg_output = output[0][:,i].float() #(N,num_class,H,W) seg_output = F.softmax(seg_output, dim=1) # measure acc acc = accuracy(cls_output.detach(), label[:,i]) val_acc.update(acc.item(), data.size(0)) total_acc += acc.item() # measure dice and record loss dice = compute_dice(seg_output.detach(), target[:,:,i]) val_dice.update(dice.item(), data.size(0)) total_dice += dice.item() # measure run dice seg_output = torch.argmax(seg_output,1).detach().cpu().numpy() #N*H*W tmp_target = torch.argmax(target[:,:,i],1).detach().cpu().numpy() run_dice.update_matrix(tmp_target,seg_output) torch.cuda.empty_cache() if step % 10 == 0: if self.mode == 'cls': print('epoch:{},step:{},val_loss:{:.5f},val_acc:{:.5f}'.format(epoch, step, loss.item(), total_acc/data.size(2))) elif self.mode == 'seg': rundice, dice_list = run_dice.compute_dice() print("Category Dice: ", dice_list) print('epoch:{},step:{},val_loss:{:.5f},val_dice:{:.5f},rundice:{:.5f}'.format(epoch, step, loss.item(), total_dice/data.size(2),rundice)) run_dice.init_op() else: print('epoch:{},step:{},val_loss:{:.5f},val_dice:{:.5f},val_acc:{:.5f}'.format(epoch, step, loss.item(), total_dice/data.size(2), total_acc/data.size(2))) return val_loss.avg, val_dice.avg, val_acc.avg def test(self, test_path, save_path, net=None, mode='seg', save_flag=False): if net is None: net = self.net net = net.cuda() net.eval() if self.mode == 'cls': test_transformer = transforms.Compose([ Trunc_and_Normalize(self.scale), CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop), RandomEraseHalf(scale_flag=False), RandomTranslationRotationZoomHalf(num_class=self.num_classes), RandomFlipHalf(mode='hv'), RandomAdjustHalf(), To_Tensor(num_class=self.num_classes) ]) else: test_transformer = transforms.Compose([ Trunc_and_Normalize(self.scale), CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop), To_Tensor(num_class=self.num_classes) ]) test_dataset = DataGenerator(test_path, roi_number=self.roi_number, num_class=self.num_classes, transform=test_transformer, seq_len=-1) test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=self.num_workers, pin_memory=True) test_dice = AverageMeter() test_acc = AverageMeter() from PIL import Image from metrics import RunningDice run_dice = RunningDice(labels=[0,1],ignore_label=-1) cls_result = { 'true': [], 'pred': [], 'prob': [] } with torch.no_grad(): for step, sample in enumerate(test_loader): data = sample['image'] target = sample['mask'] label = sample['label'] # print(label) data = data.cuda() target = target.cuda() label = label.cuda() output = net(data) total_dice = 0. total_acc = 0. for i in range(data.size(2)): cls_output = output[1][:,i] cls_output = F.sigmoid(cls_output).float() seg_output = output[0][:,i].float() seg_output = F.softmax(seg_output, dim=1) # measure acc acc = accuracy(cls_output.detach(), label[:,i]) test_acc.update(acc.item(),data.size(0)) total_acc += acc.item() # measure dice and iou for evaluation (float) dice = compute_dice(seg_output.detach(), target[:,:,i], ignore_index=0) test_dice.update(dice.item(), data.size(0)) total_dice += dice.item() cls_result['prob'].extend(cls_output.detach().squeeze().cpu().numpy().tolist()) cls_output = (cls_output > 0.5).float() # N*C cls_result['pred'].extend(cls_output.detach().squeeze().cpu().numpy().tolist()) cls_result['true'].extend(label.detach().squeeze().cpu().numpy().tolist()) # print(cls_output.detach()) if mode == 'mtl': b, c, _, _ = seg_output.size() seg_output[:,1:,...] = seg_output[:,1:,...] * cls_output.view(b,c-1,1,1).expand_as(seg_output[:,1:,...]) seg_output = torch.argmax(seg_output,1).detach().cpu().numpy() #N*H*W N=1 tmp_target = torch.argmax(target[:,:,i],1).detach().cpu().numpy() run_dice.update_matrix(tmp_target,seg_output) # print(np.unique(seg_output),np.unique(target)) # save if mode != 'cls' and save_flag: seg_output = np.squeeze(seg_output).astype(np.uint8) seg_output = Image.fromarray(seg_output, mode='L') seg_output.save(os.path.join(save_path,test_path[step].split('.')[0] + '_' + str(i) +'.png')) torch.cuda.empty_cache() print('step:{},test_dice:{:.5f},test_acc:{:.5f}'.format(step,total_dice/data.size(2),total_acc/data.size(2))) rundice, dice_list = run_dice.compute_dice() print("Category Dice: ", dice_list) print('avg_dice:{:.5f},avg_acc:{:.5f},rundice:{:.5f}'.format(test_dice.avg, test_acc.avg, rundice)) return cls_result def _get_net(self, net_name): if net_name == 'rcnn_unet' : from rcnn.model.unet import rcnn_unet net = rcnn_unet(n_channels=self.channels,n_classes=self.num_classes,seq_len=self.seq_len) return net def _get_loss(self, loss_fun, class_weight=None): if class_weight is not None: class_weight = torch.tensor(class_weight) if loss_fun == 'Cross_Entropy': from rcnn.loss.cross_entropy import CrossentropyLoss loss = CrossentropyLoss(weight=class_weight) if loss_fun == 'DynamicTopKLoss': from rcnn.loss.cross_entropy import DynamicTopKLoss loss = DynamicTopKLoss(weight=class_weight,step_threshold=self.step_pre_epoch) elif loss_fun == 'DynamicTopkCEPlusDice': from rcnn.loss.combine_loss import DynamicTopkCEPlusDice loss = DynamicTopkCEPlusDice(weight=class_weight, ignore_index=0, step_threshold=self.step_pre_epoch) elif loss_fun == 'TopKLoss': from rcnn.loss.cross_entropy import TopKLoss loss = TopKLoss(weight=class_weight, k=self.topk) elif loss_fun == 'DiceLoss': from rcnn.loss.dice_loss import DiceLoss loss = DiceLoss(weight=class_weight, ignore_index=0, p=1) elif loss_fun == 'ShiftDiceLoss': from rcnn.loss.dice_loss import ShiftDiceLoss loss = ShiftDiceLoss(weight=class_weight,ignore_index=0, reduction='topk',shift=0.5, p=1, k=self.topk) elif loss_fun == 'TopkDiceLoss': from rcnn.loss.dice_loss import DiceLoss loss = DiceLoss(weight=class_weight, ignore_index=0,reduction='topk', k=self.topk) elif loss_fun == 'PowDiceLoss': from rcnn.loss.dice_loss import DiceLoss loss = DiceLoss(weight=class_weight, ignore_index=0, p=2) elif loss_fun == 'TverskyLoss': from rcnn.loss.tversky_loss import TverskyLoss loss = TverskyLoss(weight=class_weight, ignore_index=0, alpha=0.7) elif loss_fun == 'FocalTverskyLoss': from rcnn.loss.tversky_loss import TverskyLoss loss = TverskyLoss(weight=class_weight, ignore_index=0, alpha=0.7, gamma=0.75) elif loss_fun == 'BCEWithLogitsLoss': loss = nn.BCEWithLogitsLoss(class_weight) elif loss_fun == 'BCEPlusDice': from rcnn.loss.combine_loss import BCEPlusDice loss = BCEPlusDice(weight=class_weight,ignore_index=0,p=1) elif loss_fun == 'CEPlusDice': from rcnn.loss.combine_loss import CEPlusDice loss = CEPlusDice(weight=class_weight, ignore_index=0) elif loss_fun == 'CEPlusTopkDice': from rcnn.loss.combine_loss import CEPlusTopkDice loss = CEPlusTopkDice(weight=class_weight, ignore_index=0, reduction='topk', k=self.topk) elif loss_fun == 'TopkCEPlusTopkDice': from rcnn.loss.combine_loss import TopkCEPlusTopkDice loss = TopkCEPlusTopkDice(weight=class_weight, ignore_index=0, reduction='topk', k=self.topk) elif loss_fun == 'TopkCEPlusDice': from rcnn.loss.combine_loss import TopkCEPlusDice loss = TopkCEPlusDice(weight=class_weight, ignore_index=0, k=self.topk) elif loss_fun == 'TopkCEPlusShiftDice': from rcnn.loss.combine_loss import TopkCEPlusShiftDice loss = TopkCEPlusShiftDice(weight=class_weight,ignore_index=0, shift=0.5,k=self.topk) elif loss_fun == 'TopkCEPlusTopkShiftDice': from rcnn.loss.combine_loss import TopkCEPlusTopkShiftDice loss = TopkCEPlusTopkShiftDice(weight=class_weight,ignore_index=0, reduction='topk',shift=0.5,k=self.topk) return loss def _get_optimizer(self, optimizer, net, lr): if optimizer == 'Adam': optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr, weight_decay=self.weight_decay) elif optimizer == 'SGD': optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=lr, weight_decay=self.weight_decay, momentum=self.momentum) return optimizer def _get_lr_scheduler(self, lr_scheduler, optimizer): if lr_scheduler == 'ReduceLROnPlateau': lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', patience=5, verbose=True) elif lr_scheduler == 'MultiStepLR': lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, self.milestones, gamma=self.gamma) elif lr_scheduler == 'CosineAnnealingLR': lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=self.T_max) return lr_scheduler def _get_pre_trained(self, weight_path, ckpt_point=True): checkpoint = torch.load(weight_path) self.net.load_state_dict(checkpoint['state_dict']) if ckpt_point: self.start_epoch = checkpoint['epoch'] + 1
def train_det(opt, cfg): # # Write history # if 'backlog' not in opt.config: # with open(os.path.join(opt.saved_path, f'{opt.project}_backlog.yml'), 'w') as f: # doc = open(f'projects/{opt.project}.yml', 'r') # f.write('#History log file') # f.write(f'\n__backlog__: {now.strftime("%Y/%m/%d %H:%M:%S")}\n') # f.write(doc.read()) # f.write('\n# Manual seed used') # f.write(f'\nmanual_seed: {cfg.manual_seed}') # else: # with open(os.path.join(opt.saved_path, f'{opt.project}_history.yml'), 'w') as f: # doc = open(f'projects/{opt.project}.yml', 'r') # f.write(doc.read()) training_params = { 'batch_size': cfg.batch_size, 'shuffle': True, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } val_params = { 'batch_size': cfg.batch_size, 'shuffle': False, 'drop_last': True, 'collate_fn': collater, 'num_workers': opt.num_workers } input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536] training_set = DataGenerator( data_path=os.path.join(opt.data_path, 'Train'), class_ids=cfg.dictionary_class_name.keys(), transform=transforms.Compose([ Augmenter(), Normalizer(mean=cfg.mean, std=cfg.std), Resizer(input_sizes[cfg.compound_coef]) ]), pre_augments=['', *[f'{aug}_' for aug in cfg.augment_list]] if cfg.augment_list else None) training_generator = DataLoader(training_set, **training_params) val_set = DataGenerator( # root_dir=os.path.join(opt.data_path, cfg.project_name), data_path=os.path.join(opt.data_path, 'Validation'), class_ids=cfg.dictionary_class_name.keys(), transform=transforms.Compose([ Normalizer(mean=cfg.mean, std=cfg.std), Resizer(input_sizes[cfg.compound_coef]) ])) val_generator = DataLoader(val_set, **val_params) model = EfficientDetBackbone(num_classes=len(cfg.dictionary_class_name), compound_coef=cfg.compound_coef, ratios=eval(cfg.anchor_ratios), scales=eval(cfg.anchor_scales)) # load last weights if opt.load_weights is not None: if opt.load_weights.endswith('.pth'): weights_path = opt.load_weights else: weights_path = get_last_weights(opt.saved_path) try: last_step = int( os.path.basename(weights_path).split('_')[-1].split('.')[0]) except: last_step = 0 try: ret = model.load_state_dict(torch.load(weights_path), strict=False) except RuntimeError as e: print(f'[Warning] Ignoring {e}') print( '[Warning] Don\'t panic if you see this, ' 'this might be because you load a pretrained weights with different number of classes. ' 'The rest of the weights should be loaded already.') print( f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}' ) else: last_step = 0 print('[Info] initializing weights...') init_weights(model) # freeze backbone if train head_only if cfg.training_layer.lower() == 'heads': def freeze_backbone(m): classname = m.__class__.__name__ for ntl in ['EfficientNet', 'BiFPN']: if ntl in classname: for param in m.parameters(): param.requires_grad = False model.apply(freeze_backbone) print('[Info] freezed backbone') # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4 # useful when gpu memory is limited. # because when bn is disable, the training will be very unstable or slow to converge, # apply sync_bn can solve it, # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus. # but it would also slow down the training by a little bit. if cfg.num_gpus > 1 and cfg.batch_size // cfg.num_gpus < 4: model.apply(replace_w_sync_bn) use_sync_bn = True else: use_sync_bn = False # warp the model with loss function, to reduce the memory usage on gpu0 and speedup model = ModelWithLoss(model, debug=opt.debug) if cfg.num_gpus > 0: model = model.cuda() if cfg.num_gpus > 1: model = CustomDataParallel(model, cfg.num_gpus) if use_sync_bn: patch_replication_callback(model) if cfg.optimizer.lower() == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), cfg.learning_rate) if cfg.optimizer.lower() == 'srsgd': optimizer = SRSGD(model.parameters(), lr=cfg.learning_rate, weight_decay=5e-4, iter_count=100) else: optimizer = torch.optim.SGD(model.parameters(), cfg.learning_rate, momentum=0.9, nesterov=True) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) # Setup complete, then start training now = datetime.datetime.now() opt.saved_path = opt.saved_path + f'/trainlogs_{now.strftime("%Y%m%d_%H%M%S")}' if opt.log_path is None: opt.log_path = opt.saved_path os.makedirs(opt.log_path, exist_ok=True) os.makedirs(opt.saved_path, exist_ok=True) # Write history if 'backlog' not in opt.config: with open( os.path.join(opt.saved_path, f'{now.strftime("%Y%m%d%H%M%S")}.backlog.json'), 'w') as f: backlog = dict(cfg.to_pascal_case()) backlog['__metadata__'] = 'Backlog at ' + now.strftime( "%Y/%m/%d %H:%M:%S") json.dump(backlog, f) else: with open( os.path.join(opt.saved_path, f'{now.strftime("%Y%m%d%H%M%S")}.history.json'), 'w') as f: history = dict(cfg.to_pascal_case()) history['__metadata__'] = now.strftime("%Y/%m/%d %H:%M:%S") json.dump(history, f) writer = SummaryWriter(opt.log_path + f'/tensorboard') epoch = 0 best_loss = 1e5 best_epoch = 0 step = max(0, last_step) model.train() num_iter_per_epoch = len(training_generator) try: for epoch in range(cfg.no_epochs): last_epoch = step // num_iter_per_epoch if epoch < last_epoch: continue epoch_loss = [] progress_bar = tqdm(training_generator) for iter, data in enumerate(progress_bar): if iter < step - last_epoch * num_iter_per_epoch: progress_bar.set_description( f'Skip {iter} < {step} - {last_epoch} * {num_iter_per_epoch}' ) progress_bar.update() continue try: imgs = data['img'] annot = data['annot'] if cfg.num_gpus == 1: # if only one gpu, just send it to cuda:0 # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here imgs = imgs.cuda() annot = annot.cuda() optimizer.zero_grad() cls_loss, reg_loss = model( imgs, annot, obj_list=cfg.dictionary_class_name.keys()) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() epoch_loss.append(float(loss)) progress_bar.set_description( 'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. ' 'Total loss: {:.5f}'.format(step, epoch, cfg.no_epochs, iter + 1, num_iter_per_epoch, cls_loss.item(), reg_loss.item(), loss.item())) writer.add_scalars('Loss', {'train': loss}, step) writer.add_scalars('Regression_loss', {'train': reg_loss}, step) writer.add_scalars('Classification_loss', {'train': cls_loss}, step) # log learning_rate current_lr = optimizer.param_groups[0]['lr'] writer.add_scalar('learning_rate', current_lr, step) step += 1 except Exception as e: print('[Error]', traceback.format_exc()) print(e) continue scheduler.step(np.mean(epoch_loss)) model.eval() loss_regression_ls = [] loss_classification_ls = [] for iter, data in enumerate(val_generator): with torch.no_grad(): imgs = data['img'] annot = data['annot'] if cfg.num_gpus == 1: imgs = imgs.cuda() annot = annot.cuda() cls_loss, reg_loss = model( imgs, annot, obj_list=cfg.dictionary_class_name.keys()) cls_loss = cls_loss.mean() reg_loss = reg_loss.mean() loss = cls_loss + reg_loss if loss == 0 or not torch.isfinite(loss): continue loss_classification_ls.append(cls_loss.item()) loss_regression_ls.append(reg_loss.item()) cls_loss = np.mean(loss_classification_ls) reg_loss = np.mean(loss_regression_ls) loss = cls_loss + reg_loss progress_bar.set_description( 'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}.' ' Total loss: {:1.5f}'.format(epoch, cfg.no_epochs, cls_loss, reg_loss, loss)) writer.add_scalars('Loss', {'val': loss}, step) writer.add_scalars('Regression_loss', {'val': reg_loss}, step) writer.add_scalars('Classification_loss', {'val': cls_loss}, step) if cfg.only_best_weights: if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f"{opt.saved_path}/det_d{cfg.compound_coef}_{epoch}_{step}.pth" ) else: if loss + opt.es_min_delta < best_loss: best_loss = loss best_epoch = epoch save_checkpoint( model, f"{opt.saved_path}/det_d{cfg.compound_coef}_{epoch}_{step}.pth" ) model.train() # Early stopping if epoch - best_epoch > opt.es_patience > 0: print( '[Info] Stop training at epoch {}. The lowest loss achieved is {}' .format(epoch, best_loss)) break print( f'[Info] Finished training. Best loss achieved {best_loss} at epoch {best_epoch}.' ) except KeyboardInterrupt: save_checkpoint( model, f"{opt.saved_path}/d{cfg.compound_coef}_{epoch}_{step}.pth") writer.close() writer.close()
t_rmse = rmse_loss(output, targets) rmse.update(t_rmse.item()) output_np = np.clip(output.detach().cpu().numpy(), 0, 1) target_np = np.clip(targets.detach().cpu().numpy(), 0, 1) logging.info('[{0}][{1}][{2}]\t' 'lr: {lr:.5f}\t' 'loss: {loss.val:.6f} ({loss.avg:.6f})\t' 'RMSE: {rmse.val:.6f} ({rmse.avg:.6f})'.format( epoch, headid, ind, lr=optimizer.param_groups[-1]['lr'], loss=losses, rmse=rmse)) writer.add_scalars("trainloss", {"train": losses.val}, step) step += 1 ###############################################tenosrboard太麻烦###### lossx.append(losses.val) rmsex.append(rmse.val) x = range(len(lossx)) plt.figure(1) plt.title("this is loss and rmse") plt.plot(x, lossx, label='loss') plt.plot(x, rmsex, label='rmse') plt.legend() #changepoint 方便查看tensorboard太麻烦 plt.savefig( '/media/workdir/hujh/hujh-new/huaweirader_baseline/log/demolog/unetloss.png' )
def main(argv): checkpointFilePath = '' alwaysRender = False useCuda = True try: opts, args = getopt.getopt(argv, "hrc:a:g:", ["checkpoint=", "agent=", "cuda="]) except getopt.GetoptError: print( 'Error in command arguments. Run this for help:\n\ttrain_singleAgent.py -h' ) sys.exit(2) for opt, arg in opts: if opt == '-r': alwaysRender = True elif opt == '-h': print( 'train_singleAgent.py\n-c <checkpointfile> => Resume training from a saved checkpoint\n-a(--agent) <agent version> => Version of agent to train (default=0)\n-r => Always render' ) sys.exit() elif opt in ("-c", "--checkpoint"): checkpointFilePath = arg elif opt in ("-a", "--agent"): agentName = arg elif opt in ("-g", "--cuda"): useCuda = arg # Create a set of agents (exactly four) agent_list = [ agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent() ] if agentName == "2": agent_list.append(SPINAgents.SPIN_2()) elif agentName == "1": agent_list.append(SPINAgents.SPIN_1()) else: agent_list.append(SPINAgents.SPIN_0()) # Make the "Team" environment using the agent list env = pommerman.make('PommeTeam-v0', agent_list) memory = ReplayMemory(100000) batch_size = 128 epsilon = 1 rewards = [] start_epoch = 0 # Writer to log data to tensorboard writer = SummaryWriter() if checkpointFilePath != '': start_epoch = load_checkpoint(agent_list[3], checkpointFilePath) # Run the episodes just like OpenAI Gym for i in range(start_epoch, 5750): state = env.reset() done = False total_reward = [0] * len(agent_list) epsilon *= 0.995 while not done and agent_list[3]._character.is_alive: if i > 4990 or alwaysRender: env.render() # Set epsilon for our learning agent agent_list[3].epsilon = max(epsilon, 0.1) actions = env.act(state) agentAction = actions[3] actions[3] = actions[3].data.numpy()[0] obs_input = Variable( torch.from_numpy(agent_list[3].prepInput(state[3])).type( torch.FloatTensor)) next_obs, reward, done, _ = env.step(actions) state = next_obs if not agent_list[3]._character.is_alive: reward[3] = -1 # Fill replay memory for our learning agent memory.push( agent_list[3].Input, actions[3], torch.from_numpy(agent_list[3].prepInput(state[3])).type( torch.FloatTensor), torch.Tensor([reward[3]]), torch.Tensor([done])) total_reward = [x + y for x, y in zip(total_reward, reward)] rewards.append(total_reward) # Creates a dictionary with agent name and rewards to be displayed on tensorboard total_reward_list = [] for j in range(len(total_reward)): total_reward_list.append( (type(agent_list[j]).__name__ + '(' + str(j) + ')', total_reward[j])) writer.add_scalars('data/rewards', dict(total_reward_list), i) writer.add_scalar('data/epsilon', agent_list[3].epsilon, i) writer.add_scalar('data/memory', memory.__len__(), i) print("Episode : ", i) if memory.__len__() > 10000: batch = memory.sample(batch_size) agent_list[3].backward(batch) if i > 0 and i % 750 == 0: save_checkpoint( { 'epoch': i + 1, 'arch': 0, 'state_dict_Q': agent_list[3].Q.state_dict(), 'state_dict_target_Q': agent_list[3].target_Q.state_dict(), 'best_prec1': 0, 'optimizer': agent_list[3].optimizer.state_dict(), }, agent_list[3].__class__.__name__) env.close() save_checkpoint( { 'epoch': 5000 + 1, 'arch': 0, 'state_dict_Q': agent_list[3].Q.state_dict(), 'state_dict_target_Q': agent_list[3].target_Q.state_dict(), 'best_prec1': 0, 'optimizer': agent_list[3].optimizer.state_dict(), }, agent_list[3].__class__.__name__) writer.close()
dim=1) viz_window = viz.line( X=x_axis, Y=y_axis, opts=opts, ) if args.tensorboard and \ package[ 'loss_results'] is not None and start_epoch > 0: # Previous scores to tensorboard logs for i in range(start_epoch): values = { 'Avg Train Loss': loss_results[i], 'Avg WER': wer_results[i], 'Avg CER': cer_results[i] } tensorboard_writer.add_scalars(args.id, values, i + 1) else: with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size,
dim=1) viz_window = viz.line( X=x_axis, Y=y_axis, opts=opts, ) if main_proc and args.tensorboard and \ package[ 'loss_results'] is not None and start_epoch > 0: # Previous scores to tensorboard logs for i in range(start_epoch): values = { 'Avg Train Loss': loss_results[i], 'Avg WER': wer_results[i], 'Avg CER': cer_results[i] } tensorboard_writer.add_scalars(args.id, values, i + 1) else: with open(args.labels_path) as label_file: labels = str(''.join(json.load(label_file))) audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride, window=args.window, noise_dir=args.noise_dir, noise_prob=args.noise_prob, noise_levels=(args.noise_min, args.noise_max)) rnn_type = args.rnn_type.lower() assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru" model = DeepSpeech(rnn_hidden_size=args.hidden_size,
class WGAN_GP(object): def __init__(self, args): # parameters self.epoch = args.epoch self.sample_num = 100 self.batch_size = args.batch_size self.save_dir = args.save_dir self.result_dir = args.result_dir self.dataset = args.dataset self.log_dir = args.log_dir self.gpu_mode = args.gpu_mode self.model_name = args.gan_type self.input_size = args.input_size self.z_dim = 62 self.lambda_ = 10 self.n_critic = 5 # the number of iterations of the critic per generator iteration print('run at WGAN_GP') # load dataset # self.data_loader = dataloader(self.dataset, self.input_size, self.batch_size) # data = self.data_loader.__iter__().__next__()[0] self.data_loader = testToGAN(self.dataset, 'train') self.dataset = 'trainAgain' data = next(iter(self.data_loader))[0] # networks init self.G = generator(input_dim=self.z_dim, output_dim=data.shape[1], input_size=self.input_size) self.D = discriminator(input_dim=data.shape[1], output_dim=1, input_size=self.input_size) self.G_optimizer = optim.Adam(self.G.parameters(), lr=args.lrG, betas=(args.beta1, args.beta2)) self.D_optimizer = optim.Adam(self.D.parameters(), lr=args.lrD, betas=(args.beta1, args.beta2)) if self.gpu_mode: self.G.cuda() self.D.cuda() print('---------- Networks architecture -------------') utils.print_network(self.G) utils.print_network(self.D) print('-----------------------------------------------') self.writer = SummaryWriter() #log_dir=log_dir, self.X = 0 # fixed noise self.sample_z_ = torch.rand((self.batch_size, self.z_dim)) if self.gpu_mode: self.sample_z_ = self.sample_z_.cuda() def train(self): self.train_hist = {} self.train_hist['D_loss'] = [] self.train_hist['G_loss'] = [] self.train_hist['per_epoch_time'] = [] self.train_hist['total_time'] = [] # self.y_real_, self.y_fake_ = torch.ones(self.batch_size, 1), torch.zeros(self.batch_size, 1) self.y_real_, self.y_fake_ = torch.zeros(self.batch_size, 1), torch.ones( self.batch_size, 1) if self.gpu_mode: self.y_real_, self.y_fake_ = self.y_real_.cuda( ), self.y_fake_.cuda() self.D.train() print('WGAN_GP training start!!,epoch:{},module stored at:{}'.format( self.epoch, self.dataset)) start_time = time.time() url = os.path.join(self.save_dir, self.dataset, self.model_name) # 等间隔调整学习率 StepLR # schedule_G = torch.optim.lr_scheduler.StepLR(self.G_optimizer, 20, gamma=0.1, last_epoch=-1) # schedule_D = torch.optim.lr_scheduler.StepLR(self.D_optimizer, 30, gamma=0.1, last_epoch=-1) # 余弦退火调整学习率 CosineAnnealingLR # schedule_D=torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max, eta_min=0, last_epoch=-1) # 自适应调整学习率 ReduceLROnPlateau """当验证集的 loss 不再下降时,进行学习率调整;或者监测验证集的 accuracy, 当accuracy 不再上升时,则调整学习率。""" # torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=False, # threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08) # schedule_G = torch.optim.lr_scheduler.ReduceLROnPlateau(self.G_optimizer, mode='min', factor=0.1, patience=10, verbose=False, # threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08) for epoch in range(100, self.epoch): if epoch == 100: self.G = torch.load(os.path.join(url, 'WGAN_GP_100_G.pkl')) self.D = torch.load(os.path.join(url, 'WGAN_GP_100_D.pkl')) print('reload success!', '*' * 40) self.G.train() epoch_start_time = time.time() # for iter, (x_, _) in enumerate(self.data_loader): for iter, x_, in enumerate(self.data_loader): x_ = x_[0] if iter == self.data_loader.dataset.__len__( ) // self.batch_size: break z_ = torch.rand((self.batch_size, self.z_dim)) if self.gpu_mode: x_, z_ = x_.cuda(), z_.cuda() # update D network self.D_optimizer.zero_grad() D_real = self.D(x_) D_real_loss = -torch.mean(D_real) G_ = self.G(z_) D_fake = self.D(G_) D_fake_loss = torch.mean(D_fake) # gradient penalty alpha = torch.rand((self.batch_size, 1, 1, 1)) if self.gpu_mode: alpha = alpha.cuda() x_hat = alpha * x_.data + (1 - alpha) * G_.data x_hat.requires_grad = True pred_hat = self.D(x_hat) if self.gpu_mode: gradients = grad(outputs=pred_hat, inputs=x_hat, grad_outputs=torch.ones( pred_hat.size()).cuda(), create_graph=True, retain_graph=True, only_inputs=True)[0] else: gradients = grad(outputs=pred_hat, inputs=x_hat, grad_outputs=torch.ones(pred_hat.size()), create_graph=True, retain_graph=True, only_inputs=True)[0] gradient_penalty = self.lambda_ * ( (gradients.view(gradients.size()[0], -1).norm(2, 1) - 1)** 2).mean() D_loss = D_real_loss + D_fake_loss + gradient_penalty D_loss.backward() self.D_optimizer.step() if ((iter + 1) % self.n_critic) == 0: # update G network self.G_optimizer.zero_grad() G_ = self.G(z_) D_fake = self.D(G_) G_loss = -torch.mean(D_fake) self.train_hist['G_loss'].append(G_loss.item()) G_loss.backward() self.G_optimizer.step() self.train_hist['D_loss'].append(D_loss.item()) if ((iter + 1) % 100) == 0: print("Epoch: [%2d] [%4d/%4d] D_loss: %.8f, G_loss: %.8f" % ((epoch + 1), (iter + 1), self.data_loader.dataset.__len__() // self.batch_size, D_loss.item(), G_loss.item())) self.writer.add_scalar('G_loss', G_loss.item(), self.X) # writer.add_scalar('G_loss', -G_loss_D, X) self.writer.add_scalar('D_loss', D_loss.item(), self.X) self.writer.add_scalars('cross loss', { 'G_loss': D_loss.item(), 'D_loss': D_loss.item() }, self.X) self.X += 1 self.train_hist['per_epoch_time'].append(time.time() - epoch_start_time) # with torch.no_grad(): # self.visualize_results((epoch+1)) if epoch % 5 == 0: self.load_interval(epoch) self.train_hist['total_time'].append(time.time() - start_time) print("Avg one epoch time: %.2f, total %d epochs time: %.2f" % (np.mean(self.train_hist['per_epoch_time']), self.epoch, self.train_hist['total_time'][0])) print("Training finish!... save training results") save_dir = os.path.join(self.save_dir, self.dataset, self.model_name) with open(os.path.join(save_dir, self.model_name + '_train_hist.json'), "a") as f: json.dump(self.train_hist, f) self.writer.export_scalars_to_json( os.path.join(save_dir, self.model_name + '.json')) self.writer.close() self.load_interval(epoch) # self.save() # utils.generate_animation(self.result_dir + '/' + self.dataset + '/' + self.model_name + '/' + self.model_name, # self.epoch) utils.loss_plot( self.train_hist, os.path.join(self.save_dir, self.dataset, self.model_name), self.model_name) def visualize_results(self, epoch, fix=True): self.G.eval() if not os.path.exists(self.result_dir + '/' + self.dataset + '/' + self.model_name): os.makedirs(self.result_dir + '/' + self.dataset + '/' + self.model_name) tot_num_samples = min(self.sample_num, self.batch_size) image_frame_dim = int(np.floor(np.sqrt(tot_num_samples))) if fix: """ fixed noise """ samples = self.G(self.sample_z_) else: """ random noise """ sample_z_ = torch.rand((self.batch_size, self.z_dim)) if self.gpu_mode: sample_z_ = sample_z_.cuda() samples = self.G(sample_z_) if self.gpu_mode: samples = samples.cpu().data.numpy().transpose(0, 2, 3, 1) else: samples = samples.data.numpy().transpose(0, 2, 3, 1) samples = (samples + 1) / 2 utils.save_images( samples[:image_frame_dim * image_frame_dim, :, :, :], [image_frame_dim, image_frame_dim], self.result_dir + '/' + self.dataset + '/' + self.model_name + '/' + self.model_name + '_epoch%03d' % epoch + '.png') def save(self): save_dir = os.path.join(self.save_dir, self.dataset, self.model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) torch.save(self.G.state_dict(), os.path.join(save_dir, self.model_name + '_G.pkl')) torch.save(self.D.state_dict(), os.path.join(save_dir, self.model_name + '_D.pkl')) with open(os.path.join(save_dir, self.model_name + '_history.pkl'), 'wb') as f: pickle.dump(self.train_hist, f) def load(self): save_dir = os.path.join(self.save_dir, self.dataset, self.model_name) self.G.load_state_dict( torch.load(os.path.join(save_dir, self.model_name + '_G.pkl'))) self.D.load_state_dict( torch.load(os.path.join(save_dir, self.model_name + '_D.pkl'))) def load_interval(self, epoch): save_dir = os.path.join(self.save_dir, self.dataset, self.model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) # 保存模型 torch.save( self.G, os.path.join( save_dir, self.model_name + '_{}_G.pkl'.format(epoch))) #dictionary ['bias', 'weight'] torch.save( self.D, os.path.join(save_dir, self.model_name + '_{}_D.pkl'.format(epoch)))
def main(): cfg = Config() # Redirect logs to both console and file. if cfg.log_to_file: ReDirectSTD(cfg.stdout_file, 'stdout', False) ReDirectSTD(cfg.stderr_file, 'stderr', False) # Lazily create SummaryWriter writer = None TVT, TMO = set_devices(cfg.sys_device_ids) if cfg.seed is not None: set_seed(cfg.seed) # Dump the configurations to log. import pprint print('-' * 60) print('cfg.__dict__') pprint.pprint(cfg.__dict__) print('-' * 60) ########### # Dataset # ########### train_set = create_dataset(**cfg.train_set_kwargs) num_classes = len(train_set.ids2labels) # The combined dataset does not provide val set currently. val_set = None if cfg.dataset == 'combined' else create_dataset(**cfg.val_set_kwargs) test_sets = [] test_set_names = [] if cfg.dataset == 'combined': for name in ['market1501', 'cuhk03', 'duke']: cfg.test_set_kwargs['name'] = name test_sets.append(create_dataset(**cfg.test_set_kwargs)) test_set_names.append(name) else: test_sets.append(create_dataset(**cfg.test_set_kwargs)) test_set_names.append(cfg.dataset) ########### # Models # ########### model = Model( last_conv_stride=cfg.last_conv_stride, num_stripes=cfg.num_stripes, local_conv_out_channels=cfg.local_conv_out_channels, num_classes=num_classes ) # Model wrapper model_w = DataParallel(model) ############################# # Criteria and Optimizers # ############################# criterion = torch.nn.CrossEntropyLoss() # To finetune from ImageNet weights finetuned_params = list(model.base.parameters()) # To train from scratch new_params = [p for n, p in model.named_parameters() if not n.startswith('base.')] param_groups = [{'params': finetuned_params, 'lr': cfg.finetuned_params_lr}, {'params': new_params, 'lr': cfg.new_params_lr}] optimizer = optim.SGD( param_groups, momentum=cfg.momentum, weight_decay=cfg.weight_decay) # Bind them together just to save some codes in the following usage. modules_optims = [model, optimizer] ################################ # May Resume Models and Optims # ################################ if cfg.resume: resume_ep, scores = load_ckpt(modules_optims, cfg.ckpt_file) # May Transfer Models and Optims to Specified Device. Transferring optimizer # is to cope with the case when you load the checkpoint to a new device. TMO(modules_optims) ######## # Test # ######## def test(load_model_weight=False): if load_model_weight: if cfg.model_weight_file != '': map_location = (lambda storage, loc: storage) sd = torch.load(cfg.model_weight_file, map_location=map_location) load_state_dict(model, sd) print('Loaded model weights from {}'.format(cfg.model_weight_file)) else: load_ckpt(modules_optims, cfg.ckpt_file) for test_set, name in zip(test_sets, test_set_names): test_set.set_feat_func(ExtractFeature(model_w, TVT)) print('\n=========> Test on dataset: {} <=========\n'.format(name)) test_set.eval( normalize_feat=True, verbose=True) def validate(): if val_set.extract_feat_func is None: val_set.set_feat_func(ExtractFeature(model_w, TVT)) print('\n===== Test on validation set =====\n') mAP, cmc_scores, _, _ = val_set.eval( normalize_feat=True, to_re_rank=False, verbose=True) print() return mAP, cmc_scores[0] if cfg.only_test: test(load_model_weight=True) return ############ # Training # ############ start_ep = resume_ep if cfg.resume else 0 for ep in range(start_ep, cfg.total_epochs): # Adjust Learning Rate adjust_lr_staircase( optimizer.param_groups, [cfg.finetuned_params_lr, cfg.new_params_lr], ep + 1, cfg.staircase_decay_at_epochs, cfg.staircase_decay_multiply_factor) may_set_mode(modules_optims, 'train') # For recording loss loss_meter = AverageMeter() ep_st = time.time() step = 0 epoch_done = False while not epoch_done: step += 1 step_st = time.time() ims, im_names, labels, mirrored, epoch_done = train_set.next_batch() ims_var = Variable(TVT(torch.from_numpy(ims).float())) labels_var = Variable(TVT(torch.from_numpy(labels).long())) _, logits_list = model_w(ims_var) loss = torch.sum( torch.cat([criterion(logits, labels_var) for logits in logits_list])) optimizer.zero_grad() loss.backward() optimizer.step() ############ # Step Log # ############ loss_meter.update(to_scalar(loss)) if step % cfg.steps_per_log == 0: log = '\tStep {}/Ep {}, {:.2f}s, loss {:.4f}'.format( step, ep + 1, time.time() - step_st, loss_meter.val) print(log) ############# # Epoch Log # ############# log = 'Ep {}, {:.2f}s, loss {:.4f}'.format( ep + 1, time.time() - ep_st, loss_meter.avg) print(log) ########################## # Test on Validation Set # ########################## mAP, Rank1 = 0, 0 if ((ep + 1) % cfg.epochs_per_val == 0) and (val_set is not None): mAP, Rank1 = validate() # Log to TensorBoard if cfg.log_to_file: if writer is None: writer = SummaryWriter(log_dir=osp.join(cfg.exp_dir, 'tensorboard')) writer.add_scalars( 'val scores', dict(mAP=mAP, Rank1=Rank1), ep) writer.add_scalars( 'loss', dict(loss=loss_meter.avg, ), ep) # save ckpt if cfg.log_to_file: save_ckpt(modules_optims, ep + 1, 0, cfg.ckpt_file) ######## # Test # ######## test(load_model_weight=False)
def main(): args = get_arguments() # configuration CONFIG = Dict(yaml.safe_load(open(args.config))) # writer if CONFIG.writer_flag: writer = SummaryWriter(CONFIG.result_path) else: writer = None # DataLoaders train_data = PASCALVOC( CONFIG, mode="train", transform=Compose([ RandomCrop(CONFIG), Resize(CONFIG), RandomFlip(), ToTensor(), Normalize(mean=get_mean(), std=get_std()), ]) ) val_data = PASCALVOC( CONFIG, mode="val", transform=Compose([ RandomCrop(CONFIG), Resize(CONFIG), ToTensor(), Normalize(mean=get_mean(), std=get_std()), ]) ) train_loader = DataLoader( train_data, batch_size=CONFIG.batch_size, shuffle=True, num_workers=CONFIG.num_workers, drop_last=True ) val_loader = DataLoader( val_data, batch_size=CONFIG.batch_size, shuffle=False, num_workers=CONFIG.num_workers ) # load model print('\n------------------------Loading Model------------------------\n') if CONFIG.attention == 'dual': model = DANet(CONFIG) print('Dual Attintion modules will be added to this base model') elif CONFIG.attention == 'channel': model = CANet(CONFIG) print('Channel Attintion modules will be added to this base model') else: if CONFIG.model == 'drn_d_22': print( 'Dilated ResNet D 22 w/o Dual Attention modules will be used as a model.') model = drn_d_22(pretrained=True, num_classes=CONFIG.n_classes) elif CONFIG.model == 'drn_d_38': print( 'Dilated ResNet D 28 w/o Dual Attention modules will be used as a model.') model = drn_d_38(pretrained=True, num_classes=CONFIG.n_classes) else: print('There is no option you chose as a model.') print( 'Therefore, Dilated ResNet D 22 w/o Dual Attention modules will be used as a model.') model = drn_d_22(pretrained=True, num_classes=CONFIG.n_classes) # set optimizer, lr_scheduler if CONFIG.optimizer == 'Adam': print(CONFIG.optimizer + ' will be used as an optimizer.') optimizer = optim.Adam(model.parameters(), lr=CONFIG.learning_rate) elif CONFIG.optimizer == 'SGD': print(CONFIG.optimizer + ' will be used as an optimizer.') optimizer = optim.SGD( model.parameters(), lr=CONFIG.learning_rate, momentum=CONFIG.momentum, dampening=CONFIG.dampening, weight_decay=CONFIG.weight_decay, nesterov=CONFIG.nesterov) elif CONFIG.optimizer == 'AdaBound': print(CONFIG.optimizer + ' will be used as an optimizer.') optimizer = adabound.AdaBound( model.parameters(), lr=CONFIG.learning_rate, final_lr=CONFIG.final_lr, weight_decay=CONFIG.weight_decay) else: print('There is no optimizer which suits to your option. \ Instead, SGD will be used as an optimizer.') optimizer = optim.SGD( model.parameters(), lr=CONFIG.learning_rate, momentum=CONFIG.momentum, dampening=CONFIG.dampening, weight_decay=CONFIG.weight_decay, nesterov=CONFIG.nesterov) # learning rate scheduler if CONFIG.optimizer == 'SGD': scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=CONFIG.lr_patience) else: scheduler = None # send the model to cuda/cpu device = 'cuda' if torch.cuda.is_available() else 'cpu' model.to(device) if device == 'cuda': model = torch.nn.DataParallel(model) # make parallel torch.backends.cudnn.benchmark = True # resume if you want begin_epoch = 0 if args.resume: if os.path.exists(os.path.join(CONFIG.result_path, 'checkpoint.pth')): print('loading the checkpoint...') begin_epoch, model, optimizer, scheduler = \ resume(CONFIG, model, optimizer, scheduler) print('training will start from {} epoch'.format(begin_epoch)) # criterion for loss if CONFIG.class_weight: criterion = nn.CrossEntropyLoss( weight=get_class_weight().to(device), ignore_index=255 ) else: criterion = nn.CrossEntropyLoss(ignore_index=255) # train and validate model print('\n------------------------Start training------------------------\n') losses_train = [] losses_val = [] val_ious = [] mean_ious = [] mean_ious_without_bg = [] best_mean_iou = 0.0 for epoch in range(begin_epoch, CONFIG.max_epoch): # training loss_train = train( model, train_loader, criterion, optimizer, CONFIG, device) losses_train.append(loss_train) # validation val_iou, loss_val = validation( model, val_loader, criterion, CONFIG, device) val_ious.append(val_iou) losses_val.append(loss_val) if CONFIG.optimizer == 'SGD': scheduler.step(loss_val) mean_ious.append(val_ious[-1].mean().item()) mean_ious_without_bg.append(val_ious[-1][1:].mean().item()) # save checkpoint every 5 epoch if epoch % 5 == 0 and epoch != 0: save_checkpoint(CONFIG, epoch, model, optimizer, scheduler) # save a model every 50 epoch if epoch % 50 == 0 and epoch != 0: torch.save( model.state_dict(), os.path.join(CONFIG.result_path, 'epoch_{}_model.prm'.format(epoch))) if best_mean_iou < mean_ious[-1]: best_mean_iou = mean_ious[-1] torch.save( model.state_dict(), os.path.join(CONFIG.result_path, 'best_mean_iou_model.prm')) # tensorboardx if writer: writer.add_scalars( "loss", { 'loss_train': losses_train[-1], 'loss_val': losses_val[-1]}, epoch) writer.add_scalar( "mean_iou", mean_ious[-1], epoch) writer.add_scalar( "mean_iou_w/o_bg", mean_ious_without_bg[-1], epoch) print( 'epoch: {}\tloss_train: {:.5f}\tloss_val: {:.5f}\tmean IOU: {:.3f}\tmean IOU w/o bg: {:.3f}'.format( epoch, losses_train[-1], losses_val[-1], mean_ious[-1], mean_ious_without_bg[-1]) ) torch.save( model.state_dict(), os.path.join(CONFIG.result_path, 'final_model.prm'))
class SummaryWorker(multiprocessing.Process): def __init__(self, env): super(SummaryWorker, self).__init__() self.env = env self.config = env.config self.queue = multiprocessing.Queue() try: self.timer_scalar = utils.train.Timer(env.config.getfloat('summary', 'scalar')) except configparser.NoOptionError: self.timer_scalar = lambda: False try: self.timer_image = utils.train.Timer(env.config.getfloat('summary', 'image')) except configparser.NoOptionError: self.timer_image = lambda: False try: self.timer_histogram = utils.train.Timer(env.config.getfloat('summary', 'histogram')) except configparser.NoOptionError: self.timer_histogram = lambda: False with open(os.path.expanduser(os.path.expandvars(env.config.get('summary_histogram', 'parameters'))), 'r') as f: self.histogram_parameters = utils.RegexList([line.rstrip() for line in f]) self.draw_bbox = utils.visualize.DrawBBox(env.config, env.category) self.draw_iou = utils.visualize.DrawIou(env.config) def __call__(self, name, **kwargs): if getattr(self, 'timer_' + name)(): kwargs = getattr(self, 'copy_' + name)(**kwargs) self.queue.put((name, kwargs)) def stop(self): self.queue.put((None, {})) def run(self): self.writer = SummaryWriter(os.path.join(self.env.model_dir, self.env.args.run)) while True: name, kwargs = self.queue.get() if name is None: break func = getattr(self, 'summary_' + name) try: func(**kwargs) except: traceback.print_exc() def copy_scalar(self, **kwargs): step, loss_total, loss, loss_hparam = (kwargs[key] for key in 'step, loss_total, loss, loss_hparam'.split(', ')) loss_total = loss_total.data.clone().cpu().numpy() loss = {key: loss[key].data.clone().cpu().numpy() for key in loss} loss_hparam = {key: loss_hparam[key].data.clone().cpu().numpy() for key in loss_hparam} return dict( step=step, loss_total=loss_total, loss=loss, loss_hparam=loss_hparam, ) def summary_scalar(self, **kwargs): step, loss_total, loss, loss_hparam = (kwargs[key] for key in 'step, loss_total, loss, loss_hparam'.split(', ')) for key in loss: self.writer.add_scalar('loss/' + key, loss[key][0], step) if self.config.getboolean('summary_scalar', 'loss_hparam'): self.writer.add_scalars('loss_hparam', {key: loss_hparam[key][0] for key in loss_hparam}, step) self.writer.add_scalar('loss_total', loss_total[0], step) def copy_image(self, **kwargs): step, height, width, rows, cols, data, pred, debug = (kwargs[key] for key in 'step, height, width, rows, cols, data, pred, debug'.split(', ')) data = {key: data[key].clone().cpu().numpy() for key in 'image, yx_min, yx_max, cls'.split(', ')} pred = {key: pred[key].data.clone().cpu().numpy() for key in 'yx_min, yx_max, iou, logits'.split(', ') if key in pred} matching = (debug['positive'].float() - debug['negative'].float() + 1) / 2 matching = matching.data.clone().cpu().numpy() return dict( step=step, height=height, width=width, rows=rows, cols=cols, data=data, pred=pred, matching=matching, ) def summary_image(self, **kwargs): step, height, width, rows, cols, data, pred, matching = (kwargs[key] for key in 'step, height, width, rows, cols, data, pred, matching'.split(', ')) image = data['image'] limit = min(self.config.getint('summary_image', 'limit'), image.shape[0]) image = image[:limit, :, :, :] yx_min, yx_max, iou = (pred[key] for key in 'yx_min, yx_max, iou'.split(', ')) scale = [height / rows, width / cols] yx_min, yx_max = (a * scale for a in (yx_min, yx_max)) if 'logits' in pred: cls = np.argmax(F.softmax(torch.autograd.Variable(torch.from_numpy(pred['logits'])), -1).data.cpu().numpy(), -1) else: cls = np.zeros(iou.shape, np.int) if self.config.getboolean('summary_image', 'bbox'): # data canvas = np.copy(image) canvas = pybenchmark.profile('bbox/data')(self.draw_bbox_data)(canvas, *(data[key] for key in 'yx_min, yx_max, cls'.split(', '))) self.writer.add_image('bbox/data', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step) # pred canvas = np.copy(image) canvas = pybenchmark.profile('bbox/pred')(self.draw_bbox_pred)(canvas, yx_min, yx_max, cls, iou, nms=True) self.writer.add_image('bbox/pred', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step) if self.config.getboolean('summary_image', 'iou'): # bbox canvas = np.copy(image) canvas_data = self.draw_bbox_data(canvas, *(data[key] for key in 'yx_min, yx_max, cls'.split(', ')), colors=['g']) # data for i, canvas in enumerate(pybenchmark.profile('iou/data')(self.draw_bbox_iou)(list(map(np.copy, canvas_data)), yx_min, yx_max, cls, matching, rows, cols, colors=['w'])): canvas = np.stack(canvas) canvas = torch.from_numpy(canvas).permute(0, 3, 1, 2) canvas = torchvision.utils.make_grid(canvas.float(), normalize=True, scale_each=True) self.writer.add_image('iou/data%d' % i, canvas, step) # pred for i, canvas in enumerate(pybenchmark.profile('iou/pred')(self.draw_bbox_iou)(list(map(np.copy, canvas_data)), yx_min, yx_max, cls, iou, rows, cols, colors=['w'])): canvas = np.stack(canvas) canvas = torch.from_numpy(canvas).permute(0, 3, 1, 2) canvas = torchvision.utils.make_grid(canvas.float(), normalize=True, scale_each=True) self.writer.add_image('iou/pred%d' % i, canvas, step) def draw_bbox_data(self, canvas, yx_min, yx_max, cls, colors=None): batch_size = len(canvas) if len(cls.shape) == len(yx_min.shape): cls = np.argmax(cls, -1) yx_min, yx_max, cls = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls)) return [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(canvas, yx_min, yx_max, cls)] def draw_bbox_pred(self, canvas, yx_min, yx_max, cls, iou, colors=None, nms=False): batch_size = len(canvas) mask = iou > self.config.getfloat('detect', 'threshold') yx_min, yx_max = (np.reshape(a, [a.shape[0], -1, 2]) for a in (yx_min, yx_max)) cls, iou, mask = (np.reshape(a, [a.shape[0], -1]) for a in (cls, iou, mask)) yx_min, yx_max, cls, iou, mask = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls, iou, mask)) yx_min, yx_max, cls, iou = ([a[m] for a, m in zip(l, mask)] for l in (yx_min, yx_max, cls, iou)) if nms: overlap = self.config.getfloat('detect', 'overlap') keep = [pybenchmark.profile('nms')(utils.postprocess.nms)(torch.Tensor(iou), torch.Tensor(yx_min), torch.Tensor(yx_max), overlap) if iou.shape[0] > 0 else [] for yx_min, yx_max, iou in zip(yx_min, yx_max, iou)] keep = [np.array(k, np.int) for k in keep] yx_min, yx_max, cls = ([a[k] for a, k in zip(l, keep)] for l in (yx_min, yx_max, cls)) return [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(canvas, yx_min, yx_max, cls)] def draw_bbox_iou(self, canvas_share, yx_min, yx_max, cls, iou, rows, cols, colors=None): batch_size = len(canvas_share) yx_min, yx_max = ([np.squeeze(a, -2) for a in np.split(a, a.shape[-2], -2)] for a in (yx_min, yx_max)) cls, iou = ([np.squeeze(a, -1) for a in np.split(a, a.shape[-1], -1)] for a in (cls, iou)) results = [] for i, (yx_min, yx_max, cls, iou) in enumerate(zip(yx_min, yx_max, cls, iou)): mask = iou > self.config.getfloat('detect', 'threshold') yx_min, yx_max = (np.reshape(a, [a.shape[0], -1, 2]) for a in (yx_min, yx_max)) cls, iou, mask = (np.reshape(a, [a.shape[0], -1]) for a in (cls, iou, mask)) yx_min, yx_max, cls, iou, mask = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls, iou, mask)) yx_min, yx_max, cls = ([a[m] for a, m in zip(l, mask)] for l in (yx_min, yx_max, cls)) canvas = [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(np.copy(canvas_share), yx_min, yx_max, cls)] iou = [np.reshape(a, [rows, cols]) for a in iou] canvas = [self.draw_iou(_canvas, iou) for _canvas, iou in zip(canvas, iou)] results.append(canvas) return results def copy_histogram(self, **kwargs): return {key: kwargs[key].data.clone().cpu().numpy() if torch.is_tensor(kwargs[key]) else kwargs[key] for key in 'step, dnn'.split(', ')} def summary_histogram(self, **kwargs): step, dnn = (kwargs[key] for key in 'step, dnn'.split(', ')) for name, param in dnn.named_parameters(): if self.histogram_parameters(name): self.writer.add_histogram(name, param, step)
def epoch_train(model, dataloader, dataset, criterion, optimizer, scheduler, device, data_const): print('epoch training...') # set visualization and create folder to save checkpoints writer = SummaryWriter(log_dir=args.log_dir + '/' + args.exp_ver + '/' + 'epoch_train') io.mkdir_if_not_exists(os.path.join(args.save_dir, args.exp_ver, 'epoch_train'), recursive=True) for epoch in range(args.start_epoch, args.epoch): # each epoch has a training and validation step epoch_loss = 0 for phase in ['train', 'val']: start_time = time.time() running_loss = 0.0 idx = 0 HicoDataset.data_sample_count=0 for data in tqdm(dataloader[phase]): train_data = data img_name = train_data['img_name'] det_boxes = train_data['det_boxes'] roi_labels = train_data['roi_labels'] roi_scores = train_data['roi_scores'] node_num = train_data['node_num'] edge_labels = train_data['edge_labels'] edge_num = train_data['edge_num'] features = train_data['features'] spatial_feat = train_data['spatial_feat'] word2vec = train_data['word2vec'] features, spatial_feat, word2vec, edge_labels = features.to(device), spatial_feat.to(device), word2vec.to(device), edge_labels.to(device) if idx == 10: break if phase == 'train': model.train() model.zero_grad() outputs = model(node_num, features, spatial_feat, word2vec, roi_labels) loss = criterion(outputs, edge_labels.float()) # import ipdb; ipdb.set_trace() loss.backward() optimizer.step() else: model.eval() # turn off the gradients for validation, save memory and computations with torch.no_grad(): outputs = model(node_num, features, spatial_feat, word2vec, roi_labels, validation=True) loss = criterion(outputs, edge_labels.float()) # print result every 1000 iteration during validation if idx==0 or idx % round(1000/args.batch_size)==round(1000/args.batch_size)-1: # ipdb.set_trace() image = Image.open(os.path.join(args.img_data, img_name[0])).convert('RGB') image_temp = image.copy() raw_outputs = nn.Sigmoid()(outputs[0:int(edge_num[0])]) raw_outputs = raw_outputs.cpu().detach().numpy() # class_img = vis_img(image, det_boxes, roi_labels, roi_scores) class_img = vis_img(image, det_boxes[0], roi_labels[0], roi_scores[0], edge_labels[0:int(edge_num[0])].cpu().numpy(), score_thresh=0.7) action_img = vis_img(image_temp, det_boxes[0], roi_labels[0], roi_scores[0], raw_outputs, score_thresh=0.7) writer.add_image('gt_detection', np.array(class_img).transpose(2,0,1)) writer.add_image('action_detection', np.array(action_img).transpose(2,0,1)) writer.add_text('img_name', img_name[0], epoch) idx+=1 # accumulate loss of each batch running_loss += loss.item() * edge_labels.shape[0] # calculate the loss and accuracy of each epoch epoch_loss = running_loss / len(dataset[phase]) # import ipdb; ipdb.set_trace() # log trainval datas, and visualize them in the same graph if phase == 'train': train_loss = epoch_loss HicoDataset.displaycount() else: writer.add_scalars('trainval_loss_epoch', {'train': train_loss, 'val': epoch_loss}, epoch) # print data if (epoch % args.print_every) == 0: end_time = time.time() print("[{}] Epoch: {}/{} Loss: {} Execution time: {}".format(\ phase, epoch+1, args.epoch, epoch_loss, (end_time-start_time))) # scheduler.step() # save model if epoch_loss<0.0405 or epoch % args.save_every == (args.save_every - 1) and epoch >= (200-1): checkpoint = { 'lr': args.lr, 'b_s': args.batch_size, 'bias': args.bias, 'bn': args.bn, 'dropout': args.drop_prob, 'layers': args.layers, 'feat_type': args.feat_type, 'multi_head': args.multi_attn, 'diff_edge': args.diff_edge, 'state_dict': model.state_dict() } save_name = "checkpoint_" + str(epoch+1) + '_epoch.pth' torch.save(checkpoint, os.path.join(args.save_dir, args.exp_ver, 'epoch_train', save_name)) writer.close() print('Finishing training!')
def main(): cfg = Config() # Redirect logs to both console and file. if cfg.log_to_file: ReDirectSTD(cfg.stdout_file, 'stdout', False) ReDirectSTD(cfg.stderr_file, 'stderr', False) # Lazily create SummaryWriter writer = None TVT, TMO = set_devices(cfg.sys_device_ids) if cfg.seed is not None: set_seed(cfg.seed) # Dump the configurations to log. import pprint print('-' * 60) print('cfg.__dict__') pprint.pprint(cfg.__dict__) print('-' * 60) ########### # Dataset # ########### train_set = create_dataset(**cfg.train_set_kwargs) test_sets = [] test_set_names = [] if cfg.dataset == 'combined': for name in ['market1501', 'cuhk03', 'duke']: cfg.test_set_kwargs['name'] = name test_sets.append(create_dataset(**cfg.test_set_kwargs)) test_set_names.append(name) else: test_sets.append(create_dataset(**cfg.test_set_kwargs)) test_set_names.append(cfg.dataset) ########### # Models # ########### model = Model(local_conv_out_channels=cfg.local_conv_out_channels, num_classes=len(train_set.ids2labels)) # Model wrapper model_w = DataParallel(model) ############################# # Criteria and Optimizers # ############################# id_criterion = nn.CrossEntropyLoss() g_tri_loss = TripletLoss(margin=cfg.global_margin) l_tri_loss = TripletLoss(margin=cfg.local_margin) optimizer = optim.Adam(model.parameters(), lr=cfg.base_lr, weight_decay=cfg.weight_decay) # Bind them together just to save some codes in the following usage. modules_optims = [model, optimizer] ################################ # May Resume Models and Optims # ################################ if cfg.resume: resume_ep, scores = load_ckpt(modules_optims, cfg.ckpt_file) # May Transfer Models and Optims to Specified Device. Transferring optimizer # is to cope with the case when you load the checkpoint to a new device. TMO(modules_optims) ######## # Test # ######## def test(load_model_weight=False): if load_model_weight: if cfg.model_weight_file != '': map_location = (lambda storage, loc: storage) sd = torch.load(cfg.model_weight_file, map_location=map_location) load_state_dict(model, sd) print('Loaded model weights from {}'.format(cfg.model_weight_file)) else: load_ckpt(modules_optims, cfg.ckpt_file) use_local_distance = (cfg.l_loss_weight > 0) \ and cfg.local_dist_own_hard_sample for test_set, name in zip(test_sets, test_set_names): test_set.set_feat_func(ExtractFeature(model_w, TVT)) print('\n=========> Test on dataset: {} <=========\n'.format(name)) test_set.eval( normalize_feat=cfg.normalize_feature, use_local_distance=use_local_distance) if cfg.only_test: test(load_model_weight=True) return ############ # Training # ############ start_ep = resume_ep if cfg.resume else 0 for ep in range(start_ep, cfg.total_epochs): # Adjust Learning Rate if cfg.lr_decay_type == 'exp': adjust_lr_exp( optimizer, cfg.base_lr, ep + 1, cfg.total_epochs, cfg.exp_decay_at_epoch) else: adjust_lr_staircase( optimizer, cfg.base_lr, ep + 1, cfg.staircase_decay_at_epochs, cfg.staircase_decay_multiply_factor) may_set_mode(modules_optims, 'train') g_prec_meter = AverageMeter() g_m_meter = AverageMeter() g_dist_ap_meter = AverageMeter() g_dist_an_meter = AverageMeter() g_loss_meter = AverageMeter() l_prec_meter = AverageMeter() l_m_meter = AverageMeter() l_dist_ap_meter = AverageMeter() l_dist_an_meter = AverageMeter() l_loss_meter = AverageMeter() id_loss_meter = AverageMeter() loss_meter = AverageMeter() ep_st = time.time() step = 0 epoch_done = False while not epoch_done: step += 1 step_st = time.time() ims, im_names, labels, mirrored, epoch_done = train_set.next_batch() ims_var = Variable(TVT(torch.from_numpy(ims).float())) labels_t = TVT(torch.from_numpy(labels).long()) labels_var = Variable(labels_t) global_feat, local_feat, logits = model_w(ims_var) g_loss, p_inds, n_inds, g_dist_ap, g_dist_an, g_dist_mat = global_loss( g_tri_loss, global_feat, labels_t, normalize_feature=cfg.normalize_feature) if cfg.l_loss_weight == 0: l_loss = 0 elif cfg.local_dist_own_hard_sample: # Let local distance find its own hard samples. l_loss, l_dist_ap, l_dist_an, _ = local_loss( l_tri_loss, local_feat, None, None, labels_t, normalize_feature=cfg.normalize_feature) else: l_loss, l_dist_ap, l_dist_an = local_loss( l_tri_loss, local_feat, p_inds, n_inds, labels_t, normalize_feature=cfg.normalize_feature) id_loss = 0 if cfg.id_loss_weight > 0: id_loss = id_criterion(logits, labels_var) loss = g_loss * cfg.g_loss_weight \ + l_loss * cfg.l_loss_weight \ + id_loss * cfg.id_loss_weight optimizer.zero_grad() loss.backward() optimizer.step() ############ # Step Log # ############ # precision g_prec = (g_dist_an > g_dist_ap).data.float().mean() # the proportion of triplets that satisfy margin g_m = (g_dist_an > g_dist_ap + cfg.global_margin).data.float().mean() g_d_ap = g_dist_ap.data.mean() g_d_an = g_dist_an.data.mean() g_prec_meter.update(g_prec) g_m_meter.update(g_m) g_dist_ap_meter.update(g_d_ap) g_dist_an_meter.update(g_d_an) g_loss_meter.update(to_scalar(g_loss)) if cfg.l_loss_weight > 0: # precision l_prec = (l_dist_an > l_dist_ap).data.float().mean() # the proportion of triplets that satisfy margin l_m = (l_dist_an > l_dist_ap + cfg.local_margin).data.float().mean() l_d_ap = l_dist_ap.data.mean() l_d_an = l_dist_an.data.mean() l_prec_meter.update(l_prec) l_m_meter.update(l_m) l_dist_ap_meter.update(l_d_ap) l_dist_an_meter.update(l_d_an) l_loss_meter.update(to_scalar(l_loss)) if cfg.id_loss_weight > 0: id_loss_meter.update(to_scalar(id_loss)) loss_meter.update(to_scalar(loss)) if step % cfg.log_steps == 0: time_log = '\tStep {}/Ep {}, {:.2f}s'.format( step, ep + 1, time.time() - step_st, ) if cfg.g_loss_weight > 0: g_log = (', gp {:.2%}, gm {:.2%}, ' 'gd_ap {:.4f}, gd_an {:.4f}, ' 'gL {:.4f}'.format( g_prec_meter.val, g_m_meter.val, g_dist_ap_meter.val, g_dist_an_meter.val, g_loss_meter.val, )) else: g_log = '' if cfg.l_loss_weight > 0: l_log = (', lp {:.2%}, lm {:.2%}, ' 'ld_ap {:.4f}, ld_an {:.4f}, ' 'lL {:.4f}'.format( l_prec_meter.val, l_m_meter.val, l_dist_ap_meter.val, l_dist_an_meter.val, l_loss_meter.val, )) else: l_log = '' if cfg.id_loss_weight > 0: id_log = (', idL {:.4f}'.format(id_loss_meter.val)) else: id_log = '' total_loss_log = ', loss {:.4f}'.format(loss_meter.val) log = time_log + \ g_log + l_log + id_log + \ total_loss_log print(log) ############# # Epoch Log # ############# time_log = 'Ep {}, {:.2f}s'.format(ep + 1, time.time() - ep_st, ) if cfg.g_loss_weight > 0: g_log = (', gp {:.2%}, gm {:.2%}, ' 'gd_ap {:.4f}, gd_an {:.4f}, ' 'gL {:.4f}'.format( g_prec_meter.avg, g_m_meter.avg, g_dist_ap_meter.avg, g_dist_an_meter.avg, g_loss_meter.avg, )) else: g_log = '' if cfg.l_loss_weight > 0: l_log = (', lp {:.2%}, lm {:.2%}, ' 'ld_ap {:.4f}, ld_an {:.4f}, ' 'lL {:.4f}'.format( l_prec_meter.avg, l_m_meter.avg, l_dist_ap_meter.avg, l_dist_an_meter.avg, l_loss_meter.avg, )) else: l_log = '' if cfg.id_loss_weight > 0: id_log = (', idL {:.4f}'.format(id_loss_meter.avg)) else: id_log = '' total_loss_log = ', loss {:.4f}'.format(loss_meter.avg) log = time_log + \ g_log + l_log + id_log + \ total_loss_log print(log) # Log to TensorBoard if cfg.log_to_file: if writer is None: writer = SummaryWriter(log_dir=osp.join(cfg.exp_dir, 'tensorboard')) writer.add_scalars( 'loss', dict(global_loss=g_loss_meter.avg, local_loss=l_loss_meter.avg, id_loss=id_loss_meter.avg, loss=loss_meter.avg, ), ep) writer.add_scalars( 'tri_precision', dict(global_precision=g_prec_meter.avg, local_precision=l_prec_meter.avg, ), ep) writer.add_scalars( 'satisfy_margin', dict(global_satisfy_margin=g_m_meter.avg, local_satisfy_margin=l_m_meter.avg, ), ep) writer.add_scalars( 'global_dist', dict(global_dist_ap=g_dist_ap_meter.avg, global_dist_an=g_dist_an_meter.avg, ), ep) writer.add_scalars( 'local_dist', dict(local_dist_ap=l_dist_ap_meter.avg, local_dist_an=l_dist_an_meter.avg, ), ep) # save ckpt if cfg.log_to_file: save_ckpt(modules_optims, ep + 1, 0, cfg.ckpt_file) ######## # Test # ######## test(load_model_weight=False)
class BaseTrainer: """ Base class for all trainers """ def __init__(self, model, loss, resume, config, train_logger=None): self.config = config self.logger = logging.getLogger(self.__class__.__name__) self.model = model self.loss = loss self.name = config['name'] self.epochs = config['trainer']['epochs'] self.save_freq = config['trainer']['save_freq'] self.verbosity = config['trainer']['verbosity'] self.summary_writer = SummaryWriter() # check cuda available if torch.cuda.is_available(): if config['cuda']: self.with_cuda = True self.gpus = { i: item for i, item in enumerate(self.config['gpus']) } device = 'cuda' if torch.cuda.device_count() > 1 and len(self.gpus) > 1: self.model.parallelize() torch.cuda.empty_cache() else: self.with_cuda = False device = 'cpu' else: self.logger.warning( 'Warning: There\'s no CUDA support on this machine, training is performed on CPU.' ) self.with_cuda = False device = 'cpu' self.device = torch.device(device) self.model.to(self.device) # log self.logger.debug('Model is initialized.') self._log_memory_useage() self.train_logger = train_logger # optimizer self.optimizer = self.model.optimize(config['optimizer_type'], config['optimizer']) # train monitor self.monitor = config['trainer']['monitor'] self.monitor_mode = config['trainer']['monitor_mode'] assert self.monitor_mode == 'min' or self.monitor_mode == 'max' self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf # checkpoint path self.start_epoch = 1 self.checkpoint_dir = os.path.join(config['trainer']['save_dir'], self.name) make_dir(self.checkpoint_dir) if resume: self._resume_checkpoint(resume) def _train_epoch(self, epoch): """ Training logic for an epoch :param epoch: Current epoch number """ raise NotImplementedError def train(self): """ Full training logic """ print('Total epochs: {}'.format(self.epochs)) for epoch in range(self.start_epoch, self.epochs + 1): try: result = self._train_epoch(epoch) except torch.cuda.CudaError: self._log_memory_useage() log = {'epoch': epoch} for key, value in result.items(): log[key] = value # log info if self.train_logger is not None: self.train_logger.add_entry(log) if self.verbosity >= 1: for key, value in log.items(): self.logger.info(' {:15s}: {}'.format( str(key), value)) # save checkpoints if (self.monitor_mode == 'min' and log[self.monitor] < self.monitor_best) \ or (self.monitor_mode == 'max' and log[self.monitor] > self.monitor_best): self.monitor_best = log[self.monitor] self._save_checkpoint(epoch, log, save_best=True) if epoch % self.save_freq == 0: self._save_checkpoint(epoch, log) self.summary_writer.add_scalars('HMEAN', {'hmean': result['hmean']}, epoch) self.summary_writer.add_scalars('LOSS', {'train_loss': result['loss']}, epoch) self.summary_writer.close() def _log_memory_useage(self): if not self.with_cuda: return template = """Memory Usage: \n{}""" usage = [] for deviceID, device in self.gpus.items(): deviceID = int(deviceID) allocated = torch.cuda.memory_allocated(deviceID) / (1024 * 1024) cached = torch.cuda.memory_cached(deviceID) / (1024 * 1024) usage.append( ' CUDA: {} Allocated: {} MB Cached: {} MB \n'.format( device, allocated, cached)) content = ''.join(usage) content = template.format(content) self.logger.debug(content) def _save_checkpoint(self, epoch, log, save_best=False): """ Saving checkpoints :param epoch: current epoch number :param log: logging information of the epoch :param save_best: if True, rename the saved checkpoint to 'model_best.pth.tar' """ arch = type(self.model).__name__ if save_best: state = { 'arch': arch, 'epoch': epoch, 'state_dict': self.model.state_dict() } filename = os.path.join(self.checkpoint_dir, 'model_best.pth.tar') torch.save(state, filename) self.logger.info( "Saving current best: {} ...".format('model_best.pth.tar')) else: state = { 'arch': arch, 'epoch': epoch, 'logger': self.train_logger, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'monitor_best': self.monitor_best } filename = os.path.join( self.checkpoint_dir, 'checkpoint-epoch{:03d}-loss-{:.4f}.pth.tar'.format( epoch, log['loss'])) torch.save(state, filename) self.logger.info("Saving checkpoint: {} ...".format(filename)) def _resume_checkpoint(self, resume_path): """ Resume from saved checkpoints :param resume_path: Checkpoint path to be resumed """ self.logger.info("Loading checkpoint: {} ...".format(resume_path)) checkpoint = torch.load(resume_path) self.start_epoch = checkpoint['epoch'] + 1 self.monitor_best = checkpoint['monitor_best'] self.model.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) if self.with_cuda: for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda(torch.device('cuda')) self.train_logger = checkpoint['logger'] self.logger.info("Checkpoint '{}' (epoch {}) loaded".format( resume_path, self.start_epoch))