Exemplo n.º 1
0
        rotation_action = gt.select_orientation([state[-1]])
        action = gt.select_action_active(frame, pos[0:2])
        # print('action : ', action, 'rotation : ', rotation_action)
        action = np.append(action, rotation_action)
        # print('action : ', action)
        reward, next_state, done, next_frame, next_pos, save_orientation = env.step(action)
        # print('pos : ', pos.shape, next_pos.shape)
        # print(frame.shape)
        # print(episode_reward, reward)
        episode_reward[0] += reward[0]
        episode_reward[1] += reward[1]
        # print(reward)
        # time.sleep(1)
        gt.buffer.add((state, action[0:2], rotation_action, next_state, reward, done, frame, next_frame, pos, next_pos))
        if k > 32 or done == 1:
            gt.learn_active()
            k = 0
        if done == 1:
            break
        state = next_state
        frame = next_frame
        pos = next_pos
    gt.buffer.clear()

    if t % 10 == 0:
        print('in epoch ' + str(t) + '  episode reward : ', episode_reward)
    if t % 200 == 199:
        gt.save_active_model('active_policy_' + str(t) + '.para', 'active_value_' + str(t) + '.para')
    # writer.add_scalar('episode_reward', episode_reward, steps)
    writer.add_scalars('episode_reward', {'distance reward': episode_reward[0]}, steps)
class VPoserTrainer:

    def __init__(self, work_dir, ps):
        from tensorboardX import SummaryWriter

        from human_body_prior.data.dataloader import VPoserDS

        self.pt_dtype = torch.float64 if ps.fp_precision == '64' else torch.float32

        torch.manual_seed(ps.seed)

        ps.work_dir = makepath(work_dir, isfile=False)

        logger = log2file(os.path.join(work_dir, '%s.log' % ps.expr_code))

        summary_logdir = os.path.join(work_dir, 'summaries')
        self.swriter = SummaryWriter(log_dir=summary_logdir)
        logger('tensorboard --logdir=%s' % summary_logdir)
        logger('Torch Version: %s\n' % torch.__version__)

        shutil.copy2(os.path.realpath(__file__), work_dir)

        use_cuda = torch.cuda.is_available()
        if use_cuda: torch.cuda.empty_cache()
        self.comp_device = torch.device("cuda:%d"%ps.cuda_id if torch.cuda.is_available() else "cpu")

        logger('%d CUDAs available!' % torch.cuda.device_count())

        gpu_brand= torch.cuda.get_device_name(ps.cuda_id) if use_cuda else None
        logger('Training with %s [%s]' % (self.comp_device,gpu_brand)  if use_cuda else 'Training on CPU!!!')
        logger('Base dataset_dir is %s'%ps.dataset_dir)

        kwargs = {'num_workers': ps.n_workers}
        ds_train = VPoserDS(dataset_dir=os.path.join(ps.dataset_dir, 'train'))
        self.ds_train = DataLoader(ds_train, batch_size=ps.batch_size, shuffle=True, drop_last=True, **kwargs)
        ds_val = VPoserDS(dataset_dir=os.path.join(ps.dataset_dir, 'vald'))
        self.ds_val = DataLoader(ds_val, batch_size=ps.batch_size, shuffle=True, drop_last=True, **kwargs)
        ds_test = VPoserDS(dataset_dir=os.path.join(ps.dataset_dir, 'test'))
        self.ds_test = DataLoader(ds_test, batch_size=ps.batch_size, shuffle=True, drop_last=True, **kwargs)
        logger('Train dataset size %.2f M' % (len(self.ds_train.dataset)*1e-6))
        logger('Validation dataset size %d' % len(self.ds_val.dataset))
        logger('Test dataset size %d' % len(self.ds_test.dataset))

        ps.data_shape = list(ds_val[0]['pose_aa'].shape)
        self.vposer_model = VPoser(num_neurons=ps.num_neurons, latentD=ps.latentD, data_shape=ps.data_shape,
                                   use_cont_repr=ps.use_cont_repr)

        if ps.use_multigpu :
            self.vposer_model = nn.DataParallel(self.vposer_model)

        self.vposer_model.to(self.comp_device)

        varlist = [var[1] for var in self.vposer_model.named_parameters()]

        params_count = sum(p.numel() for p in varlist if p.requires_grad)
        logger('Total Trainable Parameters Count is %2.2f M.' % ((params_count) * 1e-6))

        self.optimizer = optim.Adam(varlist, lr=ps.base_lr, weight_decay=ps.reg_coef)

        self.logger = logger
        self.best_loss_total = np.inf
        self.try_num = ps.try_num
        self.epochs_completed = 0
        self.ps = ps

        if ps.best_model_fname is not None:
            if isinstance(self.vposer_model, torch.nn.DataParallel):
                self.vposer_model.module.load_state_dict(
                    torch.load(ps.best_model_fname, map_location=self.comp_device))
            else:
                self.vposer_model.load_state_dict(torch.load(ps.best_model_fname, map_location=self.comp_device))

            logger('Restored model from %s' % ps.best_model_fname)

        chose_ids = np.random.choice(list(range(len(ds_val))), size=ps.num_bodies_to_display, replace=False, p=None)
        data_all = {}
        for id in chose_ids:
            for k, v in ds_val[id].items():
                if k in data_all.keys():
                    data_all[k] = torch.cat([data_all[k], v[np.newaxis]], dim=0)
                else:
                    data_all[k] = v[np.newaxis]

        self.vis_dorig = {k: data_all[k].to(self.comp_device) for k in data_all.keys()}

        self.bm = BodyModel(self.ps.bm_path, 'smplh', batch_size=self.ps.batch_size, use_posedirs=True).to(self.comp_device)

    def train(self):
        self.vposer_model.train()
        save_every_it = len(self.ds_train) / self.ps.log_every_epoch
        train_loss_dict = {}
        for it, dorig in enumerate(self.ds_train):
            dorig = {k: dorig[k].to(self.comp_device) for k in dorig.keys()}

            self.optimizer.zero_grad()
            drec = self.vposer_model(dorig['pose_aa'], output_type='aa')
            loss_total, cur_loss_dict = self.compute_loss(dorig, drec)
            loss_total.backward()
            self.optimizer.step()

            train_loss_dict = {k: train_loss_dict.get(k, 0.0) + v.item() for k, v in cur_loss_dict.items()}
            if it % (save_every_it + 1) == 0:
                cur_train_loss_dict = {k: v / (it + 1) for k, v in train_loss_dict.items()}
                train_msg = VPoserTrainer.creat_loss_message(cur_train_loss_dict, expr_code=self.ps.expr_code,
                                                             epoch_num=self.epochs_completed, it=it,
                                                             try_num=self.try_num, mode='train')

                self.logger(train_msg)
                self.swriter.add_histogram('q_z_sample', c2c(drec['mean']), it)

        train_loss_dict = {k: v / len(self.ds_train) for k, v in train_loss_dict.items()}
        return train_loss_dict

    def evaluate(self, split_name= 'vald'):
        self.vposer_model.eval()
        eval_loss_dict = {}
        data = self.ds_val if split_name == 'vald' else self.ds_test
        with torch.no_grad():
            for dorig in data:
                dorig = {k: dorig[k].to(self.comp_device) for k in dorig.keys()}
                drec = self.vposer_model(dorig['pose_aa'], output_type='aa')
                _, cur_loss_dict = self.compute_loss(dorig, drec)
                eval_loss_dict = {k: eval_loss_dict.get(k, 0.0) + v.item() for k, v in cur_loss_dict.items()}

        eval_loss_dict = {k: v / len(data) for k, v in eval_loss_dict.items()}
        return eval_loss_dict

    def compute_loss(self, dorig, drec):
        q_z = torch.distributions.normal.Normal(drec['mean'], drec['std'])

        prec = drec['pose_aa']
        porig = dorig['pose_aa']

        device = dorig['pose_aa'].device
        dtype = dorig['pose_aa'].dtype

        MESH_SCALER = 1000

        # Reconstruction loss - L1 on the output mesh
        mesh_orig = self.bm(pose_body=porig.view(self.ps.batch_size,-1)).v*MESH_SCALER
        mesh_rec = self.bm(pose_body=prec.view(self.ps.batch_size,-1)).v*MESH_SCALER
        loss_mesh_rec = (1. - self.ps.kl_coef) * torch.mean(torch.abs(mesh_orig - mesh_rec))

        # KL loss
        p_z = torch.distributions.normal.Normal(
            loc=torch.tensor(np.zeros([self.ps.batch_size, self.ps.latentD]), requires_grad=False).to(device).type(dtype),
            scale=torch.tensor(np.ones([self.ps.batch_size, self.ps.latentD]), requires_grad=False).to(device).type(dtype))
        loss_kl = self.ps.kl_coef * torch.mean(torch.sum(torch.distributions.kl.kl_divergence(q_z, p_z), dim=[1]))

        ## Archive of losses
        # loss_rec = (1. - self.ps.kl_coef) * torch.mean(torch.sum(torch.pow(dorig - prec, 2), dim=[1, 2, 3]))
        # R = prec.view([batch_size, n_joints, 3, 3])
        # R_T = torch.transpose(R, 2, 3)
        # R_eye = torch.tensor(np.tile(np.eye(3,3).reshape(1,1,3,3), [batch_size, n_joints, 1, 1]), dtype=dtype, requires_grad = False).to(device)
        # loss_ortho = self.ps.ortho_coef * torch.mean(torch.sum(torch.pow(torch.matmul(R, R_T) - R_eye,2),dim=[1,2,3]))
        #
        # det_R = torch.transpose(torch.stack([determinant_3d(R[:,jIdx,...]) for jIdx in range(n_joints)]),0,1)
        #
        # one = torch.tensor(np.ones([batch_size, n_joints]), dtype = dtype, requires_grad = False).to(device)
        # loss_det1 = self.ps.det1_coef * torch.mean(torch.sum(torch.abs(det_R - one), dim=[1]))

        loss_dict = {'loss_kl': loss_kl,
                     'loss_mesh_rec': loss_mesh_rec,
                     }

        if self.vposer_model.training and self.epochs_completed < 10:
            loss_dict['loss_pose_rec'] = (1. - self.ps.kl_coef) * torch.mean(torch.sum(torch.pow(porig - prec, 2), dim=[1, 2, 3]))

        loss_total = torch.stack(list(loss_dict.values())).sum()
        loss_dict['loss_total'] = loss_total

        return loss_total, loss_dict

    def perform_training(self, num_epochs=None, message=None):
        starttime = datetime.now().replace(microsecond=0)
        if num_epochs is None: num_epochs = self.ps.num_epochs

        self.logger(
            'Started Training at %s for %d epochs' % (datetime.strftime(starttime, '%Y-%m-%d_%H:%M:%S'), num_epochs))

        vis_bm =  BodyModel(self.ps.bm_path, 'smplh', num_betas=16).to(self.comp_device)
        prev_lr = np.inf
        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=int(num_epochs // 3), gamma=0.5)
        for epoch_num in range(1, num_epochs + 1):
            scheduler.step()
            cur_lr = self.optimizer.param_groups[0]['lr']
            if cur_lr != prev_lr:
                self.logger('--- Optimizer learning rate changed from %.2e to %.2e ---' % (prev_lr, cur_lr))
                prev_lr = cur_lr
            self.epochs_completed += 1
            train_loss_dict = self.train()
            eval_loss_dict = self.evaluate()

            with torch.no_grad():
                eval_msg = VPoserTrainer.creat_loss_message(eval_loss_dict, expr_code=self.ps.expr_code,
                                                            epoch_num=self.epochs_completed, it=len(self.ds_val),
                                                            try_num=self.try_num, mode='evald')
                if eval_loss_dict['loss_total'] < self.best_loss_total:
                    self.ps.best_model_fname = makepath(os.path.join(self.ps.work_dir, 'snapshots', 'TR%02d_E%03d.pt' % (
                    self.try_num, self.epochs_completed)), isfile=True)
                    self.logger(eval_msg + ' ** ')
                    self.best_loss_total = eval_loss_dict['loss_total']
                    torch.save(self.vposer_model.module.state_dict() if isinstance(self.vposer_model, torch.nn.DataParallel) else self.vposer_model.state_dict(), self.ps.best_model_fname)

                    imgname = '[%s]_TR%02d_E%03d.png' % (self.ps.expr_code, self.try_num, self.epochs_completed)
                    imgpath = os.path.join(self.ps.work_dir, 'images', imgname)
                    try:
                        VPoserTrainer.vis_results(self.vis_dorig, self.vposer_model, bm=vis_bm, imgpath=imgpath)
                    except:
                        print('The visualization failed.')
                else:
                    self.logger(eval_msg)

                self.swriter.add_scalars('total_loss/scalars', {'train_loss_total': train_loss_dict['loss_total'],
                                                                'evald_loss_total': eval_loss_dict['loss_total'], },
                                         self.epochs_completed)

            # if early_stopping(eval_loss_dict['loss_total']):
            #     self.logger("Early stopping at epoch %d"%self.epochs_completed)
            #     break

        endtime = datetime.now().replace(microsecond=0)

        self.logger('Finished Training at %s\n' % (datetime.strftime(endtime, '%Y-%m-%d_%H:%M:%S')))
        self.logger(
            'Training done in %s! Best val total loss achieved: %.2e\n' % (endtime - starttime, self.best_loss_total))
        self.logger('Best model path: %s\n' % self.ps.best_model_fname)

    @staticmethod
    def creat_loss_message(loss_dict, expr_code='XX', epoch_num=0, it=0, try_num=0, mode='evald'):
        ext_msg = ' | '.join(['%s = %.2e' % (k, v) for k, v in loss_dict.items() if k != 'loss_total'])
        return '[%s]_TR%02d_E%03d - It %05d - %s: [T:%.2e] - [%s]' % (
        expr_code, try_num, epoch_num, it, mode, loss_dict['loss_total'], ext_msg)

    @staticmethod
    def vis_results(dorig, vposer_model, bm, imgpath):
        from human_body_prior.mesh import MeshViewer
        from human_body_prior.tools.omni_tools import copy2cpu as c2c
        import trimesh
        from human_body_prior.tools.omni_tools import colors
        from human_body_prior.tools.omni_tools import apply_mesh_tranfsormations_

        from human_body_prior.tools.visualization_tools import imagearray2file
        from human_body_prior.train.vposer_smpl import VPoser

        view_angles = [0, 180, 90, -90]
        imw, imh = 800, 800
        batch_size = len(dorig['pose_aa'])

        mv = MeshViewer(width=imw, height=imh, use_offscreen=True)
        mv.render_wireframe = True

        dorig_aa = dorig['pose_aa']

        prec_aa = vposer_model(dorig_aa, output_type='aa')['pose_aa'].view(batch_size,-1)
        if hasattr(vposer_model, 'module'):
            pgen_aa = vposer_model.module.sample_poses(num_poses=batch_size, output_type='aa')
        else:
            pgen_aa = vposer_model.sample_poses(num_poses=batch_size, output_type='aa')

        pgen_aa = pgen_aa.view(batch_size,-1)
        dorig_aa = dorig_aa.view(batch_size, -1)

        images = np.zeros([len(view_angles), batch_size, 1, imw, imh, 3])
        images_gen = np.zeros([len(view_angles), batch_size, 1, imw, imh, 3])
        for cId in range(0, batch_size):

            bm.pose_body.data[:] = bm.pose_body.new(dorig_aa[cId])
            orig_body_mesh = trimesh.Trimesh(vertices=c2c(bm().v[0]), faces=c2c(bm.f), vertex_colors=np.tile(colors['grey'], (6890, 1)))

            bm.pose_body.data[:] = bm.pose_body.new(prec_aa[cId])
            rec_body_mesh = trimesh.Trimesh(vertices=c2c(bm().v[0]), faces=c2c(bm.f), vertex_colors=np.tile(colors['blue'], (6890, 1)))

            bm.pose_body.data[:] = bm.pose_body.new(pgen_aa[cId])
            gen_body_mesh = trimesh.Trimesh(vertices=c2c(bm().v[0]), faces=c2c(bm.f), vertex_colors=np.tile(colors['blue'], (6890, 1)))

            all_meshes = [orig_body_mesh, rec_body_mesh, gen_body_mesh]

            for rId, angle in enumerate(view_angles):
                if angle != 0: apply_mesh_tranfsormations_(all_meshes, trimesh.transformations.rotation_matrix(np.radians(angle), (0, 1, 0)))
                mv.set_meshes([orig_body_mesh, rec_body_mesh], group_name='static')
                images[rId, cId, 0] = mv.render()
                mv.set_meshes([gen_body_mesh], group_name='static')
                images_gen[rId, cId, 0] = mv.render()

                if angle != 0: apply_mesh_tranfsormations_(all_meshes, trimesh.transformations.rotation_matrix(np.radians(-angle), (0, 1, 0)))

        imagearray2file(images, imgpath)
        imagearray2file(images_gen, imgpath.replace('.png','_gen.png'))
Exemplo n.º 3
0
import numpy as np
from tensorboardX import SummaryWriter

writer = SummaryWriter(comment="base_scalar", log_dir="scalar")
for epoch in range(100):
    writer.add_scalar("scalar/test", np.random.rand(), epoch)
    writer.add_scalars("scalar/scalars_test", {
        'xsinx': epoch * np.sin(epoch),
        'xcosx': epoch * np.cos(epoch)
    }, epoch)

writer.close()
def train(season_id, dm_train_set, dm_test_set):

    EMBEDDING_DIM = 200
    feature_dim = 50
    max_len = 49
    windows_size = [1, 2, 3, 4]
    batch_size = 128
    epoch_num = 100
    max_acc = 0
    max_v_acc = 0
    model_save_path = '.tmp/model_save/straight_CNN.model'

    dm_dataloader = data.DataLoader(dataset=dm_train_set,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    drop_last=True,
                                    num_workers=8)

    dm_test_dataloader = data.DataLoader(dataset=dm_test_set,
                                         batch_size=batch_size,
                                         shuffle=False,
                                         drop_last=False,
                                         num_workers=8)

    model = E2ECNNModeler(dm_train_set.vocab_size(), EMBEDDING_DIM,
                          feature_dim, windows_size, max_len)
    print(model)
    init_weight = np.loadtxt(
        os.path.join('./tmp', season_id, 'unigram_weights.txt'))
    model.init_emb(init_weight)
    if torch.cuda.is_available():
        print("CUDA : On")
        model.cuda()
    else:
        print("CUDA : Off")

    embedding_params = list(map(id, model.dynamic_embedding.parameters()))
    other_params = filter(lambda p: id(p) not in embedding_params,
                          model.parameters())

    optimizer = optim.Adam([{
        'params': other_params
    }, {
        'params': model.dynamic_embedding.parameters(),
        'lr': 1e-3
    }],
                           lr=1e-3,
                           betas=(0.9, 0.99))

    logging = True
    if logging:
        writer = SummaryWriter()
        log_name = 'Direct_CNN'

    history = None

    for epoch in range(epoch_num):

        if epoch > 0:
            for param_group in optimizer.param_groups:
                param_group['lr'] = param_group['lr'] * 0.8

        model.train(mode=True)

        for batch_idx, sample_dict in enumerate(dm_dataloader):
            sentence = Variable(torch.LongTensor(sample_dict['sentence']))
            label = Variable(torch.LongTensor(sample_dict['label']))
            if torch.cuda.is_available():
                sentence = sentence.cuda()
                label = label.cuda()

            optimizer.zero_grad()
            pred = model.forward(sentence)
            cross_entropy = nn.NLLLoss()
            loss = cross_entropy(F.log_softmax(pred, dim=1), label)
            if batch_idx % 10 == 0:
                accuracy = valid_util.running_accuracy(pred, label)
                print('epoch: %d batch %d : loss: %4.6f accuracy: %4.6f' %
                      (epoch, batch_idx, loss.item(), accuracy))
                if logging:
                    writer.add_scalar(log_name + '_data/loss', loss.item(),
                                      epoch * 10 + batch_idx // 10)
            loss.backward()
            optimizer.step()

        model.eval()
        if logging:
            result_dict = valid_util.validate(model,
                                              dm_test_set,
                                              dm_test_dataloader,
                                              mode='report')
            writer.add_scalars(
                log_name + '_data/0-PRF', {
                    '0-Precision': result_dict['0']['precision'],
                    '0-Recall': result_dict['0']['recall'],
                    '0-F1-score': result_dict['0']['f1-score']
                }, epoch)
            writer.add_scalars(
                log_name + '_data/1-PRF', {
                    '1-Precision': result_dict['1']['precision'],
                    '1-Recall': result_dict['1']['recall'],
                    '1-F1-score': result_dict['1']['f1-score']
                }, epoch)
            writer.add_scalar(log_name + '_data/accuracy',
                              result_dict['accuracy'], epoch)
        accuracy = valid_util.validate(model,
                                       dm_test_set,
                                       dm_test_dataloader,
                                       mode='output')
        if accuracy > max_acc:
            max_acc = accuracy

        # dm_valid_set = pickle.load(open(os.path.join('./tmp', season_id, 'unigram_valid_dataset.pkl'), 'rb'))
        # v_acc = valid_util.validate(model, dm_valid_set, mode='output')
        # if v_acc > max_v_acc:
        #     max_v_acc = v_acc

    if logging:
        writer.close()
    print("Max Accuracy: %4.6f" % max_acc)
    print("Max Validation Accuracy: %4.6f" % max_v_acc)
    return
Exemplo n.º 5
0
                print("\t\t\tfg/bg=(%d/%d), time cost: %f" %
                      (fg_cnt, bg_cnt, end - start))
                print(
                    "\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f dloss s: %.4f dloss t: %.4f dloss s pixel: %.4f dloss t pixel: %.4f eta: %.4f" \
                    % (loss_rpn_cls, loss_rpn_box, loss_rcnn_cls, loss_rcnn_box, dloss_s, dloss_t, dloss_s_p, dloss_t_p,
                       args.eta))
                if args.use_tfboard:
                    info = {
                        'loss': loss_temp,
                        'loss_rpn_cls': loss_rpn_cls,
                        'loss_rpn_box': loss_rpn_box,
                        'loss_rcnn_cls': loss_rcnn_cls,
                        'loss_rcnn_box': loss_rcnn_box
                    }
                    logger.add_scalars("logs_s_{}/losses".format(args.session),
                                       info,
                                       (epoch - 1) * iters_per_epoch + step)

                loss_temp = 0
                start = time.time()
        save_name = os.path.join(
            output_dir,
            'globallocal_target_{}_eta_{}_local_context_{}_global_context_{}_gamma_{}_session_{}_epoch_{}_step_{}.pth'
            .format(args.dataset_t, args.eta, args.lc, args.gc, args.gamma,
                    args.session, epoch, step))
        save_checkpoint(
            {
                'session':
                args.session,
                'epoch':
                epoch + 1,
Exemplo n.º 6
0
def train(config_path,
          model_dir,
          result_path=None,
          create_folder=False,
          display_step=50,
          summary_step=5,
          pickle_result=True):
    """train a VoxelNet model specified by a config file.
    """
    if create_folder:
        if pathlib.Path(model_dir).exists():
            model_dir = torchplus.train.create_folder(model_dir)

    model_dir = pathlib.Path(model_dir)
    model_dir.mkdir(parents=True, exist_ok=True)
    if result_path is None:
        result_path = model_dir / 'results'
    config_file_bkp = "pipeline.config"
    config = pipeline_pb2.TrainEvalPipelineConfig()
    with open(config_path, "r") as f:
        proto_str = f.read()
        text_format.Merge(proto_str, config)
    shutil.copyfile(config_path, str(model_dir / config_file_bkp))
    input_cfg = config.train_input_reader
    eval_input_cfg = config.eval_input_reader
    model_cfg = config.model.second
    train_cfg = config.train_config

    class_names = list(input_cfg.class_names)
    ######################
    # BUILD VOXEL GENERATOR
    ######################
    voxel_generator = voxel_builder.build(model_cfg.voxel_generator)
    ######################
    # BUILD TARGET ASSIGNER
    ######################
    bv_range = voxel_generator.point_cloud_range[[0, 1, 3, 4]]
    box_coder = box_coder_builder.build(model_cfg.box_coder)
    target_assigner_cfg = model_cfg.target_assigner
    target_assigner = target_assigner_builder.build(target_assigner_cfg,
                                                    bv_range, box_coder)
    ######################
    # BUILD NET
    ######################
    center_limit_range = model_cfg.post_center_limit_range
    net = second_builder.build(model_cfg, voxel_generator, target_assigner)
    net.cuda()
    # net_train = torch.nn.DataParallel(net).cuda()
    print("num_trainable parameters:", len(list(net.parameters())))
    # for n, p in net.named_parameters():
    #     print(n, p.shape)
    ######################
    # BUILD OPTIMIZER
    ######################
    # we need global_step to create lr_scheduler, so restore net first.
    torchplus.train.try_restore_latest_checkpoints(model_dir, [net])
    gstep = net.get_global_step() - 1
    optimizer_cfg = train_cfg.optimizer
    if train_cfg.enable_mixed_precision:
        net.half()
        net.metrics_to_float()
        net.convert_norm_to_float(net)
    optimizer = optimizer_builder.build(optimizer_cfg, net.parameters())
    if train_cfg.enable_mixed_precision:
        loss_scale = train_cfg.loss_scale_factor
        mixed_optimizer = torchplus.train.MixedPrecisionWrapper(
            optimizer, loss_scale)
    else:
        mixed_optimizer = optimizer
    # must restore optimizer AFTER using MixedPrecisionWrapper
    torchplus.train.try_restore_latest_checkpoints(model_dir,
                                                   [mixed_optimizer])
    lr_scheduler = lr_scheduler_builder.build(optimizer_cfg, optimizer, gstep)
    if train_cfg.enable_mixed_precision:
        float_dtype = torch.float16
    else:
        float_dtype = torch.float32
    ######################
    # PREPARE INPUT
    ######################

    dataset = input_reader_builder.build(input_cfg,
                                         model_cfg,
                                         training=True,
                                         voxel_generator=voxel_generator,
                                         target_assigner=target_assigner)
    eval_dataset = input_reader_builder.build(eval_input_cfg,
                                              model_cfg,
                                              training=False,
                                              voxel_generator=voxel_generator,
                                              target_assigner=target_assigner)

    def _worker_init_fn(worker_id):
        time_seed = np.array(time.time(), dtype=np.int32)
        np.random.seed(time_seed + worker_id)
        print(f"WORKER {worker_id} seed:", np.random.get_state()[1][0])

    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=input_cfg.batch_size,
                                             shuffle=True,
                                             num_workers=input_cfg.num_workers,
                                             pin_memory=False,
                                             collate_fn=merge_second_batch,
                                             worker_init_fn=_worker_init_fn)
    eval_dataloader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=eval_input_cfg.batch_size,
        shuffle=False,
        num_workers=eval_input_cfg.num_workers,
        pin_memory=False,
        collate_fn=merge_second_batch)
    data_iter = iter(dataloader)

    ######################
    # TRAINING
    ######################
    log_path = model_dir / 'log.txt'
    logf = open(log_path, 'a')
    logf.write(proto_str)
    logf.write("\n")
    summary_dir = model_dir / 'summary'
    summary_dir.mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter(str(summary_dir))

    total_step_elapsed = 0
    remain_steps = train_cfg.steps - net.get_global_step()
    t = time.time()
    ckpt_start_time = t

    total_loop = train_cfg.steps // train_cfg.steps_per_eval + 1
    # total_loop = remain_steps // train_cfg.steps_per_eval + 1
    clear_metrics_every_epoch = train_cfg.clear_metrics_every_epoch

    if train_cfg.steps % train_cfg.steps_per_eval == 0:
        total_loop -= 1
    mixed_optimizer.zero_grad()
    try:
        print(total_loop)
        for _ in range(total_loop):
            if total_step_elapsed + train_cfg.steps_per_eval > train_cfg.steps:
                steps = train_cfg.steps % train_cfg.steps_per_eval
            else:
                steps = train_cfg.steps_per_eval
            print(steps)
            for step in range(steps):
                lr_scheduler.step()
                try:
                    example = next(data_iter)
                except StopIteration:
                    print("end epoch")
                    if clear_metrics_every_epoch:
                        net.clear_metrics()
                    data_iter = iter(dataloader)
                    example = next(data_iter)
                example_torch = example_convert_to_torch(example, float_dtype)

                batch_size = example["anchors"].shape[0]

                ret_dict = net(example_torch)

                # box_preds = ret_dict["box_preds"]
                cls_preds = ret_dict["cls_preds"]
                loss = ret_dict["loss"].mean()
                cls_loss_reduced = ret_dict["cls_loss_reduced"].mean()
                loc_loss_reduced = ret_dict["loc_loss_reduced"].mean()
                cls_pos_loss = ret_dict["cls_pos_loss"]
                cls_neg_loss = ret_dict["cls_neg_loss"]
                loc_loss = ret_dict["loc_loss"]
                cls_loss = ret_dict["cls_loss"]
                dir_loss_reduced = ret_dict["dir_loss_reduced"]
                cared = ret_dict["cared"]
                labels = example_torch["labels"]
                if train_cfg.enable_mixed_precision:
                    loss *= loss_scale
                loss.backward()
                torch.nn.utils.clip_grad_norm_(net.parameters(), 10.0)
                mixed_optimizer.step()
                mixed_optimizer.zero_grad()
                net.update_global_step()
                net_metrics = net.update_metrics(cls_loss_reduced,
                                                 loc_loss_reduced, cls_preds,
                                                 labels, cared)

                step_time = (time.time() - t)
                t = time.time()
                metrics = {}
                num_pos = int((labels > 0)[0].float().sum().cpu().numpy())
                num_neg = int((labels == 0)[0].float().sum().cpu().numpy())
                if 'anchors_mask' not in example_torch:
                    num_anchors = example_torch['anchors'].shape[1]
                else:
                    num_anchors = int(example_torch['anchors_mask'][0].sum())
                global_step = net.get_global_step()
                if global_step % display_step == 0:
                    loc_loss_elem = [
                        float(loc_loss[:, :, i].sum().detach().cpu().numpy() /
                              batch_size) for i in range(loc_loss.shape[-1])
                    ]
                    metrics["step"] = global_step
                    metrics["steptime"] = step_time
                    metrics.update(net_metrics)
                    metrics["loss"] = {}
                    metrics["loss"]["loc_elem"] = loc_loss_elem
                    metrics["loss"]["cls_pos_rt"] = float(
                        cls_pos_loss.detach().cpu().numpy())
                    metrics["loss"]["cls_neg_rt"] = float(
                        cls_neg_loss.detach().cpu().numpy())
                    # if unlabeled_training:
                    #     metrics["loss"]["diff_rt"] = float(
                    #         diff_loc_loss_reduced.detach().cpu().numpy())
                    if model_cfg.use_direction_classifier:
                        metrics["loss"]["dir_rt"] = float(
                            dir_loss_reduced.detach().cpu().numpy())
                    metrics["num_vox"] = int(example_torch["voxels"].shape[0])
                    metrics["num_pos"] = int(num_pos)
                    metrics["num_neg"] = int(num_neg)
                    metrics["num_anchors"] = int(num_anchors)
                    metrics["lr"] = float(
                        mixed_optimizer.param_groups[0]['lr'])
                    metrics["image_idx"] = example['image_idx'][0]
                    flatted_metrics = flat_nested_json_dict(metrics)
                    flatted_summarys = flat_nested_json_dict(metrics, "/")
                    for k, v in flatted_summarys.items():
                        if isinstance(v, (list, tuple)):
                            v = {str(i): e for i, e in enumerate(v)}
                            writer.add_scalars(k, v, global_step)
                        else:
                            writer.add_scalar(k, v, global_step)
                    metrics_str_list = []
                    for k, v in flatted_metrics.items():
                        if isinstance(v, float):
                            metrics_str_list.append(f"{k}={v:.3}")
                        elif isinstance(v, (list, tuple)):
                            if v and isinstance(v[0], float):
                                v_str = ', '.join([f"{e:.3}" for e in v])
                                metrics_str_list.append(f"{k}=[{v_str}]")
                            else:
                                metrics_str_list.append(f"{k}={v}")
                        else:
                            metrics_str_list.append(f"{k}={v}")
                    log_str = ', '.join(metrics_str_list)
                    print(log_str, file=logf)
                    print(log_str)
                ckpt_elasped_time = time.time() - ckpt_start_time
                if ckpt_elasped_time > train_cfg.save_checkpoints_secs:
                    torchplus.train.save_models(model_dir, [net, optimizer],
                                                net.get_global_step())
                    ckpt_start_time = time.time()
            total_step_elapsed += steps
            torchplus.train.save_models(model_dir, [net, optimizer],
                                        net.get_global_step())
            net.eval()
            result_path_step = result_path / f"step_{net.get_global_step()}"
            result_path_step.mkdir(parents=True, exist_ok=True)
            print("#################################")
            print("#################################", file=logf)
            print("# EVAL")
            print("# EVAL", file=logf)
            print("#################################")
            print("#################################", file=logf)
            print("Generate output labels...")
            print("Generate output labels...", file=logf)
            t = time.time()
            dt_annos = []
            prog_bar = ProgressBar()
            prog_bar.start(len(eval_dataset) // eval_input_cfg.batch_size + 1)
            for example in iter(eval_dataloader):
                example = example_convert_to_torch(example, float_dtype)
                if pickle_result:
                    dt_annos += predict_kitti_to_anno(net, example,
                                                      class_names,
                                                      center_limit_range,
                                                      model_cfg.lidar_input)
                else:
                    _predict_kitti_to_file(net, example, result_path_step,
                                           class_names, center_limit_range,
                                           model_cfg.lidar_input)

                prog_bar.print_bar()

            sec_per_ex = len(eval_dataset) / (time.time() - t)
            print(f"avg forward time per example: {net.avg_forward_time:.3f}")
            print(
                f"avg postprocess time per example: {net.avg_postprocess_time:.3f}"
            )

            net.clear_time_metrics()
            print(f'generate label finished({sec_per_ex:.2f}/s). start eval:')
            print(f'generate label finished({sec_per_ex:.2f}/s). start eval:',
                  file=logf)
            gt_annos = [
                info["annos"] for info in eval_dataset.dataset.kitti_infos
            ]
            if not pickle_result:
                dt_annos = kitti.get_label_annos(result_path_step)
            result = get_official_eval_result(gt_annos, dt_annos, class_names)
            print(result, file=logf)
            print(result)
            writer.add_text('eval_result', result, global_step)
            result = get_coco_eval_result(gt_annos, dt_annos, class_names)
            print(result, file=logf)
            print(result)
            if pickle_result:
                with open(result_path_step / "result.pkl", 'wb') as f:
                    pickle.dump(dt_annos, f)
            writer.add_text('eval_result', result, global_step)
            net.train()
    except Exception as e:
        torchplus.train.save_models(model_dir, [net, optimizer],
                                    net.get_global_step())
        logf.close()
        raise e
    # save model before exit
    torchplus.train.save_models(model_dir, [net, optimizer],
                                net.get_global_step())
    logf.close()
Exemplo n.º 7
0
class MtcnnTrainer(object):
    """ Train Templet
    """

    def __init__(self, configer, net, params, trainset, validset, testset, criterion, 
                    optimizer, lr_scheduler, num_to_keep=5, valid_freq=1):

        self.configer = configer
        self.valid_freq = valid_freq

        self.net = net
        
        ## directory for log and checkpoints
        self.logdir = os.path.join(configer.logdir, self.net._get_name())
        if not os.path.exists(self.logdir): os.makedirs(self.logdir)
        self.ckptdir = configer.ckptdir
        if not os.path.exists(self.ckptdir): os.makedirs(self.ckptdir)
        
        ## datasets
        self.trainset = trainset
        self.validset = validset
        self.testset  = testset
        self.trainloader = DataLoader(trainset, configer.batchsize, True,  collate_fn=collate_fn)
        self.validloader = DataLoader(validset, configer.batchsize, True,  collate_fn=collate_fn)
        self.testloader  = DataLoader(testset,  configer.batchsize, False, collate_fn=collate_fn)

        ## for optimization
        self.criterion = criterion
        self.optimizer = optimizer(params, configer.lrbase, weight_decay=4e-5)
        # self.lr_scheduler = lr_scheduler(self.optimizer, configer.adjstep, configer.gamma)      # MultiStepLR
        self.lr_scheduler = lr_scheduler(self.optimizer, configer.gamma)                                   # ExponentialLR
        self.writer = SummaryWriter(configer.logdir)
        self.writer.add_graph(self.net, (torch.rand([1] + trainset.image_size), ))
        
        ## initialize
        self.valid_loss = float('inf')
        self.elapsed_time = 0
        self.cur_epoch = 0
        self.cur_batch = 0
        self.save_times = 0
        self.num_to_keep = num_to_keep

        ## print information
        # stat(self.net, trainset.image_size)
        if configer.cuda and cuda.is_available(): 
            self.net.cuda()
            
        print("==============================================================================================")
        print("model:           {}".format(self.net._get_name()))
        print("logdir:          {}".format(self.logdir))
        print("ckptdir:         {}".format(self.ckptdir))
        print("train samples:   {}k".format(len(trainset)/1000))
        print("valid samples:   {}k".format(len(validset)/1000))
        print("batch size:      {}".format(configer.batchsize))
        print("batch per epoch: {}".format(len(trainset)/configer.batchsize))
        print("epoch:           [{:4d}]/[{:4d}]".format(self.cur_epoch, configer.n_epoch))
        print("val frequency:   {}".format(self.valid_freq))
        print("learing rate:    {}".format(configer.lrbase))
        print("==============================================================================================")

    def train(self):
        
        n_epoch = self.configer.n_epoch - self.cur_epoch
        print("Start training! current epoch: {}, remain epoch: {}".format(self.cur_epoch, n_epoch))

        bar = ProcessBar(n_epoch)
        loss_train = 0.; loss_valid = 0.

        for i_epoch in range(n_epoch):
            
            if self.configer.cuda and cuda.is_available(): cuda.empty_cache()

            self.cur_epoch += 1
            bar.step()

            self.lr_scheduler.step(self.cur_epoch)
            cur_lr = self.lr_scheduler.get_lr()[-1]
            self.writer.add_scalar('{}/lr'.format(self.net._get_name()), cur_lr, self.cur_epoch)

            loss_train = self.train_epoch()
            # print("----------------------------------------------------------------------------------------------")
            
            if self.valid_freq != 0 and self.cur_epoch % self.valid_freq == 0:
                loss_valid = self.valid_epoch()
            else:
                loss_valid = self.valid_loss
            # print("----------------------------------------------------------------------------------------------")

            self.writer.add_scalars('loss', {'train': loss_train, 'valid': loss_valid}, self.cur_epoch)

            if self.valid_freq == 0:
                self.save_checkpoint()
                
            else:
                if loss_valid < self.valid_loss:
                    self.valid_loss = loss_valid
                    self.save_checkpoint()
                
            # print("==============================================================================================")


    def train_epoch(self):
        
        self.net.train()
        avg_loss = []
        start_time = time.time()
        n_batch = len(self.trainset) // self.configer.batchsize

        bar = ProcessBar(n_batch, title='    [Train|Epoch %d] ' % self.cur_epoch)
        for i_batch, (images, labels, offsets, landmarks) in enumerate(self.trainloader):
            
            bar.step(i_batch)
            self.cur_batch += 1

            if self.configer.cuda and cuda.is_available(): 
                images = images.cuda()
                labels = labels.cuda()
                offsets = offsets.cuda()
                landmarks = landmarks.cuda()
            
            pred = self.net(images)
            loss_i, loss_cls, loss_offset, loss_landmark = self.criterion(pred, labels, offsets, landmarks)
            
            cls_pred = torch.where(torch.sigmoid(pred[:, 0].squeeze()) > 0.5, torch.ones_like(labels), torch.zeros_like(labels))
            cls_gt   = torch.where((labels == 1)^(labels == -2),    torch.ones_like(labels), torch.zeros_like(labels))
            mask     = labels >= 0
            cls_pred = torch.masked_select(cls_pred, mask)
            cls_gt   = torch.masked_select(cls_gt,   mask)
            acc_i = torch.mean((cls_pred == cls_gt).float())

            self.optimizer.zero_grad()
            loss_i.backward()
            self.optimizer.step()

            global_step = self.cur_epoch*n_batch + i_batch
            self.writer.add_scalar('{}/train/loss_i'.format(self.net._get_name()), loss_i, global_step=global_step)
            self.writer.add_scalar('{}/train/loss_cls'.format(self.net._get_name()), loss_cls, global_step=global_step)
            self.writer.add_scalar('{}/train/loss_offset'.format(self.net._get_name()), loss_offset, global_step=global_step)
            self.writer.add_scalar('{}/train/loss_landmark'.format(self.net._get_name()), loss_landmark, global_step=global_step)
            self.writer.add_scalar('{}/train/acc_i'.format(self.net._get_name()), acc_i, global_step=global_step)
            
            avg_loss += [loss_i.detach().cpu().numpy()]

        avg_loss = np.mean(np.array(avg_loss))
        return avg_loss


    def valid_epoch(self):
        
        self.net.eval()
        avg_loss = []
        start_time = time.time()
        n_batch = len(self.validset) // self.configer.batchsize

        bar = ProcessBar(n_batch, title='    [Valid|Epoch %d] ' % self.cur_epoch)
        
        with torch.no_grad():

            for i_batch, (images, labels, offsets, landmarks) in enumerate(self.validloader):
                
                bar.step(i_batch)

                if self.configer.cuda and cuda.is_available(): 
                    images = images.cuda()
                    labels = labels.cuda()
                    offsets = offsets.cuda()
                    landmarks = landmarks.cuda()
                
                pred = self.net(images)
                loss_i, loss_cls, loss_offset, loss_landmark = self.criterion(pred, labels, offsets, landmarks)
                
                cls_pred = torch.where(torch.sigmoid(pred[:, 0].squeeze()) > 0.5, torch.ones_like(labels), torch.zeros_like(labels))
                cls_gt   = torch.where((labels == 1)^(labels == -2),    torch.ones_like(labels), torch.zeros_like(labels))
                mask     = labels >= 0
                cls_pred = torch.masked_select(cls_pred, mask)
                cls_gt   = torch.masked_select(cls_gt, mask)
                acc_i = torch.mean((cls_pred == cls_gt).float())

                global_step = self.cur_epoch*n_batch + i_batch
                self.writer.add_scalar('{}/valid/loss_i'.format(self.net._get_name()), loss_i, global_step=global_step)
                self.writer.add_scalar('{}/valid/loss_cls'.format(self.net._get_name()), loss_cls, global_step=global_step)
                self.writer.add_scalar('{}/valid/loss_offset'.format(self.net._get_name()), loss_offset, global_step=global_step)
                self.writer.add_scalar('{}/valid/loss_landmark'.format(self.net._get_name()), loss_landmark, global_step=global_step)
                self.writer.add_scalar('{}/valid/acc_i'.format(self.net._get_name()), acc_i, global_step=global_step)
            
                avg_loss += [loss_i.detach().cpu().numpy()]

        avg_loss = np.mean(np.array(avg_loss))
        return avg_loss

    def test(self):

        pass

    def save_checkpoint(self):
        
        checkpoint_state = {
            'save_time': getTime(),

            'cur_epoch': self.cur_epoch,
            'cur_batch': self.cur_batch,
            'elapsed_time': self.elapsed_time,
            'valid_loss': self.valid_loss,
            'save_times': self.save_times,
            
            'net_state': self.net.state_dict(),
        }

        checkpoint_path = os.path.join(self.ckptdir, "{}_{:04d}.pkl".\
                            format(self.net._get_name(), self.save_times))
        torch.save(checkpoint_state, checkpoint_path)
        
        checkpoint_path = os.path.join(self.ckptdir, "{}_{:04d}.pkl".\
                            format(self.net._get_name(), self.save_times-self.num_to_keep))
        if os.path.exists(checkpoint_path): os.remove(checkpoint_path)

        self.save_times += 1
        
        # print("checkpoint saved at {}".format(checkpoint_path))


    def load_checkpoint(self, index):
        
        checkpoint_path = os.path.join(self.ckptdir, "{}_{:04d}.pkl".\
                            format(self.net._get_name(), index))
        checkpoint_state = torch.load(checkpoint_path, map_location='cuda' if cuda.is_available() else 'cpu')
        
        self.cur_epoch = checkpoint_state['cur_epoch']
        self.cur_batch = checkpoint_state['cur_batch']
        self.elapsed_time = checkpoint_state['elapsed_time']
        self.valid_loss = checkpoint_state['valid_loss']
        self.save_times = checkpoint_state['save_times']

        self.net.load_state_dict(checkpoint_state['net_state'])
        self.optimizer.load_state_dict(checkpoint_state['optimizer_state'])
        self.lr_scheduler.load_state_dict(checkpoint_state['lr_scheduler_state'])
Exemplo n.º 8
0
          bg_cnt = rois_label.data.numel() - fg_cnt

        print("[session %d][epoch %2d][iter %4d/%4d] loss: %.4f, lr: %.2e" \
                                % (args.session, epoch, step, iters_per_epoch, loss_temp, lr))
        print("\t\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, end-start))
        print("\t\t\trpn_cls: %.4f, rpn_box: %.4f, rcnn_cls: %.4f, rcnn_box %.4f" \
                      % (loss_rpn_cls, loss_rpn_box, loss_rcnn_cls, loss_rcnn_box))
        if args.use_tfboard:
          info = {
            'loss': loss_temp,
            'loss_rpn_cls': loss_rpn_cls,
            'loss_rpn_box': loss_rpn_box,
            'loss_rcnn_cls': loss_rcnn_cls,
            'loss_rcnn_box': loss_rcnn_box
          }
          logger.add_scalars("logs_s_{}/losses".format(args.session), info, (epoch - 1) * iters_per_epoch + step)

        loss_temp = 0
        start = time.time()

    
    save_name = os.path.join(output_dir, 'faster_rcnn_{}_{}_{}.pth'.format(args.session, epoch, step))
    save_checkpoint({
      'session': args.session,
      'epoch': epoch + 1,
      'model': fasterRCNN.module.state_dict() if args.mGPUs else fasterRCNN.state_dict(),
      'optimizer': optimizer.state_dict(),
      'pooling_mode': cfg.POOLING_MODE,
      'class_agnostic': args.class_agnostic,
    }, save_name)
    print('save model: {}'.format(save_name))
Exemplo n.º 9
0
true_positive_counts = [75, 64, 21, 5, 0]
false_positive_counts = [150, 105, 18, 0, 0]
true_negative_counts = [0, 45, 132, 150, 150]
false_negative_counts = [0, 11, 54, 70, 75]
precision = [0.3333333, 0.3786982, 0.5384616, 1.0, 0.0]
recall = [1.0, 0.8533334, 0.28, 0.0666667, 0.0]

for n_iter in range(100):
    s1 = torch.rand(1)  # value to keep
    s2 = torch.rand(1)
    writer.add_scalar('data/scalar1', s1[0],
                      n_iter)  # data grouping by `slash`
    writer.add_scalars(
        'data/scalar_group', {
            "xsinx": n_iter * np.sin(n_iter),
            "xcosx": n_iter * np.cos(n_iter),
            "arctanx": np.arctan(n_iter)
        }, n_iter)
    x = torch.rand(32, 3, 64, 64)  # output from network
    if n_iter % 10 == 0:
        x = vutils.make_grid(x, normalize=True, scale_each=True)
        writer.add_image('Image', x, n_iter)  # Tensor
        #writer.add_image('astronaut', skimage.data.astronaut(), n_iter) # numpy
        #writer.add_image('imread', skimage.io.imread('screenshots/audio.png'), n_iter) # numpy
        x = torch.zeros(sample_rate * 2)
        for i in range(x.size(0)):
            x[i] = np.cos(
                freqs[n_iter // 10] * np.pi * float(i) /
                float(sample_rate))  # sound amplitude should in [-1, 1]
        writer.add_audio('myAudio', x, n_iter)
        writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter)
Exemplo n.º 10
0
                                     cnn,
                                     train_loader,
                                     phase='training')
    val_epoch_loss, val_epoch_accuracy = fit(epoch,
                                             cnn,
                                             val_loader,
                                             phase='validation')

    train_losses.append(epoch_loss)
    train_accuracy.append(epoch_accuracy)

    val_losses.append(val_epoch_loss)
    val_accuracy.append(val_epoch_accuracy)

    writer_train.add_scalars("losses", {
        'train_bm': epoch_loss,
        'val_bm': val_epoch_loss
    }, int(epoch))
    writer_train.add_scalars("accuracies", {
        'train_bm': epoch_accuracy,
        'val_bm': val_epoch_accuracy
    }, int(epoch))

    # Learning rate scheduler update
    scheduler.step(val_epoch_loss)

writer_train.add_histogram("error_bm", np.array(train_losses))

elapsed = clock() - start

print(elapsed)
Exemplo n.º 11
0
        iou_test += env.iou()
    reward_test_total = reward_test_total / N_iteration_test
    IOU_test_total = iou_test / N_iteration_test
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60
    print('Epodise: ', episode, '| Ep_reward_test:', reward_test_total,
          '| Ep_IOU_test: ', IOU_test_total)
    print(" | time in %d minutes, %d seconds\n" % (mins, secs))
    if agent.greedy_epsilon > FINAL_EPSILON:
        agent.greedy_epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / N_iteration
    if reward_test_total >= best_reward:
        torch.save(agent.Eval_net.state_dict(),
                   log_path + 'Eval_net_episode_%d.pth' % (episode))
        torch.save(agent.Target_net.state_dict(),
                   log_path + 'Target_net_episode_%d.pth' % (episode))
        best_reward = reward_test_total
    writer.add_scalars(
        OUT_FILE_NAME, {
            'train_loss': train_loss,
            'train_reward': reward_train,
            'train_iou': train_iou,
            'test_reward': reward_test_total,
            'test_iou': IOU_test_total,
        }, episode)
JSON_log_PATH = "./JSON/"
if os.path.exists(JSON_log_PATH) == False:
    os.makedirs(JSON_log_PATH)
writer.export_scalars_to_json(JSON_log_PATH + OUT_FILE_NAME + ".json")
writer.close()
Exemplo n.º 12
0
def train(season_id, dm_train_set, dm_test_set, features, edges):

    EMBEDDING_DIM = 200
    batch_size = 128
    epoch_num = 300
    max_acc = 0
    max_v_acc = 0
    model_save_path = './tmp/model_save/gcn.model'

    dm_dataloader = data.DataLoader(dataset=dm_train_set,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    drop_last=True,
                                    num_workers=8)

    dm_test_dataloader = data.DataLoader(dataset=dm_test_set,
                                         batch_size=batch_size,
                                         shuffle=False,
                                         drop_last=False,
                                         num_workers=8)

    graph = build_graph(features, edges)
    features = torch.FloatTensor(features)
    graph = graph.to(device)

    model = GCN(graph, EMBEDDING_DIM, 256, dropout=0.5)
    # model.init_emb(features)
    print(model)
    model.to(device)

    if torch.cuda.is_available():
        print("CUDA : On")
    else:
        print("CUDA : Off")

    optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.99))
    # scheduler = StepLR(optimizer, step_size=100, gamma=0.1)

    logging = False
    if logging:
        writer = SummaryWriter()
        log_name = 'gcn'

    for epoch in tqdm(range(epoch_num)):
        model.train(mode=True)
        # scheduler.step()
        for batch_idx, sample_dict in enumerate(dm_dataloader):
            sentence = torch.LongTensor(sample_dict['sentence'])
            label = torch.LongTensor(sample_dict['label'])

            sentence = sentence.to(device)
            label = label.to(device)

            optimizer.zero_grad()
            pred = model.forward(sentence)
            cross_entropy = nn.CrossEntropyLoss()
            loss = cross_entropy(pred, label)
            if batch_idx % 10 == 0:
                accuracy = valid_util.running_accuracy(pred, label)
                print('epoch: %d batch %d : loss: %4.6f accuracy: %4.6f' %
                      (epoch, batch_idx, loss.item(), accuracy))
                if logging:
                    writer.add_scalar(log_name + '_data/loss', loss.item(),
                                      epoch * 10 + batch_idx // 10)
            loss.backward()
            optimizer.step()

        model.eval()
        accuracy = valid_util.validate(model,
                                       dm_test_set,
                                       dm_test_dataloader,
                                       mode='output',
                                       type='normal')
        if accuracy > max_acc:
            max_acc = accuracy

        if logging:
            result_dict = valid_util.validate(model,
                                              dm_test_set,
                                              dm_test_dataloader,
                                              mode='report',
                                              type='normal')
            writer.add_scalars(
                log_name + '_data/0-PRF', {
                    '0-Precision': result_dict['0']['precision'],
                    '0-Recall': result_dict['0']['recall'],
                    '0-F1-score': result_dict['0']['f1-score']
                }, epoch)
            writer.add_scalars(
                log_name + '_data/1-PRF', {
                    '1-Precision': result_dict['1']['precision'],
                    '1-Recall': result_dict['1']['recall'],
                    '1-F1-score': result_dict['1']['f1-score']
                }, epoch)
            writer.add_scalars(log_name + '_data/accuracy', {
                'accuracy': result_dict['accuracy'],
                'max_accuracy': max_acc
            }, epoch)

    if logging:
        writer.close()
    print("Max Accuracy: %4.6f" % max_acc)
    return
Exemplo n.º 13
0
    def train(self):
        img_size = Constants.IMAGE_SIZE
        if not Constants.PRETRAINED:
            summary(self.network, (1, img_size, img_size))
        else:
            summary(self.network, (3, img_size, img_size))
        writer = SummaryWriter(Path(self.base_out_dir) / "tensorboard")

        print(f"Run ID : {self.config.run_id}")
        print("Training started at: {}".format(
            time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())))

        self.best_result["train_kappa"] = np.inf
        self.best_result["val_kappa"] = np.inf

        start = self.start_epoch
        end = start + self.model_config.epoch

        self.network = self.network.train()
        for epoch in range(start, end):
            for iteration, batch_set in enumerate(self.train_data):
                inputs = torch.cat((batch_set["positive_x"].to(self.device),
                                    batch_set["negative_x"].to(self.device)))
                target = torch.cat((batch_set["positive_y"].to(self.device),
                                    batch_set["negative_y"].to(self.device)))
                self.optimizer.zero_grad()
                predictions = self.network(inputs)
                loss = self.loss_function(predictions, target)
                loss.backward()
                self.optimizer.step()

                if iteration % self.model_config.model_dump_gap == 0 \
                        and iteration != 0:
                    train_loss, train_kappa = self.test(self.train_data,
                                                        ds_type="train_set")
                    self.results["epochs"].append(epoch + iteration /
                                                  len(self.train_data))
                    self.results["train_loss"].append(train_loss)
                    self.results["train_kappa"].append(train_kappa)

                    print("lr: {:.2E}".format(
                        self.optimizer.param_groups[0]['lr']))
                    print(
                        "{} Epoch: {}, Iteration: {}, Train loss: {:.4f}, Train Cohen Kappa Score: {:.4f}"
                        .format(
                            time.strftime("%Y-%m-%d-%H:%M:%S",
                                          time.localtime()), epoch, iteration,
                            train_loss, train_kappa))
                    writer.add_scalars("Loss", {"train_loss": train_loss},
                                       epoch)

                    if self.val_data:
                        val_loss, val_kappa = self.test(self.val_data)
                        self.results["val_loss"].append(val_loss)
                        self.results["val_kappa"].append(val_kappa)

                        print(
                            "{} Epoch: {}, Iteration: {}, Val loss: {:.4f}, Val Cohen Kappa Score: {:.4f}"
                            .format(
                                time.strftime("%Y-%m-%d-%H:%M:%S",
                                              time.localtime()), epoch,
                                iteration, val_loss, val_kappa))
                        writer.add_scalars("Loss", {"val_loss": val_loss},
                                           epoch)

                    # Now plotting
                    loss_fig = generate_plot(self.results["epochs"],
                                             self.results["train_loss"],
                                             self.results["val_loss"],
                                             title="Loss Progression",
                                             y_label="loss")
                    writer.add_figure("Loss", loss_fig)
                    loss_fig.savefig(Path(self.base_out_dir) / "loss.png")
                    plt.show()
                    plt.close(loss_fig)

                    kappa_score_fig = generate_plot(
                        self.results["epochs"],
                        self.results["train_kappa"],
                        self.results["val_kappa"],
                        title="Cohen Kappa Score",
                        y_label="CKS")
                    writer.add_figure("Cohen Kappa Score", kappa_score_fig)
                    loss_fig.savefig(
                        Path(self.base_out_dir) / "kappa_score.png")
                    plt.show()
                    plt.close(kappa_score_fig)

                    if (self.results["train_kappa"][-1] <=
                            self.best_result["train_kappa"]
                            and self.results["val_kappa"][-1] <
                            self.best_result["val_kappa"]):

                        self.best_result["train_kappa"] = self.results[
                            "train_kappa"][-1]
                        self.best_result["val_kappa"] = self.results[
                            "val_kappa"][-1]
                        self.best_result["state"] = {
                            "epoch": epoch,
                            "network_dict":
                            deepcopy(self.network.state_dict()),
                            "optimizer_dict":
                            deepcopy(self.optimizer.state_dict()),
                            "results": deepcopy(self.results)
                        }
                    self.save_model(epoch, tag=iteration)

            self.scheduler.step()

        print(
            "Best results: Epoch{}, Train Cohen Kappa Score: {}, Val Cohen Kappa Score: {}"
            .format(self.best_result["epoch"], self.best_result["train_kappa"],
                    self.best_result["val_kappa"]))

        model_save_dir = "best_model_{}.pth".format(
            str(self.best_result["state"]["epoch"]))

        torch.save(self.best_result["state"],
                   Path(self.model_out_dir) / model_save_dir)

        if self.test_data:
            test_loss, test_kappa = self.test(self.test_data)
            print(
                "{} Epoch: {}, Test loss: {:.4f}, Test Cohen Kappa Score: {:.4f}"
                .format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()),
                        epoch, test_loss, test_kappa))
Exemplo n.º 14
0
    def train(self, protocol_name, subset='development', n_calls=1):

        train_dir = self.TRAIN_DIR.format(
            experiment_dir=self.experiment_dir,
            protocol=protocol_name,
            subset=subset)

        mkdir_p(train_dir)

        protocol = get_protocol(protocol_name, progress=False,
                                preprocessors=self.preprocessors_)

        tune_db = f'{train_dir}/tune.db'
        params_yml = f'{train_dir}/params.yml'
        params_yml_lock = f'{train_dir}/params.yml.lock'

        pid = os.getpid()
        writer = SummaryWriter(log_dir=f"{train_dir}/{pid}")

        progress_bar = tqdm(unit='trial')
        progress_bar.set_description('Trial #1 : ...')
        progress_bar.update(0)

        iterations = self.pipeline_.tune_iter(
            tune_db, protocol, subset=subset,
            sampler=self.sampler_)

        for s, status in enumerate(iterations):

            if s+1 == n_calls:
                break

            loss = status['latest']['loss']
            writer.add_scalar(f'train/{protocol_name}.{subset}/loss/latest',
                              loss, global_step=s + 1)
            writer.add_scalars(
                f'train/{protocol_name}.{subset}/params/latest',
                status['latest']['params'], global_step=s + 1)

            if 'new_best' in status:
                _ = self.dump(status['new_best'], params_yml, params_yml_lock)
                n_trials = status['new_best']['n_trials']
                best_loss = status['new_best']['loss']
                writer.add_scalar(f'train/{protocol_name}.{subset}/loss/best',
                                  best_loss, global_step=n_trials)
                writer.add_scalars(
                    f'train/{protocol_name}.{subset}/params/best',
                    status['new_best']['params'], global_step=n_trials)

            # progress bar
            desc = f"Trial #{s+1}"
            loss = status['latest']['loss']
            if abs(loss) < 1:
                desc += f" = {100 * loss:.3f}%"
                desc += f" : Best = {100 * best_loss:.3f}% after {n_trials} trials"
            else:
                desc += f" = {loss:.3f}"
                desc += f" : Best = {best_loss:.3f} after {n_trials} trials"

            progress_bar.set_description(desc=desc)
            progress_bar.update(1)

        best = self.pipeline_.best(tune_db)
        content = self.dump(best, params_yml, params_yml_lock)

        sep = "=" * max(len(params_yml),
                        max(len(l) for l in content.split('\n')))
        print(f"\n{sep}\n{params_yml}\n{sep}\n{content}{sep}")
        print(f"Loss = {best['loss']:g} | {best['n_trials']} trials")
        print(f"{sep}")
                info_loss.backward()
                optimizerInfo.step()

                total_g_loss += g_loss.data / batch_size
                total_d_loss += d_loss.data / batch_size
                total_info_loss += info_loss.data / batch_size

                total_real_prob += real.mean(0).data
                total_fake_prob += fake.mean(0).data
                total_fake2_prob += fake2.mean(0).data
                t2.update()

                if i % plot_every == 0 and i != 0:
                    writer.add_scalars(
                        'infogan/loss', {
                            'g_loss': total_g_loss / plot_every,
                            'd_loss': total_d_loss / plot_every,
                            'info_loss': total_info_loss / plot_every
                        }, plot)

                    writer.add_scalars(
                        'infogan/prob', {
                            'real_data': total_real_prob / plot_every,
                            'fake_data_before': total_fake_prob / plot_every,
                            'fake_data_after': total_fake2_prob / plot_every
                        }, plot)
                    plot += 1
                    total_g_loss = 0.0
                    total_d_loss = 0.0
                    total_info_loss = 0.0
                    total_real_prob = 0.0
                    total_fake_prob = 0.0
def main():
  cfg = Config()

  # Redirect logs to both console and file.
  if cfg.log_to_file:
    ReDirectSTD(cfg.stdout_file, 'stdout', False)
    ReDirectSTD(cfg.stderr_file, 'stderr', False)

  # Lazily create SummaryWriter
  writer = None

  TVTs, TMOs, relative_device_ids = set_devices_for_ml(cfg.sys_device_ids)

  if cfg.seed is not None:
    set_seed(cfg.seed)

  # Dump the configurations to log.
  import pprint
  print('-' * 60)
  print('cfg.__dict__')
  pprint.pprint(cfg.__dict__)
  print('-' * 60)

  ###########
  # Dataset #
  ###########

  train_set = create_dataset(**cfg.train_set_kwargs)

  test_sets = []
  test_set_names = []
  if cfg.dataset == 'combined':
    for name in ['market1501', 'cuhk03', 'duke']:
      cfg.test_set_kwargs['name'] = name
      test_sets.append(create_dataset(**cfg.test_set_kwargs))
      test_set_names.append(name)
  else:
    test_sets.append(create_dataset(**cfg.test_set_kwargs))
    test_set_names.append(cfg.dataset)

  ###########
  # Models  #
  ###########

  models = [Model(local_conv_out_channels=cfg.local_conv_out_channels,
                  num_classes=len(train_set.ids2labels))
            for _ in range(cfg.num_models)]
  # Model wrappers
  model_ws = [DataParallel(models[i], device_ids=relative_device_ids[i])
              for i in range(cfg.num_models)]

  #############################
  # Criteria and Optimizers   #
  #############################

  id_criterion = nn.CrossEntropyLoss()
  g_tri_loss = TripletLoss(margin=cfg.global_margin)
  l_tri_loss = TripletLoss(margin=cfg.local_margin)

  optimizers = [optim.Adam(m.parameters(),
                           lr=cfg.base_lr,
                           weight_decay=cfg.weight_decay)
                for m in models]

  # Bind them together just to save some codes in the following usage.
  modules_optims = models + optimizers

  ################################
  # May Resume Models and Optims #
  ################################

  if cfg.resume:
    resume_ep, scores = load_ckpt(modules_optims, cfg.ckpt_file)

  # May Transfer Models and Optims to Specified Device. Transferring optimizers
  # is to cope with the case when you load the checkpoint to a new device.
  for TMO, model, optimizer in zip(TMOs, models, optimizers):
    TMO([model, optimizer])

  ########
  # Test #
  ########

  # Test each model using different distance settings.
  def test(load_model_weight=False):
    if load_model_weight:
      load_ckpt(modules_optims, cfg.ckpt_file)

    use_local_distance = (cfg.l_loss_weight > 0) \
                         and cfg.local_dist_own_hard_sample

    for i, (model_w, TVT) in enumerate(zip(model_ws, TVTs)):
      for test_set, name in zip(test_sets, test_set_names):
        test_set.set_feat_func(ExtractFeature(model_w, TVT))
        print('\n=========> Test Model #{} on dataset: {} <=========\n'
              .format(i + 1, name))
        test_set.eval(
          normalize_feat=cfg.normalize_feature,
          use_local_distance=use_local_distance)

  if cfg.only_test:
    test(load_model_weight=True)
    return

  ############
  # Training #
  ############

  # Storing things that can be accessed cross threads.

  ims_list = [None for _ in range(cfg.num_models)]
  labels_list = [None for _ in range(cfg.num_models)]

  done_list1 = [False for _ in range(cfg.num_models)]
  done_list2 = [False for _ in range(cfg.num_models)]

  probs_list = [None for _ in range(cfg.num_models)]
  g_dist_mat_list = [None for _ in range(cfg.num_models)]
  l_dist_mat_list = [None for _ in range(cfg.num_models)]

  # Two phases for each model:
  # 1) forward and single-model loss;
  # 2) further add mutual loss and backward.
  # The 2nd phase is only ready to start when the 1st is finished for
  # all models.
  run_event1 = threading.Event()
  run_event2 = threading.Event()

  # This event is meant to be set to stop threads. However, as I found, with
  # `daemon` set to true when creating threads, manually stopping is
  # unnecessary. I guess some main-thread variables required by sub-threads
  # are destroyed when the main thread ends, thus the sub-threads throw errors
  # and exit too.
  # Real reason should be further explored.
  exit_event = threading.Event()

  # The function to be called by threads.
  def thread_target(i):
    while not exit_event.isSet():
      # If the run event is not set, the thread just waits.
      if not run_event1.wait(0.001): continue

      ######################################
      # Phase 1: Forward and Separate Loss #
      ######################################

      TVT = TVTs[i]
      model_w = model_ws[i]
      ims = ims_list[i]
      labels = labels_list[i]
      optimizer = optimizers[i]

      ims_var = Variable(TVT(torch.from_numpy(ims).float()))
      labels_t = TVT(torch.from_numpy(labels).long())
      labels_var = Variable(labels_t)

      global_feat, local_feat, logits = model_w(ims_var)
      probs = F.softmax(logits)
      log_probs = F.log_softmax(logits)

      g_loss, p_inds, n_inds, g_dist_ap, g_dist_an, g_dist_mat = global_loss(
        g_tri_loss, global_feat, labels_t,
        normalize_feature=cfg.normalize_feature)

      if cfg.l_loss_weight == 0:
        l_loss, l_dist_mat = 0, 0
      elif cfg.local_dist_own_hard_sample:
        # Let local distance find its own hard samples.
        l_loss, l_dist_ap, l_dist_an, l_dist_mat = local_loss(
          l_tri_loss, local_feat, None, None, labels_t,
          normalize_feature=cfg.normalize_feature)
      else:
        l_loss, l_dist_ap, l_dist_an = local_loss(
          l_tri_loss, local_feat, p_inds, n_inds, labels_t,
          normalize_feature=cfg.normalize_feature)
        l_dist_mat = 0

      id_loss = 0
      if cfg.id_loss_weight > 0:
        id_loss = id_criterion(logits, labels_var)

      probs_list[i] = probs
      g_dist_mat_list[i] = g_dist_mat
      l_dist_mat_list[i] = l_dist_mat

      done_list1[i] = True

      # Wait for event to be set, meanwhile checking if need to exit.
      while True:
        phase2_ready = run_event2.wait(0.001)
        if exit_event.isSet():
          return
        if phase2_ready:
          break

      #####################################
      # Phase 2: Mutual Loss and Backward #
      #####################################

      # Probability Mutual Loss (KL Loss)
      pm_loss = 0
      if (cfg.num_models > 1) and (cfg.pm_loss_weight > 0):
        for j in range(cfg.num_models):
          if j != i:
            pm_loss += F.kl_div(log_probs, TVT(probs_list[j]).detach(), False)
        pm_loss /= 1. * (cfg.num_models - 1) * len(ims)

      # Global Distance Mutual Loss (L2 Loss)
      gdm_loss = 0
      if (cfg.num_models > 1) and (cfg.gdm_loss_weight > 0):
        for j in range(cfg.num_models):
          if j != i:
            gdm_loss += torch.sum(torch.pow(
              g_dist_mat - TVT(g_dist_mat_list[j]).detach(), 2))
        gdm_loss /= 1. * (cfg.num_models - 1) * len(ims) * len(ims)

      # Local Distance Mutual Loss (L2 Loss)
      ldm_loss = 0
      if (cfg.num_models > 1) \
          and cfg.local_dist_own_hard_sample \
          and (cfg.ldm_loss_weight > 0):
        for j in range(cfg.num_models):
          if j != i:
            ldm_loss += torch.sum(torch.pow(
              l_dist_mat - TVT(l_dist_mat_list[j]).detach(), 2))
        ldm_loss /= 1. * (cfg.num_models - 1) * len(ims) * len(ims)

      loss = g_loss * cfg.g_loss_weight \
             + l_loss * cfg.l_loss_weight \
             + id_loss * cfg.id_loss_weight \
             + pm_loss * cfg.pm_loss_weight \
             + gdm_loss * cfg.gdm_loss_weight \
             + ldm_loss * cfg.ldm_loss_weight

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      ##################################
      # Step Log For One of the Models #
      ##################################

      # These meters are outer-scope variables

      # Just record for the first model
      if i == 0:

        # precision
        g_prec = (g_dist_an > g_dist_ap).data.float().mean()
        # the proportion of triplets that satisfy margin
        g_m = (g_dist_an > g_dist_ap + cfg.global_margin).data.float().mean()
        g_d_ap = g_dist_ap.data.mean()
        g_d_an = g_dist_an.data.mean()

        g_prec_meter.update(g_prec)
        g_m_meter.update(g_m)
        g_dist_ap_meter.update(g_d_ap)
        g_dist_an_meter.update(g_d_an)
        g_loss_meter.update(to_scalar(g_loss))

        if cfg.l_loss_weight > 0:
          # precision
          l_prec = (l_dist_an > l_dist_ap).data.float().mean()
          # the proportion of triplets that satisfy margin
          l_m = (l_dist_an > l_dist_ap + cfg.local_margin).data.float().mean()
          l_d_ap = l_dist_ap.data.mean()
          l_d_an = l_dist_an.data.mean()

          l_prec_meter.update(l_prec)
          l_m_meter.update(l_m)
          l_dist_ap_meter.update(l_d_ap)
          l_dist_an_meter.update(l_d_an)
          l_loss_meter.update(to_scalar(l_loss))

        if cfg.id_loss_weight > 0:
          id_loss_meter.update(to_scalar(id_loss))

        if (cfg.num_models > 1) and (cfg.pm_loss_weight > 0):
          pm_loss_meter.update(to_scalar(pm_loss))

        if (cfg.num_models > 1) and (cfg.gdm_loss_weight > 0):
          gdm_loss_meter.update(to_scalar(gdm_loss))

        if (cfg.num_models > 1) \
            and cfg.local_dist_own_hard_sample \
            and (cfg.ldm_loss_weight > 0):
          ldm_loss_meter.update(to_scalar(ldm_loss))

        loss_meter.update(to_scalar(loss))

      ###################
      # End Up One Step #
      ###################

      run_event1.clear()
      run_event2.clear()

      done_list2[i] = True

  threads = []
  for i in range(cfg.num_models):
    thread = threading.Thread(target=thread_target, args=(i,))
    # Set the thread in daemon mode, so that the main program ends normally.
    thread.daemon = True
    thread.start()
    threads.append(thread)

  start_ep = resume_ep if cfg.resume else 0
  for ep in range(start_ep, cfg.total_epochs):

    # Adjust Learning Rate
    for optimizer in optimizers:
      if cfg.lr_decay_type == 'exp':
        adjust_lr_exp(
          optimizer,
          cfg.base_lr,
          ep + 1,
          cfg.total_epochs,
          cfg.exp_decay_at_epoch)
      else:
        adjust_lr_staircase(
          optimizer,
          cfg.base_lr,
          ep + 1,
          cfg.staircase_decay_at_epochs,
          cfg.staircase_decay_multiply_factor)

    may_set_mode(modules_optims, 'train')

    epoch_done = False

    g_prec_meter = AverageMeter()
    g_m_meter = AverageMeter()
    g_dist_ap_meter = AverageMeter()
    g_dist_an_meter = AverageMeter()
    g_loss_meter = AverageMeter()

    l_prec_meter = AverageMeter()
    l_m_meter = AverageMeter()
    l_dist_ap_meter = AverageMeter()
    l_dist_an_meter = AverageMeter()
    l_loss_meter = AverageMeter()

    id_loss_meter = AverageMeter()

    # Global Distance Mutual Loss
    gdm_loss_meter = AverageMeter()
    # Local Distance Mutual Loss
    ldm_loss_meter = AverageMeter()
    # Probability Mutual Loss
    pm_loss_meter = AverageMeter()

    loss_meter = AverageMeter()

    ep_st = time.time()
    step = 0
    while not epoch_done:

      step += 1
      step_st = time.time()

      ims, im_names, labels, mirrored, epoch_done = train_set.next_batch()

      for i in range(cfg.num_models):
        ims_list[i] = ims
        labels_list[i] = labels
        done_list1[i] = False
        done_list2[i] = False

      run_event1.set()
      # Waiting for phase 1 done
      while not all(done_list1): continue

      run_event2.set()
      # Waiting for phase 2 done
      while not all(done_list2): continue

      ############
      # Step Log #
      ############

      if step % cfg.log_steps == 0:
        time_log = '\tStep {}/Ep {}, {:.2f}s'.format(
          step, ep + 1, time.time() - step_st, )

        if cfg.g_loss_weight > 0:
          g_log = (', gp {:.2%}, gm {:.2%}, '
                   'gd_ap {:.4f}, gd_an {:.4f}, '
                   'gL {:.4f}'.format(
            g_prec_meter.val, g_m_meter.val,
            g_dist_ap_meter.val, g_dist_an_meter.val,
            g_loss_meter.val, ))
        else:
          g_log = ''

        if cfg.l_loss_weight > 0:
          l_log = (', lp {:.2%}, lm {:.2%}, '
                   'ld_ap {:.4f}, ld_an {:.4f}, '
                   'lL {:.4f}'.format(
            l_prec_meter.val, l_m_meter.val,
            l_dist_ap_meter.val, l_dist_an_meter.val,
            l_loss_meter.val, ))
        else:
          l_log = ''

        if cfg.id_loss_weight > 0:
          id_log = (', idL {:.4f}'.format(id_loss_meter.val))
        else:
          id_log = ''

        if (cfg.num_models > 1) and (cfg.pm_loss_weight > 0):
          pm_log = (', pmL {:.4f}'.format(pm_loss_meter.val))
        else:
          pm_log = ''

        if (cfg.num_models > 1) and (cfg.gdm_loss_weight > 0):
          gdm_log = (', gdmL {:.4f}'.format(gdm_loss_meter.val))
        else:
          gdm_log = ''

        if (cfg.num_models > 1) \
            and cfg.local_dist_own_hard_sample \
            and (cfg.ldm_loss_weight > 0):
          ldm_log = (', ldmL {:.4f}'.format(ldm_loss_meter.val))
        else:
          ldm_log = ''

        total_loss_log = ', loss {:.4f}'.format(loss_meter.val)

        log = time_log + \
              g_log + l_log + id_log + \
              pm_log + gdm_log + ldm_log + \
              total_loss_log
        print(log)

    #############
    # Epoch Log #
    #############

    time_log = 'Ep {}, {:.2f}s'.format(ep + 1, time.time() - ep_st, )

    if cfg.g_loss_weight > 0:
      g_log = (', gp {:.2%}, gm {:.2%}, '
               'gd_ap {:.4f}, gd_an {:.4f}, '
               'gL {:.4f}'.format(
        g_prec_meter.avg, g_m_meter.avg,
        g_dist_ap_meter.avg, g_dist_an_meter.avg,
        g_loss_meter.avg, ))
    else:
      g_log = ''

    if cfg.l_loss_weight > 0:
      l_log = (', lp {:.2%}, lm {:.2%}, '
               'ld_ap {:.4f}, ld_an {:.4f}, '
               'lL {:.4f}'.format(
        l_prec_meter.avg, l_m_meter.avg,
        l_dist_ap_meter.avg, l_dist_an_meter.avg,
        l_loss_meter.avg, ))
    else:
      l_log = ''

    if cfg.id_loss_weight > 0:
      id_log = (', idL {:.4f}'.format(id_loss_meter.avg))
    else:
      id_log = ''

    if (cfg.num_models > 1) and (cfg.pm_loss_weight > 0):
      pm_log = (', pmL {:.4f}'.format(pm_loss_meter.avg))
    else:
      pm_log = ''

    if (cfg.num_models > 1) and (cfg.gdm_loss_weight > 0):
      gdm_log = (', gdmL {:.4f}'.format(gdm_loss_meter.avg))
    else:
      gdm_log = ''

    if (cfg.num_models > 1) \
        and cfg.local_dist_own_hard_sample \
        and (cfg.ldm_loss_weight > 0):
      ldm_log = (', ldmL {:.4f}'.format(ldm_loss_meter.avg))
    else:
      ldm_log = ''

    total_loss_log = ', loss {:.4f}'.format(loss_meter.avg)

    log = time_log + \
          g_log + l_log + id_log + \
          pm_log + gdm_log + ldm_log + \
          total_loss_log
    print(log)

    # Log to TensorBoard

    if cfg.log_to_file:
      if writer is None:
        writer = SummaryWriter(log_dir=osp.join(cfg.exp_dir, 'tensorboard'))
      writer.add_scalars(
        'loss',
        dict(global_loss=g_loss_meter.avg,
             local_loss=l_loss_meter.avg,
             id_loss=id_loss_meter.avg,
             pm_loss=pm_loss_meter.avg,
             gdm_loss=gdm_loss_meter.avg,
             ldm_loss=ldm_loss_meter.avg,
             loss=loss_meter.avg, ),
        ep)
      writer.add_scalars(
        'tri_precision',
        dict(global_precision=g_prec_meter.avg,
             local_precision=l_prec_meter.avg, ),
        ep)
      writer.add_scalars(
        'satisfy_margin',
        dict(global_satisfy_margin=g_m_meter.avg,
             local_satisfy_margin=l_m_meter.avg, ),
        ep)
      writer.add_scalars(
        'global_dist',
        dict(global_dist_ap=g_dist_ap_meter.avg,
             global_dist_an=g_dist_an_meter.avg, ),
        ep)
      writer.add_scalars(
        'local_dist',
        dict(local_dist_ap=l_dist_ap_meter.avg,
             local_dist_an=l_dist_an_meter.avg, ),
        ep)

    # save ckpt
    if cfg.log_to_file:
      save_ckpt(modules_optims, ep + 1, 0, cfg.ckpt_file)

  ########
  # Test #
  ########

  test(load_model_weight=False)
Exemplo n.º 17
0
class Monitor(Thread):
    """Monitor Class."""
    def __init__(self, log_dir, delay=1, gpu_id=0, verbose=False):
        """Initialize monitor, log_dir and gpu_id are needed."""
        super(Monitor, self).__init__()

        DEVICE_ID_LIST = GPUtil.getAvailable(
            order="memory", limit=1)  # get the fist gpu with the lowest load
        if len(DEVICE_ID_LIST) < 1 or gpu_id is None:
            self.hasgpu = False
        else:
            self.hasgpu = True

        self.gpu_id = gpu_id
        self.start_time = time.time()  # Start time
        self.verbose = verbose  # if update the usage status during the process
        self.stopped = False  # flag for stop the monitor
        self.delay = delay  # Time between calls to GPUtil
        self.pid = os.getpid()
        self.writer = SummaryWriter(log_dir=log_dir)  # tensorboard writer
        self.writer.add_text(
            "device/CPU",
            "cpu count: {:d} \t brand: {:s}".format(
                os.cpu_count(),
                cpuinfo.get_cpu_info()["brand"]),
            0,
        )
        self.writer.add_text(
            "device/RAM",
            "Current RAM - total:\t {:.3f}GB;".format(
                psutil.virtual_memory().total / 2.0**30),
            0,
        )
        self.count = 0  # Count for calculate the average usage

        self.GPU_memoryUsed = []
        self.GPU_memoryFree = []
        self.CPU_load = []
        self.memoryUsed = []

        if self.hasgpu:
            self.GPU = GPUtil.getGPUs()[self.gpu_id]
            self.GPU_memoryTotal = (self.GPU.memoryTotal / 2.0**10
                                    )  # Total gpu memory amount in GB
            self.writer.add_text(
                "device/GPU",
                "Current GPU (ID:{:d}) name:{:s} ".format(
                    self.gpu_id, self.GPU.name) +
                "Total_GPU_memory: {:.3f}GB;".format(self.GPU_memoryTotal),
                0,
            )

        if verbose:
            devices_status()
        self.start()

    def write_cpu_status(self):
        """Write CPU status."""
        CPU_load = psutil.Process(self.pid).cpu_percent(interval=1)
        self.writer.add_scalars(
            "device/cpu",
            {"CPU_load (%)": CPU_load},
            self.count,
        )
        self.CPU_load.append(CPU_load)

    def write_mem_status(self):
        """Write memory usage status."""
        memoryUsed = (psutil.Process(self.pid).memory_info()[0] / 2.0**30
                      )  # current app memory use in GB
        self.writer.add_scalars(
            "device/mem",
            {"memory_used (GB)": memoryUsed},
            self.count,
        )
        self.memoryUsed.append(memoryUsed)

    def write_gpu_status(self):
        """Write gpu usage status."""
        self.GPU = GPUtil.getGPUs()[self.gpu_id]
        GPU_load = self.GPU.load * 100
        GPU_memoryUsed = self.GPU.memoryUsed / self.GPU_memoryTotal * 100
        GPU_memoryFree = self.GPU.memoryFree / self.GPU_memoryTotal * 100
        self.writer.add_scalars(
            "device/GPU",
            {
                "GPU_load (%)": GPU_load,
                "GPU_memory_used (%)": GPU_memoryUsed,
                "GPU_memory_free (%)": GPU_memoryFree,
            },
            self.count,
        )
        self.GPU_memoryUsed.append(GPU_memoryUsed)
        self.GPU_memoryFree.append(GPU_memoryFree)

    def run(self):
        """Run the monitor."""
        while not self.stopped:
            self.count += 1
            self.write_cpu_status()
            self.write_mem_status()
            if self.hasgpu:
                self.write_gpu_status()

    def stop(self):
        """Stop the monitor."""
        self.run_time = time.time() - self.start_time
        print("Program running time:%d seconds" % self.run_time)
        self.stopped = True
        return self.run_time
Exemplo n.º 18
0
def main():
    global args, best_prec1
    args = parser.parse_args()

    args.distributed = args.world_size > 1

    if args.distributed:
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size)
    # Use CUDA
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
    use_cuda = torch.cuda.is_available()

    # Random seed
    if args.manual_seed is None:
        args.manual_seed = random.randint(1, 10000)
    random.seed(args.manual_seed)
    torch.manual_seed(args.manual_seed)
    if use_cuda:
        torch.cuda.manual_seed_all(args.manual_seed)

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    elif args.arch.startswith('resnext'):
        model = models.__dict__[args.arch](
            baseWidth=args.base_width,
            cardinality=args.cardinality,
        )
    elif args.arch.startswith('shufflenet'):
        model = models.__dict__[args.arch](groups=args.groups)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if not args.distributed:
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    else:
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(model)

    # define loss function (criterion) and optimizer
    # criterion = nn.CrossEntropyLoss().cuda()
    criterion = FocalLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    title = 'CelebA-' + args.arch
    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            args.checkpoint = os.path.dirname(args.resume)
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            title=title,
                            resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title)
        logger.set_names([
            'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.',
            'Valid Acc.'
        ])

    cudnn.benchmark = True

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = CelebA(
        args.data, 'training.txt',
        transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))
    val_dataset = CelebA(
        args.data, 'validation.txt',
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    test_dataset = CelebA(
        args.data, 'validation.txt',
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.train_batch,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.test_batch,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch,
                                              shuffle=False,
                                              num_workers=args.workers,
                                              pin_memory=True)

    if args.evaluate:
        validate(test_loader, model, criterion)
        return

    # visualization
    writer = SummaryWriter(os.path.join(args.checkpoint, 'logs'))

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        lr = adjust_learning_rate(optimizer, epoch)

        print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, lr))

        # train for one epoch
        train_loss, train_acc = train(train_loader, model, criterion,
                                      optimizer, epoch)

        # evaluate on validation set
        val_loss, prec1, _ = validate(val_loader, model, criterion)

        # append logger file
        logger.append([lr, train_loss, val_loss, train_acc, prec1])

        # tensorboardX
        writer.add_scalar('learning rate', lr, epoch + 1)
        writer.add_scalars('loss', {
            'train loss': train_loss,
            'validation loss': val_loss
        }, epoch + 1)
        writer.add_scalars('accuracy', {
            'train accuracy': train_acc,
            'validation accuracy': prec1
        }, epoch + 1)
        #for name, param in model.named_parameters():
        #    writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch + 1)

        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'optimizer': optimizer.state_dict(),
            },
            is_best,
            checkpoint=args.checkpoint)

    logger.close()
    logger.plot()
    savefig(os.path.join(args.checkpoint, 'log.eps'))
    writer.close()

    print('Best accuracy:')
    print(best_prec1)

    checkpoint = torch.load(os.path.join(args.checkpoint,
                                         'model_best.pth.tar'))
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])

    print("Printing training set attribute accuracy")
    _, _, train_top1 = validate(train_loader, model, criterion)
    print(train_top1)

    print("Printing validation set attribute accuracy")
    _, _, val_top1 = validate(val_loader, model, criterion)
    print(val_top1)
Exemplo n.º 19
0
def train():
    learning_rate  = configer.learningrate
    batch_size     = configer.batchsize
    n_epoch        = configer.n_epoch
    early_stopping = configer.earlystopping
    modelname      = configer.modelname
    logger         = init_logger()

    log_dir = os.path.join(configer.logspath, modelname)
    if not os.path.exists(log_dir): os.mkdir(log_dir)
    writer = SummaryWriter(log_dir)

    trainsets = HyperECUST(configer.splitmode, configer.facesize, 'train')
    trainloader = DataLoader(trainsets, batch_size, shuffle=True)
    validsets  = HyperECUST(configer.splitmode, configer.facesize, 'valid')
    validloader  = DataLoader(validsets, batch_size)

    model, modelpath = init_model()
    # writer.add_graph(model, input_to_model=torch.Tensor(batch_size, configer.getint('global', 'N_CHANNLES'),
    #             eval(configer.get('global', 'N_CHANNLES'))[0], eval(configer.get('global', 'N_CHANNLES'))[1]))
    print_log = 'load model: {}'.format(modelpath)
    print(print_log); logger.debug(print_log)

    loss = init_loss()
    optimizor = optim.Adam(model.parameters(), learning_rate,  betas=(0.9, 0.95), weight_decay=0.0005)
    scheduler = lr_scheduler.StepLR(optimizor, configer.stepsize, configer.gamma)

    acc_train_epoch = 0.; acc_valid_epoch = 0.
    loss_train_epoch = float('inf'); loss_valid_epoch = float('inf')
    acc_train_epoch_last = acc_train_epoch; acc_valid_epoch_last = acc_valid_epoch
    loss_train_epoch_last = loss_train_epoch; loss_valid_epoch_last = loss_valid_epoch

    for i_epoch in range(n_epoch):

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        scheduler.step(i_epoch)

        acc_train_epoch = []; acc_valid_epoch = []
        loss_train_epoch = []; loss_valid_epoch = []


        model.train()
        for i_batch, (X, y) in enumerate(trainloader):
            X = Variable(X.float())

            if torch.cuda.is_available():
                X = X.cuda()
                y = y.cuda()

            y_pred_prob = model(X)

            loss_train_batch = loss(y_pred_prob, y)
            optimizor.zero_grad()
            loss_train_batch.backward() 
            optimizor.step()

            acc_train_batch  = accuracy(y_pred_prob, y, multi=False)
            print_log = 'training...    epoch [{:3d}]/[{:3d}] | batch [{:2d}]/[{:2d}] || accuracy: {:2.2%}, loss: {:4.4f}'.\
                        format(i_epoch+1, n_epoch, i_batch+1, len(trainsets)//batch_size, acc_train_batch, loss_train_batch)
            print(print_log); logger.debug(print_log)

            acc_train_epoch.append(acc_train_batch.cpu().numpy())
            loss_train_epoch.append(loss_train_batch.detach().cpu().numpy())
        
        acc_train_epoch = np.mean(np.array(acc_train_epoch))
        loss_train_epoch = np.mean(np.array(loss_train_epoch))
        
        
        model.eval()
        for i_batch, (X, y) in enumerate(validloader):
            X = Variable(X.float())

            if torch.cuda.is_available():
                X = X.cuda()
                y = y.cuda()

            y_pred_prob = model(X)

            loss_valid_batch = loss(y_pred_prob, y)
            acc_valid_batch  = accuracy(y_pred_prob, y, multi=False)
            print_log = 'validating...  epoch [{:3d}]/[{:3d}] | batch [{:2d}]/[{:2d}] || accuracy: {:2.2%}, loss: {:4.4f}'.\
                        format(i_epoch+1, n_epoch, i_batch+1, len(validsets)//batch_size, acc_valid_batch, loss_valid_batch)
            print(print_log); logger.debug(print_log)

            acc_valid_epoch.append(acc_valid_batch.cpu().numpy())
            loss_valid_epoch.append(loss_valid_batch.detach().cpu().numpy())
        

        acc_valid_epoch = np.mean(np.array(acc_valid_epoch))
        loss_valid_epoch = np.mean(np.array(loss_valid_epoch))

        writer.add_scalars('accuracy', {'train': acc_train_epoch,  'valid': acc_valid_epoch},  i_epoch)
        writer.add_scalars('logloss',  {'train': loss_train_epoch, 'valid': loss_valid_epoch}, i_epoch)
        writer.add_scalar('lr', scheduler.get_lr()[-1], i_epoch)

        print_log = '--------------------------------------------------------------------'
        print(print_log); logger.debug(print_log)
        print_log = 'epoch [{:3d}]/[{:3d}] || training: accuracy: {:2.2%}, loss: {:4.4f} | validing: accuracy: {:2.2%}, loss: {:4.4f}'.\
                        format(i_epoch, n_epoch, acc_train_epoch, loss_train_epoch, acc_valid_epoch, loss_valid_epoch)
        print(print_log); logger.debug(print_log)


        if early_stopping:
            if loss_valid_epoch_last > loss_valid_epoch:
                torch.save(model, modelpath)
                acc_train_epoch_last = acc_train_epoch; acc_valid_epoch_last = acc_valid_epoch
                loss_train_epoch_last = loss_train_epoch; loss_valid_epoch_last = loss_valid_epoch
                print_log = 'model saved!'
                print(print_log); logger.debug(print_log)
        else:
            torch.save(model, modelpath)
            acc_train_epoch_last = acc_train_epoch; acc_valid_epoch_last = acc_valid_epoch
            loss_train_epoch_last = loss_train_epoch; loss_valid_epoch_last = loss_valid_epoch
            print_log = 'model saved!'
            print(print_log); logger.debug(print_log)


        print_log = '===================================================================='
        print(print_log); logger.debug(print_log)
Exemplo n.º 20
0
def main():
    cfg = parse_args()

    exp_dir = 'exp/{}_train_{}'.format(cfg.train_set, cfg.task)
    # Redirect logs to both console and file.
    ReDirectSTD(osp.join(exp_dir, 'stdout_{}.txt'.format(time_str())),
                'stdout', False)
    ReDirectSTD(osp.join(exp_dir, 'stderr_{}.txt'.format(time_str())),
                'stderr', False)
    ckpt_file = osp.join(exp_dir, 'ckpt.pth')
    model_weight_file = osp.join(exp_dir, 'model_weight.pth')
    writer = SummaryWriter(log_dir=osp.join(exp_dir, 'tensorboard'))

    # Dump the configurations to log.
    import pprint
    print('-' * 60)
    print('cfg.__dict__')
    pprint.pprint(cfg.__dict__)
    print('-' * 60)

    ###########
    # Dataset #
    ###########

    im_mean = [0.486, 0.459, 0.408]
    im_std = [0.229, 0.224, 0.225]

    dataset_kwargs = dict(
        resize_h_w=cfg.resize_h_w,
        scale=True,
        im_mean=im_mean,
        im_std=im_std,
        batch_dims='NCHW',
        num_prefetch_threads=cfg.num_prefetch_threads,
        prefetch_size=cfg.prefetch_size,
    )

    train_set_kwargs = dict(
        name=cfg.train_set,
        part='trainval',
        batch_size=cfg.train_batch_size,
        final_batch=False,
        shuffle=True,
        crop_prob=cfg.crop_prob,
        crop_ratio=cfg.crop_ratio,
        mirror_type='random',
        prng=np.random,
    )

    test_set_kwargs = dict(
        part='test',
        batch_size=cfg.test_batch_size,
        final_batch=True,
        shuffle=False,
        mirror_type=None,
        prng=np.random,
    )

    train_set_kwargs.update(dataset_kwargs)
    train_set = create_dataset(**train_set_kwargs)

    test_set_kwargs.update(dataset_kwargs)
    test_sets = []
    for name in cfg.test_sets:
        test_set_kwargs['name'] = name
        test_sets.append(create_dataset(**test_set_kwargs))

    ###########
    # Models  #
    ###########

    TVT, TMO = set_devices(cfg.sys_device_ids)

    # You may find that dropout=0 is also OK under current lr settings.
    if cfg.dropout_rate is not None:
        dropout_rate = cfg.dropout_rate
    elif cfg.train_set == 'market1501':
        dropout_rate = 0.6
    else:
        dropout_rate = 0.5
    model = Model(
        last_conv_stride=cfg.last_conv_stride,
        max_or_avg=cfg.max_or_avg,
        dropout_rate=dropout_rate,
        num_classes=len(set(train_set.labels)),
    )
    # Model wrapper
    model_w = DataParallel(model)

    #############################
    # Criteria and Optimizers   #
    #############################

    criterion = torch.nn.CrossEntropyLoss()

    # To finetune from ImageNet weights
    finetuned_params = list(model.base.parameters())
    # To train from scratch
    new_params = [
        p for n, p in model.named_parameters() if not n.startswith('base.')
    ]
    param_groups = [{
        'params': finetuned_params,
        'lr': cfg.finetuned_params_lr
    }, {
        'params': new_params,
        'lr': cfg.new_params_lr
    }]
    optimizer = optim.SGD(
        param_groups,
        momentum=0.9,
        weight_decay=5e-4,
    )

    # Bind them together just to save some codes in the following usage.
    modules_optims = [model, optimizer]

    ################################
    # May Resume Models and Optims #
    ################################

    if cfg.resume:
        resume_ep, scores = load_ckpt(modules_optims, ckpt_file)

    # May Transfer Models and Optims to Specified Device. Transferring optimizer
    # is to cope with the case when you load the checkpoint to a new device.
    TMO(modules_optims)

    ########
    # Test #
    ########

    def extract_feat(ims):
        model.eval()
        ims = Variable(TVT(torch.from_numpy(ims).float()))
        feat, logits = model_w(ims)
        feat = feat.data.cpu().numpy()
        return feat

    def test(load_model_weight=False):
        if load_model_weight:
            if model_weight_file != '':
                sd = torch.load(model_weight_file,
                                map_location=(lambda storage, loc: storage))
                load_state_dict(model, sd)
                print('Loaded model weights from {}'.format(model_weight_file))
            else:
                load_ckpt(modules_optims, ckpt_file)

        for test_set, name in zip(test_sets, cfg.test_sets):
            if test_set.extract_feat_func is None:
                test_set.set_feat_func(extract_feat)
            print('\n=========> Test on dataset: {} <=========\n'.format(name))
            test_set.eval(
                normalize_feat=True,
                to_re_rank=False,
                verbose=False,
            )

    if cfg.only_test:
        test(load_model_weight=True)
        return

    ############
    # Training #
    ############

    prob_diff, all_masks = None, None
    if cfg.task in ['No-Adversary', 'Hard-1', 'Sampling']:
        prob_diff = load_pickle('exp/{}_sw_occlusion/prob_diff.pkl'.format(
            cfg.train_set))
        prob_diff = blur_prob_diff(prob_diff)
        all_masks = load_pickle('exp/{}_sw_occlusion/all_masks.pkl'.format(
            cfg.train_set))

    start_ep = resume_ep if cfg.resume else 0
    for ep in range(start_ep, cfg.total_epochs):

        # Adjust Learning Rate
        adjust_lr_staircase(
            optimizer.param_groups,
            [cfg.finetuned_params_lr, cfg.new_params_lr],
            ep + 1,
            cfg.staircase_decay_at_epochs,
            cfg.staircase_decay_multiply_factor,
        )

        model.train()

        # For recording loss
        loss_meter = AverageMeter(name='cls loss')

        ep_st = time.time()
        step = 0
        epoch_done = False
        while not epoch_done:

            step += 1
            step_st = time.time()

            ims, im_names, labels, mirrored, epoch_done = train_set.next_batch(
            )

            # Occlude images before feeding to network
            if cfg.task != 'Baseline':
                masks = get_masks(im_names,
                                  mirrored,
                                  cfg,
                                  all_masks=all_masks,
                                  prob_diff=prob_diff)
                ims = ims * np.expand_dims(masks, 1)

            ims_var = Variable(TVT(torch.from_numpy(ims).float()))
            labels_var = Variable(TVT(torch.from_numpy(labels).long()))

            _, logits = model_w(ims_var)
            loss = criterion(logits, labels_var)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_meter.update(to_scalar(loss))

            if step % cfg.steps_per_log == 0:
                time_log = '\tStep {}/Ep {}, {:.2f}s'.format(
                    step,
                    ep + 1,
                    time.time() - step_st,
                )
                loss_log = loss_meter.val_str
                log = join([time_log, loss_log], ', ')
                print(log)

        #############
        # Epoch Log #
        #############

        time_log = 'Ep {}, {:.2f}s'.format(
            ep + 1,
            time.time() - ep_st,
        )
        loss_log = loss_meter.avg_str
        log = join([time_log, loss_log], ', ')
        print(log)

        writer.add_scalars(
            loss_meter.name,
            {loss_meter.name: loss_meter.avg},
            ep,
        )

        ########
        # Test #
        ########

        if ((ep + 1) % cfg.epochs_per_val == 0) or ((ep + 1)
                                                    == cfg.total_epochs):
            test(load_model_weight=False)

        #############
        # Save CKPT #
        #############

        save_ckpt(modules_optims, ep + 1, 0, ckpt_file)
Exemplo n.º 21
0
def trainAgent(net):

    # initialize our game
    game = pong.PongGame()

    # create a queue for experience replay to store policies
    # and set the maxlength equals to the size of replay memory
    D = deque(maxlen=REPLAY_MEMORY)

    # intial frame
    frame = game.getPresentFrame()
    # convert rgb to gray scale for processing
    frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
    # binary colors, black or white
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    # stack frames, that is our input tensor
    inp_t = np.stack((frame, frame, frame, frame), axis=2)

    Network = net().cuda()

    optimizer = optim.Adam(Network.parameters(), lr=1e-5)
    criterion = nn.MSELoss()

    writer = SummaryWriter(log_dir='./logs')

    if os.path.exists('./params.pkl'):
        print('Restore from exists model')
        Network.load_state_dict(torch.load('./params.pkl'))
        # TODO: record steps
        steps = 0
    else:
        steps = 0

    expected_epsilon = INITIAL_EPSILON - steps * (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
    if expected_epsilon > FINAL_EPSILON:
        epsilon = expected_epsilon
    else:
        epsilon = FINAL_EPSILON
    total_observe = steps + ADDITIONAL_OB

    # training time
    while(1):
        out_t = Network(to_tensor(inp_t))
        # argmax function
        argmax_t = np.zeros([ACTIONS])

        if random.random() <= epsilon:
            maxIndex = random.randrange(ACTIONS)
        else:
            _, maxIndex = torch.max(out_t, 1)
            maxIndex = maxIndex.cpu().numpy()[0]
        argmax_t[maxIndex] = 1

        if epsilon > FINAL_EPSILON:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # reward tensor if score is positive
        reward_t, frame, hit_rate, hit_rate_100 = game.getNextFrame(argmax_t)
        # get frame pixel data
        frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
        ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
        frame = np.reshape(frame, (84, 84, 1))
        # new input tensor
        inp_t1 = np.append(frame, inp_t[:, :, 0:3], axis=2)

        # add our input tensor, argmax tensor, reward and updated input tensor tos tack of experiences
        D.append((inp_t, argmax_t, reward_t, inp_t1))

        # training iteration
        if steps > total_observe:

            # get values from our replay memory
            minibatch = random.sample(D, BATCH)
            # minibatch = np.array(minibatch).transpose()

            inp_batch = [d[0] for d in minibatch]
            argmax_batch = [d[1] for d in minibatch]
            reward_batch = [d[2] for d in minibatch]
            inp_t1_batch = [d[3] for d in minibatch]

            gt_batch = []
            # out_batch = out.eval(feed_dict={inp: inp_t1_batch})
            out_prev_batch = Network(to_tensor(inp_batch))
            out_batch = Network(to_tensor(inp_t1_batch))

            # add values to our batch
            for i in range(0, len(minibatch)):
                gt_batch.append(reward_batch[i] + GAMMA * np.max(out_batch.data.cpu().numpy()[i]))

            # action = np.mean(np.multiply(argmax_batch, out_prev_batch.data.cpu().numpy()), axis=1)
            action = torch.sum(out_prev_batch.mul(torch.FloatTensor(argmax_batch).cuda()), dim=1)
            gt_batch = torch.FloatTensor(reward_batch).cuda() + GAMMA * out_batch.max(1)[0]
            gt_batch = torch.autograd.Variable(gt_batch, requires_grad=False)

            loss = criterion(action, gt_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # update our input tensor the the next frame
        inp_t = inp_t1
        steps += 1

        # record the agent's performance every 100 steps
        if steps % 100 == 0:
            writer.add_scalars('', {'hit_rate': hit_rate,
                                    'hit_rate_100': hit_rate_100}, steps)

        # print our where we are after saving where we are
        if steps % 10000 == 0:
            torch.save(Network.state_dict(), './params.pkl')

        print("TIMESTEP", steps, "/ EPSILON %7.5f" % epsilon, "/ ACTION", maxIndex,
              "/ REWARD", reward_t, "/ Q_MAX %e" % torch.max(out_t))

        # stop traing after 1M steps
        if steps > 1000000:
            break
Exemplo n.º 22
0
def pruning_DDPG(model,test_loader,criterion,pruning_rate,num_episode,warmup,lbound=0,rbound=0.8,output='./',nb_states=8,nb_actions=1,hidden1=300,hidden2=300,lr_a=1e-4,lr_c=1e-4,device='cuda'):
	writer = SummaryWriter("./DDPG")
	left_bound = get_left_bound(model,pruning_rate/3.0)
	right_bound = []
	for i in range(len(left_bound)):
		left_bound[i] = float(left_bound[i].cpu().numpy())
		right_bound.append(rbound)
	print("left_bound:",left_bound)
	print("right_bound:",right_bound)
	agent = DDPG(warmup,nb_states, nb_actions,left_bound,right_bound,hidden1,hidden2,lr_a,lr_c)
	agent.is_training = True
	env = Pruning_Env(model,test_loader,criterion,pruning_rate,left_bound,right_bound,device)
	step = episode = episode_steps = 0
	episode_reward = 0.
	observation = None
	T = []  # trajectory
	while episode < num_episode:  # counting based on episode
		# reset if it is the start of episode
		if observation is None:
			observation = deepcopy(env.reset())
			agent.reset(observation)
		# agent pick action ...
		if episode <= warmup:
			action = agent.random_action()
			# action = sample_from_truncated_normal_distribution(lower=0., upper=1., mu=env.pruning_rate, sigma=0.5)
		else:
			action = agent.select_action(observation, episode=episode)	
		# env response with next_observation, reward, terminate_info

		observation2, reward, done, info,action = env.step(action)
		if (episode>warmup):
			writer.add_scalars('Acc_DDPG',{'Acc':100+reward},episode-warmup)
		observation2 = deepcopy(observation2)
		#print(observation)
		T.append([reward, deepcopy(observation), deepcopy(observation2), action, done])
		# fix-length, never reach here
		# if max_episode_length and episode_steps >= max_episode_length - 1:
		#	 done = True
		# [optional] save intermideate model
		if episode % int(num_episode / 3) == 0:
			agent.save_model(output)
		# update
		step += 1
		episode_steps += 1
		episode_reward += reward
		observation = deepcopy(observation2)
		if done:  # end of episode
			print('#{}: episode_reward:{:.4f} acc: {:.4f}, ratio: {:.4f}'.format(episode,episode_reward,info['accuracy'],info['pruning_ratio']))
			final_reward = T[-1][0]
			#if final_reward > best_reward - 5.0:
			# print('final_reward: {}'.format(final_reward))
			# agent observe and update policy
			for i in range(len(T)):
				r_t, s_t, s_t1, a_t, done = T[i]
				agent.observe(final_reward, s_t, s_t1, a_t, done)
				if episode > warmup:
					agent.update_policy()
			#agent.memory.append(
			#	observation,
			#	agent.select_action(observation, episode=episode),
			#	0., False
			#)
			# reset
			observation = None
			episode_steps = 0
			episode_reward = 0.
			episode += 1
			T = []
	print("best_action_list:")
	print(env.best_action_list)
	model_new = pruning_by_action_list(model,env.best_action_list)
	return model_new
Exemplo n.º 23
0
                            lr=args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay,
                            nesterov=args.nesterov)

writer = SummaryWriter(log_dir=args.logdir)

for epoch in range(5):
    # train for one epoch
    train_loss = train(train_data, model, optimizer, epoch)

    # evaluate on validation set
    val_loss = validate(valid_data, model, epoch)

    writer.add_scalars('data/scalar_group', {
        'train loss': train_loss,
        'val loss': val_loss
    }, epoch)
# Release all weights
for param in model.module.parameters():
    param.requires_grad = True

trainable_vars = [param for param in model.parameters() if param.requires_grad]
optimizer = torch.optim.SGD(trainable_vars,
                            lr=args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay,
                            nesterov=args.nesterov)

lr_scheduler = ReduceLROnPlateau(optimizer,
                                 mode='min',
                                 factor=0.8,
Exemplo n.º 24
0
class Solver():
    def __init__(self, exp):
        self.root_dir = '/vulcan/scratch/koutilya/projects/Domain_Adaptation/Common_Domain_Adaptation-Lighting/UNet_Baseline'

        # Seed
        self.seed = 1729
        random.seed(self.seed)
        torch.manual_seed(self.seed)
        np.random.seed(self.seed)
        torch.cuda.manual_seed_all(self.seed)

        # Initialize networks
        self.netT = all_networks.define_G(3, 1, 64, 4, 'batch', 'PReLU',
                                          'UNet', 'kaiming', 0, False, [0],
                                          0.1)
        self.netT.cuda()

        # Initialize Loss
        self.netT_loss_fn = nn.L1Loss()

        self.netT_loss_fn = self.netT_loss_fn.cuda()

        # Training Configuration details
        self.batch_size = 16
        joint_transform_list = [
            RandomImgAugment(no_flip=True,
                             no_rotation=True,
                             no_augment=True,
                             size=(192, 640))
        ]
        img_transform_list = [
            tr.ToTensor(),
            tr.Normalize([.5, .5, .5], [.5, .5, .5])
        ]

        self.joint_transform = tr.Compose(joint_transform_list)

        self.img_transform = tr.Compose(img_transform_list)

        self.depth_transform = tr.Compose([DepthToTensor()])

        self.exp = exp
        if self.exp == 'UNet_Baseline_NEW':
            self.model_string = ''
        elif self.exp == 'UNet_Baseline_bicubic_NEW':
            self.model_string = '_bicubic'

        self.writer = SummaryWriter(
            os.path.join(self.root_dir,
                         '../tensorboard_logs/Vkitti-kitti/test/' + self.exp))

        # Initialize Data
        self.get_validation_data()

        self.garg_crop = True
        self.eigen_crop = False
        self.kitti = KITTI()

    def compute_errors(self, ground_truth, predication):

        # accuracy
        threshold = np.maximum((ground_truth / predication),
                               (predication / ground_truth))
        a1 = (threshold < 1.25).mean()
        a2 = (threshold < 1.25**2).mean()
        a3 = (threshold < 1.25**3).mean()

        #MSE
        rmse = (ground_truth - predication)**2
        rmse = np.sqrt(rmse.mean())

        #MSE(log)
        rmse_log = (np.log(ground_truth) - np.log(predication))**2
        rmse_log = np.sqrt(rmse_log.mean())

        # Abs Relative difference
        abs_rel = np.mean(np.abs(ground_truth - predication) / ground_truth)

        # Squared Relative difference
        sq_rel = np.mean(((ground_truth - predication)**2) / ground_truth)

        return abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3

    def get_validation_data(self):
        self.real_val_dataset = real_dataset(
            data_file='test.txt',
            phase='test',
            img_transform=self.img_transform,
            joint_transform=self.joint_transform,
            depth_transform=self.depth_transform)
        self.real_val_dataloader = DataLoader(self.real_val_dataset,
                                              shuffle=False,
                                              batch_size=self.batch_size,
                                              num_workers=4)

    def load_prev_model(self):
        saved_models = glob.glob(
            os.path.join(
                self.root_dir, 'saved_models_all_iters', 'UNet_baseline-' +
                str(self.iteration) + self.model_string + '.pth.tar'))
        if len(saved_models) > 0:
            model_state = torch.load(saved_models[0])
            self.netT.load_state_dict(model_state['netT_state_dict'])
            # self.iteration = model_state['iteration']
            return True
        return False

    def tensor2im(self, depth):
        depth_numpy = depth.cpu().data.float().numpy().transpose(0, 2, 3, 1)
        depth_numpy = (depth_numpy + 1.0) / 2.0  # Unnormalize between 0 and 1
        return depth_numpy * 80.0

    def get_depth_manually(self, depth_file):
        root_dir = '/vulcan/scratch/koutilya/kitti/Depth_from_velodyne/'
        depth_split = depth_file.split('/')
        main_file = osp.join(root_dir, 'test', depth_split[0], depth_split[1],
                             depth_split[-1].split('.')[0] + '.png')

        depth = Image.open(main_file)
        depth = np.array(depth, dtype=np.float32) / 255.0
        return depth

    def Validate(self):
        self.netT.eval()
        saved_models_list = glob.glob(
            os.path.join(self.root_dir, 'saved_models_all_iters',
                         'UNet_baseline-*999' + self.model_string +
                         '.pth.tar'))
        for self.iteration in range(999, 1000 * len(saved_models_list), 1000):
            self.load_prev_model()
            self.Validation()

    def Validation(self):
        num_samples = len(self.real_val_dataset)
        abs_rel = np.zeros(num_samples, np.float32)
        sq_rel = np.zeros(num_samples, np.float32)
        rmse = np.zeros(num_samples, np.float32)
        rmse_log = np.zeros(num_samples, np.float32)
        a1 = np.zeros(num_samples, np.float32)
        a2 = np.zeros(num_samples, np.float32)
        a3 = np.zeros(num_samples, np.float32)

        with torch.no_grad():
            for i, (data, depth_filenames) in tqdm(
                    enumerate(self.real_val_dataloader)):
                self.real_val_image = data[
                    'left_img']  #, data['depth'] # self.real_depth is a numpy array
                self.real_val_image = Variable(self.real_val_image.cuda())

                depth = self.netT(self.real_val_image)
                depth = depth[-1]
                depth_numpy = self.tensor2im(depth)  # 0-80m
                for t_id in range(depth_numpy.shape[0]):
                    t_id_global = (i * self.batch_size) + t_id
                    # _,_,_,ground_depth = self.real_val_dataset.read_data(self.real_val_dataset.files[(i*self.batch_size)+t_id])
                    h, w = self.real_val_image.shape[
                        2], self.real_val_image.shape[3]
                    datafiles1 = self.real_val_dataset.files[t_id_global]
                    ground_depth = self.get_depth_manually(datafiles1['depth'])
                    height, width = ground_depth.shape

                    predicted_depth = cv2.resize(
                        depth_numpy[t_id], (width, height),
                        interpolation=cv2.INTER_LINEAR)
                    predicted_depth[predicted_depth < 1.0] = 1.0
                    # predicted_depth[predicted_depth < 1.0] = 1.0
                    predicted_depth[predicted_depth > 50.0] = 50.0

                    mask = np.logical_and(ground_depth > 1.0,
                                          ground_depth < 50.0)

                    # crop used by Garg ECCV16
                    if self.garg_crop:
                        self.crop = np.array([
                            0.40810811 * height, 0.99189189 * height,
                            0.03594771 * width, 0.96405229 * width
                        ]).astype(np.int32)

                    # crop we found by trail and error to reproduce Eigen NIPS14 results
                    elif self.eigen_crop:
                        self.crop = np.array([
                            0.3324324 * height, 0.91351351 * height,
                            0.0359477 * width, 0.96405229 * width
                        ]).astype(np.int32)

                    crop_mask = np.zeros(mask.shape)
                    crop_mask[self.crop[0]:self.crop[1],
                              self.crop[2]:self.crop[3]] = 1
                    mask = np.logical_and(mask, crop_mask)

                    abs_rel[t_id_global], sq_rel[t_id_global], rmse[
                        t_id_global], rmse_log[t_id_global], a1[
                            t_id_global], a2[t_id_global], a3[
                                t_id_global] = self.compute_errors(
                                    ground_depth[mask], predicted_depth[mask])

                    # print('{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f}'
                    #     .format(t_id, abs_rel[t_id], sq_rel[t_id], rmse[t_id], rmse_log[t_id], a1[t_id], a2[t_id], a3[t_id]))

            print('{:>10},{:>10},{:>10},{:>10},{:>10},{:>10},{:>10}'.format(
                'abs_rel', 'sq_rel', 'rmse', 'rmse_log', 'a1', 'a2', 'a3'))
            print(
                '{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f},{:10.4f}'
                .format(abs_rel.mean(), sq_rel.mean(), rmse.mean(),
                        rmse_log.mean(), a1.mean(), a2.mean(), a3.mean()))

            self.writer.add_scalars('Kitti_Validatoin_metrics/Abs_Rel',
                                    {self.exp: abs_rel.mean()}, self.iteration)
            self.writer.add_scalars('Kitti_Validatoin_metrics/Sq_Rel',
                                    {self.exp: sq_rel.mean()}, self.iteration)
            self.writer.add_scalars('Kitti_Validatoin_metrics/RMSE',
                                    {self.exp: rmse.mean()}, self.iteration)
            self.writer.add_scalars('Kitti_Validatoin_metrics/RMSE_log',
                                    {self.exp: rmse_log.mean()},
                                    self.iteration)
            self.writer.add_scalars('Kitti_Validatoin_metrics/del<1.25',
                                    {self.exp: a1.mean()}, self.iteration)
            self.writer.add_scalars('Kitti_Validatoin_metrics/del<1.25^2',
                                    {self.exp: a2.mean()}, self.iteration)
            self.writer.add_scalars('Kitti_Validatoin_metrics/del<1.25^3',
                                    {self.exp: a3.mean()}, self.iteration)

        self.writer.close()
Exemplo n.º 25
0
class SemanticSeg(object):
    '''
    Control the training, evaluation, and inference process.
    Args:
    - net_name: string
    - lr: float, learning rate.
    - n_epoch: integer, the epoch number
    - channels: integer, the channel number of the input
    - num_classes: integer, the number of class
    - input_shape: tuple of integer, input dim
    - crop: integer, cropping size
    - batch_size: integer
    - num_workers: integer, how many subprocesses to use for data loading.
    - device: string, use the specified device
    - pre_trained: True or False, default False
    - weight_path: weight path of pre-trained model
    - mode: string __all__ = ['cls','seg','cls_and_seg','cls_or_seg']
    '''
    def __init__(self,
                 net_name=None,
                 lr=1e-3,
                 n_epoch=1,
                 channels=1,
                 num_classes=2,
                 roi_number=1,
                 scale=None,
                 seq_len=3,
                 input_shape=None,
                 crop=0,
                 batch_size=6,
                 num_workers=0,
                 device=None,
                 pre_trained=False,
                 ckpt_point=True,
                 weight_path=None,
                 weight_decay=0.,
                 momentum=0.95,
                 gamma=0.1,
                 milestones=[40, 80],
                 T_max=5,
                 mode='cls',
                 topk=10,
                 freeze=None):
        super(SemanticSeg, self).__init__()

        self.net_name = net_name
        self.lr = lr
        self.n_epoch = n_epoch
        self.channels = channels
        self.num_classes = num_classes
        self.roi_number = roi_number
        self.scale = scale
        self.seq_len = seq_len
        self.input_shape = input_shape
        self.crop = crop
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.device = device
        self.net = self._get_net(self.net_name)
        self.pre_trained = pre_trained
        self.ckpt_point = ckpt_point
        self.weight_path = weight_path

        self.start_epoch = 0
        self.global_step = 0
        self.loss_threshold = 2.0

        self.weight_decay = weight_decay
        self.momentum = momentum
        self.gamma = gamma
        self.milestones = milestones
        self.T_max = T_max

        self.mode = mode
        self.topk = topk
        self.freeze = freeze

        os.environ['CUDA_VISIBLE_DEVICES'] = self.device

        if self.pre_trained:
            self._get_pre_trained(self.weight_path,ckpt_point)


        if self.roi_number is not None:
            assert self.num_classes == 2, "num_classes must be set to 2 for binary segmentation"

    def trainer(self,
                train_path,
                val_path,
                cur_fold,
                output_dir=None,
                log_dir=None,
                optimizer='Adam',
                loss_fun='Cross_Entropy',
                class_weight=None,
                lr_scheduler=None):

        torch.manual_seed(1000)
        np.random.seed(1000)
        torch.cuda.manual_seed_all(1000)
        print('Device:{}'.format(self.device))
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True


        output_dir = os.path.join(output_dir, "fold" + str(cur_fold))
        log_dir = os.path.join(log_dir, "fold" + str(cur_fold))

        if os.path.exists(log_dir):
            if not self.pre_trained:
                shutil.rmtree(log_dir)
                os.makedirs(log_dir)
        else:
            os.makedirs(log_dir)

        if os.path.exists(output_dir):
            if not self.pre_trained:
                shutil.rmtree(output_dir)
                os.makedirs(output_dir)
        else:
            os.makedirs(output_dir)
        self.step_pre_epoch = len(train_path) // self.batch_size
        self.writer = SummaryWriter(log_dir)
        self.global_step = self.start_epoch * math.ceil(
            len(train_path[0]) / self.batch_size)

        net = self.net

        # only for deeplab
        if self.freeze is not None and 'deeplab' in self.net_name:
            if self.freeze == 'backbone':
                net.freeze_backbone()
            elif self.freeze == 'classifier':
                net.freeze_classifier()

        lr = self.lr
        loss = self._get_loss(loss_fun, class_weight)

        if len(self.device.split(',')) > 1:
            net = DataParallel(net)

        # dataloader setting
        if self.mode == 'cls':
            train_transformer = transforms.Compose([
                Trunc_and_Normalize(self.scale),
                CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop),
                RandomEraseHalf(scale_flag=False),
                RandomDistortHalf(),
                RandomTranslationRotationZoomHalf(num_class=self.num_classes),
                RandomFlipHalf(mode='hv'),
                RandomAdjustHalf(),
                To_Tensor(num_class=self.num_classes)
            ])
        else:
            train_transformer = transforms.Compose([
                Trunc_and_Normalize(self.scale),
                CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop),
                RandomEraseHalf(scale_flag=False),
                RandomDistortHalf(),
                RandomTranslationRotationZoomHalf(num_class=self.num_classes),
                # RandomFlipHalf(mode='hv'),
                # RandomAdjustHalf(),
                RandomNoiseHalf(),
                To_Tensor(num_class=self.num_classes)
            ])
        train_dataset = DataGenerator(train_path,
                                      roi_number=self.roi_number,
                                      num_class=self.num_classes,
                                      transform=train_transformer,
                                      seq_len=self.seq_len)

        train_loader = DataLoader(train_dataset,
                                  batch_size=self.batch_size,
                                  shuffle=True,
                                  num_workers=self.num_workers,
                                  pin_memory=True)

        # copy to gpu
        net = net.cuda()
        loss = loss.cuda()

        # optimizer setting
        optimizer = self._get_optimizer(optimizer, net, lr)
        if self.pre_trained and self.ckpt_point:
            checkpoint = torch.load(self.weight_path)
            optimizer.load_state_dict(checkpoint['optimizer'])

        if lr_scheduler is not None:
            lr_scheduler = self._get_lr_scheduler(lr_scheduler, optimizer)

        # loss_threshold = 1.0
        early_stopping = EarlyStopping(patience=20,verbose=True,monitor='val_loss',op_type='min')

        for epoch in range(self.start_epoch, self.n_epoch):
            train_loss, train_dice, train_acc = self._train_on_epoch(epoch, net, loss, optimizer, train_loader)

            val_loss, val_dice, val_acc = self._val_on_epoch(epoch, net, loss, val_path)

            if lr_scheduler is not None:
                lr_scheduler.step(val_loss)

            torch.cuda.empty_cache()
            print('epoch:{},train_loss:{:.5f},val_loss:{:.5f}'.format(epoch, train_loss, val_loss))

            print('epoch:{},train_dice:{:.5f},val_dice:{:.5f}'.format(epoch, train_dice, val_dice))

            self.writer.add_scalars('data/loss', {
                'train': train_loss,
                'val': val_loss
            }, epoch)
            self.writer.add_scalars('data/dice', {
                'train': train_dice,
                'val': val_dice
            }, epoch)
            self.writer.add_scalars('data/acc', {
                'train': train_acc,
                'val': val_acc
            }, epoch)
            self.writer.add_scalar('data/lr', optimizer.param_groups[0]['lr'],epoch)

            early_stopping(val_loss)
            #save
            if val_loss <= self.loss_threshold:
                self.loss_threshold = val_loss

                if len(self.device.split(',')) > 1:
                    state_dict = net.module.state_dict()
                else:
                    state_dict = net.state_dict()

                saver = {
                    'epoch': epoch,
                    'save_dir': output_dir,
                    'state_dict': state_dict,
                    'optimizer': optimizer.state_dict()
                }

                file_name = 'epoch:{}-train_loss:{:.5f}-train_dice:{:.5f}-train_acc:{:.5f}-val_loss:{:.5f}-val_dice:{:.5f}-val_acc:{:.5f}.pth'.format(
                    epoch, train_loss, train_dice, train_acc, val_loss,
                    val_dice, val_acc)
                
                save_path = os.path.join(output_dir, file_name)
                print("Save as %s" % file_name)

                torch.save(saver, save_path)
            
            if early_stopping.early_stop:
                print('Early Stopping!')
                break

        self.writer.close()

    def _train_on_epoch(self, epoch, net, criterion, optimizer, train_loader):

        net.train()

        train_loss = AverageMeter()
        train_dice = AverageMeter()
        train_acc = AverageMeter()

        from metrics import RunningDice
        run_dice = RunningDice(labels=[0,1],ignore_label=-1)
        for step, sample in enumerate(train_loader):

            data = sample['image']  # img:(N,cin,seq_len, H, W),
            target = sample['mask'] # mask:(N,num_class,seq_len, H, W),
            label = sample['label'] # label (N,seq_len,num_class-1)
            # print(data.size(),target.size(),label.size())

            data = data.cuda() 
            target = target.cuda()
            label = label.cuda()

            output = net(data) #(N,seq_len,num_class,H,W) (N,seq_len,num_class-1)
            # print(output[0].size(),output[1].size())
            loss = 0.
            if self.mode == 'cls':
                for i in range(data.size(2)):
                    loss += criterion(output[1][:,i], label[:,i])
            elif self.mode == 'seg':
                for i in range(data.size(2)):
                    loss += criterion(output[0][:,i], target[:,:,i])
            else:
                for i in range(data.size(2)):
                    loss += criterion([output[0][:,i],output[1][:,i]],[target[:,:,i],label[:,i]])
            
            loss /= data.size(2)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss = loss.float()
            train_loss.update(loss.item(), data.size(0))

            total_dice = 0.
            total_acc = 0.
            for i in range(data.size(2)):
                cls_output = output[1][:,i] #(N,num_class-1)
                cls_output = F.sigmoid(cls_output).float()

                seg_output = output[0][:,i].float() #(N,num_class,H,W)
                seg_output = F.softmax(seg_output, dim=1)

                # measure acc
                acc = accuracy(cls_output.detach(), label[:,i])
                train_acc.update(acc.item(), data.size(0))
                total_acc += acc.item()

                # measure dice and record loss
                dice = compute_dice(seg_output.detach(), target[:,:,i])
                train_dice.update(dice.item(), data.size(0))
                total_dice += dice.item()

                # measure run dice  
                seg_output = torch.argmax(seg_output,1).detach().cpu().numpy()  #N*H*W 
                tmp_target = torch.argmax(target[:,:,i],1).detach().cpu().numpy()
                run_dice.update_matrix(tmp_target,seg_output)

            torch.cuda.empty_cache()

            if self.global_step % 10 == 0:
                if self.mode == 'cls':
                    print('epoch:{},step:{},train_loss:{:.5f},train_acc:{:.5f},lr:{}'.format(epoch, step, loss.item(), total_acc/data.size(2), optimizer.param_groups[0]['lr']))
                elif self.mode == 'seg':
                    rundice, dice_list = run_dice.compute_dice() 
                    print("Category Dice: ", dice_list)
                    print('epoch:{},step:{},train_loss:{:.5f},train_dice:{:.5f},run_dice:{:.5f},lr:{}'.format(epoch, step, loss.item(), total_dice/data.size(2), rundice, optimizer.param_groups[0]['lr']))
                    run_dice.init_op()
                else:
                    print('epoch:{},step:{},train_loss:{:.5f},train_dice:{:.5f},train_acc:{:.5f},lr:{}'.format(epoch, step, loss.item(), total_dice/data.size(2),total_acc/data.size(2), optimizer.param_groups[0]['lr']))

                
                self.writer.add_scalars('data/train_loss_dice', {
                    'train_loss': loss.item(),
                    'train_dice': total_dice/data.size(2),
                    'train_acc': total_acc/data.size(2)
                }, self.global_step)

            self.global_step += 1

        return train_loss.avg, train_dice.avg, train_acc.avg

    def _val_on_epoch(self, epoch, net, criterion, val_path, val_transformer=None):

        net.eval()

        if self.mode == 'cls':
            val_transformer = transforms.Compose([
                Trunc_and_Normalize(self.scale),
                CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop),
                To_Tensor(num_class=self.num_classes)
            ])
        else:
            val_transformer = transforms.Compose([
                Trunc_and_Normalize(self.scale),
                CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop),
                To_Tensor(num_class=self.num_classes)
            ])

        val_dataset = DataGenerator(val_path,
                                    roi_number=self.roi_number,
                                    num_class=self.num_classes,
                                    transform=val_transformer,
                                    seq_len=-1)

        val_loader = DataLoader(val_dataset,
                                batch_size=1,
                                shuffle=False,
                                num_workers=self.num_workers,
                                pin_memory=True)

        val_loss = AverageMeter()
        val_dice = AverageMeter()
        val_acc = AverageMeter()

        from metrics import RunningDice
        run_dice = RunningDice(labels=[0,1],ignore_label=-1)
        with torch.no_grad():
            for step, sample in enumerate(val_loader):
                data = sample['image']
                target = sample['mask']
                label = sample['label']

                data = data.cuda()
                target = target.cuda()
                label = label.cuda()

                output = net(data)
                loss = 0.
                if self.mode == 'cls':
                    for i in range(data.size(2)):
                        loss += criterion(output[1][:,i], label[:,i])
                elif self.mode == 'seg':
                    for i in range(data.size(2)):
                        loss += criterion(output[0][:,i], target[:,:,i])
                else:
                    for i in range(data.size(2)):
                        loss += criterion([output[0][:,i],output[1][:,i]],[target[:,:,i],label[:,i]])

                loss /= data.size(2)

                loss = loss.float()
                val_loss.update(loss.item(), data.size(0))
                
                total_dice = 0.
                total_acc = 0.
                for i in range(data.size(2)):
                    cls_output = output[1][:,i] #(N,num_class-1)
                    cls_output = F.sigmoid(cls_output).float()

                    seg_output = output[0][:,i].float() #(N,num_class,H,W)
                    seg_output = F.softmax(seg_output, dim=1)

                    # measure acc
                    acc = accuracy(cls_output.detach(), label[:,i])
                    val_acc.update(acc.item(), data.size(0))
                    total_acc += acc.item()

                    # measure dice and record loss
                    dice = compute_dice(seg_output.detach(), target[:,:,i])
                    val_dice.update(dice.item(), data.size(0))
                    total_dice += dice.item()

                    # measure run dice  
                    seg_output = torch.argmax(seg_output,1).detach().cpu().numpy()  #N*H*W 
                    tmp_target = torch.argmax(target[:,:,i],1).detach().cpu().numpy()
                    run_dice.update_matrix(tmp_target,seg_output)

                torch.cuda.empty_cache()

                if step % 10 == 0:
                    if self.mode == 'cls':
                        print('epoch:{},step:{},val_loss:{:.5f},val_acc:{:.5f}'.format(epoch, step, loss.item(), total_acc/data.size(2)))
                    elif self.mode == 'seg':
                        rundice, dice_list = run_dice.compute_dice() 
                        print("Category Dice: ", dice_list)
                        print('epoch:{},step:{},val_loss:{:.5f},val_dice:{:.5f},rundice:{:.5f}'.format(epoch, step, loss.item(), total_dice/data.size(2),rundice))
                        run_dice.init_op()
                    else:
                        print('epoch:{},step:{},val_loss:{:.5f},val_dice:{:.5f},val_acc:{:.5f}'.format(epoch, step, loss.item(), total_dice/data.size(2), total_acc/data.size(2)))

        return val_loss.avg, val_dice.avg, val_acc.avg
    
    
    def test(self, test_path, save_path, net=None, mode='seg', save_flag=False):
        if net is None:
            net = self.net
        
        net = net.cuda()
        net.eval()
        
        if self.mode == 'cls':
            test_transformer = transforms.Compose([
                Trunc_and_Normalize(self.scale),
                CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop),
                RandomEraseHalf(scale_flag=False),
                RandomTranslationRotationZoomHalf(num_class=self.num_classes),
                RandomFlipHalf(mode='hv'),
                RandomAdjustHalf(),
                To_Tensor(num_class=self.num_classes)
            ])
        else:
            test_transformer = transforms.Compose([
                Trunc_and_Normalize(self.scale),
                CropResizeHalf(dim=self.input_shape,num_class=self.num_classes,crop=self.crop),
                To_Tensor(num_class=self.num_classes)
            ])

        test_dataset = DataGenerator(test_path,
                                    roi_number=self.roi_number,
                                    num_class=self.num_classes,
                                    transform=test_transformer,
                                    seq_len=-1)

        test_loader = DataLoader(test_dataset,
                                batch_size=1,
                                shuffle=False,
                                num_workers=self.num_workers,
                                pin_memory=True)

        test_dice = AverageMeter()
        test_acc = AverageMeter()
        from PIL import Image
        from metrics import RunningDice
        run_dice = RunningDice(labels=[0,1],ignore_label=-1)

        cls_result = {
            'true': [],
            'pred': [],
            'prob': []
        }

        with torch.no_grad():
            for step, sample in enumerate(test_loader):
                data = sample['image']
                target = sample['mask']
                label = sample['label'] 
                # print(label)

                data = data.cuda()
                target = target.cuda()
                label = label.cuda()

                output = net(data)
                total_dice = 0.
                total_acc = 0.
                for i in range(data.size(2)):
                    cls_output = output[1][:,i]
                    cls_output = F.sigmoid(cls_output).float()

                    seg_output = output[0][:,i].float()
                    seg_output = F.softmax(seg_output, dim=1)

                    # measure acc
                    acc = accuracy(cls_output.detach(), label[:,i])
                    test_acc.update(acc.item(),data.size(0))
                    total_acc += acc.item()

                    # measure dice and iou for evaluation (float)
                    dice = compute_dice(seg_output.detach(), target[:,:,i], ignore_index=0)
                    test_dice.update(dice.item(), data.size(0))
                    total_dice += dice.item()

                    cls_result['prob'].extend(cls_output.detach().squeeze().cpu().numpy().tolist())
                    cls_output = (cls_output > 0.5).float() # N*C
                    cls_result['pred'].extend(cls_output.detach().squeeze().cpu().numpy().tolist())
                    cls_result['true'].extend(label.detach().squeeze().cpu().numpy().tolist())
                    # print(cls_output.detach())
                    if mode == 'mtl':
                        b, c, _, _ = seg_output.size()
                        seg_output[:,1:,...] = seg_output[:,1:,...] * cls_output.view(b,c-1,1,1).expand_as(seg_output[:,1:,...])

                    seg_output = torch.argmax(seg_output,1).detach().cpu().numpy()  #N*H*W N=1
                    tmp_target = torch.argmax(target[:,:,i],1).detach().cpu().numpy()
                    run_dice.update_matrix(tmp_target,seg_output)
                    # print(np.unique(seg_output),np.unique(target))

                    # save
                    if mode != 'cls' and save_flag:
                        seg_output = np.squeeze(seg_output).astype(np.uint8) 
                        seg_output = Image.fromarray(seg_output, mode='L')
                        seg_output.save(os.path.join(save_path,test_path[step].split('.')[0] + '_' + str(i) +'.png'))

                torch.cuda.empty_cache()
                
                print('step:{},test_dice:{:.5f},test_acc:{:.5f}'.format(step,total_dice/data.size(2),total_acc/data.size(2)))
            
        rundice, dice_list = run_dice.compute_dice() 
        print("Category Dice: ", dice_list)
        print('avg_dice:{:.5f},avg_acc:{:.5f},rundice:{:.5f}'.format(test_dice.avg, test_acc.avg, rundice))

        return cls_result
    

    def _get_net(self, net_name):
        if net_name == 'rcnn_unet' :
            from rcnn.model.unet import rcnn_unet
            net = rcnn_unet(n_channels=self.channels,n_classes=self.num_classes,seq_len=self.seq_len)    
   
        return net

    def _get_loss(self, loss_fun, class_weight=None):
        if class_weight is not None:
            class_weight = torch.tensor(class_weight)

        if loss_fun == 'Cross_Entropy':
            from rcnn.loss.cross_entropy import CrossentropyLoss
            loss = CrossentropyLoss(weight=class_weight)
        if loss_fun == 'DynamicTopKLoss':
            from rcnn.loss.cross_entropy import DynamicTopKLoss
            loss = DynamicTopKLoss(weight=class_weight,step_threshold=self.step_pre_epoch)
        
        elif loss_fun == 'DynamicTopkCEPlusDice':
            from rcnn.loss.combine_loss import DynamicTopkCEPlusDice
            loss = DynamicTopkCEPlusDice(weight=class_weight, ignore_index=0, step_threshold=self.step_pre_epoch)
        
        elif loss_fun == 'TopKLoss':
            from rcnn.loss.cross_entropy import TopKLoss
            loss = TopKLoss(weight=class_weight, k=self.topk)
        
        elif loss_fun == 'DiceLoss':
            from rcnn.loss.dice_loss import DiceLoss
            loss = DiceLoss(weight=class_weight, ignore_index=0, p=1)
        elif loss_fun == 'ShiftDiceLoss':
            from rcnn.loss.dice_loss import ShiftDiceLoss
            loss = ShiftDiceLoss(weight=class_weight,ignore_index=0, reduction='topk',shift=0.5, p=1, k=self.topk)
        elif loss_fun == 'TopkDiceLoss':
            from rcnn.loss.dice_loss import DiceLoss
            loss = DiceLoss(weight=class_weight, ignore_index=0,reduction='topk', k=self.topk)

        elif loss_fun == 'PowDiceLoss':
            from rcnn.loss.dice_loss import DiceLoss
            loss = DiceLoss(weight=class_weight, ignore_index=0, p=2)
        
        elif loss_fun == 'TverskyLoss':
            from rcnn.loss.tversky_loss import TverskyLoss
            loss = TverskyLoss(weight=class_weight, ignore_index=0, alpha=0.7)
        
        elif loss_fun == 'FocalTverskyLoss':
            from rcnn.loss.tversky_loss import TverskyLoss
            loss = TverskyLoss(weight=class_weight, ignore_index=0, alpha=0.7, gamma=0.75)

        elif loss_fun == 'BCEWithLogitsLoss':
            loss = nn.BCEWithLogitsLoss(class_weight)
        
        elif loss_fun == 'BCEPlusDice':
            from rcnn.loss.combine_loss import BCEPlusDice
            loss = BCEPlusDice(weight=class_weight,ignore_index=0,p=1)
        
        elif loss_fun == 'CEPlusDice':
            from rcnn.loss.combine_loss import CEPlusDice
            loss = CEPlusDice(weight=class_weight, ignore_index=0)
        
        elif loss_fun == 'CEPlusTopkDice':
            from rcnn.loss.combine_loss import CEPlusTopkDice
            loss = CEPlusTopkDice(weight=class_weight, ignore_index=0, reduction='topk', k=self.topk)
        
        elif loss_fun == 'TopkCEPlusTopkDice':
            from rcnn.loss.combine_loss import TopkCEPlusTopkDice
            loss = TopkCEPlusTopkDice(weight=class_weight, ignore_index=0, reduction='topk', k=self.topk)
        
        elif loss_fun == 'TopkCEPlusDice':
            from rcnn.loss.combine_loss import TopkCEPlusDice
            loss = TopkCEPlusDice(weight=class_weight, ignore_index=0, k=self.topk)
        
        elif loss_fun == 'TopkCEPlusShiftDice':
            from rcnn.loss.combine_loss import TopkCEPlusShiftDice
            loss = TopkCEPlusShiftDice(weight=class_weight,ignore_index=0, shift=0.5,k=self.topk)
        
        elif loss_fun == 'TopkCEPlusTopkShiftDice':
            from rcnn.loss.combine_loss import TopkCEPlusTopkShiftDice
            loss = TopkCEPlusTopkShiftDice(weight=class_weight,ignore_index=0, reduction='topk',shift=0.5,k=self.topk)
        
        return loss

    def _get_optimizer(self, optimizer, net, lr):
        if optimizer == 'Adam':
            optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()),
                                         lr=lr,
                                         weight_decay=self.weight_decay)

        elif optimizer == 'SGD':
            optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()),
                                        lr=lr,
                                        weight_decay=self.weight_decay,
                                        momentum=self.momentum)

        return optimizer

    def _get_lr_scheduler(self, lr_scheduler, optimizer):
        if lr_scheduler == 'ReduceLROnPlateau':
            lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer, mode='min', patience=5, verbose=True)
        elif lr_scheduler == 'MultiStepLR':
            lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
                optimizer, self.milestones, gamma=self.gamma)
        elif lr_scheduler == 'CosineAnnealingLR':
            lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer, T_max=self.T_max)
        return lr_scheduler

    def _get_pre_trained(self, weight_path, ckpt_point=True):
        checkpoint = torch.load(weight_path)
        self.net.load_state_dict(checkpoint['state_dict'])
        if ckpt_point:
            self.start_epoch = checkpoint['epoch'] + 1
Exemplo n.º 26
0
def train_det(opt, cfg):
    # # Write history
    # if 'backlog' not in opt.config:
    #     with open(os.path.join(opt.saved_path, f'{opt.project}_backlog.yml'), 'w') as f:
    #         doc = open(f'projects/{opt.project}.yml', 'r')
    #         f.write('#History log file')
    #         f.write(f'\n__backlog__: {now.strftime("%Y/%m/%d %H:%M:%S")}\n')
    #         f.write(doc.read())
    #         f.write('\n# Manual seed used')
    #         f.write(f'\nmanual_seed: {cfg.manual_seed}')
    # else:
    #     with open(os.path.join(opt.saved_path, f'{opt.project}_history.yml'), 'w') as f:
    #         doc = open(f'projects/{opt.project}.yml', 'r')
    #         f.write(doc.read())

    training_params = {
        'batch_size': cfg.batch_size,
        'shuffle': True,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': opt.num_workers
    }

    val_params = {
        'batch_size': cfg.batch_size,
        'shuffle': False,
        'drop_last': True,
        'collate_fn': collater,
        'num_workers': opt.num_workers
    }

    input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]

    training_set = DataGenerator(
        data_path=os.path.join(opt.data_path, 'Train'),
        class_ids=cfg.dictionary_class_name.keys(),
        transform=transforms.Compose([
            Augmenter(),
            Normalizer(mean=cfg.mean, std=cfg.std),
            Resizer(input_sizes[cfg.compound_coef])
        ]),
        pre_augments=['', *[f'{aug}_' for aug in cfg.augment_list]]
        if cfg.augment_list else None)
    training_generator = DataLoader(training_set, **training_params)

    val_set = DataGenerator(
        # root_dir=os.path.join(opt.data_path, cfg.project_name),
        data_path=os.path.join(opt.data_path, 'Validation'),
        class_ids=cfg.dictionary_class_name.keys(),
        transform=transforms.Compose([
            Normalizer(mean=cfg.mean, std=cfg.std),
            Resizer(input_sizes[cfg.compound_coef])
        ]))
    val_generator = DataLoader(val_set, **val_params)

    model = EfficientDetBackbone(num_classes=len(cfg.dictionary_class_name),
                                 compound_coef=cfg.compound_coef,
                                 ratios=eval(cfg.anchor_ratios),
                                 scales=eval(cfg.anchor_scales))

    # load last weights
    if opt.load_weights is not None:
        if opt.load_weights.endswith('.pth'):
            weights_path = opt.load_weights
        else:
            weights_path = get_last_weights(opt.saved_path)
        try:
            last_step = int(
                os.path.basename(weights_path).split('_')[-1].split('.')[0])
        except:
            last_step = 0

        try:
            ret = model.load_state_dict(torch.load(weights_path), strict=False)
        except RuntimeError as e:
            print(f'[Warning] Ignoring {e}')
            print(
                '[Warning] Don\'t panic if you see this, '
                'this might be because you load a pretrained weights with different number of classes. '
                'The rest of the weights should be loaded already.')

        print(
            f'[Info] loaded weights: {os.path.basename(weights_path)}, resuming checkpoint from step: {last_step}'
        )
    else:
        last_step = 0
        print('[Info] initializing weights...')
        init_weights(model)

    # freeze backbone if train head_only
    if cfg.training_layer.lower() == 'heads':

        def freeze_backbone(m):
            classname = m.__class__.__name__
            for ntl in ['EfficientNet', 'BiFPN']:
                if ntl in classname:
                    for param in m.parameters():
                        param.requires_grad = False

        model.apply(freeze_backbone)
        print('[Info] freezed backbone')

    # https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
    # apply sync_bn when using multiple gpu and batch_size per gpu is lower than 4
    #  useful when gpu memory is limited.
    # because when bn is disable, the training will be very unstable or slow to converge,
    # apply sync_bn can solve it,
    # by packing all mini-batch across all gpus as one batch and normalize, then send it back to all gpus.
    # but it would also slow down the training by a little bit.
    if cfg.num_gpus > 1 and cfg.batch_size // cfg.num_gpus < 4:
        model.apply(replace_w_sync_bn)
        use_sync_bn = True
    else:
        use_sync_bn = False

    # warp the model with loss function, to reduce the memory usage on gpu0 and speedup
    model = ModelWithLoss(model, debug=opt.debug)

    if cfg.num_gpus > 0:
        model = model.cuda()
        if cfg.num_gpus > 1:
            model = CustomDataParallel(model, cfg.num_gpus)
            if use_sync_bn:
                patch_replication_callback(model)

    if cfg.optimizer.lower() == 'adamw':
        optimizer = torch.optim.AdamW(model.parameters(), cfg.learning_rate)
    if cfg.optimizer.lower() == 'srsgd':
        optimizer = SRSGD(model.parameters(),
                          lr=cfg.learning_rate,
                          weight_decay=5e-4,
                          iter_count=100)
    else:
        optimizer = torch.optim.SGD(model.parameters(),
                                    cfg.learning_rate,
                                    momentum=0.9,
                                    nesterov=True)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           patience=3,
                                                           verbose=True)

    # Setup complete, then start training
    now = datetime.datetime.now()
    opt.saved_path = opt.saved_path + f'/trainlogs_{now.strftime("%Y%m%d_%H%M%S")}'
    if opt.log_path is None:
        opt.log_path = opt.saved_path
    os.makedirs(opt.log_path, exist_ok=True)
    os.makedirs(opt.saved_path, exist_ok=True)

    # Write history
    if 'backlog' not in opt.config:
        with open(
                os.path.join(opt.saved_path,
                             f'{now.strftime("%Y%m%d%H%M%S")}.backlog.json'),
                'w') as f:
            backlog = dict(cfg.to_pascal_case())
            backlog['__metadata__'] = 'Backlog at ' + now.strftime(
                "%Y/%m/%d %H:%M:%S")
            json.dump(backlog, f)
    else:
        with open(
                os.path.join(opt.saved_path,
                             f'{now.strftime("%Y%m%d%H%M%S")}.history.json'),
                'w') as f:
            history = dict(cfg.to_pascal_case())
            history['__metadata__'] = now.strftime("%Y/%m/%d %H:%M:%S")
            json.dump(history, f)

    writer = SummaryWriter(opt.log_path + f'/tensorboard')

    epoch = 0
    best_loss = 1e5
    best_epoch = 0
    step = max(0, last_step)
    model.train()

    num_iter_per_epoch = len(training_generator)

    try:
        for epoch in range(cfg.no_epochs):
            last_epoch = step // num_iter_per_epoch
            if epoch < last_epoch:
                continue

            epoch_loss = []
            progress_bar = tqdm(training_generator)
            for iter, data in enumerate(progress_bar):
                if iter < step - last_epoch * num_iter_per_epoch:
                    progress_bar.set_description(
                        f'Skip {iter} < {step} - {last_epoch} * {num_iter_per_epoch}'
                    )
                    progress_bar.update()
                    continue
                try:
                    imgs = data['img']
                    annot = data['annot']

                    if cfg.num_gpus == 1:
                        # if only one gpu, just send it to cuda:0
                        # elif multiple gpus, send it to multiple gpus in CustomDataParallel, not here
                        imgs = imgs.cuda()
                        annot = annot.cuda()

                    optimizer.zero_grad()
                    cls_loss, reg_loss = model(
                        imgs, annot, obj_list=cfg.dictionary_class_name.keys())
                    cls_loss = cls_loss.mean()
                    reg_loss = reg_loss.mean()

                    loss = cls_loss + reg_loss
                    if loss == 0 or not torch.isfinite(loss):
                        continue

                    loss.backward()
                    # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
                    optimizer.step()

                    epoch_loss.append(float(loss))

                    progress_bar.set_description(
                        'Step: {}. Epoch: {}/{}. Iteration: {}/{}. Cls loss: {:.5f}. Reg loss: {:.5f}. '
                        'Total loss: {:.5f}'.format(step, epoch, cfg.no_epochs,
                                                    iter + 1,
                                                    num_iter_per_epoch,
                                                    cls_loss.item(),
                                                    reg_loss.item(),
                                                    loss.item()))
                    writer.add_scalars('Loss', {'train': loss}, step)
                    writer.add_scalars('Regression_loss', {'train': reg_loss},
                                       step)
                    writer.add_scalars('Classification_loss',
                                       {'train': cls_loss}, step)

                    # log learning_rate
                    current_lr = optimizer.param_groups[0]['lr']
                    writer.add_scalar('learning_rate', current_lr, step)

                    step += 1

                except Exception as e:
                    print('[Error]', traceback.format_exc())
                    print(e)
                    continue
            scheduler.step(np.mean(epoch_loss))

            model.eval()
            loss_regression_ls = []
            loss_classification_ls = []
            for iter, data in enumerate(val_generator):
                with torch.no_grad():
                    imgs = data['img']
                    annot = data['annot']

                    if cfg.num_gpus == 1:
                        imgs = imgs.cuda()
                        annot = annot.cuda()

                    cls_loss, reg_loss = model(
                        imgs, annot, obj_list=cfg.dictionary_class_name.keys())
                    cls_loss = cls_loss.mean()
                    reg_loss = reg_loss.mean()

                    loss = cls_loss + reg_loss
                    if loss == 0 or not torch.isfinite(loss):
                        continue

                    loss_classification_ls.append(cls_loss.item())
                    loss_regression_ls.append(reg_loss.item())

            cls_loss = np.mean(loss_classification_ls)
            reg_loss = np.mean(loss_regression_ls)
            loss = cls_loss + reg_loss

            progress_bar.set_description(
                'Val. Epoch: {}/{}. Classification loss: {:1.5f}. Regression loss: {:1.5f}.'
                ' Total loss: {:1.5f}'.format(epoch, cfg.no_epochs, cls_loss,
                                              reg_loss, loss))

            writer.add_scalars('Loss', {'val': loss}, step)
            writer.add_scalars('Regression_loss', {'val': reg_loss}, step)
            writer.add_scalars('Classification_loss', {'val': cls_loss}, step)

            if cfg.only_best_weights:
                if loss + opt.es_min_delta < best_loss:
                    best_loss = loss
                    best_epoch = epoch
                    save_checkpoint(
                        model,
                        f"{opt.saved_path}/det_d{cfg.compound_coef}_{epoch}_{step}.pth"
                    )
            else:
                if loss + opt.es_min_delta < best_loss:
                    best_loss = loss
                    best_epoch = epoch
                save_checkpoint(
                    model,
                    f"{opt.saved_path}/det_d{cfg.compound_coef}_{epoch}_{step}.pth"
                )

            model.train()

            # Early stopping
            if epoch - best_epoch > opt.es_patience > 0:
                print(
                    '[Info] Stop training at epoch {}. The lowest loss achieved is {}'
                    .format(epoch, best_loss))
                break
        print(
            f'[Info] Finished training. Best loss achieved {best_loss} at epoch {best_epoch}.'
        )
    except KeyboardInterrupt:
        save_checkpoint(
            model, f"{opt.saved_path}/d{cfg.compound_coef}_{epoch}_{step}.pth")
        writer.close()
    writer.close()
            t_rmse = rmse_loss(output, targets)
            rmse.update(t_rmse.item())

            output_np = np.clip(output.detach().cpu().numpy(), 0, 1)
            target_np = np.clip(targets.detach().cpu().numpy(), 0, 1)
            logging.info('[{0}][{1}][{2}]\t'
                         'lr: {lr:.5f}\t'
                         'loss: {loss.val:.6f} ({loss.avg:.6f})\t'
                         'RMSE: {rmse.val:.6f} ({rmse.avg:.6f})'.format(
                             epoch,
                             headid,
                             ind,
                             lr=optimizer.param_groups[-1]['lr'],
                             loss=losses,
                             rmse=rmse))
            writer.add_scalars("trainloss", {"train": losses.val}, step)
            step += 1

            ###############################################tenosrboard太麻烦######
            lossx.append(losses.val)
            rmsex.append(rmse.val)
            x = range(len(lossx))
            plt.figure(1)
            plt.title("this is loss and rmse")
            plt.plot(x, lossx, label='loss')
            plt.plot(x, rmsex, label='rmse')
            plt.legend()
            #changepoint 方便查看tensorboard太麻烦
            plt.savefig(
                '/media/workdir/hujh/hujh-new/huaweirader_baseline/log/demolog/unetloss.png'
            )
Exemplo n.º 28
0
def main(argv):
    checkpointFilePath = ''
    alwaysRender = False
    useCuda = True
    try:
        opts, args = getopt.getopt(argv, "hrc:a:g:",
                                   ["checkpoint=", "agent=", "cuda="])
    except getopt.GetoptError:
        print(
            'Error in command arguments. Run this for help:\n\ttrain_singleAgent.py -h'
        )
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-r':
            alwaysRender = True
        elif opt == '-h':
            print(
                'train_singleAgent.py\n-c <checkpointfile> => Resume training from a saved checkpoint\n-a(--agent) <agent version> => Version of agent to train (default=0)\n-r => Always render'
            )
            sys.exit()
        elif opt in ("-c", "--checkpoint"):
            checkpointFilePath = arg
        elif opt in ("-a", "--agent"):
            agentName = arg
        elif opt in ("-g", "--cuda"):
            useCuda = arg

    # Create a set of agents (exactly four)
    agent_list = [
        agents.SimpleAgent(),
        agents.SimpleAgent(),
        agents.SimpleAgent()
    ]
    if agentName == "2":
        agent_list.append(SPINAgents.SPIN_2())
    elif agentName == "1":
        agent_list.append(SPINAgents.SPIN_1())
    else:
        agent_list.append(SPINAgents.SPIN_0())

    # Make the "Team" environment using the agent list
    env = pommerman.make('PommeTeam-v0', agent_list)
    memory = ReplayMemory(100000)
    batch_size = 128
    epsilon = 1
    rewards = []
    start_epoch = 0

    # Writer to log data to tensorboard
    writer = SummaryWriter()

    if checkpointFilePath != '':
        start_epoch = load_checkpoint(agent_list[3], checkpointFilePath)

    # Run the episodes just like OpenAI Gym
    for i in range(start_epoch, 5750):
        state = env.reset()
        done = False
        total_reward = [0] * len(agent_list)
        epsilon *= 0.995
        while not done and agent_list[3]._character.is_alive:
            if i > 4990 or alwaysRender:
                env.render()
            # Set epsilon for our learning agent
            agent_list[3].epsilon = max(epsilon, 0.1)

            actions = env.act(state)
            agentAction = actions[3]
            actions[3] = actions[3].data.numpy()[0]
            obs_input = Variable(
                torch.from_numpy(agent_list[3].prepInput(state[3])).type(
                    torch.FloatTensor))
            next_obs, reward, done, _ = env.step(actions)
            state = next_obs
            if not agent_list[3]._character.is_alive:
                reward[3] = -1
            # Fill replay memory for our learning agent
            memory.push(
                agent_list[3].Input, actions[3],
                torch.from_numpy(agent_list[3].prepInput(state[3])).type(
                    torch.FloatTensor), torch.Tensor([reward[3]]),
                torch.Tensor([done]))
            total_reward = [x + y for x, y in zip(total_reward, reward)]
        rewards.append(total_reward)

        # Creates a dictionary with agent name and rewards to be displayed on tensorboard
        total_reward_list = []
        for j in range(len(total_reward)):
            total_reward_list.append(
                (type(agent_list[j]).__name__ + '(' + str(j) + ')',
                 total_reward[j]))
        writer.add_scalars('data/rewards', dict(total_reward_list), i)
        writer.add_scalar('data/epsilon', agent_list[3].epsilon, i)
        writer.add_scalar('data/memory', memory.__len__(), i)

        print("Episode : ", i)
        if memory.__len__() > 10000:
            batch = memory.sample(batch_size)
            agent_list[3].backward(batch)
        if i > 0 and i % 750 == 0:
            save_checkpoint(
                {
                    'epoch': i + 1,
                    'arch': 0,
                    'state_dict_Q': agent_list[3].Q.state_dict(),
                    'state_dict_target_Q': agent_list[3].target_Q.state_dict(),
                    'best_prec1': 0,
                    'optimizer': agent_list[3].optimizer.state_dict(),
                }, agent_list[3].__class__.__name__)
    env.close()

    save_checkpoint(
        {
            'epoch': 5000 + 1,
            'arch': 0,
            'state_dict_Q': agent_list[3].Q.state_dict(),
            'state_dict_target_Q': agent_list[3].target_Q.state_dict(),
            'best_prec1': 0,
            'optimizer': agent_list[3].optimizer.state_dict(),
        }, agent_list[3].__class__.__name__)

    writer.close()
Exemplo n.º 29
0
                    dim=1)
                viz_window = viz.line(
                    X=x_axis,
                    Y=y_axis,
                    opts=opts,
                )
            if args.tensorboard and \
                            package[
                                'loss_results'] is not None and start_epoch > 0:  # Previous scores to tensorboard logs
                for i in range(start_epoch):
                    values = {
                        'Avg Train Loss': loss_results[i],
                        'Avg WER': wer_results[i],
                        'Avg CER': cer_results[i]
                    }
                    tensorboard_writer.add_scalars(args.id, values, i + 1)
    else:
        with open(args.labels_path) as label_file:
            labels = str(''.join(json.load(label_file)))

        audio_conf = dict(sample_rate=args.sample_rate,
                          window_size=args.window_size,
                          window_stride=args.window_stride,
                          window=args.window,
                          noise_dir=args.noise_dir,
                          noise_prob=args.noise_prob,
                          noise_levels=(args.noise_min, args.noise_max))

        rnn_type = args.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
        model = DeepSpeech(rnn_hidden_size=args.hidden_size,
Exemplo n.º 30
0
                    dim=1)
                viz_window = viz.line(
                    X=x_axis,
                    Y=y_axis,
                    opts=opts,
                )
            if main_proc and args.tensorboard and \
                            package[
                                'loss_results'] is not None and start_epoch > 0:  # Previous scores to tensorboard logs
                for i in range(start_epoch):
                    values = {
                        'Avg Train Loss': loss_results[i],
                        'Avg WER': wer_results[i],
                        'Avg CER': cer_results[i]
                    }
                    tensorboard_writer.add_scalars(args.id, values, i + 1)
    else:
        with open(args.labels_path) as label_file:
            labels = str(''.join(json.load(label_file)))

        audio_conf = dict(sample_rate=args.sample_rate,
                          window_size=args.window_size,
                          window_stride=args.window_stride,
                          window=args.window,
                          noise_dir=args.noise_dir,
                          noise_prob=args.noise_prob,
                          noise_levels=(args.noise_min, args.noise_max))

        rnn_type = args.rnn_type.lower()
        assert rnn_type in supported_rnns, "rnn_type should be either lstm, rnn or gru"
        model = DeepSpeech(rnn_hidden_size=args.hidden_size,
Exemplo n.º 31
0
class WGAN_GP(object):
    def __init__(self, args):
        # parameters
        self.epoch = args.epoch
        self.sample_num = 100
        self.batch_size = args.batch_size
        self.save_dir = args.save_dir
        self.result_dir = args.result_dir
        self.dataset = args.dataset
        self.log_dir = args.log_dir
        self.gpu_mode = args.gpu_mode
        self.model_name = args.gan_type
        self.input_size = args.input_size
        self.z_dim = 62
        self.lambda_ = 10
        self.n_critic = 5  # the number of iterations of the critic per generator iteration

        print('run at WGAN_GP')
        # load dataset
        # self.data_loader = dataloader(self.dataset, self.input_size, self.batch_size)
        # data = self.data_loader.__iter__().__next__()[0]
        self.data_loader = testToGAN(self.dataset, 'train')
        self.dataset = 'trainAgain'
        data = next(iter(self.data_loader))[0]

        # networks init
        self.G = generator(input_dim=self.z_dim,
                           output_dim=data.shape[1],
                           input_size=self.input_size)
        self.D = discriminator(input_dim=data.shape[1],
                               output_dim=1,
                               input_size=self.input_size)
        self.G_optimizer = optim.Adam(self.G.parameters(),
                                      lr=args.lrG,
                                      betas=(args.beta1, args.beta2))
        self.D_optimizer = optim.Adam(self.D.parameters(),
                                      lr=args.lrD,
                                      betas=(args.beta1, args.beta2))

        if self.gpu_mode:
            self.G.cuda()
            self.D.cuda()

        print('---------- Networks architecture -------------')
        utils.print_network(self.G)
        utils.print_network(self.D)
        print('-----------------------------------------------')
        self.writer = SummaryWriter()  #log_dir=log_dir,
        self.X = 0

        # fixed noise
        self.sample_z_ = torch.rand((self.batch_size, self.z_dim))
        if self.gpu_mode:
            self.sample_z_ = self.sample_z_.cuda()

    def train(self):
        self.train_hist = {}
        self.train_hist['D_loss'] = []
        self.train_hist['G_loss'] = []
        self.train_hist['per_epoch_time'] = []
        self.train_hist['total_time'] = []

        # self.y_real_, self.y_fake_ = torch.ones(self.batch_size, 1), torch.zeros(self.batch_size, 1)
        self.y_real_, self.y_fake_ = torch.zeros(self.batch_size,
                                                 1), torch.ones(
                                                     self.batch_size, 1)

        if self.gpu_mode:
            self.y_real_, self.y_fake_ = self.y_real_.cuda(
            ), self.y_fake_.cuda()

        self.D.train()
        print('WGAN_GP training start!!,epoch:{},module stored at:{}'.format(
            self.epoch, self.dataset))
        start_time = time.time()
        url = os.path.join(self.save_dir, self.dataset, self.model_name)

        # 等间隔调整学习率 StepLR
        # schedule_G = torch.optim.lr_scheduler.StepLR(self.G_optimizer, 20, gamma=0.1, last_epoch=-1)
        # schedule_D = torch.optim.lr_scheduler.StepLR(self.D_optimizer, 30, gamma=0.1, last_epoch=-1)

        # 余弦退火调整学习率 CosineAnnealingLR
        # schedule_D=torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max, eta_min=0, last_epoch=-1)

        # 自适应调整学习率 ReduceLROnPlateau
        """当验证集的 loss 不再下降时,进行学习率调整;或者监测验证集的 accuracy,
        当accuracy 不再上升时,则调整学习率。"""
        # torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=False,
        # threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08)
        # schedule_G = torch.optim.lr_scheduler.ReduceLROnPlateau(self.G_optimizer, mode='min', factor=0.1, patience=10, verbose=False,
        # threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08)

        for epoch in range(100, self.epoch):
            if epoch == 100:
                self.G = torch.load(os.path.join(url, 'WGAN_GP_100_G.pkl'))
                self.D = torch.load(os.path.join(url, 'WGAN_GP_100_D.pkl'))
                print('reload success!', '*' * 40)
            self.G.train()
            epoch_start_time = time.time()
            # for iter, (x_, _) in enumerate(self.data_loader):
            for iter, x_, in enumerate(self.data_loader):
                x_ = x_[0]
                if iter == self.data_loader.dataset.__len__(
                ) // self.batch_size:
                    break

                z_ = torch.rand((self.batch_size, self.z_dim))
                if self.gpu_mode:
                    x_, z_ = x_.cuda(), z_.cuda()

                # update D network
                self.D_optimizer.zero_grad()

                D_real = self.D(x_)
                D_real_loss = -torch.mean(D_real)

                G_ = self.G(z_)
                D_fake = self.D(G_)
                D_fake_loss = torch.mean(D_fake)

                # gradient penalty
                alpha = torch.rand((self.batch_size, 1, 1, 1))
                if self.gpu_mode:
                    alpha = alpha.cuda()

                x_hat = alpha * x_.data + (1 - alpha) * G_.data
                x_hat.requires_grad = True

                pred_hat = self.D(x_hat)
                if self.gpu_mode:
                    gradients = grad(outputs=pred_hat,
                                     inputs=x_hat,
                                     grad_outputs=torch.ones(
                                         pred_hat.size()).cuda(),
                                     create_graph=True,
                                     retain_graph=True,
                                     only_inputs=True)[0]
                else:
                    gradients = grad(outputs=pred_hat,
                                     inputs=x_hat,
                                     grad_outputs=torch.ones(pred_hat.size()),
                                     create_graph=True,
                                     retain_graph=True,
                                     only_inputs=True)[0]

                gradient_penalty = self.lambda_ * (
                    (gradients.view(gradients.size()[0], -1).norm(2, 1) - 1)**
                    2).mean()

                D_loss = D_real_loss + D_fake_loss + gradient_penalty

                D_loss.backward()
                self.D_optimizer.step()

                if ((iter + 1) % self.n_critic) == 0:
                    # update G network
                    self.G_optimizer.zero_grad()

                    G_ = self.G(z_)
                    D_fake = self.D(G_)
                    G_loss = -torch.mean(D_fake)
                    self.train_hist['G_loss'].append(G_loss.item())

                    G_loss.backward()
                    self.G_optimizer.step()

                    self.train_hist['D_loss'].append(D_loss.item())

                if ((iter + 1) % 100) == 0:
                    print("Epoch: [%2d] [%4d/%4d] D_loss: %.8f, G_loss: %.8f" %
                          ((epoch + 1),
                           (iter + 1), self.data_loader.dataset.__len__() //
                           self.batch_size, D_loss.item(), G_loss.item()))
                    self.writer.add_scalar('G_loss', G_loss.item(), self.X)
                    # writer.add_scalar('G_loss', -G_loss_D, X)
                    self.writer.add_scalar('D_loss', D_loss.item(), self.X)
                    self.writer.add_scalars('cross loss', {
                        'G_loss': D_loss.item(),
                        'D_loss': D_loss.item()
                    }, self.X)
                    self.X += 1

            self.train_hist['per_epoch_time'].append(time.time() -
                                                     epoch_start_time)
            # with torch.no_grad():
            #     self.visualize_results((epoch+1))
            if epoch % 5 == 0:
                self.load_interval(epoch)

        self.train_hist['total_time'].append(time.time() - start_time)
        print("Avg one epoch time: %.2f, total %d epochs time: %.2f" %
              (np.mean(self.train_hist['per_epoch_time']), self.epoch,
               self.train_hist['total_time'][0]))
        print("Training finish!... save training results")

        save_dir = os.path.join(self.save_dir, self.dataset, self.model_name)

        with open(os.path.join(save_dir, self.model_name + '_train_hist.json'),
                  "a") as f:
            json.dump(self.train_hist, f)

        self.writer.export_scalars_to_json(
            os.path.join(save_dir, self.model_name + '.json'))
        self.writer.close()
        self.load_interval(epoch)

        # self.save()
        # utils.generate_animation(self.result_dir + '/' + self.dataset + '/' + self.model_name + '/' + self.model_name,
        #                          self.epoch)
        utils.loss_plot(
            self.train_hist,
            os.path.join(self.save_dir, self.dataset, self.model_name),
            self.model_name)

    def visualize_results(self, epoch, fix=True):
        self.G.eval()

        if not os.path.exists(self.result_dir + '/' + self.dataset + '/' +
                              self.model_name):
            os.makedirs(self.result_dir + '/' + self.dataset + '/' +
                        self.model_name)

        tot_num_samples = min(self.sample_num, self.batch_size)
        image_frame_dim = int(np.floor(np.sqrt(tot_num_samples)))

        if fix:
            """ fixed noise """
            samples = self.G(self.sample_z_)
        else:
            """ random noise """
            sample_z_ = torch.rand((self.batch_size, self.z_dim))
            if self.gpu_mode:
                sample_z_ = sample_z_.cuda()

            samples = self.G(sample_z_)

        if self.gpu_mode:
            samples = samples.cpu().data.numpy().transpose(0, 2, 3, 1)
        else:
            samples = samples.data.numpy().transpose(0, 2, 3, 1)

        samples = (samples + 1) / 2
        utils.save_images(
            samples[:image_frame_dim * image_frame_dim, :, :, :],
            [image_frame_dim, image_frame_dim],
            self.result_dir + '/' + self.dataset + '/' + self.model_name +
            '/' + self.model_name + '_epoch%03d' % epoch + '.png')

    def save(self):
        save_dir = os.path.join(self.save_dir, self.dataset, self.model_name)

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        torch.save(self.G.state_dict(),
                   os.path.join(save_dir, self.model_name + '_G.pkl'))
        torch.save(self.D.state_dict(),
                   os.path.join(save_dir, self.model_name + '_D.pkl'))

        with open(os.path.join(save_dir, self.model_name + '_history.pkl'),
                  'wb') as f:
            pickle.dump(self.train_hist, f)

    def load(self):
        save_dir = os.path.join(self.save_dir, self.dataset, self.model_name)

        self.G.load_state_dict(
            torch.load(os.path.join(save_dir, self.model_name + '_G.pkl')))
        self.D.load_state_dict(
            torch.load(os.path.join(save_dir, self.model_name + '_D.pkl')))

    def load_interval(self, epoch):
        save_dir = os.path.join(self.save_dir, self.dataset, self.model_name)
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        # 保存模型
        torch.save(
            self.G,
            os.path.join(
                save_dir, self.model_name +
                '_{}_G.pkl'.format(epoch)))  #dictionary ['bias', 'weight']
        torch.save(
            self.D,
            os.path.join(save_dir,
                         self.model_name + '_{}_D.pkl'.format(epoch)))
Exemplo n.º 32
0
def main():
  cfg = Config()

  # Redirect logs to both console and file.
  if cfg.log_to_file:
    ReDirectSTD(cfg.stdout_file, 'stdout', False)
    ReDirectSTD(cfg.stderr_file, 'stderr', False)

  # Lazily create SummaryWriter
  writer = None

  TVT, TMO = set_devices(cfg.sys_device_ids)

  if cfg.seed is not None:
    set_seed(cfg.seed)

  # Dump the configurations to log.
  import pprint
  print('-' * 60)
  print('cfg.__dict__')
  pprint.pprint(cfg.__dict__)
  print('-' * 60)

  ###########
  # Dataset #
  ###########

  train_set = create_dataset(**cfg.train_set_kwargs)
  num_classes = len(train_set.ids2labels)
  # The combined dataset does not provide val set currently.
  val_set = None if cfg.dataset == 'combined' else create_dataset(**cfg.val_set_kwargs)

  test_sets = []
  test_set_names = []
  if cfg.dataset == 'combined':
    for name in ['market1501', 'cuhk03', 'duke']:
      cfg.test_set_kwargs['name'] = name
      test_sets.append(create_dataset(**cfg.test_set_kwargs))
      test_set_names.append(name)
  else:
    test_sets.append(create_dataset(**cfg.test_set_kwargs))
    test_set_names.append(cfg.dataset)

  ###########
  # Models  #
  ###########

  model = Model(
    last_conv_stride=cfg.last_conv_stride,
    num_stripes=cfg.num_stripes,
    local_conv_out_channels=cfg.local_conv_out_channels,
    num_classes=num_classes
  )
  # Model wrapper
  model_w = DataParallel(model)

  #############################
  # Criteria and Optimizers   #
  #############################

  criterion = torch.nn.CrossEntropyLoss()

  # To finetune from ImageNet weights
  finetuned_params = list(model.base.parameters())
  # To train from scratch
  new_params = [p for n, p in model.named_parameters()
                if not n.startswith('base.')]
  param_groups = [{'params': finetuned_params, 'lr': cfg.finetuned_params_lr},
                  {'params': new_params, 'lr': cfg.new_params_lr}]
  optimizer = optim.SGD(
    param_groups,
    momentum=cfg.momentum,
    weight_decay=cfg.weight_decay)

  # Bind them together just to save some codes in the following usage.
  modules_optims = [model, optimizer]

  ################################
  # May Resume Models and Optims #
  ################################

  if cfg.resume:
    resume_ep, scores = load_ckpt(modules_optims, cfg.ckpt_file)

  # May Transfer Models and Optims to Specified Device. Transferring optimizer
  # is to cope with the case when you load the checkpoint to a new device.
  TMO(modules_optims)

  ########
  # Test #
  ########

  def test(load_model_weight=False):
    if load_model_weight:
      if cfg.model_weight_file != '':
        map_location = (lambda storage, loc: storage)
        sd = torch.load(cfg.model_weight_file, map_location=map_location)
        load_state_dict(model, sd)
        print('Loaded model weights from {}'.format(cfg.model_weight_file))
      else:
        load_ckpt(modules_optims, cfg.ckpt_file)

    for test_set, name in zip(test_sets, test_set_names):
      test_set.set_feat_func(ExtractFeature(model_w, TVT))
      print('\n=========> Test on dataset: {} <=========\n'.format(name))
      test_set.eval(
        normalize_feat=True,
        verbose=True)

  def validate():
    if val_set.extract_feat_func is None:
      val_set.set_feat_func(ExtractFeature(model_w, TVT))
    print('\n===== Test on validation set =====\n')
    mAP, cmc_scores, _, _ = val_set.eval(
      normalize_feat=True,
      to_re_rank=False,
      verbose=True)
    print()
    return mAP, cmc_scores[0]

  if cfg.only_test:
    test(load_model_weight=True)
    return

  ############
  # Training #
  ############

  start_ep = resume_ep if cfg.resume else 0
  for ep in range(start_ep, cfg.total_epochs):

    # Adjust Learning Rate
    adjust_lr_staircase(
      optimizer.param_groups,
      [cfg.finetuned_params_lr, cfg.new_params_lr],
      ep + 1,
      cfg.staircase_decay_at_epochs,
      cfg.staircase_decay_multiply_factor)

    may_set_mode(modules_optims, 'train')

    # For recording loss
    loss_meter = AverageMeter()

    ep_st = time.time()
    step = 0
    epoch_done = False
    while not epoch_done:

      step += 1
      step_st = time.time()

      ims, im_names, labels, mirrored, epoch_done = train_set.next_batch()

      ims_var = Variable(TVT(torch.from_numpy(ims).float()))
      labels_var = Variable(TVT(torch.from_numpy(labels).long()))

      _, logits_list = model_w(ims_var)
      loss = torch.sum(
        torch.cat([criterion(logits, labels_var) for logits in logits_list]))

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      ############
      # Step Log #
      ############

      loss_meter.update(to_scalar(loss))

      if step % cfg.steps_per_log == 0:
        log = '\tStep {}/Ep {}, {:.2f}s, loss {:.4f}'.format(
          step, ep + 1, time.time() - step_st, loss_meter.val)
        print(log)

    #############
    # Epoch Log #
    #############

    log = 'Ep {}, {:.2f}s, loss {:.4f}'.format(
      ep + 1, time.time() - ep_st, loss_meter.avg)
    print(log)

    ##########################
    # Test on Validation Set #
    ##########################

    mAP, Rank1 = 0, 0
    if ((ep + 1) % cfg.epochs_per_val == 0) and (val_set is not None):
      mAP, Rank1 = validate()

    # Log to TensorBoard

    if cfg.log_to_file:
      if writer is None:
        writer = SummaryWriter(log_dir=osp.join(cfg.exp_dir, 'tensorboard'))
      writer.add_scalars(
        'val scores',
        dict(mAP=mAP,
             Rank1=Rank1),
        ep)
      writer.add_scalars(
        'loss',
        dict(loss=loss_meter.avg, ),
        ep)

    # save ckpt
    if cfg.log_to_file:
      save_ckpt(modules_optims, ep + 1, 0, cfg.ckpt_file)

  ########
  # Test #
  ########

  test(load_model_weight=False)
Exemplo n.º 33
0
def main():

    args = get_arguments()

    # configuration
    CONFIG = Dict(yaml.safe_load(open(args.config)))

    # writer
    if CONFIG.writer_flag:
        writer = SummaryWriter(CONFIG.result_path)
    else:
        writer = None

    # DataLoaders
    train_data = PASCALVOC(
        CONFIG,
        mode="train",
        transform=Compose([
            RandomCrop(CONFIG),
            Resize(CONFIG),
            RandomFlip(),
            ToTensor(),
            Normalize(mean=get_mean(), std=get_std()),
        ])
    )

    val_data = PASCALVOC(
        CONFIG,
        mode="val",
        transform=Compose([
            RandomCrop(CONFIG),
            Resize(CONFIG),
            ToTensor(),
            Normalize(mean=get_mean(), std=get_std()),
        ])
    )

    train_loader = DataLoader(
        train_data,
        batch_size=CONFIG.batch_size,
        shuffle=True,
        num_workers=CONFIG.num_workers,
        drop_last=True
    )

    val_loader = DataLoader(
        val_data,
        batch_size=CONFIG.batch_size,
        shuffle=False,
        num_workers=CONFIG.num_workers
    )

    # load model
    print('\n------------------------Loading Model------------------------\n')

    if CONFIG.attention == 'dual':
        model = DANet(CONFIG)
        print('Dual Attintion modules will be added to this base model')
    elif CONFIG.attention == 'channel':
        model = CANet(CONFIG)
        print('Channel Attintion modules will be added to this base model')
    else:
        if CONFIG.model == 'drn_d_22':
            print(
                'Dilated ResNet D 22 w/o Dual Attention modules will be used as a model.')
            model = drn_d_22(pretrained=True, num_classes=CONFIG.n_classes)
        elif CONFIG.model == 'drn_d_38':
            print(
                'Dilated ResNet D 28 w/o Dual Attention modules will be used as a model.')
            model = drn_d_38(pretrained=True, num_classes=CONFIG.n_classes)
        else:
            print('There is no option you chose as a model.')
            print(
                'Therefore, Dilated ResNet D 22 w/o Dual Attention modules will be used as a model.')
            model = drn_d_22(pretrained=True, num_classes=CONFIG.n_classes)

    # set optimizer, lr_scheduler
    if CONFIG.optimizer == 'Adam':
        print(CONFIG.optimizer + ' will be used as an optimizer.')
        optimizer = optim.Adam(model.parameters(), lr=CONFIG.learning_rate)
    elif CONFIG.optimizer == 'SGD':
        print(CONFIG.optimizer + ' will be used as an optimizer.')
        optimizer = optim.SGD(
            model.parameters(),
            lr=CONFIG.learning_rate,
            momentum=CONFIG.momentum,
            dampening=CONFIG.dampening,
            weight_decay=CONFIG.weight_decay,
            nesterov=CONFIG.nesterov)
    elif CONFIG.optimizer == 'AdaBound':
        print(CONFIG.optimizer + ' will be used as an optimizer.')
        optimizer = adabound.AdaBound(
            model.parameters(),
            lr=CONFIG.learning_rate,
            final_lr=CONFIG.final_lr,
            weight_decay=CONFIG.weight_decay)
    else:
        print('There is no optimizer which suits to your option. \
            Instead, SGD will be used as an optimizer.')
        optimizer = optim.SGD(
            model.parameters(),
            lr=CONFIG.learning_rate,
            momentum=CONFIG.momentum,
            dampening=CONFIG.dampening,
            weight_decay=CONFIG.weight_decay,
            nesterov=CONFIG.nesterov)

    # learning rate scheduler
    if CONFIG.optimizer == 'SGD':
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, 'min', patience=CONFIG.lr_patience)
    else:
        scheduler = None

    # send the model to cuda/cpu
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    if device == 'cuda':
        model = torch.nn.DataParallel(model)  # make parallel
        torch.backends.cudnn.benchmark = True

    # resume if you want
    begin_epoch = 0
    if args.resume:
        if os.path.exists(os.path.join(CONFIG.result_path, 'checkpoint.pth')):
            print('loading the checkpoint...')
            begin_epoch, model, optimizer, scheduler = \
                resume(CONFIG, model, optimizer, scheduler)
            print('training will start from {} epoch'.format(begin_epoch))

    # criterion for loss
    if CONFIG.class_weight:
        criterion = nn.CrossEntropyLoss(
            weight=get_class_weight().to(device),
            ignore_index=255
        )
    else:
        criterion = nn.CrossEntropyLoss(ignore_index=255)

    # train and validate model
    print('\n------------------------Start training------------------------\n')
    losses_train = []
    losses_val = []
    val_ious = []
    mean_ious = []
    mean_ious_without_bg = []
    best_mean_iou = 0.0

    for epoch in range(begin_epoch, CONFIG.max_epoch):
        # training
        loss_train = train(
            model, train_loader, criterion, optimizer, CONFIG, device)
        losses_train.append(loss_train)

        # validation
        val_iou, loss_val = validation(
            model, val_loader, criterion, CONFIG, device)
        val_ious.append(val_iou)
        losses_val.append(loss_val)
        if CONFIG.optimizer == 'SGD':
            scheduler.step(loss_val)

        mean_ious.append(val_ious[-1].mean().item())
        mean_ious_without_bg.append(val_ious[-1][1:].mean().item())

        # save checkpoint every 5 epoch
        if epoch % 5 == 0 and epoch != 0:
            save_checkpoint(CONFIG, epoch, model, optimizer, scheduler)

        # save a model every 50 epoch
        if epoch % 50 == 0 and epoch != 0:
            torch.save(
                model.state_dict(), os.path.join(CONFIG.result_path, 'epoch_{}_model.prm'.format(epoch)))

        if best_mean_iou < mean_ious[-1]:
            best_mean_iou = mean_ious[-1]
            torch.save(
                model.state_dict(), os.path.join(CONFIG.result_path, 'best_mean_iou_model.prm'))

        # tensorboardx
        if writer:
            writer.add_scalars(
                "loss", {
                    'loss_train': losses_train[-1],
                    'loss_val': losses_val[-1]}, epoch)
            writer.add_scalar(
                "mean_iou", mean_ious[-1], epoch)
            writer.add_scalar(
                "mean_iou_w/o_bg", mean_ious_without_bg[-1], epoch)

        print(
            'epoch: {}\tloss_train: {:.5f}\tloss_val: {:.5f}\tmean IOU: {:.3f}\tmean IOU w/o bg: {:.3f}'.format(
                epoch, losses_train[-1], losses_val[-1], mean_ious[-1], mean_ious_without_bg[-1])
        )

    torch.save(
        model.state_dict(), os.path.join(CONFIG.result_path, 'final_model.prm'))
Exemplo n.º 34
0
class SummaryWorker(multiprocessing.Process):
    def __init__(self, env):
        super(SummaryWorker, self).__init__()
        self.env = env
        self.config = env.config
        self.queue = multiprocessing.Queue()
        try:
            self.timer_scalar = utils.train.Timer(env.config.getfloat('summary', 'scalar'))
        except configparser.NoOptionError:
            self.timer_scalar = lambda: False
        try:
            self.timer_image = utils.train.Timer(env.config.getfloat('summary', 'image'))
        except configparser.NoOptionError:
            self.timer_image = lambda: False
        try:
            self.timer_histogram = utils.train.Timer(env.config.getfloat('summary', 'histogram'))
        except configparser.NoOptionError:
            self.timer_histogram = lambda: False
        with open(os.path.expanduser(os.path.expandvars(env.config.get('summary_histogram', 'parameters'))), 'r') as f:
            self.histogram_parameters = utils.RegexList([line.rstrip() for line in f])
        self.draw_bbox = utils.visualize.DrawBBox(env.config, env.category)
        self.draw_iou = utils.visualize.DrawIou(env.config)

    def __call__(self, name, **kwargs):
        if getattr(self, 'timer_' + name)():
            kwargs = getattr(self, 'copy_' + name)(**kwargs)
            self.queue.put((name, kwargs))

    def stop(self):
        self.queue.put((None, {}))

    def run(self):
        self.writer = SummaryWriter(os.path.join(self.env.model_dir, self.env.args.run))
        while True:
            name, kwargs = self.queue.get()
            if name is None:
                break
            func = getattr(self, 'summary_' + name)
            try:
                func(**kwargs)
            except:
                traceback.print_exc()

    def copy_scalar(self, **kwargs):
        step, loss_total, loss, loss_hparam = (kwargs[key] for key in 'step, loss_total, loss, loss_hparam'.split(', '))
        loss_total = loss_total.data.clone().cpu().numpy()
        loss = {key: loss[key].data.clone().cpu().numpy() for key in loss}
        loss_hparam = {key: loss_hparam[key].data.clone().cpu().numpy() for key in loss_hparam}
        return dict(
            step=step,
            loss_total=loss_total,
            loss=loss, loss_hparam=loss_hparam,
        )

    def summary_scalar(self, **kwargs):
        step, loss_total, loss, loss_hparam = (kwargs[key] for key in 'step, loss_total, loss, loss_hparam'.split(', '))
        for key in loss:
            self.writer.add_scalar('loss/' + key, loss[key][0], step)
        if self.config.getboolean('summary_scalar', 'loss_hparam'):
            self.writer.add_scalars('loss_hparam', {key: loss_hparam[key][0] for key in loss_hparam}, step)
        self.writer.add_scalar('loss_total', loss_total[0], step)

    def copy_image(self, **kwargs):
        step, height, width, rows, cols, data, pred, debug = (kwargs[key] for key in 'step, height, width, rows, cols, data, pred, debug'.split(', '))
        data = {key: data[key].clone().cpu().numpy() for key in 'image, yx_min, yx_max, cls'.split(', ')}
        pred = {key: pred[key].data.clone().cpu().numpy() for key in 'yx_min, yx_max, iou, logits'.split(', ') if key in pred}
        matching = (debug['positive'].float() - debug['negative'].float() + 1) / 2
        matching = matching.data.clone().cpu().numpy()
        return dict(
            step=step, height=height, width=width, rows=rows, cols=cols,
            data=data, pred=pred,
            matching=matching,
        )

    def summary_image(self, **kwargs):
        step, height, width, rows, cols, data, pred, matching = (kwargs[key] for key in 'step, height, width, rows, cols, data, pred, matching'.split(', '))
        image = data['image']
        limit = min(self.config.getint('summary_image', 'limit'), image.shape[0])
        image = image[:limit, :, :, :]
        yx_min, yx_max, iou = (pred[key] for key in 'yx_min, yx_max, iou'.split(', '))
        scale = [height / rows, width / cols]
        yx_min, yx_max = (a * scale for a in (yx_min, yx_max))
        if 'logits' in pred:
            cls = np.argmax(F.softmax(torch.autograd.Variable(torch.from_numpy(pred['logits'])), -1).data.cpu().numpy(), -1)
        else:
            cls = np.zeros(iou.shape, np.int)
        if self.config.getboolean('summary_image', 'bbox'):
            # data
            canvas = np.copy(image)
            canvas = pybenchmark.profile('bbox/data')(self.draw_bbox_data)(canvas, *(data[key] for key in 'yx_min, yx_max, cls'.split(', ')))
            self.writer.add_image('bbox/data', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step)
            # pred
            canvas = np.copy(image)
            canvas = pybenchmark.profile('bbox/pred')(self.draw_bbox_pred)(canvas, yx_min, yx_max, cls, iou, nms=True)
            self.writer.add_image('bbox/pred', torchvision.utils.make_grid(torch.from_numpy(np.stack(canvas)).permute(0, 3, 1, 2).float(), normalize=True, scale_each=True), step)
        if self.config.getboolean('summary_image', 'iou'):
            # bbox
            canvas = np.copy(image)
            canvas_data = self.draw_bbox_data(canvas, *(data[key] for key in 'yx_min, yx_max, cls'.split(', ')), colors=['g'])
            # data
            for i, canvas in enumerate(pybenchmark.profile('iou/data')(self.draw_bbox_iou)(list(map(np.copy, canvas_data)), yx_min, yx_max, cls, matching, rows, cols, colors=['w'])):
                canvas = np.stack(canvas)
                canvas = torch.from_numpy(canvas).permute(0, 3, 1, 2)
                canvas = torchvision.utils.make_grid(canvas.float(), normalize=True, scale_each=True)
                self.writer.add_image('iou/data%d' % i, canvas, step)
            # pred
            for i, canvas in enumerate(pybenchmark.profile('iou/pred')(self.draw_bbox_iou)(list(map(np.copy, canvas_data)), yx_min, yx_max, cls, iou, rows, cols, colors=['w'])):
                canvas = np.stack(canvas)
                canvas = torch.from_numpy(canvas).permute(0, 3, 1, 2)
                canvas = torchvision.utils.make_grid(canvas.float(), normalize=True, scale_each=True)
                self.writer.add_image('iou/pred%d' % i, canvas, step)

    def draw_bbox_data(self, canvas, yx_min, yx_max, cls, colors=None):
        batch_size = len(canvas)
        if len(cls.shape) == len(yx_min.shape):
            cls = np.argmax(cls, -1)
        yx_min, yx_max, cls = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls))
        return [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(canvas, yx_min, yx_max, cls)]

    def draw_bbox_pred(self, canvas, yx_min, yx_max, cls, iou, colors=None, nms=False):
        batch_size = len(canvas)
        mask = iou > self.config.getfloat('detect', 'threshold')
        yx_min, yx_max = (np.reshape(a, [a.shape[0], -1, 2]) for a in (yx_min, yx_max))
        cls, iou, mask = (np.reshape(a, [a.shape[0], -1]) for a in (cls, iou, mask))
        yx_min, yx_max, cls, iou, mask = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls, iou, mask))
        yx_min, yx_max, cls, iou = ([a[m] for a, m in zip(l, mask)] for l in (yx_min, yx_max, cls, iou))
        if nms:
            overlap = self.config.getfloat('detect', 'overlap')
            keep = [pybenchmark.profile('nms')(utils.postprocess.nms)(torch.Tensor(iou), torch.Tensor(yx_min), torch.Tensor(yx_max), overlap) if iou.shape[0] > 0 else [] for yx_min, yx_max, iou in zip(yx_min, yx_max, iou)]
            keep = [np.array(k, np.int) for k in keep]
            yx_min, yx_max, cls = ([a[k] for a, k in zip(l, keep)] for l in (yx_min, yx_max, cls))
        return [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(canvas, yx_min, yx_max, cls)]

    def draw_bbox_iou(self, canvas_share, yx_min, yx_max, cls, iou, rows, cols, colors=None):
        batch_size = len(canvas_share)
        yx_min, yx_max = ([np.squeeze(a, -2) for a in np.split(a, a.shape[-2], -2)] for a in (yx_min, yx_max))
        cls, iou = ([np.squeeze(a, -1) for a in np.split(a, a.shape[-1], -1)] for a in (cls, iou))
        results = []
        for i, (yx_min, yx_max, cls, iou) in enumerate(zip(yx_min, yx_max, cls, iou)):
            mask = iou > self.config.getfloat('detect', 'threshold')
            yx_min, yx_max = (np.reshape(a, [a.shape[0], -1, 2]) for a in (yx_min, yx_max))
            cls, iou, mask = (np.reshape(a, [a.shape[0], -1]) for a in (cls, iou, mask))
            yx_min, yx_max, cls, iou, mask = ([a[b] for b in range(batch_size)] for a in (yx_min, yx_max, cls, iou, mask))
            yx_min, yx_max, cls = ([a[m] for a, m in zip(l, mask)] for l in (yx_min, yx_max, cls))
            canvas = [self.draw_bbox(canvas, yx_min.astype(np.int), yx_max.astype(np.int), cls, colors=colors) for canvas, yx_min, yx_max, cls in zip(np.copy(canvas_share), yx_min, yx_max, cls)]
            iou = [np.reshape(a, [rows, cols]) for a in iou]
            canvas = [self.draw_iou(_canvas, iou) for _canvas, iou in zip(canvas, iou)]
            results.append(canvas)
        return results

    def copy_histogram(self, **kwargs):
        return {key: kwargs[key].data.clone().cpu().numpy() if torch.is_tensor(kwargs[key]) else kwargs[key] for key in 'step, dnn'.split(', ')}


    def summary_histogram(self, **kwargs):
        step, dnn = (kwargs[key] for key in 'step, dnn'.split(', '))
        for name, param in dnn.named_parameters():
            if self.histogram_parameters(name):
                self.writer.add_histogram(name, param, step)
Exemplo n.º 35
0
def epoch_train(model, dataloader, dataset, criterion, optimizer, scheduler, device, data_const):
    print('epoch training...')
    
    # set visualization and create folder to save checkpoints
    writer = SummaryWriter(log_dir=args.log_dir + '/' + args.exp_ver + '/' + 'epoch_train')
    io.mkdir_if_not_exists(os.path.join(args.save_dir, args.exp_ver, 'epoch_train'), recursive=True)

    for epoch in range(args.start_epoch, args.epoch):
        # each epoch has a training and validation step
        epoch_loss = 0
        for phase in ['train', 'val']:
            start_time = time.time()
            running_loss = 0.0
            idx = 0
            
            HicoDataset.data_sample_count=0
            for data in tqdm(dataloader[phase]): 
                train_data = data
                img_name = train_data['img_name']
                det_boxes = train_data['det_boxes']
                roi_labels = train_data['roi_labels']
                roi_scores = train_data['roi_scores']
                node_num = train_data['node_num']
                edge_labels = train_data['edge_labels']
                edge_num = train_data['edge_num']
                features = train_data['features']
                spatial_feat = train_data['spatial_feat']
                word2vec = train_data['word2vec']
                features, spatial_feat, word2vec, edge_labels = features.to(device), spatial_feat.to(device), word2vec.to(device), edge_labels.to(device)
                if idx == 10: break    
                if phase == 'train':
                    model.train()
                    model.zero_grad()
                    outputs = model(node_num, features, spatial_feat, word2vec, roi_labels)
                    loss = criterion(outputs, edge_labels.float())
                    # import ipdb; ipdb.set_trace()
                    loss.backward()
                    optimizer.step()

                else:
                    model.eval()
                    # turn off the gradients for validation, save memory and computations
                    with torch.no_grad():
                        outputs = model(node_num, features, spatial_feat, word2vec, roi_labels, validation=True)
                        loss = criterion(outputs, edge_labels.float())
                    # print result every 1000 iteration during validation
                    if idx==0 or idx % round(1000/args.batch_size)==round(1000/args.batch_size)-1:
                        # ipdb.set_trace()
                        image = Image.open(os.path.join(args.img_data, img_name[0])).convert('RGB')
                        image_temp = image.copy()
                        raw_outputs = nn.Sigmoid()(outputs[0:int(edge_num[0])])
                        raw_outputs = raw_outputs.cpu().detach().numpy()
                        # class_img = vis_img(image, det_boxes, roi_labels, roi_scores)
                        class_img = vis_img(image, det_boxes[0], roi_labels[0], roi_scores[0], edge_labels[0:int(edge_num[0])].cpu().numpy(), score_thresh=0.7)
                        action_img = vis_img(image_temp, det_boxes[0], roi_labels[0], roi_scores[0], raw_outputs, score_thresh=0.7)
                        writer.add_image('gt_detection', np.array(class_img).transpose(2,0,1))
                        writer.add_image('action_detection', np.array(action_img).transpose(2,0,1))
                        writer.add_text('img_name', img_name[0], epoch)

                idx+=1
                # accumulate loss of each batch
                running_loss += loss.item() * edge_labels.shape[0]
            # calculate the loss and accuracy of each epoch
            epoch_loss = running_loss / len(dataset[phase])
            # import ipdb; ipdb.set_trace()
            # log trainval datas, and visualize them in the same graph
            if phase == 'train':
                train_loss = epoch_loss 
                HicoDataset.displaycount() 
            else:
                writer.add_scalars('trainval_loss_epoch', {'train': train_loss, 'val': epoch_loss}, epoch)
            # print data
            if (epoch % args.print_every) == 0:
                end_time = time.time()
                print("[{}] Epoch: {}/{} Loss: {} Execution time: {}".format(\
                        phase, epoch+1, args.epoch, epoch_loss, (end_time-start_time)))
                        
        # scheduler.step()
        # save model
        if epoch_loss<0.0405 or epoch % args.save_every == (args.save_every - 1) and epoch >= (200-1):
            checkpoint = { 
                            'lr': args.lr,
                           'b_s': args.batch_size,
                          'bias': args.bias, 
                            'bn': args.bn, 
                       'dropout': args.drop_prob,
                        'layers': args.layers,
                     'feat_type': args.feat_type,
                    'multi_head': args.multi_attn,
                     'diff_edge': args.diff_edge,
                    'state_dict': model.state_dict()
            }
            save_name = "checkpoint_" + str(epoch+1) + '_epoch.pth'
            torch.save(checkpoint, os.path.join(args.save_dir, args.exp_ver, 'epoch_train', save_name))

    writer.close()
    print('Finishing training!')
def main():
  cfg = Config()

  # Redirect logs to both console and file.
  if cfg.log_to_file:
    ReDirectSTD(cfg.stdout_file, 'stdout', False)
    ReDirectSTD(cfg.stderr_file, 'stderr', False)

  # Lazily create SummaryWriter
  writer = None

  TVT, TMO = set_devices(cfg.sys_device_ids)

  if cfg.seed is not None:
    set_seed(cfg.seed)

  # Dump the configurations to log.
  import pprint
  print('-' * 60)
  print('cfg.__dict__')
  pprint.pprint(cfg.__dict__)
  print('-' * 60)

  ###########
  # Dataset #
  ###########

  train_set = create_dataset(**cfg.train_set_kwargs)

  test_sets = []
  test_set_names = []
  if cfg.dataset == 'combined':
    for name in ['market1501', 'cuhk03', 'duke']:
      cfg.test_set_kwargs['name'] = name
      test_sets.append(create_dataset(**cfg.test_set_kwargs))
      test_set_names.append(name)
  else:
    test_sets.append(create_dataset(**cfg.test_set_kwargs))
    test_set_names.append(cfg.dataset)

  ###########
  # Models  #
  ###########

  model = Model(local_conv_out_channels=cfg.local_conv_out_channels,
                num_classes=len(train_set.ids2labels))
  # Model wrapper
  model_w = DataParallel(model)

  #############################
  # Criteria and Optimizers   #
  #############################

  id_criterion = nn.CrossEntropyLoss()
  g_tri_loss = TripletLoss(margin=cfg.global_margin)
  l_tri_loss = TripletLoss(margin=cfg.local_margin)

  optimizer = optim.Adam(model.parameters(),
                         lr=cfg.base_lr,
                         weight_decay=cfg.weight_decay)

  # Bind them together just to save some codes in the following usage.
  modules_optims = [model, optimizer]

  ################################
  # May Resume Models and Optims #
  ################################

  if cfg.resume:
    resume_ep, scores = load_ckpt(modules_optims, cfg.ckpt_file)

  # May Transfer Models and Optims to Specified Device. Transferring optimizer
  # is to cope with the case when you load the checkpoint to a new device.
  TMO(modules_optims)

  ########
  # Test #
  ########

  def test(load_model_weight=False):
    if load_model_weight:
      if cfg.model_weight_file != '':
        map_location = (lambda storage, loc: storage)
        sd = torch.load(cfg.model_weight_file, map_location=map_location)
        load_state_dict(model, sd)
        print('Loaded model weights from {}'.format(cfg.model_weight_file))
      else:
        load_ckpt(modules_optims, cfg.ckpt_file)

    use_local_distance = (cfg.l_loss_weight > 0) \
                         and cfg.local_dist_own_hard_sample

    for test_set, name in zip(test_sets, test_set_names):
      test_set.set_feat_func(ExtractFeature(model_w, TVT))
      print('\n=========> Test on dataset: {} <=========\n'.format(name))
      test_set.eval(
        normalize_feat=cfg.normalize_feature,
        use_local_distance=use_local_distance)

  if cfg.only_test:
    test(load_model_weight=True)
    return

  ############
  # Training #
  ############

  start_ep = resume_ep if cfg.resume else 0
  for ep in range(start_ep, cfg.total_epochs):

    # Adjust Learning Rate
    if cfg.lr_decay_type == 'exp':
      adjust_lr_exp(
        optimizer,
        cfg.base_lr,
        ep + 1,
        cfg.total_epochs,
        cfg.exp_decay_at_epoch)
    else:
      adjust_lr_staircase(
        optimizer,
        cfg.base_lr,
        ep + 1,
        cfg.staircase_decay_at_epochs,
        cfg.staircase_decay_multiply_factor)

    may_set_mode(modules_optims, 'train')

    g_prec_meter = AverageMeter()
    g_m_meter = AverageMeter()
    g_dist_ap_meter = AverageMeter()
    g_dist_an_meter = AverageMeter()
    g_loss_meter = AverageMeter()

    l_prec_meter = AverageMeter()
    l_m_meter = AverageMeter()
    l_dist_ap_meter = AverageMeter()
    l_dist_an_meter = AverageMeter()
    l_loss_meter = AverageMeter()

    id_loss_meter = AverageMeter()

    loss_meter = AverageMeter()

    ep_st = time.time()
    step = 0
    epoch_done = False
    while not epoch_done:

      step += 1
      step_st = time.time()

      ims, im_names, labels, mirrored, epoch_done = train_set.next_batch()

      ims_var = Variable(TVT(torch.from_numpy(ims).float()))
      labels_t = TVT(torch.from_numpy(labels).long())
      labels_var = Variable(labels_t)

      global_feat, local_feat, logits = model_w(ims_var)

      g_loss, p_inds, n_inds, g_dist_ap, g_dist_an, g_dist_mat = global_loss(
        g_tri_loss, global_feat, labels_t,
        normalize_feature=cfg.normalize_feature)

      if cfg.l_loss_weight == 0:
        l_loss = 0
      elif cfg.local_dist_own_hard_sample:
        # Let local distance find its own hard samples.
        l_loss, l_dist_ap, l_dist_an, _ = local_loss(
          l_tri_loss, local_feat, None, None, labels_t,
          normalize_feature=cfg.normalize_feature)
      else:
        l_loss, l_dist_ap, l_dist_an = local_loss(
          l_tri_loss, local_feat, p_inds, n_inds, labels_t,
          normalize_feature=cfg.normalize_feature)

      id_loss = 0
      if cfg.id_loss_weight > 0:
        id_loss = id_criterion(logits, labels_var)

      loss = g_loss * cfg.g_loss_weight \
             + l_loss * cfg.l_loss_weight \
             + id_loss * cfg.id_loss_weight

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      ############
      # Step Log #
      ############

      # precision
      g_prec = (g_dist_an > g_dist_ap).data.float().mean()
      # the proportion of triplets that satisfy margin
      g_m = (g_dist_an > g_dist_ap + cfg.global_margin).data.float().mean()
      g_d_ap = g_dist_ap.data.mean()
      g_d_an = g_dist_an.data.mean()

      g_prec_meter.update(g_prec)
      g_m_meter.update(g_m)
      g_dist_ap_meter.update(g_d_ap)
      g_dist_an_meter.update(g_d_an)
      g_loss_meter.update(to_scalar(g_loss))

      if cfg.l_loss_weight > 0:
        # precision
        l_prec = (l_dist_an > l_dist_ap).data.float().mean()
        # the proportion of triplets that satisfy margin
        l_m = (l_dist_an > l_dist_ap + cfg.local_margin).data.float().mean()
        l_d_ap = l_dist_ap.data.mean()
        l_d_an = l_dist_an.data.mean()

        l_prec_meter.update(l_prec)
        l_m_meter.update(l_m)
        l_dist_ap_meter.update(l_d_ap)
        l_dist_an_meter.update(l_d_an)
        l_loss_meter.update(to_scalar(l_loss))

      if cfg.id_loss_weight > 0:
        id_loss_meter.update(to_scalar(id_loss))

      loss_meter.update(to_scalar(loss))

      if step % cfg.log_steps == 0:
        time_log = '\tStep {}/Ep {}, {:.2f}s'.format(
          step, ep + 1, time.time() - step_st, )

        if cfg.g_loss_weight > 0:
          g_log = (', gp {:.2%}, gm {:.2%}, '
                   'gd_ap {:.4f}, gd_an {:.4f}, '
                   'gL {:.4f}'.format(
            g_prec_meter.val, g_m_meter.val,
            g_dist_ap_meter.val, g_dist_an_meter.val,
            g_loss_meter.val, ))
        else:
          g_log = ''

        if cfg.l_loss_weight > 0:
          l_log = (', lp {:.2%}, lm {:.2%}, '
                   'ld_ap {:.4f}, ld_an {:.4f}, '
                   'lL {:.4f}'.format(
            l_prec_meter.val, l_m_meter.val,
            l_dist_ap_meter.val, l_dist_an_meter.val,
            l_loss_meter.val, ))
        else:
          l_log = ''

        if cfg.id_loss_weight > 0:
          id_log = (', idL {:.4f}'.format(id_loss_meter.val))
        else:
          id_log = ''

        total_loss_log = ', loss {:.4f}'.format(loss_meter.val)

        log = time_log + \
              g_log + l_log + id_log + \
              total_loss_log
        print(log)

    #############
    # Epoch Log #
    #############

    time_log = 'Ep {}, {:.2f}s'.format(ep + 1, time.time() - ep_st, )

    if cfg.g_loss_weight > 0:
      g_log = (', gp {:.2%}, gm {:.2%}, '
               'gd_ap {:.4f}, gd_an {:.4f}, '
               'gL {:.4f}'.format(
        g_prec_meter.avg, g_m_meter.avg,
        g_dist_ap_meter.avg, g_dist_an_meter.avg,
        g_loss_meter.avg, ))
    else:
      g_log = ''

    if cfg.l_loss_weight > 0:
      l_log = (', lp {:.2%}, lm {:.2%}, '
               'ld_ap {:.4f}, ld_an {:.4f}, '
               'lL {:.4f}'.format(
        l_prec_meter.avg, l_m_meter.avg,
        l_dist_ap_meter.avg, l_dist_an_meter.avg,
        l_loss_meter.avg, ))
    else:
      l_log = ''

    if cfg.id_loss_weight > 0:
      id_log = (', idL {:.4f}'.format(id_loss_meter.avg))
    else:
      id_log = ''

    total_loss_log = ', loss {:.4f}'.format(loss_meter.avg)

    log = time_log + \
          g_log + l_log + id_log + \
          total_loss_log
    print(log)

    # Log to TensorBoard

    if cfg.log_to_file:
      if writer is None:
        writer = SummaryWriter(log_dir=osp.join(cfg.exp_dir, 'tensorboard'))
      writer.add_scalars(
        'loss',
        dict(global_loss=g_loss_meter.avg,
             local_loss=l_loss_meter.avg,
             id_loss=id_loss_meter.avg,
             loss=loss_meter.avg, ),
        ep)
      writer.add_scalars(
        'tri_precision',
        dict(global_precision=g_prec_meter.avg,
             local_precision=l_prec_meter.avg, ),
        ep)
      writer.add_scalars(
        'satisfy_margin',
        dict(global_satisfy_margin=g_m_meter.avg,
             local_satisfy_margin=l_m_meter.avg, ),
        ep)
      writer.add_scalars(
        'global_dist',
        dict(global_dist_ap=g_dist_ap_meter.avg,
             global_dist_an=g_dist_an_meter.avg, ),
        ep)
      writer.add_scalars(
        'local_dist',
        dict(local_dist_ap=l_dist_ap_meter.avg,
             local_dist_an=l_dist_an_meter.avg, ),
        ep)

    # save ckpt
    if cfg.log_to_file:
      save_ckpt(modules_optims, ep + 1, 0, cfg.ckpt_file)

  ########
  # Test #
  ########

  test(load_model_weight=False)
Exemplo n.º 37
0
class BaseTrainer:
    """
    Base class for all trainers
    """
    def __init__(self, model, loss, resume, config, train_logger=None):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)
        self.model = model
        self.loss = loss
        self.name = config['name']
        self.epochs = config['trainer']['epochs']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']
        self.summary_writer = SummaryWriter()

        # check cuda available
        if torch.cuda.is_available():
            if config['cuda']:
                self.with_cuda = True
                self.gpus = {
                    i: item
                    for i, item in enumerate(self.config['gpus'])
                }
                device = 'cuda'
                if torch.cuda.device_count() > 1 and len(self.gpus) > 1:
                    self.model.parallelize()
                torch.cuda.empty_cache()
            else:
                self.with_cuda = False
                device = 'cpu'
        else:
            self.logger.warning(
                'Warning: There\'s no CUDA support on this machine, training is performed on CPU.'
            )
            self.with_cuda = False
            device = 'cpu'

        self.device = torch.device(device)
        self.model.to(self.device)

        # log
        self.logger.debug('Model is initialized.')
        self._log_memory_useage()
        self.train_logger = train_logger

        # optimizer
        self.optimizer = self.model.optimize(config['optimizer_type'],
                                             config['optimizer'])

        # train monitor
        self.monitor = config['trainer']['monitor']
        self.monitor_mode = config['trainer']['monitor_mode']
        assert self.monitor_mode == 'min' or self.monitor_mode == 'max'
        self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf

        # checkpoint path
        self.start_epoch = 1
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'],
                                           self.name)
        make_dir(self.checkpoint_dir)

        if resume:
            self._resume_checkpoint(resume)

    def _train_epoch(self, epoch):
        """
        Training logic for an epoch

        :param epoch: Current epoch number
        """
        raise NotImplementedError

    def train(self):
        """
        Full training logic
        """
        print('Total epochs: {}'.format(self.epochs))
        for epoch in range(self.start_epoch, self.epochs + 1):
            try:
                result = self._train_epoch(epoch)
            except torch.cuda.CudaError:
                self._log_memory_useage()

            log = {'epoch': epoch}
            for key, value in result.items():
                log[key] = value

            # log info
            if self.train_logger is not None:
                self.train_logger.add_entry(log)
                if self.verbosity >= 1:
                    for key, value in log.items():
                        self.logger.info('    {:15s}: {}'.format(
                            str(key), value))

            # save checkpoints
            if (self.monitor_mode == 'min' and log[self.monitor] < self.monitor_best) \
                    or (self.monitor_mode == 'max' and log[self.monitor] > self.monitor_best):
                self.monitor_best = log[self.monitor]
                self._save_checkpoint(epoch, log, save_best=True)

            if epoch % self.save_freq == 0:
                self._save_checkpoint(epoch, log)

            self.summary_writer.add_scalars('HMEAN',
                                            {'hmean': result['hmean']}, epoch)
            self.summary_writer.add_scalars('LOSS',
                                            {'train_loss': result['loss']},
                                            epoch)

        self.summary_writer.close()

    def _log_memory_useage(self):
        if not self.with_cuda:
            return

        template = """Memory Usage: \n{}"""
        usage = []
        for deviceID, device in self.gpus.items():
            deviceID = int(deviceID)
            allocated = torch.cuda.memory_allocated(deviceID) / (1024 * 1024)
            cached = torch.cuda.memory_cached(deviceID) / (1024 * 1024)
            usage.append(
                '    CUDA: {}  Allocated: {} MB Cached: {} MB \n'.format(
                    device, allocated, cached))

        content = ''.join(usage)
        content = template.format(content)

        self.logger.debug(content)

    def _save_checkpoint(self, epoch, log, save_best=False):
        """
        Saving checkpoints

        :param epoch: current epoch number
        :param log: logging information of the epoch
        :param save_best: if True, rename the saved checkpoint to 'model_best.pth.tar'
        """
        arch = type(self.model).__name__
        if save_best:
            state = {
                'arch': arch,
                'epoch': epoch,
                'state_dict': self.model.state_dict()
            }
            filename = os.path.join(self.checkpoint_dir, 'model_best.pth.tar')
            torch.save(state, filename)
            self.logger.info(
                "Saving current best: {} ...".format('model_best.pth.tar'))
        else:
            state = {
                'arch': arch,
                'epoch': epoch,
                'logger': self.train_logger,
                'state_dict': self.model.state_dict(),
                'optimizer': self.optimizer.state_dict(),
                'monitor_best': self.monitor_best
            }
            filename = os.path.join(
                self.checkpoint_dir,
                'checkpoint-epoch{:03d}-loss-{:.4f}.pth.tar'.format(
                    epoch, log['loss']))
            torch.save(state, filename)
            self.logger.info("Saving checkpoint: {} ...".format(filename))

    def _resume_checkpoint(self, resume_path):
        """
        Resume from saved checkpoints

        :param resume_path: Checkpoint path to be resumed
        """
        self.logger.info("Loading checkpoint: {} ...".format(resume_path))
        checkpoint = torch.load(resume_path)
        self.start_epoch = checkpoint['epoch'] + 1
        self.monitor_best = checkpoint['monitor_best']
        self.model.load_state_dict(checkpoint['state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        if self.with_cuda:
            for state in self.optimizer.state.values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.cuda(torch.device('cuda'))
        self.train_logger = checkpoint['logger']
        self.logger.info("Checkpoint '{}' (epoch {}) loaded".format(
            resume_path, self.start_epoch))