Пример #1
0
def _get_lr_scheduler(opt):
    if 'lr_factor' not in opt or opt.lr_factor >= 1:
        return opt.lr, None
    global lr_steps, batch_size
    lr, lr_factor = opt.lr, opt.lr_factor
    start_epoch = opt.start_epoch
    num_examples = get_num_examples(opt.dataset)
    its_per_epoch = math.ceil(num_examples / batch_size)

    # move forward to start epoch
    for s in lr_steps:
        if start_epoch >= s:
            lr *= lr_factor
    if lr != opt.lr:
        logger.info('Adjust learning rate to %e for epoch %d', lr, start_epoch)

    steps = [
        its_per_epoch * (epoch - start_epoch) for epoch in lr_steps
        if epoch - start_epoch > 0
    ]
    if steps:
        return lr, lr_scheduler.MultiFactorScheduler(step=steps,
                                                     factor=lr_factor)
    else:
        return lr, None
Пример #2
0
 def learning_rate_scheduler(self):
     print("learning_rate \t base_lr:%d, scheduler: MultiFactorScheduler" %
           self._base_lr)
     lr_sch = lr_scheduler.MultiFactorScheduler(
         step=[int(self._num_epochs * 0.5),
               int(self._num_epochs * 0.75)],
         factor=0.1)
     lr_sch.base_lr = self._base_lr
     return lr_sch
Пример #3
0
def init_trainer_0(neural_network, number_of_batches):
    steps_iterations = [s * number_of_batches for s in SCHEDULER_STEPS]
    schedule = lr_scheduler.MultiFactorScheduler(step=steps_iterations,
                                                 factor=SCHEDULER_FACTOR)
    schedule.base_lr = LEARNING_RATE
    sgd_optimizer = optimizer.SGD(learning_rate=LEARNING_RATE,
                                  momentum=MOMENTUM,
                                  lr_scheduler=schedule)
    trainer = gluon.Trainer(params=neural_network.collect_params(),
                            optimizer=sgd_optimizer)
    return trainer
Пример #4
0
def generate_lr_scheduler(ls_dict):
    scheduler_type = ls_dict['type']
    scheduler_param = ls_dict['lr_scheduler_config']
    factor = float(scheduler_param['factor'])
    if scheduler_type == 'Factor':
        step = int(scheduler_param['step'])
        stop_factor_lr = float(scheduler_param['stop_factor_lr'])
        return ls.FactorScheduler(step, factor, stop_factor_lr)
    elif scheduler_type == 'MultiFactor':
        steps = scheduler_param['steps']
        step_list = [int(step) for step in steps]
        return ls.MultiFactorScheduler(step=step_list, factor=factor)
Пример #5
0
num_workers = hvd.size()
rank = hvd.rank()
local_rank = hvd.local_rank()

num_classes = 1000
num_training_samples = 1281167
batch_size = args.batch_size
epoch_size = \
    int(math.ceil(int(num_training_samples // num_workers) / batch_size))

if args.lr_mode == 'step':
    lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]
    steps = [epoch_size * x for x in lr_decay_epoch]
    lr_sched = lr_scheduler.MultiFactorScheduler(
        step=steps,
        factor=args.lr_decay,
        base_lr=(args.lr * num_workers),
        warmup_steps=(args.warmup_epochs * epoch_size),
        warmup_begin_lr=args.warmup_lr)
elif args.lr_mode == 'poly':
    lr_sched = lr_scheduler.PolyScheduler(args.num_epochs * epoch_size,
                                          base_lr=(args.lr * num_workers),
                                          pwr=2,
                                          warmup_steps=(args.warmup_epochs *
                                                        epoch_size),
                                          warmup_begin_lr=args.warmup_lr)
elif args.lr_mode == 'cosine':
    lr_sched = lr_scheduler.CosineScheduler(args.num_epochs * epoch_size,
                                            base_lr=(args.lr * num_workers),
                                            warmup_steps=(args.warmup_epochs *
                                                          epoch_size),
                                            warmup_begin_lr=args.warmup_lr)
Пример #6
0
def train(
        backbone,
        root_dir,
        train_index_fp,
        pretrain_model,
        optimizer,
        epochs=50,
        lr=0.001,
        wd=5e-4,
        momentum=0.9,
        batch_size=4,
        ctx=mx.cpu(),
        verbose_step=5,
        output_dir='ckpt',
):
    output_dir = os.path.join(output_dir, backbone)
    os.makedirs(output_dir, exist_ok=True)
    num_kernels = 3
    dataset = StdDataset(root_dir=root_dir,
                         train_idx_fp=train_index_fp,
                         num_kernels=num_kernels - 1)
    if not isinstance(ctx, (list, tuple)):
        ctx = [ctx]
    batch_size = batch_size * len(ctx)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    net = PSENet(base_net_name=backbone,
                 num_kernels=num_kernels,
                 ctx=ctx,
                 pretrained=True)
    # initial params
    net.initialize(mx.init.Xavier(), ctx=ctx)
    net.collect_params("extra_.*_weight|decoder_.*_weight").initialize(
        mx.init.Xavier(), ctx=ctx, force_reinit=True)
    net.collect_params("extra_.*_bias|decoder_.*_bias").initialize(
        mx.init.Zero(), ctx=ctx, force_reinit=True)

    if pretrain_model is not None:
        net.load_parameters(pretrain_model,
                            ctx=ctx,
                            allow_missing=True,
                            ignore_extra=True)

    # pse_loss = DiceLoss(lam=0.7, num_kernels=num_kernels)
    pse_loss = DiceLoss_with_OHEM(lam=0.7,
                                  num_kernels=num_kernels,
                                  debug=False)

    # lr_scheduler = ls.PolyScheduler(
    #     max_update=icdar_loader.length * epochs // batch_size, base_lr=lr
    # )
    max_update = len(dataset) * epochs // batch_size
    lr_scheduler = ls.MultiFactorScheduler(
        base_lr=lr, step=[max_update // 3, max_update * 2 // 3], factor=0.1)

    optimizer_params = {
        'learning_rate': lr,
        'wd': wd,
        'momentum': momentum,
        'lr_scheduler': lr_scheduler,
    }
    if optimizer.lower() == 'adam':
        optimizer_params.pop('momentum')

    trainer = Trainer(net.collect_params(),
                      optimizer=optimizer,
                      optimizer_params=optimizer_params)
    summary_writer = SummaryWriter(output_dir)
    for e in range(epochs):
        cumulative_loss = 0

        num_batches = 0
        for i, item in enumerate(loader):
            item_ctxs = [split_and_load(field, ctx) for field in item]
            loss_list = []
            for im, gt_text, gt_kernels, training_masks, ori_img in zip(
                    *item_ctxs):
                gt_text = gt_text[:, ::4, ::4]
                gt_kernels = gt_kernels[:, :, ::4, ::4]
                training_masks = training_masks[:, ::4, ::4]

                with autograd.record():
                    kernels_pred = net(im)  # 第0个是对complete text的预测
                    loss = pse_loss(gt_text, gt_kernels, kernels_pred,
                                    training_masks)
                    loss_list.append(loss)
            mean_loss = []
            for loss in loss_list:
                loss.backward()
                mean_loss.append(mx.nd.mean(to_cpu(loss)).asscalar())
            mean_loss = np.mean(mean_loss)
            trainer.step(batch_size)

            if i % verbose_step == 0:
                global_steps = dataset.length * e + i * batch_size
                summary_writer.add_scalar('loss', mean_loss, global_steps)
                summary_writer.add_scalar(
                    'c_loss',
                    mx.nd.mean(to_cpu(pse_loss.C_loss)).asscalar(),
                    global_steps,
                )
                summary_writer.add_scalar(
                    'kernel_loss',
                    mx.nd.mean(to_cpu(pse_loss.kernel_loss)).asscalar(),
                    global_steps,
                )
                summary_writer.add_scalar('pixel_accuracy', pse_loss.pixel_acc,
                                          global_steps)
            if i % 1 == 0:
                logger.info(
                    "step: {}, lr: {}, "
                    "loss: {}, score_loss: {}, kernel_loss: {}, pixel_acc: {}, kernel_acc: {}"
                    .format(
                        i * batch_size,
                        trainer.learning_rate,
                        mean_loss,
                        mx.nd.mean(to_cpu(pse_loss.C_loss)).asscalar(),
                        mx.nd.mean(to_cpu(pse_loss.kernel_loss)).asscalar(),
                        pse_loss.pixel_acc,
                        pse_loss.kernel_acc,
                    ))
            cumulative_loss += mean_loss
            num_batches += 1
        summary_writer.add_scalar('mean_loss_per_epoch',
                                  cumulative_loss / num_batches, global_steps)
        logger.info("Epoch {}, mean loss: {}\n".format(
            e, cumulative_loss / num_batches))
        net.save_parameters(
            os.path.join(output_dir, model_fn_prefix(backbone, e)))

    summary_writer.add_image('complete_gt', to_cpu(gt_text[0:1, :, :]),
                             global_steps)
    summary_writer.add_image('complete_pred',
                             to_cpu(kernels_pred[0:1, 0, :, :]), global_steps)
    summary_writer.add_images(
        'kernels_gt',
        to_cpu(gt_kernels[0:1, :, :, :]).reshape(-1, 1, 0, 0),
        global_steps,
    )
    summary_writer.add_images(
        'kernels_pred',
        to_cpu(kernels_pred[0:1, 1:, :, :]).reshape(-1, 1, 0, 0),
        global_steps,
    )

    summary_writer.close()
Пример #7
0
    number_classes = len(classes)

    net = ssd.SSD(number_classes)

    if os.path.exists(pretrained):
        net.load_parameters(pretrained)
        print('finetune based on ', pretrained)
    net.hybridize(static_alloc=True, static_shape=True)
    #net.hybridize()
    net.collect_params().reset_ctx(ctx)

    #lr_sch = lr_scheduler.MultiFactorScheduler(step=[int(num_epochs * 0.45), int(num_epochs * 0.7) ], factor=0.1, base_lr = base_lr, warmup_steps = 10)
    #lr_sch = lr_scheduler.MultiFactorScheduler(step=[int(num_epochs * 0.7) ], factor=0.1)
    #lr_sch.base_lr = base_lr
    #lr_sch = lr_schs.CosineScheduler(num_epochs,base_lr=base_lr,warmup=10)
    lr_sch = lr_scheduler.MultiFactorScheduler(
        step=[int(num_epochs * 0.45),
              int(num_epochs * 0.7)], factor=0.1)
    #lr_sch = lr_scheduler.MultiFactorScheduler(step=[int(num_epochs * 0.7)], factor=0.1)
    lr_sch.base_lr = base_lr

    trainer = Trainer(net.collect_params(),
                      optimizer="sgd",
                      optimizer_params={
                          "wd": wd,
                          "momentum": momentum
                      })

    train_ssd_custom(net, train_iter, test_iter, batch_size, trainer, ctx,
                     num_epochs, lr_sch, output_prefix)
Пример #8
0
def train_eval(opt):
    mx.random.seed(123)
    np.random.seed(123)
    os.environ['CUDA_VISIBLE_DEVICES']='0,1,2,3'
    gpus = [] if opt.gpus is None or opt.gpus is '' else [
        int(gpu) for gpu in opt.gpus.split(',')]
    num_gpus = len(gpus)
    batch_size = opt.batch_per_device*max(1,num_gpus)
    context = [mx.gpu(i) for i in gpus] if num_gpus>0 else [mx.cpu()]
    steps = [int(step) for step in opt.lr_scheduler_steps.split(',')]

    vis_env = opt.dataset + opt.output
    vis = Visulizer(env=vis_env)
    vis.log(opt)

    #optional ucf101 or meitu,get net structure,loss criterion,train val loader
    if opt.dataset=='ucf101' or opt.dataset=='ucf':
        net = R2Plus2D(num_class=101,model_depth=opt.model_depth)
        loss_criterion = gloss.SoftmaxCrossEntropyLoss() # loss function
        train_loader, val_loader = get_ucf101trainval(datadir='/data/jh/notebooks/hudengjun/DeepVideo/UCF-101',
                                                      batch_size=batch_size,
                                                      n_frame=opt.n_frame,
                                                      crop_size=opt.crop_size,
                                                      scale_h=opt.scale_h,
                                                      scale_w=opt.scale_w,
                                                      num_workers=opt.num_workers)  # the train and evaluation data loader
    elif opt.dataset =='meitu':
        net = R2Plus2D(num_class=63,model_depth=opt.model_depth,final_temporal_kernel=opt.n_frame//8) # labels set 63

        # train_loader,val_loader = get_meitu_dataloader(data_dir=opt.meitu_dir,
        #                                                device_id=opt.decoder_gpu,
        #                                                batch_size=batch_size,
        #                                                n_frame=opt.n_frame,
        #                                                crop_size=opt.crop_size,
        #                                                scale_h=opt.scale_h,
        #                                                scale_w=opt.scale_w,
        #                                                num_workers=opt.num_workers) # use multi gpus to load data
        train_loader, val_loader = get_meitu_dataloader(data_dir=opt.meitu_dir,
                                                        device_id=opt.decoder_gpu,
                                                        batch_size=batch_size,
                                                        num_workers=opt.num_workers,
                                                        n_frame=opt.n_frame,
                                                        crop_size=opt.crop_size,
                                                        scale_h=opt.scale_h,
                                                        scale_w=opt.scale_w,
                                                        cache_size=opt.cache_size)

    #[type(data) for i,enumerate(train_loader) if i<2]
    # step when 66,in data/nvvl_meitu.py
    # create new find_nvv_error.py ,copy train_nvvl_r3d.py one by one test,
    # find error

    loss_dict = {'bce':gloss.SigmoidBinaryCrossEntropyLoss,
                 'warp_nn':WarpLoss,
                 'warp_fn':WARP_funcLoss,
                 'lsep_nn':LsepLoss,
                 'lsep_fn':LSEP_funcLoss}
    if opt.loss_type == 'lsep_nnh':
        loss_criterion = LsepLossHy(batch_size=batch_size//num_gpus,num_class=opt.num_class)
        loss_criterion.hybridize()
    elif opt.loss_type =='bce':
        loss_criterion = gloss.SigmoidBinaryCrossEntropyLoss()
        loss_criterion.hybridize()
    else:
        loss_criterion = loss_dict[opt.loss_type]()





    # net.initialize(mx.init.Xavier(),
    #                ctx=context)  # net parameter initialize in several cards
    net.initialize(mx.init.Xavier(),ctx=context)
    if not opt.pretrained is None:
        if opt.pretrained.endswith('.pkl'):
            net.load_from_caffe2_pickle(opt.pretrained)
        elif opt.pretrained.endswith('.params'):
            try:
                print("load pretrained params ",opt.pretrained)
                net.load_from_sym_params(opt.pretrained,ctx = context)
            except Exception as e:
                print("load as sym params failed,reload as gluon params")
                net.load_params(opt.pretrained,ctx=context)
                #load params to net context

    net.hybridize()
    trainer = gluon.Trainer(net.collect_params(),'sgd',
                            {'learning_rate':opt.lr,'momentum':0.9,'wd':opt.wd},
                            kvstore=opt.kvstore) # the trainer

    lr_steps = lr_schedualer.MultiFactorScheduler(steps,opt.lr_schedualer_factor)
    lr_steps.base_lr = opt.lr

    best_eval = 0.0
    for epoch in range(opt.num_epoch):
        tic = time()
        pre_loss,cumulative_loss = 0.0,0.0
        trainer.set_learning_rate(lr_steps(epoch))
        vis.log('Epoch %d learning rate %f'%(epoch,trainer.learning_rate))

        for i,(data,label) in enumerate(train_loader):
            try:
                data_list = gluon.utils.split_and_load(data,ctx_list=context,batch_axis=0)
                label_list = gluon.utils.split_and_load(label,ctx_list=context,batch_axis=0)
            except Exception as e:
                logging.info(e)
                continue
            Ls =[]
            with autograd.record():
                for x,y in zip(data_list,label_list):
                    y_hat = net(x)
                    loss = loss_criterion(y_hat,y)
                    Ls.append(loss)
                    cumulative_loss +=nd.mean(loss).asscalar()
                for L in Ls:
                    L.backward()
            trainer.step(data.shape[0])
            if (i+1)%opt.log_interval ==0:
                vis.log('[Epoch %d,Iter %d ] training loss= %f'%(
                    epoch,i+1,cumulative_loss-pre_loss
                ))
                vis.plot('loss',cumulative_loss-pre_loss)
                pre_loss =cumulative_loss
                if opt.debug:
                    if (i+1)//(opt.log_interval)==3:
                        break
        vis.log('[Epoch %d] training loss=%f'%(epoch,cumulative_loss))
        vis.log('[Epoch %d] time used: %f'%(epoch,time()-tic))
        vis.log('[Epoch %d] saving net')
        save_path = './{0}/{1}_test-val{2}.params'.format(opt.output, str(opt.dataset + opt.loss_type), str(epoch))
        vis.log("save path %s" % (save_path))
        net.save_parameters(save_path)

        best_iou=0.0
        if opt.dataset=='ucf101' or opt.dataset =='ucf':
            acc = nd.array([0],ctx=mx.cpu())
            test_iter = 0
            for i,(data,label) in enumerate(val_loader):
                try:
                    data_list = gluon.utils.split_and_load(data,ctx_list=context,batch_axis=0)
                    label_list = gluon.utils.split_and_load(label,ctx_list=context,batch_axis=0)
                except Exception as e:
                    logging.info(e)
                    continue
                for x,y in zip(data_list,label_list):
                    y_hat = net(x)
                    test_iter +=1 # single iter
                    y_pred = y_hat.argmax(axis=1)
                    acc += (y_pred == y.astype('float32')).mean().asscalar() # acc in cpu

                val_acc = acc.asscalar() / test_iter
                if (i+1) %(opt.log_interval)==0:
                    logging.info("[Epoch %d,Iter %d],acc=%f" % (epoch,i,val_acc))
                    if opt.debug:
                        if (i+1)//opt.log_interval ==3:
                            break
            vis.plot('acc',val_acc)
        elif opt.dataset=='meitu':
            k=4
            topk_inter = np.array([1e-4]*k) # a epsilon for divide not by zero
            topk_union = np.array([1e-4]*k)

            for i,(data,label) in enumerate(val_loader):
                try:
                    data_list = gluon.utils.split_and_load(data,ctx_list=context,batch_axis=0)
                    label_list = gluon.utils.split_and_load(label,ctx_list=context,batch_axis=0)
                except Exception as e:
                    logging.info(e)
                    continue
                for x,y in zip(data_list,label_list):
                    y_hat = net(x)
                    pred_order = y_hat.argsort()[:,::-1] # sort and desend order
                    #just compute top1 label
                    pred_order_np = pred_order.asnumpy()
                    y_np = y.asnumpy()
                    if opt.debug:
                        print("pred shape and target shape",pred_order_np.shape,y_np.shape)
                    for pred_vec,y_vec in zip(pred_order_np,y_np):
                        label_set =set([index for index,value in enumerate(y_vec) if value>0.1])
                        pred_topk = [set(pred_vec[0:k]) for k in range(1,k+1)]
                        topk_inter +=np.array([len(p_k.intersection(label_set)) for p_k in pred_topk])
                        topk_union +=np.array([len(p_k.union(label_set)) for p_k in pred_topk])
                if (i+1) %(opt.log_interval)==0:
                    logging.info("[Epoch %d,Iter %d],time %s,Iou %s" % (epoch, i, \
                                                                        tmm.strftime("%Y-%D:%H-%S"), \
                                                                        str(topk_inter / topk_union)))

                    for i in range(k):
                        vis.plot('val_iou_{0}'.format(i+1),topk_inter[i]/topk_union[i])
                    if opt.debug:
                        if (i + 1) // (opt.log_interval) == 2:
                            break
    vis.log("""----------------------------------------
               ----XXXX------finished------------------
               ----------------------------------------""")
Пример #9
0
def train_decision(opt):
    mx.random.seed(123)
    np.random.seed(123)
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
    gpus = [] if opt.gpus is None or opt.gpus is '' else [
        int(gpu) for gpu in opt.gpus.split(',')
    ]
    num_gpus = len(gpus)
    batch_size = opt.batch_per_device * max(1, num_gpus)
    context = [mx.gpu(i) for i in gpus] if num_gpus > 0 else [mx.cpu()]
    steps = [int(step) for step in opt.lr_scheduler_steps.split(',')]

    feature_net = R2Plus2D(num_class=62, model_depth=34)
    model = Decision_thresh(thresh_size=62)

    if not opt.ranking_model is None:
        feature_net.load_params(opt.ranking_model, ctx=context)
    model.initialize(init=mx.init.Xavier(), ctx=context)
    trainer = mx.gluon.Trainer(model.collect_params(),'sgd',\
                               {'learning_rate':opt.lr,'momentum':0.9,'wd':opt.wd},
                               kvstore=opt.kvstore)
    train_loader, val_loader = get_simple_meitu_dataloader(
        datadir=opt.meitu_dir,
        batch_size=batch_size,
        n_frame=opt.n_frame,
        crop_size=opt.crop_size,
        scale_h=opt.scale_h,
        scale_w=opt.scale_w,
        num_workers=opt.num_workers)
    loss_criterion = gluon.loss.SigmoidBinaryCrossEntropyLoss()
    lr_steps = lr_schedualer.MultiFactorScheduler(steps,
                                                  opt.lr_scheduler_factor)
    best_eval = 0.0
    for epoch in range(opt.num_epoch):
        tic = time()
        pre_loss, cumulative_loss = 0.0, 0.0
        trainer.set_learning_rate(lr_steps(epoch))
        logging.info(
            'Epoch %d learning rate %f to make decision through threshold' %
            (epoch, trainer.learning_rate))
        for i, (data, label) in enumerate(train_loader):
            try:
                data_list = gluon.utils.split_and_load(data,
                                                       ctx_list=context,
                                                       batch_axis=0)
                label_list = gluon.utils.split_and_load(label,
                                                        ctx_list=context,
                                                        batch_axis=0)
            except Exception as e:
                logging.info(e)
                continue
            Ls = []
            confidences = []
            for x in data_list:
                confidences.append(feature_net(x))
            with autograd.record():
                Ls = []
                for conf, y in zip(confidences, label_list):
                    decision = model(conf)
                    loss = loss_criterion(decision, y)
                    Ls.append(loss)
                    cumulative_loss += nd.mean(loss).asscalar()
                for L in Ls:
                    L.backward()
            trainer.step(data.shape[0])
            if (i + 1) % opt.log_interval == 0:
                logging.info('[Epoch %d,Iter %d] ,training loss=%f' %
                             (epoch + 1, i + 1, cumulative_loss - pre_loss))
                pre_loss = cumulative_loss
                print(model.collect_params()['decision_thresh0_thresh'].data())
            if opt.debug:
                break
        logging.info('[Epoch %d] training loss = %f' %
                     (epoch, cumulative_loss))
        logging.info('[Epoch %d] time used:%f' % (epoch, time() - tic))
        logging.info('[Epoch %d] save net')
        model.save_parameters('./{0}/{1}_decisionmodel_{2}.params'.format(
            opt.output, str(opt.dataset + opt.loss_type), str(epoch)))

        # begin to evaluation the model

        inter = 1e-4
        union = 1e-4
        tic = time()
        for i, (data, label) in enumerate(val_loader):
            try:
                data_list = gluon.utils.split_and_load(data,
                                                       ctx_list=context,
                                                       batch_axis=0)
                label_list = gluon.utils.split_and_load(label,
                                                        ctx_list=context,
                                                        batch_axis=0)
            except Exception as e:
                logging.info(e)
                continue
            try:
                for x, y in zip(data_list, label_list):
                    conf = feature_net(x)
                    sig_label = model(conf)
                    y_np = y.asnumpy()
                    sig_np = sig_label.asnumpy()
                    rows, indexs = np.where(sig_np > 0.5)
                    labelset_list = []
                    for j in range(x.shape[0]):
                        labelset_list.append(set())
                    #labelset_list = [set()]*x.shape[0] # the sample number
                    for row, index in zip(rows, indexs):
                        labelset_list[row].add(index)
                    for pred_set, gt_vec in zip(labelset_list, y_np):
                        gt_set = set([
                            index for index, value in enumerate(gt_vec)
                            if value > 0.1
                        ])
                        inter += len(pred_set.intersection(gt_set))
                        union += len(pred_set.union(gt_set))
            except Exception as e:
                print(e)
                continue
            if (i + 1) % (opt.log_interval) == 0:
                logging.info('[Epoch %d,Iter %d],time %s,IoU %s' %
                             (epoch, i, tmm.strftime("%Y-%D:%H-%S"),
                              str(inter / union)))
                if opt.debug:
                    break
        logging.info("finish one epoch validataion")
        logging.info("[Epoch %d],validation time used %d" %
                     (epoch, time() - tic))
    logging.info("finish all epoch trainning and test")
Пример #10
0
def train_eval(opt):
    mx.random.seed(123)
    np.random.seed(123)
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
    gpus = [] if opt.gpus is None or opt.gpus is '' else [
        int(gpu) for gpu in opt.gpus.split(',')]
    num_gpus = len(gpus)
    batch_size = opt.batch_per_device * max(1, num_gpus)
    context = [mx.gpu(i) for i in gpus][0] if num_gpus > 0 else [mx.cpu()]
    steps = [int(step) for step in opt.lr_scheduler_steps.split(',')]

    vis_env = opt.dataset + opt.output
    vis = Visulizer(env=vis_env)
    vis.log(opt)


    net = R2Plus2D_MT(num_scenes=19,num_actions=44, model_depth=opt.model_depth,
                   final_temporal_kernel=opt.n_frame // 8)  # labels set 63

    # train_loader,val_loader = get_meitu_dataloader(data_dir=opt.meitu_dir,
    #                                                device_id=opt.decoder_gpu,
    #                                                batch_size=batch_size,
    #                                                n_frame=opt.n_frame,
    #                                                crop_size=opt.crop_size,
    #                                                scale_h=opt.scale_h,
    #                                                scale_w=opt.scale_w,
    #                                                num_workers=opt.num_workers) # use multi gpus to load data
    train_loader, val_loader,sample_weight = get_meitu_multi_task_dataloader(data_dir=opt.meitu_dir,
                                                    device_id=opt.decoder_gpu,
                                                    batch_size=batch_size,
                                                    num_workers=opt.num_workers,
                                                    n_frame=opt.n_frame,
                                                    crop_size=opt.crop_size,
                                                    scale_h=opt.scale_h,
                                                    scale_w=opt.scale_w,
                                                    cache_size=opt.cache_size)

    action_loss = gloss.SoftmaxCrossEntropyLoss()

    #scene_loss = LsepLoss()

    # [type(data) for i,enumerate(train_loader) if i<2]
    # step when 66,in data/nvvl_meitu.py
    # create new find_nvv_error.py ,copy train_nvvl_r3d.py one by one test,
    # find error

    loss_dict = {'bce': gloss.SigmoidBinaryCrossEntropyLoss,
                 'warp_nn': WarpLoss,
                 'warp_fn': WARP_funcLoss,
                 'lsep_nn': LsepLoss,
                 'lsep_fn': LSEP_funcLoss}
    scene_loss = loss_dict[opt.loss_type]()

    # if opt.loss_type == 'lsep_nnh':
    #     loss_criterion = LsepLossHy(batch_size=batch_size // num_gpus, num_class=opt.num_class)
    #     loss_criterion.hybridize()
    # elif opt.loss_type == 'bce':
    #     loss_criterion = gloss.SigmoidBinaryCrossEntropyLoss()
    #     loss_criterion.hybridize()
    # else:
    #

    # net.initialize(mx.init.Xavier(),
    #                ctx=context)  # net parameter initialize in several cards

    net.initialize(mx.init.Xavier(), ctx=context)
    if not opt.pretrained is None:
        if opt.pretrained.endswith('.pkl'):
            net.load_from_caffe2_pickle(opt.pretrained)
        elif opt.pretrained.endswith('.params'):
            try:
                print("load pretrained params ", opt.pretrained)
                net.load_from_sym_params(opt.pretrained, ctx=context)
            except Exception as e:
                print("load as sym params failed,reload as gluon params")
                net.load_params(opt.pretrained, ctx=context)
                # load params to net context

    #net.hybridize()
    trainer = gluon.Trainer(net.collect_params(), 'sgd',
                            {'learning_rate': opt.lr, 'momentum': 0.9, 'wd': opt.wd},
                            kvstore=opt.kvstore)  # the trainer

    lr_steps = lr_schedualer.MultiFactorScheduler(steps, opt.lr_schedualer_factor)
    lr_steps.base_lr = opt.lr

    best_eval = 0.0
    for epoch in range(opt.num_epoch):
        tic = time()
        scene_pre_loss, scene_cumulative_loss = 0.0,0.0
        action_pre_loss,action_cumulative_loss = 0.0, 0.0
        trainer.set_learning_rate(lr_steps(epoch))
        vis.log('Epoch %d learning rate %f' % (epoch, trainer.learning_rate))

        for i, (data, scene_label,action_label) in enumerate(train_loader):
            # single card not split
            with autograd.record():
                data = data.as_in_context(context)
                scene_label = scene_label.as_in_context(context)
                action_label = action_label.as_in_context(context)

                pred_scene,pred_action = net(data)
                loss_scene = scene_loss(pred_scene,scene_label)
                loss_action = action_loss(pred_action,action_label)
                loss = loss_scene + opt.action_rate*loss_action.mean()
                scene_cumulative_loss += nd.mean(loss_scene).asscalar()
                action_cumulative_loss +=nd.mean(loss_action).asscalar()
                loss.backward()
            trainer.step(data.shape[0])
            if (i + 1) % opt.log_interval == 0:
                vis.log('[Epoch %d,Iter %d ] scene loss= %f' % (epoch, i + 1, scene_cumulative_loss - scene_pre_loss))
                vis.plot('scene_loss', scene_cumulative_loss - scene_pre_loss)
                scene_pre_loss = scene_cumulative_loss

                vis.log('[Epoch %d,Iter %d ] action loss= %f' % (epoch, i + 1, action_cumulative_loss - action_pre_loss ))
                vis.plot("action_loss", action_cumulative_loss - action_pre_loss)
                action_pre_loss = action_cumulative_loss

                if opt.debug:
                    if (i + 1) // (opt.log_interval) == 3:
                        break

        vis.log('[Epoch %d] scene loss=%f,action loss=%f' % (epoch, scene_cumulative_loss,action_cumulative_loss))
        vis.log('[Epoch %d] time used: %f' % (epoch, time() - tic))
        vis.log('[Epoch %d] saving net')
        save_path = './{0}/{1}_test-val{2}.params'.format(opt.output, str(opt.dataset + 'multi'), str(epoch))
        vis.log("save path %s" % (save_path))
        net.save_parameters(save_path)



        label_inter =1e-4
        label_union =1e-4
        acc = nd.array([0], ctx=mx.cpu())
        val_iter =0
        for i,(data,scene_label,action_label) in enumerate(val_loader):
            data = data.as_in_context(context)
            action_label = action_label.as_in_context(context)
            scene_pred,action_pred = net(data)
            scene_order = scene_pred.argsort()[:,::-1]
            scene_order_np = scene_order.asnumpy()
            scene_label_np = scene_label.asnumpy()
            for scene_pred_v,scene_label_v in zip(scene_order_np,scene_label_np):
                label_set = set([index for index,value in enumerate(scene_label_v) if value>0.1])
                pred_top1 = set([scene_pred_v[0]])
                label_inter += len(pred_top1.intersection(label_set))
                label_union += len(pred_top1.union(label_set))

            action_pred = action_pred.argmax(axis=1)
            acc += (action_pred == action_label.astype('float32')).mean().asscalar()
            val_iter +=1
            if (i + 1) % (opt.log_interval) == 0:
                vis.log("[Epoch %d,Iter %d],action_acc= %f"%(epoch,i,acc.asscalar()/val_iter))
                vis.log("[Epoch %d,Iter %d],scene_top1=%f"%(epoch,i,label_inter/label_union))
                if opt.debug:
                    if (i + 1) // (opt.log_interval) == 2:
                        break
    vis.log("""----------------------------------------
               ----XXXX------finished------------------
               ----------------------------------------""")