Exemplo n.º 1
0
def init_trainer_0(neural_network, number_of_batches):
    steps_iterations = [s * number_of_batches for s in SCHEDULER_STEPS]
    schedule = lr_scheduler.MultiFactorScheduler(step=steps_iterations,
                                                 factor=SCHEDULER_FACTOR)
    schedule.base_lr = LEARNING_RATE
    sgd_optimizer = optimizer.SGD(learning_rate=LEARNING_RATE,
                                  momentum=MOMENTUM,
                                  lr_scheduler=schedule)
    trainer = gluon.Trainer(params=neural_network.collect_params(),
                            optimizer=sgd_optimizer)
    return trainer
Exemplo n.º 2
0
def train(params, loader, model=None):
    epoch = params.get('epoch', 10)
    verbose = params.get("verbose", True)
    batch_size = params.get("batch_size", 32)
    if model is None:
        class_name = params["class_name"]
        layer_num = params.get("layer_num", 5)
        class_num = params.get("class_num", 3)
        s = params.get("s", 4)
        b = params.get("b", 2)
        yolo = Yolo(layer_num, class_num, s=s, b=b, class_name=class_name)

        yolo.initialize(init=Xavier(magnitude=0.02))
    else:
        print("model load finish")
        layer_num = model.layer_num
        class_num = model.class_num
        s = model.s
        b = model.b
        yolo = model
    if verbose:
        print("train params: \n\tepoch:%d \n\tlayer_num:%d \n\tclass_num:%d  \n\ts:%d  \n\tb:%d" % \
              (epoch, layer_num, class_num, s, b))

    ngd = optimizer.SGD(momentum=0.7, learning_rate=0.005)
    trainer = gluon.Trainer(yolo.collect_params(), ngd)

    for ep in range(epoch):
        loader.reset()
        mean_loss = 0
        t1 = time()
        for i, batch in enumerate(loader):
            x = batch.data[0]
            y = batch.label[0].reshape((-1, 5))
            y = translate_y(y, yolo.s, yolo.b, yolo.class_num)
            y = nd.array(y)
            with autograd.record():
                loss_func = TotalLoss(s=s, c=class_num, b=b)
                ypre = yolo(x)  # (32,output_dim)
                loss = nd.mean(loss_func(ypre, y))
                mean_loss += loss.asscalar()
            loss.backward()
            trainer.step(batch_size)
        t2 = time()
        if verbose:
            print("epoch:%d/%d  loss:%.5f  time:%4f" %
                  (ep + 1, epoch, mean_loss / 32, t2 - t1),
                  flush=True)

        print()
    return yolo
Exemplo n.º 3
0
def train2(params, loader: BaseDataLoader, model=None):
    epoch = params.get('epoch', 10)
    verbose = params.get("verbose", True)
    batch_size = params.get("batch_size", 32)
    if model is None:
        layer_num = params.get("layer_num", 5)
        class_num = params.get("class_num", 3)
        s = params.get("s", 4)
        b = params.get("b", 2)
        yolo = Yolo(layer_num, class_num, s=s, b=b)

        yolo.initialize(init=Xavier(magnitude=0.02))
    else:
        print("model load finish")
        layer_num = model.layer_num
        class_num = model.class_num
        s = model.s
        b = model.b
        yolo = model
    if verbose:
        print("train params: \n\tepoch:%d \n\tlayer_num:%d \n\tclass_num:%d  \n\ts:%d  \n\tb:%d" % \
              (epoch, layer_num, class_num, s, b))

    ngd = optimizer.SGD(momentum=0.7, learning_rate=0.0025)
    trainer = gluon.Trainer(yolo.collect_params(), ngd)

    for ep in range(epoch):
        loss = 0
        all_batch = int(loader.data_number() / batch_size)
        t1 = time()
        for _ in range(all_batch):
            x, y = loader.next_batch(batch_size)
            with autograd.record():
                loss_func = TotalLoss(s=s, c=class_num, b=b)
                ypre = yolo(x)  # (32,output_dim)
                loss = nd.mean(loss_func(ypre, y))
            loss.backward()
            trainer.step(batch_size)

        t2 = time()
        if verbose:
            print("epoch:%d/%d  loss:%.5f  time:%4f" %
                  (ep + 1, epoch, loss.asscalar(), t2 - t1),
                  flush=True)

    return yolo
Exemplo n.º 4
0
def build_optimizer(type, lr, kerasDefaults):


    if type == 'sgd':
        if kerasDefaults['nesterov_sgd']:
            return optimizer.NAG(learning_rate=lr,
                                 momentum=kerasDefaults['momentum_sgd'],
                                 #rescale_grad=kerasDefaults['clipnorm'],
                                 #clip_gradient=kerasDefaults['clipvalue'],
                                 lr_scheduler=None)
        else:
            return optimizer.SGD(learning_rate=lr,
                                 momentum=kerasDefaults['momentum_sgd'],
                                 #rescale_grad=kerasDefaults['clipnorm'],
                                 #clip_gradient=kerasDefaults['clipvalue'],
                                 lr_scheduler=None)
    
    elif type == 'rmsprop':
        return optimizer.RMSProp(learning_rate=lr,
                                 gamma1=kerasDefaults['rho'],
                                 epsilon=kerasDefaults['epsilon'],
                                 centered=False,
                                 #rescale_grad=kerasDefaults['clipnorm'],
                                 #clip_gradient=kerasDefaults['clipvalue'],
                                 lr_scheduler=None)

    elif type == 'adagrad':
        return optimizer.AdaGrad(learning_rate=lr,
                                 epsilon=kerasDefaults['epsilon'])#,
                                 #rescale_grad=kerasDefaults['clipnorm'],
                                 #clip_gradient=kerasDefaults['clipvalue'])

    elif type == 'adadelta':
        return optimizer.AdaDelta(epsilon=kerasDefaults['epsilon'],
                                  rho=kerasDefaults['rho'])#,
                                  #rescale_grad=kerasDefaults['clipnorm'],
                                  #clip_gradient=kerasDefaults['clipvalue'])

    elif type == 'adam':
        return optimizer.Adam(learning_rate=lr, beta_1=kerasDefaults['beta_1'],
                              beta_2=kerasDefaults['beta_2'],
                              epsilon=kerasDefaults['epsilon'])#,
Exemplo n.º 5
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
      for i in range(len(cvd.split(','))):
        ctx.append(mx.gpu(i))
    if len(ctx)==0:
      ctx = [mx.cpu()]
      print('use cpu')
    else:
      print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
      os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    network, num_layers = args.network.split(',')
    print('num_layers', num_layers)
    if args.per_batch_size==0:
      args.per_batch_size = 128
    args.batch_size = args.per_batch_size*args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = 3

    os.environ['BETA'] = str(args.beta)
    data_dir_list = args.data_dir.split(',')
    path_imgrecs = []
    path_imglist = None
    args.num_classes = []
    for data_dir in data_dir_list:
      prop = face_image.load_property(data_dir)
      args.num_classes.append(prop.num_classes)
      image_size = prop.image_size
      args.image_h = image_size[0]
      args.image_w = image_size[1]
      print('image_size', image_size)
      assert(args.num_classes[-1]>0)
      print('num_classes', args.num_classes)
      path_imgrecs.append(os.path.join(data_dir, "train.rec"))

    if args.loss_type==1 and args.num_classes>20000:
      args.beta_freeze = 5000
      args.gamma = 0.06

    print('Called with argument:', args)
    data_shape = (args.image_channel,image_size[0],image_size[1])
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    if len(args.pretrained)==0:
      arg_params = None
      aux_params = None
      sym, arg_params, aux_params = get_symbol(network, int(num_layers), args, arg_params, aux_params)
    else:
      vec = args.pretrained.split(',')
      _, arg_params, aux_params = mx.model.load_checkpoint(vec[0], int(vec[1]))
      sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)

    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)
    ctx_group = dict(zip(['dev%d' % (i+1) for i in range(len(ctx))], ctx))
    ctx_group['dev0'] = ctx
    model = mx.mod.Module(
        context       = ctx,
        symbol        = sym,
        data_names    = ['bn1_orig', 'bn1_mask', 'fc1_orig', 'fc1_mask'],
        label_names   = ['softmax_label'],
        group2ctxs    = ctx_group
    )
    val_dataiter = None

    from config import cutout

    data_shapes = [('data', (args.batch_size, args.image_channel, image_size[0], image_size[1]))]
    pretrain_model = get_module(args, data_shapes)

    train_dataiter = FaceImageIter(
        batch_size           = args.batch_size,
        data_shape           = data_shape,
        path_imgrecs         = path_imgrecs,
        shuffle              = True,
        rand_mirror          = args.rand_mirror,
        mean                 = mean,
        cutout               = cutout,
        loss_type            = args.loss_type,
        #margin_m             = args.margin_m,
        #margin_policy        = args.margin_policy,
        #max_steps            = args.max_steps,
        data_names           = ['bn1', 'fc1'],
        downsample_back      = args.downsample_back,
        motion_blur          = args.motion_blur,
        mx_model             = pretrain_model,
    )

    #if args.loss_type<10:
    #  _metric = AccMetric()
    #else:
    #_metric = LossValueMetric()
    eval_metrics = mx.metric.CompositeEvalMetric()
    for loss_name, loss_idx in zip(['distill_loss', 'softmax_orig', 'softmax_mask'], [2, 3, 4]):
      eval_metrics.add(LossValueMetric(loss_name, loss_idx))
    #eval_metrics = None

    if args.network[1]=='r' or args.network[1]=='y':
      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    _rescale = 1.0/args.ctx_num
    #opt = AdaBound()
    #opt = AdaBound(lr=base_lr, wd=base_wd, gamma = 2. / args.max_steps)

    lr_steps = [int(x) for x in args.lr_steps.split(',')]
    lr_scheduler =  mx.lr_scheduler.MultiFactorScheduler(lr_steps, factor=0.1, base_lr=base_lr)
    optimizer_params = {'learning_rate':base_lr,
                        'momentum':base_mom,
                        'wd':base_wd,
                        'rescale_grad':_rescale, 
                        'lr_scheduler': lr_scheduler}

    opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale)
    som = 200
    _cb = mx.callback.Speedometer(args.batch_size, som)

    ver_list = []
    ver_name_list = []

    def ver_test(nbatch):
      return [0]

    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in range(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    if len(args.lr_steps)==0:
      lr_steps = [40000, 60000, 80000]
      if args.loss_type>=1 and args.loss_type<=7:
        lr_steps = [100000, 140000, 160000]
      p = 512.0/args.batch_size
      for l in range(len(lr_steps)):
        lr_steps[l] = int(lr_steps[l]*p)
    else:
      lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)
    def _batch_callback(param):
      #global global_step
      global_step[0]+=1
      mbatch = global_step[0]
      #for _lr in lr_steps:
      #  if mbatch==args.beta_freeze+_lr:
      #    opt.lr *= 0.1
      #    print('lr change to', opt.lr)
      #    break

      _cb(param)
      if mbatch%10000==0:
        print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch)

      if mbatch>=0 and mbatch%args.verbose==0:
        acc_list = ver_test(mbatch)
        save_step[0]+=1
        msave = save_step[0]
        do_save = False
        if len(acc_list)>0:
          lfw_score = acc_list[0]
          if lfw_score>highest_acc[0]:
            highest_acc[0] = lfw_score
            if lfw_score>=0.998:
              do_save = True
          if acc_list[-1]>=highest_acc[-1]:
            highest_acc[-1] = acc_list[-1]
            if lfw_score>=0.99:
              do_save = True
        if args.ckpt==0:
          do_save = False
        elif args.ckpt>1:
          do_save = True
        if do_save:
          print('saving', msave)
          arg, aux = model.get_params()
          mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
        print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1]))
      if mbatch<=args.beta_freeze:
        _beta = args.beta
      else:
        move = max(0, mbatch-args.beta_freeze)
        _beta = max(args.beta_min, args.beta*math.pow(1+args.gamma*move, -1.0*args.power))
      #print('beta', _beta)
      os.environ['BETA'] = str(_beta)
      if args.max_steps>0 and mbatch>args.max_steps:
        sys.exit(0)

    epoch_cb = None
    train_dataiter = mx.io.PrefetchingIter(train_dataiter)

    model.fit(train_dataiter,
        begin_epoch        = begin_epoch,
        num_epoch          = end_epoch,
        eval_data          = val_dataiter,
        eval_metric        = eval_metrics,
        kvstore            = 'device',
        #optimizer          = opt,
        optimizer_params   = optimizer_params,
        initializer        = initializer,
        arg_params         = arg_params,
        aux_params         = aux_params,
        allow_missing      = True,
        #allow_extra        = True,
        batch_end_callback = _batch_callback,
        epoch_end_callback = epoch_cb )
Exemplo n.º 6
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = "%s-%s-p%s" % (args.prefix, args.network, args.patch)
    end_epoch = args.end_epoch
    pretrained = args.pretrained
    load_epoch = args.load_epoch
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    if args.per_batch_size == 0:
        args.per_batch_size = 128
        if args.network[0] == 'r':
            args.per_batch_size = 128
        else:
            if args.num_layers >= 64:
                args.per_batch_size = 120
        if args.ctx_num == 2:
            args.per_batch_size *= 2
        elif args.ctx_num == 3:
            args.per_batch_size = 170
        if args.network[0] == 'm':
            args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = 3
    ppatch = [int(x) for x in args.patch.split('_')]
    image_size = [int(x) for x in args.image_size.split(',')]
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    assert len(ppatch) == 5
    #if args.patch%2==1:
    #  args.image_channel = 1

    #os.environ['GLOBAL_STEP'] = "0"
    os.environ['BETA'] = str(args.beta)
    args.use_val = False
    path_imgrec = None
    path_imglist = None
    val_rec = None

    #path_imglist = "/raid5data/dplearn/faceinsight_align_webface.lst.new"
    #path_imglist = "/raid5data/dplearn/faceinsight_align_webface_clean.lst.new"
    for line in open(os.path.join(args.data_dir, 'property')):
        args.num_classes = int(line.strip())
    assert (args.num_classes > 0)
    print('num_classes', args.num_classes)

    #path_imglist = "/raid5data/dplearn/MS-Celeb-Aligned/lst2"
    path_imgrec = os.path.join(args.data_dir, "train.rec")
    val_rec = os.path.join(args.data_dir, "val.rec")
    if os.path.exists(val_rec):
        args.use_val = True
    else:
        val_rec = None
    #args.num_classes = 10572 #webface
    #args.num_classes = 81017
    #args.num_classes = 82395

    if args.loss_type == 1 and args.num_classes > 40000:
        args.beta_freeze = 5000
        args.gamma = 0.06

    print('Called with argument:', args)

    data_shape = (args.image_channel, image_size[0], image_size[1])
    #mean = [127.5,127.5,127.5]
    mean = None

    if args.use_val:
        val_dataiter = FaceImageIter(
            batch_size=args.batch_size,
            data_shape=data_shape,
            path_imgrec=val_rec,
            #path_imglist         = val_path,
            shuffle=False,
            rand_mirror=False,
            mean=mean,
        )
    else:
        val_dataiter = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = 0.9
    if not args.retrain:
        #load and initialize params
        #print(pretrained)
        #_, arg_params, aux_params = mx.model.load_checkpoint(pretrained, load_epoch)
        arg_params = None
        aux_params = None
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
        #arg_params, aux_params = load_param(pretrained, epoch, convert=True)
        data_shape_dict = {
            'data': (args.batch_size, ) + data_shape,
            'softmax_label': (args.batch_size, )
        }
        if args.network[0] == 's':
            arg_params, aux_params = spherenet.init_weights(
                sym, data_shape_dict, args.num_layers)
        elif args.network[0] == 'm':
            arg_params, aux_params = marginalnet.init_weights(
                sym, data_shape_dict, args.num_layers)
        #resnet_dcn.init_weights(sym, data_shape_dict, arg_params, aux_params)
    else:
        #sym, arg_params, aux_params = mx.model.load_checkpoint(pretrained, load_epoch)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            pretrained, load_epoch)
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
        #begin_epoch = load_epoch
        #end_epoch = begin_epoch+10
        #base_wd = 0.00005

    if args.loss_type != 10:
        model = mx.mod.Module(
            context=ctx,
            symbol=sym,
        )
    else:
        data_names = ('data', 'extra')
        model = mx.mod.Module(
            context=ctx,
            symbol=sym,
            data_names=data_names,
        )

    if args.loss_type <= 9:
        train_dataiter = FaceImageIter(
            batch_size=args.batch_size,
            data_shape=data_shape,
            path_imgrec=path_imgrec,
            shuffle=True,
            rand_mirror=True,
            mean=mean,
        )
    elif args.loss_type == 10:
        train_dataiter = FaceImageIter4(
            batch_size=args.batch_size,
            ctx_num=args.ctx_num,
            images_per_identity=args.images_per_identity,
            data_shape=data_shape,
            path_imglist=path_imglist,
            shuffle=True,
            rand_mirror=True,
            mean=mean,
            patch=ppatch,
            use_extra=True,
            model=model,
        )
    elif args.loss_type == 11:
        train_dataiter = FaceImageIter5(
            batch_size=args.batch_size,
            ctx_num=args.ctx_num,
            images_per_identity=args.images_per_identity,
            data_shape=data_shape,
            path_imglist=path_imglist,
            shuffle=True,
            rand_mirror=True,
            mean=mean,
            patch=ppatch,
        )
    #args.epoch_size = int(math.ceil(train_dataiter.num_samples()/args.batch_size))

    #_dice = DiceMetric()
    _acc = AccMetric()
    eval_metrics = [mx.metric.create(_acc)]

    # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric
    #for child_metric in [fcn_loss_metric]:
    #    eval_metrics.add(child_metric)

    # callback
    #batch_end_callback = callback.Speedometer(input_batch_size, frequent=args.frequent)
    #epoch_end_callback = mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True)

    # decide learning rate
    #lr_step = '10,20,30'
    #train_size = 4848
    #nrof_batch_in_epoch = int(train_size/input_batch_size)
    #print('nrof_batch_in_epoch:', nrof_batch_in_epoch)
    #lr_factor = 0.1
    #lr_epoch = [float(epoch) for epoch in lr_step.split(',')]
    #lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
    #lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
    #lr_iters = [int(epoch * train_size / batch_size) for epoch in lr_epoch_diff]
    #print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters

    #lr_scheduler = MultiFactorScheduler(lr_iters, lr_factor)

    # optimizer
    #optimizer_params = {'momentum': 0.9,
    #                    'wd': 0.0005,
    #                    'learning_rate': base_lr,
    #                    'rescale_grad': 1.0,
    #                    'clip_gradient': None}
    if args.network[0] == 'r':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style
    elif args.network[0] == 'i' or args.network[0] == 'x':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="in",
                                     magnitude=2)  #inception
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    _rescale = 1.0 / args.ctx_num
    #_rescale = 1.0
    opt = optimizer.SGD(learning_rate=base_lr,
                        momentum=base_mom,
                        wd=base_wd,
                        rescale_grad=_rescale)
    #opt = optimizer.RMSProp(learning_rate=base_lr, wd=base_wd, rescale_grad=_rescale)
    #opt = optimizer.AdaGrad(learning_rate=base_lr, wd=base_wd, rescale_grad=_rescale)
    #opt = optimizer.AdaGrad(learning_rate=base_lr, wd=base_wd, rescale_grad=1.0)
    _cb = mx.callback.Speedometer(args.batch_size, 10)

    lfw_dir = os.path.join(args.data_dir, 'lfw')
    lfw_set = lfw.load_dataset(lfw_dir, image_size)

    def lfw_test(nbatch):
        acc1, std1, acc2, std2, xnorm, embeddings_list = lfw.test(
            lfw_set, model, args.batch_size)
        print('[%d]XNorm: %f' % (nbatch, xnorm))
        print('[%d]Accuracy: %1.5f+-%1.5f' % (nbatch, acc1, std1))
        print('[%d]Accuracy-Flip: %1.5f+-%1.5f' % (nbatch, acc2, std2))
        return acc2, embeddings_list

    def val_test():
        acc = AccMetric()
        val_metric = mx.metric.create(acc)
        val_metric.reset()
        val_dataiter.reset()
        for i, eval_batch in enumerate(val_dataiter):
            model.forward(eval_batch, is_train=False)
            model.update_metric(val_metric, eval_batch.label)
        acc_value = val_metric.get_name_value()[0][1]
        print('VACC: %f' % (acc_value))

    #global_step = 0
    highest_acc = [0.0]
    last_save_acc = [0.0]
    global_step = [0]
    save_step = [0]
    if len(args.lr_steps) == 0:
        #lr_steps = [40000, 70000, 90000]
        lr_steps = [40000, 60000, 80000]
        if args.loss_type == 1:
            lr_steps = [100000, 140000, 160000]
    else:
        lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        #global global_step
        global_step[0] += 1
        mbatch = global_step[0]
        for _lr in lr_steps:
            if mbatch == args.beta_freeze + _lr:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break

        _cb(param)
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)
        #os.environ['GLOBAL_STEP'] = str(mbatch)

        if mbatch >= 0 and mbatch % args.verbose == 0:
            acc, embeddings_list = lfw_test(mbatch)
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            if acc >= highest_acc[0]:
                highest_acc[0] = acc
                if acc >= 0.996:
                    do_save = True
            if mbatch > lr_steps[-1] and mbatch % 10000 == 0:
                do_save = True
            if do_save:
                print('saving', msave, acc)
                if val_dataiter is not None:
                    val_test()
                arg, aux = model.get_params()
                mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
                if acc >= highest_acc[0]:
                    lfw_npy = "%s-lfw-%04d" % (prefix, msave)
                    X = np.concatenate(embeddings_list, axis=0)
                    print('saving lfw npy', X.shape)
                    np.save(lfw_npy, X)
            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[0]))
        if mbatch <= args.beta_freeze:
            _beta = args.beta
        else:
            move = max(0, mbatch - args.beta_freeze)
            _beta = max(
                args.beta_min,
                args.beta * math.pow(1 + args.gamma * move, -1.0 * args.power))
            #_beta = max(args.beta_min, args.beta*math.pow(0.7, move//500))
        #print('beta', _beta)
        os.environ['BETA'] = str(_beta)

    #epoch_cb = mx.callback.do_checkpoint(prefix, 1)
    epoch_cb = None

    #def _epoch_callback(epoch, sym, arg_params, aux_params):
    #  print('epoch-end', epoch)

    model.fit(
        train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=end_epoch,
        eval_data=val_dataiter,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        #optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
Exemplo n.º 7
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = os.path.join(args.models_root,
                          '%s-%s-%s' % (args.network, args.loss, args.dataset),
                          'model')
    prefix_dir = os.path.dirname(prefix)
    print('prefix', prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    args.ctx_num = len(ctx)
    args.batch_size = args.per_batch_size * args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = config.image_shape[2]

    data_dir = config.dataset_path
    path_imgrec = None
    path_imglist = None
    image_size = config.image_shape[0:2]
    assert len(image_size) == 2
    assert image_size[0] == image_size[1]
    print('image_size', image_size)
    print('num_classes', config.num_classes)
    path_imgrec = os.path.join(data_dir, "train.rec")

    print('Called with argument:', args, config)
    data_shape = (args.image_channel, image_size[0], image_size[1])
    mean = None

    begin_epoch = 0
    if len(args.pretrained) == 0:
        arg_params = None
        aux_params = None
        sym = get_symbol(args)
        if config.net_name == 'spherenet':
            data_shape_dict = {'data': (args.per_batch_size, ) + data_shape}
            spherenet.init_weights(sym, data_shape_dict, args.num_layers)
    else:
        print('loading', args.pretrained, args.pretrained_epoch)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            args.pretrained, args.pretrained_epoch)
        sym = get_symbol(args)

    if config.count_flops:
        all_layers = sym.get_internals()
        _sym = all_layers['fc1_output']
        FLOPs = flops_counter.count_flops(_sym,
                                          data=(1, 3, image_size[0],
                                                image_size[1]))
        print('Network FLOPs: %d' % FLOPs)

    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)
    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
    )
    val_dataiter = None

    if config.loss_name.find('triplet') >= 0:
        from triplet_image_iter import FaceImageIter
        triplet_params = [
            config.triplet_bag_size, config.triplet_alpha,
            config.triplet_max_ap
        ]
        train_dataiter = FaceImageIter(
            batch_size=args.batch_size,
            data_shape=data_shape,
            path_imgrec=path_imgrec,
            shuffle=True,
            rand_mirror=config.data_rand_mirror,
            mean=mean,
            cutoff=config.data_cutoff,
            ctx_num=args.ctx_num,
            images_per_identity=config.images_per_identity,
            triplet_params=triplet_params,
            mx_model=model,
        )
        _metric = LossValueMetric()
        eval_metrics = [mx.metric.create(_metric)]
    else:
        from image_iter import FaceImageIter
        train_dataiter = FaceImageIter(
            batch_size=args.batch_size,
            data_shape=data_shape,
            path_imgrec=path_imgrec,
            shuffle=True,
            rand_mirror=config.data_rand_mirror,
            mean=mean,
            cutoff=config.data_cutoff,
            color_jittering=config.data_color,
            images_filter=config.data_images_filter,
        )
        metric1 = AccMetric()
        eval_metrics = [mx.metric.create(metric1)]
        if config.ce_loss:
            metric2 = LossValueMetric()
            eval_metrics.append(mx.metric.create(metric2))

    if config.net_name == 'fresnet' or config.net_name == 'fmobilefacenet':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    #initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    _rescale = 1.0 / args.ctx_num
    opt = optimizer.SGD(learning_rate=args.lr,
                        momentum=args.mom,
                        wd=args.wd,
                        rescale_grad=_rescale)
    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)

    ver_list = []
    ver_name_list = []
    for name in config.val_targets:
        path = os.path.join(data_dir, name + ".bin")
        if os.path.exists(path):
            data_set = verification.load_bin(path, image_size)
            ver_list.append(data_set)
            ver_name_list.append(name)
            print('ver', name)

    def ver_test(nbatch):
        results = []
        for i in xrange(len(ver_list)):
            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
                ver_list[i], model, args.batch_size, 10, None, None)
            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
            #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
                  (ver_name_list[i], nbatch, acc2, std2))
            results.append(acc2)
        return results

    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in xrange(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        #global global_step
        global_step[0] += 1
        mbatch = global_step[0]
        for step in lr_steps:
            if mbatch == step:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break

        _cb(param)
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)

        if mbatch >= 0 and mbatch % args.verbose == 0:
            acc_list = ver_test(mbatch)
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            is_highest = False
            if len(acc_list) > 0:
                #lfw_score = acc_list[0]
                #if lfw_score>highest_acc[0]:
                #  highest_acc[0] = lfw_score
                #  if lfw_score>=0.998:
                #    do_save = True
                score = sum(acc_list)
                if acc_list[-1] >= highest_acc[-1]:
                    if acc_list[-1] > highest_acc[-1]:
                        is_highest = True
                    else:
                        if score >= highest_acc[0]:
                            is_highest = True
                            highest_acc[0] = score
                    highest_acc[-1] = acc_list[-1]
                    #if lfw_score>=0.99:
                    #  do_save = True
            if is_highest:
                do_save = True
            if args.ckpt == 0:
                do_save = False
            elif args.ckpt == 2:
                do_save = True
            elif args.ckpt == 3:
                msave = 1

            if do_save:
                print('saving', msave)
                arg, aux = model.get_params()
                if config.ckpt_embedding:
                    all_layers = model.symbol.get_internals()
                    _sym = all_layers['fc1_output']
                    _arg = {}
                    for k in arg:
                        if not k.startswith('fc7'):
                            _arg[k] = arg[k]
                    mx.model.save_checkpoint(prefix, msave, _sym, _arg, aux)
                else:
                    mx.model.save_checkpoint(prefix, msave, model.symbol, arg,
                                             aux)
            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
        if config.max_steps > 0 and mbatch > config.max_steps:
            sys.exit(0)

    epoch_cb = None
    train_dataiter = mx.io.PrefetchingIter(train_dataiter)

    model.fit(
        train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=999999,
        eval_data=val_dataiter,
        eval_metric=eval_metrics,
        kvstore=args.kvstore,
        optimizer=opt,
        #optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
Exemplo n.º 8
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
      for i in xrange(len(cvd.split(','))):
        ctx.append(mx.gpu(i))
    if len(ctx)==0:
      ctx = [mx.cpu()]
      print('use cpu')
    else:
      print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
      os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    if args.per_batch_size==0:
      args.per_batch_size = 128
      if args.loss_type==10:
        args.per_batch_size = 256
    args.batch_size = args.per_batch_size*args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = 3
    ppatch = [int(x) for x in args.patch.split('_')]
    assert len(ppatch)==5


    os.environ['BETA'] = str(args.beta)
    args.use_val = False
    path_imgrec = None
    path_imglist = None
    val_rec = None
    prop = face_image.load_property(args.data_dir)
    args.num_classes = prop.num_classes
    image_size = prop.image_size
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    print('image_size', image_size)

    assert(args.num_classes>0)
    print('num_classes', args.num_classes)

    #path_imglist = "/raid5data/dplearn/MS-Celeb-Aligned/lst2"
    path_imgrec = os.path.join(args.data_dir, "train.rec")
    val_rec = os.path.join(args.data_dir, "val.rec")
    if os.path.exists(val_rec) and args.loss_type<10:
      args.use_val = True
    else:
      val_rec = None
    #args.num_classes = 10572 #webface
    #args.num_classes = 81017
    #args.num_classes = 82395



    if args.loss_type==1 and args.num_classes>40000:
      args.beta_freeze = 5000
      args.gamma = 0.06

    if args.loss_type==11:
      args.images_per_identity = 2
    elif args.loss_type==10:
      args.images_per_identity = 16

    if args.loss_type<10:
      assert args.images_per_identity==0
    else:
      assert args.images_per_identity>=2
      args.per_identities = int(args.per_batch_size/args.images_per_identity)

    print('Called with argument:', args)

    data_shape = (args.image_channel,image_size[0],image_size[1])
    mean = None

    if args.use_val:
      val_dataiter = FaceImageIter(
          batch_size           = args.batch_size,
          data_shape           = data_shape,
          path_imgrec          = val_rec,
          #path_imglist         = val_path,
          shuffle              = False,
          rand_mirror          = False,
          mean                 = mean,
      )
    else:
      val_dataiter = None



    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    if len(args.pretrained)==0:
      arg_params = None
      aux_params = None
      sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
    else:
      vec = args.pretrained.split(',')
      _, arg_params, aux_params = mx.model.load_checkpoint(vec[0], int(vec[1]))
      sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)

    data_extra = None
    hard_mining = False
    if args.loss_type==10:
      hard_mining = True
      _shape = (args.batch_size, args.per_batch_size)
      data_extra = np.full(_shape, -1.0, dtype=np.float32)
      c = 0
      while c<args.batch_size:
        a = 0
        while a<args.per_batch_size:
          b = a+args.images_per_identity
          data_extra[(c+a):(c+b),a:b] = 1.0
          #print(c+a, c+b, a, b)
          a = b
        c += args.per_batch_size
    elif args.loss_type==11:
      data_extra = np.zeros( (args.batch_size, args.per_identities), dtype=np.float32)
      c = 0
      while c<args.batch_size:
        for i in xrange(args.per_identities):
          data_extra[c+i][i] = 1.0
        c+=args.per_batch_size

    label_name = 'softmax_label'
    if data_extra is None:
      model = mx.mod.Module(
          context       = ctx,
          symbol        = sym,
      )
    else:
      data_names = ('data', 'extra')
      #label_name = ''
      model = mx.mod.Module(
          context       = ctx,
          symbol        = sym,
          data_names    = data_names,
          label_names   = (label_name,),
      )


    train_dataiter = FaceImageIter(
        batch_size           = args.batch_size,
        data_shape           = data_shape,
        path_imgrec          = path_imgrec,
        shuffle              = True,
        rand_mirror          = True,
        mean                 = mean,
        ctx_num              = args.ctx_num,
        images_per_identity  = args.images_per_identity,
        data_extra           = data_extra,
        hard_mining          = hard_mining,
        mx_model             = model,
        label_name           = label_name,
    )

    if args.loss_type<10:
      _metric = AccMetric()
    else:
      _metric = LossValueMetric()
    eval_metrics = [mx.metric.create(_metric)]

    if args.network[0]=='r':
      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    elif args.network[0]=='i' or args.network[0]=='x':
      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) #inception
    else:
      initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2)
    _rescale = 1.0/args.ctx_num
    opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale)
    _cb = mx.callback.Speedometer(args.batch_size, 20)

    ver_list = []
    ver_name_list = []
    for name in args.target.split(','):
      path = os.path.join(args.data_dir,name+".bin")
      if os.path.exists(path):
        data_set = verification.load_bin(path, image_size)
        ver_list.append(data_set)
        ver_name_list.append(name)
        print('ver', name)



    def ver_test(nbatch):
      results = []
      for i in xrange(len(ver_list)):
        acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], model, args.batch_size, data_extra)
        print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
        #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
        print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2))
        results.append(acc2)
      return results


    def val_test():
      acc = AccMetric()
      val_metric = mx.metric.create(acc)
      val_metric.reset()
      val_dataiter.reset()
      for i, eval_batch in enumerate(val_dataiter):
        model.forward(eval_batch, is_train=False)
        model.update_metric(val_metric, eval_batch.label)
      acc_value = val_metric.get_name_value()[0][1]
      print('VACC: %f'%(acc_value))


    highest_acc = []
    for i in xrange(len(ver_list)):
      highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    if len(args.lr_steps)==0:
      lr_steps = [40000, 60000, 70000]
      if args.loss_type==1:
        lr_steps = [50000, 70000, 80000]
      p = 512.0/args.batch_size
      for l in xrange(len(lr_steps)):
        lr_steps[l] = int(lr_steps[l]*p)
    else:
      lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)
    def _batch_callback(param):
      #global global_step
      global_step[0]+=1
      mbatch = global_step[0]
      for _lr in lr_steps:
        if mbatch==args.beta_freeze+_lr:
          opt.lr *= 0.1
          print('lr change to', opt.lr)
          break

      _cb(param)
      if mbatch%1000==0:
        print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch)

      if mbatch>=0 and mbatch%args.verbose==0:
        acc_list = ver_test(mbatch)
        save_step[0]+=1
        msave = save_step[0]
        do_save = False
        lfw_score = acc_list[0]
        for i in xrange(len(acc_list)):
          acc = acc_list[i]
          if acc>=highest_acc[i]:
            highest_acc[i] = acc
            if lfw_score>=0.99:
              do_save = True
        if args.loss_type==1 and mbatch>lr_steps[-1] and mbatch%10000==0:
          do_save = True
        if do_save:
          print('saving', msave, acc)
          if val_dataiter is not None:
            val_test()
          arg, aux = model.get_params()
          mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
          #if acc>=highest_acc[0]:
          #  lfw_npy = "%s-lfw-%04d" % (prefix, msave)
          #  X = np.concatenate(embeddings_list, axis=0)
          #  print('saving lfw npy', X.shape)
          #  np.save(lfw_npy, X)
        #print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[0]))
      if mbatch<=args.beta_freeze:
        _beta = args.beta
      else:
        move = max(0, mbatch-args.beta_freeze)
        _beta = max(args.beta_min, args.beta*math.pow(1+args.gamma*move, -1.0*args.power))
      #print('beta', _beta)
      os.environ['BETA'] = str(_beta)

    #epoch_cb = mx.callback.do_checkpoint(prefix, 1)
    epoch_cb = None



    #def _epoch_callback(epoch, sym, arg_params, aux_params):
    #  print('epoch-end', epoch)

    model.fit(train_dataiter,
        begin_epoch        = begin_epoch,
        num_epoch          = end_epoch,
        eval_data          = val_dataiter,
        eval_metric        = eval_metrics,
        kvstore            = 'device',
        optimizer          = opt,
        #optimizer_params   = optimizer_params,
        initializer        = initializer,
        arg_params         = arg_params,
        aux_params         = aux_params,
        allow_missing      = True,
        batch_end_callback = _batch_callback,
        epoch_end_callback = epoch_cb )
def train_net(args):
    ctx = []
    cvd = '0'  # os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    if args.per_batch_size == 0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num
    args.rescale_threshold = 0

    print('num_classes', args.num_classes)

    print('Called with argument:', args)
    train_dataiter, val_dataiter, emb_size, data_size = load_data(args)
    args.emb_size = emb_size
    print('emb_size', emb_size)
    print('data_size', data_size)

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    arg_params = None
    aux_params = None
    sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)

    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)
    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
    )

    metric1 = AccMetric()
    eval_metrics = [mx.metric.create(metric1)]
    if args.ce_loss:
        metric2 = LossValueMetric()
        eval_metrics.append(mx.metric.create(metric2))

    if args.network[0] == 'r' or args.network[0] == 'y':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style
    elif args.network[0] == 'i' or args.network[0] == 'x':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="in",
                                     magnitude=2)  #inception
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    _rescale = 1.0 / args.ctx_num
    opt = optimizer.SGD(learning_rate=base_lr,
                        momentum=args.mom,
                        wd=args.wd,
                        rescale_grad=_rescale)
    #opt = optimizer.Nadam(learning_rate=base_lr, wd=args.wd, rescale_grad=_rescale, clip_gradient=5.0)
    #opt = optimizer.Adam(learning_rate=base_lr, wd=args.wd, rescale_grad=_rescale)
    som = 20
    _cb = mx.callback.Speedometer(args.batch_size, som)
    lr_step = [int(x) for x in args.lr_step.split(',')]
    begin_epoch = args.begin_epoch
    epoches = lr_step
    end_epoch = epoches[-1]
    lr_factor = 0.1
    #lr_epoch = [int(epoch) for epoch in lr_step.split(',')]
    #lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
    #lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
    #lr_iters = [int(epoch * len(roidb) / input_batch_size) for epoch in lr_epoch_diff]

    #lr_iters = [36000,42000] #TODO
    #lr_iters = [40000,50000,60000] #TODO
    #lr_iters = [40,50,60] #TODO
    #logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters))
    #lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)

    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in xrange(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    #lr_steps = [6000,10000,12000]
    lr_steps = []
    for ep in epoches:
        lr_steps.append(data_size * ep // args.batch_size)
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        #global global_step
        global_step[0] += 1
        mbatch = global_step[0]
        for _lr in lr_steps:
            if mbatch == _lr:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break

        _cb(param)
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)

    epoch_cb = mx.callback.do_checkpoint(args.prefix, period=end_epoch)
    train_dataiter = mx.io.PrefetchingIter(train_dataiter)

    model.fit(
        train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=end_epoch,
        eval_data=val_dataiter,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        #optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
Exemplo n.º 10
0
def train_net(args):
    #_seed = 727
    #random.seed(_seed)
    #np.random.seed(_seed)
    #mx.random.seed(_seed)
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
      for i in range(len(cvd.split(','))):
        ctx.append(mx.gpu(i))
    if len(ctx)==0:
      ctx = [mx.cpu()]
      print('use cpu')
    else:
      print('gpu num:', len(ctx))
    if len(args.extra_model_name)==0:
      prefix = os.path.join(args.models_root, '%s-%s-%s'%(args.network, args.loss, args.dataset), 'model')
    else:
      prefix = os.path.join(args.models_root, '%s-%s-%s-%s'%(args.network, args.loss, args.dataset, args.extra_model_name), 'model')
    prefix_dir = os.path.dirname(prefix)
    print('prefix', prefix)
    if not os.path.exists(prefix_dir):
      os.makedirs(prefix_dir)
    args.ctx_num = len(ctx)
    if args.per_batch_size==0:
      args.per_batch_size = 128
    args.batch_size = args.per_batch_size*args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = config.image_shape[2]
    config.batch_size = args.batch_size
    config.per_batch_size = args.per_batch_size
    data_dir = config.dataset_path
    path_imgrec = None
    path_imglist = None
    image_size = config.image_shape[0:2]
    assert len(image_size)==2
    assert image_size[0]==image_size[1]
    print('image_size', image_size)
    print('num_classes', config.num_classes)
    path_imgrec = os.path.join(data_dir, "train.rec")

    data_shape = (args.image_channel,image_size[0],image_size[1])

    num_workers = config.num_workers
    global_num_ctx = num_workers * args.ctx_num
    if config.num_classes%global_num_ctx==0:
      args.ctx_num_classes = config.num_classes//global_num_ctx
    else:
      args.ctx_num_classes = config.num_classes//global_num_ctx+1
    args.local_num_classes = args.ctx_num_classes * args.ctx_num
    args.local_class_start = args.local_num_classes * args.worker_id

    #if len(args.partial)==0:
    #  local_classes_range = (0, args.num_classes)
    #else:
    #  _vec = args.partial.split(',')
    #  local_classes_range = (int(_vec[0]), int(_vec[1]))

    #args.partial_num_classes = local_classes_range[1] - local_classes_range[0]
    #args.partial_start = local_classes_range[0]

    print('Called with argument:', args, config)
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    arg_params = None
    aux_params = None
    if len(args.pretrained)==0:
      esym = get_symbol_embedding()
      asym = get_symbol_arcface
    else:
      assert False

    if config.count_flops:
      all_layers = esym.get_internals()
      _sym = all_layers['fc1_output']
      FLOPs = flops_counter.count_flops(_sym, data=(1,3,image_size[0],image_size[1]))
      _str = flops_counter.flops_str(FLOPs)
      print('Network FLOPs: %s'%_str)

    if config.num_workers==1:
      from parall_module_local_v1 import ParallModule
    else:
      from parall_module_dist import ParallModule

    model = ParallModule(
        context       = ctx,
        symbol        = esym,
        data_names    = ['data'],
        label_names    = ['softmax_label'],
        asymbol       = asym,
        args = args,
    )
    val_dataiter = None
    train_dataiter = FaceImageIter(
        batch_size           = args.batch_size,
        data_shape           = data_shape,
        path_imgrec          = path_imgrec,
        shuffle              = True,
        rand_mirror          = config.data_rand_mirror,
        mean                 = mean,
        cutoff               = config.data_cutoff,
        color_jittering      = config.data_color,
        images_filter        = config.data_images_filter,
    )


    
    if config.net_name=='fresnet' or config.net_name=='fmobilefacenet':
      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    else:
      initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2)

    _rescale = 1.0/args.batch_size
    opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale)
    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)


    ver_list = []
    ver_name_list = []
    for name in config.val_targets:
      path = os.path.join(data_dir,name+".bin")
      if os.path.exists(path):
        data_set = verification.load_bin(path, image_size)
        ver_list.append(data_set)
        ver_name_list.append(name)
        print('ver', name)


    def ver_test(nbatch):
      results = []
      for i in range(len(ver_list)):
        acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], model, args.batch_size, 10, None, None)
        print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
        #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
        print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2))
        results.append(acc2)
      return results


    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in range(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)
    def _batch_callback(param):
      #global global_step
      global_step[0]+=1
      mbatch = global_step[0]
      for step in lr_steps:
        if mbatch==step:
          opt.lr *= 0.1
          print('lr change to', opt.lr)
          break

      _cb(param)
      if mbatch%1000==0:
        print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch)

      if mbatch>=0 and mbatch%args.verbose==0:
        acc_list = ver_test(mbatch)
        save_step[0]+=1
        msave = save_step[0]
        do_save = False
        is_highest = False
        if len(acc_list)>0:
          #lfw_score = acc_list[0]
          #if lfw_score>highest_acc[0]:
          #  highest_acc[0] = lfw_score
          #  if lfw_score>=0.998:
          #    do_save = True
          score = sum(acc_list)
          if acc_list[-1]>=highest_acc[-1]:
            if acc_list[-1]>highest_acc[-1]:
              is_highest = True
            else:
              if score>=highest_acc[0]:
                is_highest = True
                highest_acc[0] = score
            highest_acc[-1] = acc_list[-1]
            #if lfw_score>=0.99:
            #  do_save = True
        if is_highest:
          do_save = True
        if args.ckpt==0:
          do_save = False
        elif args.ckpt==2:
          do_save = True
        elif args.ckpt==3:
          msave = 1

        if do_save:
          print('saving', msave)
          arg, aux = model.get_export_params()
          all_layers = model.symbol.get_internals()
          _sym = all_layers['fc1_output']
          mx.model.save_checkpoint(prefix, msave, _sym, arg, aux)
        print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1]))
      if config.max_steps>0 and mbatch>config.max_steps:
        sys.exit(0)

    epoch_cb = None
    train_dataiter = mx.io.PrefetchingIter(train_dataiter)

    model.fit(train_dataiter,
        begin_epoch        = begin_epoch,
        num_epoch          = 999999,
        eval_data          = val_dataiter,
        #eval_metric        = eval_metrics,
        kvstore            = args.kvstore,
        optimizer          = opt,
        #optimizer_params   = optimizer_params,
        initializer        = initializer,
        arg_params         = arg_params,
        aux_params         = aux_params,
        allow_missing      = True,
        batch_end_callback = _batch_callback,
        epoch_end_callback = epoch_cb )
Exemplo n.º 11
0
def main(args):
    _seed = 727
    random.seed(_seed)
    np.random.seed(_seed)
    mx.random.seed(_seed)
    ctx = []
    #   cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    #   if len(cvd)>0:
    #     for i in range(len(cvd.split(','))):
    #       ctx.append(mx.gpu(i))
    #   if len(ctx)==0:
    #     ctx = [mx.cpu()]
    #     print('use cpu')
    #   else:
    #     print('gpu num:', len(ctx))
    ctx = [mx.cpu()]
    args.ctx_num = len(ctx)

    args.batch_size = args.per_batch_size * args.ctx_num
    config.per_batch_size = args.per_batch_size

    print('Call with', args, config)
    train_iter = FaceSegIter(
        path_imgrec=os.path.join(config.dataset_path, 'train.rec'),
        batch_size=args.batch_size,
        per_batch_size=args.per_batch_size,
        aug_level=1,
        exf=args.exf,
        args=args,
    )

    data_shape = train_iter.get_data_shape()
    #label_shape = train_iter.get_label_shape()
    sym = sym_heatmap.get_symbol(num_classes=config.num_classes)
    if len(args.pretrained) == 0:
        #data_shape_dict = {'data' : (args.per_batch_size,)+data_shape, 'softmax_label' : (args.per_batch_size,)+label_shape}
        data_shape_dict = train_iter.get_shape_dict()
        arg_params, aux_params = sym_heatmap.init_weights(sym, data_shape_dict)
    else:
        vec = args.pretrained.split(',')
        print('loading', vec)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        #sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)

    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
        label_names=train_iter.get_label_names(),
    )
    #lr = 1.0e-3
    #lr = 2.5e-4
    _rescale_grad = 1.0 / args.ctx_num
    #_rescale_grad = 1.0/args.batch_size
    #lr = args.lr
    #opt = optimizer.Nadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0)
    if args.optimizer == 'onadam':
        opt = ONadam(learning_rate=args.lr,
                     wd=args.wd,
                     rescale_grad=_rescale_grad,
                     clip_gradient=5.0)
    elif args.optimizer == 'nadam':
        opt = optimizer.Nadam(learning_rate=args.lr,
                              rescale_grad=_rescale_grad)
    elif args.optimizer == 'rmsprop':
        opt = optimizer.RMSProp(learning_rate=args.lr,
                                rescale_grad=_rescale_grad)
    elif args.optimizer == 'adam':
        opt = optimizer.Adam(learning_rate=args.lr, rescale_grad=_rescale_grad)
    else:
        opt = optimizer.SGD(learning_rate=args.lr,
                            momentum=0.9,
                            wd=args.wd,
                            rescale_grad=_rescale_grad)
    initializer = mx.init.Xavier(rnd_type='gaussian',
                                 factor_type="in",
                                 magnitude=2)
    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)
    _metric = LossValueMetric()
    #_metric = NMEMetric()
    #_metric2 = AccMetric()
    #eval_metrics = [_metric, _metric2]
    eval_metrics = [_metric]
    lr_steps = [int(x) for x in args.lr_step.split(',')]
    print('lr-steps', lr_steps)
    global_step = [0]

    def val_test():
        all_layers = sym.get_internals()
        vsym = all_layers['heatmap_output']
        vmodel = mx.mod.Module(symbol=vsym, context=ctx, label_names=None)
        #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
        vmodel.bind(data_shapes=[('data', (args.batch_size, ) + data_shape)])
        arg_params, aux_params = model.get_params()
        vmodel.set_params(arg_params, aux_params)
        for target in config.val_targets:
            _file = os.path.join(config.dataset_path, '%s.rec' % target)
            if not os.path.exists(_file):
                continue
            val_iter = FaceSegIter(
                path_imgrec=_file,
                batch_size=args.batch_size,
                #batch_size = 4,
                aug_level=0,
                args=args,
            )
            _metric = NMEMetric()
            val_metric = mx.metric.create(_metric)
            val_metric.reset()
            val_iter.reset()
            for i, eval_batch in enumerate(val_iter):
                #print(eval_batch.data[0].shape, eval_batch.label[0].shape)
                batch_data = mx.io.DataBatch(eval_batch.data)
                model.forward(batch_data, is_train=False)
                model.update_metric(val_metric, eval_batch.label)
            nme_value = val_metric.get_name_value()[0][1]
            print('[%d][%s]NME: %f' % (global_step[0], target, nme_value))

    def _batch_callback(param):
        _cb(param)
        global_step[0] += 1
        mbatch = global_step[0]
        for _lr in lr_steps:
            if mbatch == _lr:
                opt.lr *= 0.2
                print('lr change to', opt.lr)
                break
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)
        if mbatch > 0 and mbatch % args.verbose == 0:
            val_test()
            if args.ckpt == 1:
                msave = mbatch // args.verbose
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg,
                                         aux)
        if mbatch == lr_steps[-1]:
            if args.ckpt == 2:
                #msave = mbatch//args.verbose
                msave = 1
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg,
                                         aux)
            sys.exit(0)

    train_iter = mx.io.PrefetchingIter(train_iter)

    model.fit(
        train_iter,
        begin_epoch=0,
        num_epoch=9999,
        #eval_data          = val_iter,
        eval_data=None,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=None,
    )
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    args.batch_size = args.batch_size1
    args.rescale_threshold = 0
    args.image_channel = 3

    os.environ['BETA'] = str(args.beta)
    data_dir_list = args.data_dir.split(',')
    assert len(data_dir_list) == 1
    data_dir = data_dir_list[0]
    prop = face_image.load_property(data_dir)
    args.num_classes = prop.num_classes
    image_size = prop.image_size
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    print('image_size', image_size)
    assert (args.num_classes > 0)
    print('num_classes', args.num_classes)
    path_imgrec1 = os.path.join(data_dir, "train.rec")

    data_dir_interclass_list = args.data_dir_interclass.split(',')
    assert len(data_dir_interclass_list) == 1
    data_dir_interclass = data_dir_interclass_list[0]
    path_imgrec2 = os.path.join(data_dir_interclass, "train.rec")

    if args.loss_type == 1 and args.num_classes > 20000:
        args.beta_freeze = 5000
        args.gamma = 0.06

    print('Called with argument:', args)
    data_shape = (args.image_channel, image_size[0], image_size[1])
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    if len(args.pretrained) == 0:
        arg_params = None
        aux_params = None
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
        print('sym: ', sym)

    else:
        vec = args.pretrained.split(',')
        #print('loading', vec)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        #if 'fc7_weight' in arg_params.keys():
        #  del arg_params['fc7_weight']
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)

    sym_test, _, _ = get_symbol(args, arg_params, aux_params)
    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)
    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
    )
    val_dataiter = None

    train_dataiter = FaceImageIter(
        mx_model=model,
        ctx=ctx,
        ctx_num=args.ctx_num,
        data_shape=data_shape,
        batch_size1=args.batch_size1,
        path_imgrec1=path_imgrec1,
        batchsize_id=args.batchsize_id,
        batch_size2=args.batch_size2,
        path_imgrec2=path_imgrec2,
        images_per_identity=args.images_per_identity,
        interclass_bag_size=args.bag_size,
        shuffle=True,
        aug_list=None,
        rand_mirror=True,
    )

    eval_metrics = [
        mx.metric.create(AccMetric()),
        mx.metric.create(LossValue()),
        mx.metric.create(LossValue2())
    ]

    if args.network[0] == 'r' or args.network[0] == 'y':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style
    elif args.network[0] == 'i' or args.network[0] == 'x':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="in",
                                     magnitude=2)  #inception
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    _rescale = 1.0 / args.ctx_num
    opt = optimizer.SGD(learning_rate=base_lr,
                        momentum=base_mom,
                        wd=base_wd,
                        rescale_grad=_rescale)
    som = 2
    _cb = mx.callback.Speedometer(args.batch_size, som)

    ver_list = []
    ver_name_list = []
    for name in args.target.split(','):
        path = os.path.join(data_dir, name + ".bin")
        #path = os.path.join('/ssd/MegaFace/MF2_aligned_pic9/', name + ".bin")
        if os.path.exists(path):
            data_set = verification.load_bin(path, image_size)
            ver_list.append(data_set)
            ver_name_list.append(name)
            print('ver', name)

    model_t = None

    def ver_test(nbatch, model_t):
        results = []

        if model_t is None:
            all_layers = model.symbol.get_internals()
            symbol_t = all_layers['blockgrad0_output']
            model_t = mx.mod.Module(symbol=symbol_t,
                                    context=ctx,
                                    label_names=None)
            print([('data', (10, ) + data_shape)])
            model_t.bind(data_shapes=[('data', (10, ) + data_shape)])
            arg_t, aux_t = model.get_params()
            model_t.set_params(arg_t, aux_t)
        else:
            arg_t, aux_t = model.get_params()
            model_t.set_params(arg_t, aux_t)

        for i in xrange(len(ver_list)):
            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
                ver_list[i], model_t, 10, 10, None, None)
            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
            #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
                  (ver_name_list[i], nbatch, acc2, std2))
            results.append(acc2)
        return results

    highest_acc = [0.0, 0.0, 0.0]  #lfw and target
    #for i in xrange(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    if len(args.lr_steps) == 0:
        lr_steps = [100000, 140000, 160000]
        p = 512.0 / args.batch_size
        for l in xrange(len(lr_steps)):
            lr_steps[l] = int(lr_steps[l] * p)
    else:
        lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        #global global_step
        global_step[0] += 1
        mbatch = global_step[0]
        for _lr in lr_steps:
            if mbatch == args.beta_freeze + _lr:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break

        _cb(param)
        #if mbatch%1==0:
        #print('mbatch:',mbatch)
        #arg, aux = model.get_params()

        if mbatch == 1:
            ver_test(mbatch, model_t)
            arg, aux = model.get_params()
            mx.model.save_checkpoint(prefix, 1000, model.symbol, arg, aux)
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)

        if mbatch >= 0 and mbatch % args.verbose == 0:
            arg, aux = model.get_params()
            mx.model.save_checkpoint(prefix, 0, model.symbol, arg, aux)
            acc_list = ver_test(mbatch, model_t)
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            if do_save:
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
            print('[%d]Accuracy-Highest: %1.5f %1.5f %1.5f' %
                  (mbatch, highest_acc[0], highest_acc[1], highest_acc[2]))
        if mbatch <= args.beta_freeze:
            _beta = args.beta
        else:
            move = max(0, mbatch - args.beta_freeze)
            _beta = max(
                args.beta_min,
                args.beta * math.pow(1 + args.gamma * move, -1.0 * args.power))
        #print('beta', _beta)
        os.environ['BETA'] = str(_beta)
        if args.max_steps > 0 and mbatch > args.max_steps:
            sys.exit(0)

    epoch_cb = None

    #print('arg_params',arg_params,aux_params)
    model.fit(
        train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=end_epoch,
        eval_data=val_dataiter,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        #optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
Exemplo n.º 13
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in range(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    network, num_layers = args.network.split(',')
    print('num_layers', num_layers)
    if args.per_batch_size == 0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = 3

    os.environ['BETA'] = str(args.beta)
    data_dir_list = args.data_dir.split(',')
    path_imgrecs = []
    path_imglist = None
    args.num_classes = []
    for data_idx, data_dir in enumerate(data_dir_list):
        prop = face_image.load_property(data_dir)
        args.num_classes.append(prop.num_classes)
        image_size = prop.image_size
        if data_idx == 0:
            args.image_h = image_size[0]
            args.image_w = image_size[1]
        else:
            args.image_h = min(args.image_h, image_size[0])
            args.image_w = min(args.image_w, image_size[1])
        print('image_size', image_size)
        assert (args.num_classes[-1] > 0)
        print('num_classes', args.num_classes)
        path_imgrecs.append(os.path.join(data_dir, "train.rec"))

    if args.loss_type == 1 and args.num_classes > 20000:
        args.beta_freeze = 5000
        args.gamma = 0.06

    print('Called with argument:', args)
    data_shape = (args.image_channel, image_size[0], image_size[1])
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    if len(args.pretrained) == 0:
        arg_params = None
        aux_params = None
        sym, arg_params, aux_params = get_symbol(network, int(num_layers),
                                                 args, arg_params, aux_params)
    else:
        vec = args.pretrained.split(',')
        print('loading', vec)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        sym, arg_params, aux_params = get_symbol(network, int(num_layers),
                                                 args, arg_params, aux_params)

    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)
    ctx_group = dict(zip(['dev%d' % (i + 1) for i in range(len(ctx))], ctx))
    ctx_group['dev0'] = ctx
    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
        data_names=['data'] if args.loss_type != 6 else ['data', 'margin'],
        group2ctxs=ctx_group)
    val_dataiter = None

    from config import crop
    from config import cutout

    train_dataiter = FaceImageIter(
        batch_size=args.batch_size,
        data_shape=data_shape,
        path_imgrecs=path_imgrecs,
        shuffle=True,
        rand_mirror=args.rand_mirror,
        mean=mean,
        cutout=cutout,
        crop=crop,
        loss_type=args.loss_type,
        #margin_m             = args.margin_m,
        #margin_policy        = args.margin_policy,
        #max_steps            = args.max_steps,
        #data_names           = ['data', 'margin'],
        downsample_back=args.downsample_back,
        motion_blur=args.motion_blur,
    )

    _metric = AccMetric()
    #_metric = LossValueMetric()
    eval_metrics = [mx.metric.create(_metric)]

    if args.network[0] == 'r' or args.network[0] == 'y':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style
    elif args.network[0] == 'i' or args.network[0] == 'x':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="in",
                                     magnitude=2)  #inception
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    _rescale = 1.0 / args.ctx_num

    if len(args.lr_steps) == 0:
        print('Error: lr_steps is not seted')
        sys.exit(0)
    else:
        lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_steps,
                                                        factor=0.1,
                                                        base_lr=base_lr)
    optimizer_params = {
        'learning_rate': base_lr,
        'momentum': base_mom,
        'wd': base_wd,
        'rescale_grad': _rescale,
        'lr_scheduler': lr_scheduler
    }

    #opt = AdaBound()
    #opt = AdaBound(lr=base_lr, wd=base_wd, gamma = 2. / args.max_steps)
    opt = optimizer.SGD(**optimizer_params)

    som = 2000
    _cb = mx.callback.Speedometer(args.batch_size, som)

    ver_list = []
    ver_name_list = []
    for name in args.target.split(','):
        path = os.path.join(data_dir, name + ".bin")
        if os.path.exists(path):
            data_set = verification.load_bin(path, image_size)
            ver_list.append(data_set)
            ver_name_list.append(name)
            print('ver', name)

    def ver_test(nbatch):
        results = []
        for i in range(len(ver_list)):
            _, issame_list = ver_list[i]
            if all(issame_list):
                fp_rates, fp_dict, thred_dict, recall_dict = verification.test(
                    ver_list[i],
                    model,
                    args.batch_size,
                    label_shape=(args.batch_size, len(path_imgrecs)))
                for k in fp_rates:
                    print("[%s] TPR at FPR %.2e[%.2e: %.4f]:\t%.5f" %
                          (ver_name_list[i], k, fp_dict[k], thred_dict[k],
                           recall_dict[k]))
            else:
                acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
                    ver_list[i],
                    model,
                    args.batch_size,
                    10,
                    None,
                    label_shape=(args.batch_size, len(path_imgrecs)))
                print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
                #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
                print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
                      (ver_name_list[i], nbatch, acc2, std2))
                results.append(acc2)
        return results

    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in range(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]

    def _batch_callback(param):
        #global global_step
        global_step[0] += 1
        mbatch = global_step[0]

        _cb(param)
        if mbatch % 10000 == 0:
            print('lr-batch-epoch:', opt.learning_rate, param.nbatch,
                  param.epoch)

        if mbatch >= 0 and mbatch % args.verbose == 0:
            acc_list = ver_test(mbatch)
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            if len(acc_list) > 0:
                lfw_score = acc_list[0]
                if lfw_score > highest_acc[0]:
                    highest_acc[0] = lfw_score
                    if lfw_score >= 0.998:
                        do_save = True
                if acc_list[-1] >= highest_acc[-1]:
                    highest_acc[-1] = acc_list[-1]
                    if lfw_score >= 0.99:
                        do_save = True
            if args.ckpt == 0:
                do_save = False
            elif args.ckpt > 1:
                do_save = True
            if do_save:
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
        if mbatch <= args.beta_freeze:
            _beta = args.beta
        else:
            move = max(0, mbatch - args.beta_freeze)
            _beta = max(
                args.beta_min,
                args.beta * math.pow(1 + args.gamma * move, -1.0 * args.power))
        #print('beta', _beta)
        os.environ['BETA'] = str(_beta)
        if args.max_steps > 0 and mbatch > args.max_steps:
            sys.exit(0)

    epoch_cb = None
    train_dataiter = mx.io.PrefetchingIter(train_dataiter)

    model.fit(train_dataiter,
              begin_epoch=begin_epoch,
              num_epoch=end_epoch,
              eval_data=val_dataiter,
              eval_metric=eval_metrics,
              kvstore='device',
              optimizer=opt,
              optimizer_params=optimizer_params,
              initializer=initializer,
              arg_params=arg_params,
              aux_params=aux_params,
              allow_missing=True,
              batch_end_callback=_batch_callback,
              epoch_end_callback=epoch_cb)
Exemplo n.º 14
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
      for i in range(len(cvd.split(','))):
        ctx.append(mx.gpu(i))
    if len(ctx)==0:
      ctx = [mx.cpu()]
      print('use cpu')
    else:
      print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
      os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    network, num_layers = args.network.split(',')
    print('num_layers', num_layers)
    if args.per_batch_size==0:
      args.per_batch_size = 128
    args.batch_size = args.per_batch_size*args.ctx_num
    args.image_channel = 3

    data_dir_list = args.data_dir.split(',')
    path_imgrecs = []
    path_imglist = None
    for data_idx, data_dir in enumerate(data_dir_list):
      image_size = (112, 112)
      if data_idx == 0:
        args.image_h = image_size[0]
        args.image_w = image_size[1]
      else:
        args.image_h = min(args.image_h, image_size[0]) 
        args.image_w = min(args.image_w, image_size[1])
      print('image_size', image_size)
      path_imgrecs.append(data_dir)

    args.use_val = False
    val_rec = None

    print('Called with argument:', args)

    data_shape = (args.image_channel,image_size[0],image_size[1])
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    if len(args.pretrained)==0:
      arg_params = None
      aux_params = None
      sym, arg_params, aux_params = get_symbol(network, int(num_layers), args, arg_params, aux_params)
    else:
      vec = args.pretrained.split(',')
      print('loading', vec)
      _, arg_params, aux_params = mx.model.load_checkpoint(vec[0], int(vec[1]))
      sym, arg_params, aux_params = get_symbol(network, int(num_layers), args, arg_params, aux_params)
    if args.network[0]=='s':
      data_shape_dict = {'data' : (args.per_batch_size,)+data_shape}
      spherenet.init_weights(sym, data_shape_dict, args.num_layers)

    data_extra = None
    hard_mining = False
    model = mx.mod.Module(
        context       = ctx,
        symbol        = sym,
        #data_names = ('data',),
        #label_names = None,
        #label_names = ('softmax_label',),
    )
    label_shape = (args.batch_size,)

    val_dataiter = None

    from config import crop
    from config import cutout

    train_dataiter = FaceImageIter(
        batch_size           = args.batch_size,
        data_shape           = data_shape,
        path_imgrecs         = path_imgrecs,
        shuffle              = True,
        rand_mirror          = args.rand_mirror,
        mean                 = mean,
        cutout               = None, #cutout,
        crop                 = crop, 
        downsample_back      = args.downsample_back,
        motion_blur          = args.motion_blur,
        mx_model             = model,
        ctx_num              = args.ctx_num,
    )

    _metric = LossValueMetric()
    eval_metrics = [mx.metric.create(_metric)]

    if args.network[0]=='r':
      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    elif args.network[0]=='i' or args.network[0]=='x':
      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) #inception
    else:
      initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2)
    _rescale = 1.0/args.ctx_num
    opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale)
    som = 200
    _cb = mx.callback.Speedometer(args.batch_size, som)

    ver_list = []
    ver_name_list = []
    """
    for name in args.target.split(','):
      path = os.path.join(os.path.dirname(data_dir),name+".bin")
      if os.path.exists(path):
        data_set = verification.load_bin(path, image_size)
        ver_list.append(data_set)
        ver_name_list.append(name)
        print('ver', name)
    """



    def ver_test(nbatch):
      results = []
      for i in range(len(ver_list)):
        _, issame_list = ver_list[i]
        if all(issame_list):
          fp_rates, fp_dict, thred_dict, recall_dict = verification.test(ver_list[i], model, args.batch_size, label_shape = (args.batch_size, len(path_imgrecs)))
          for k in fp_rates:
            print("[%s] TPR at FPR %.2e[%.2e: %.4f]:\t%.5f" %(ver_name_list[i], k, fp_dict[k], thred_dict[k], recall_dict[k]))
        else:
          acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], model, args.batch_size, 10, None, label_shape = (args.batch_size, len(path_imgrecs)))
          print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
          #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
          print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2))
          results.append(acc2)
      return results

    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in range(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    if len(args.lr_steps)==0:
      lr_steps = [40000, 60000, 80000]
      if args.loss_type>=1 and args.loss_type<=7:
        lr_steps = [100000, 140000, 160000]
      p = 512.0/args.batch_size
      for l in range(len(lr_steps)):
        lr_steps[l] = int(lr_steps[l]*p)
    else:
      lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)
    def _batch_callback(param):
      #global global_step
      global_step[0]+=1
      mbatch = global_step[0]
      for _lr in lr_steps:
        if mbatch==_lr:
          opt.lr *= 0.1
          print('lr change to', opt.lr)
          break

      _cb(param)
      if mbatch%1000==0:
        print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch)

      if mbatch>=0 and mbatch%args.verbose==0:
        acc_list = ver_test(mbatch)
        save_step[0]+=1
        msave = save_step[0]
        do_save = False
        if len(acc_list)>0:
          lfw_score = acc_list[0]
          if lfw_score>highest_acc[0]:
            highest_acc[0] = lfw_score
            if lfw_score>=0.998:
              do_save = True
          if acc_list[-1]>=highest_acc[-1]:
            highest_acc[-1] = acc_list[-1]
            if lfw_score>=0.99:
              do_save = True
        if args.ckpt==0:
          do_save = False
        elif args.ckpt>1:
          do_save = True
        #for i in range(len(acc_list)):
        #  acc = acc_list[i]
        #  if acc>=highest_acc[i]:
        #    highest_acc[i] = acc
        #    if lfw_score>=0.99:
        #      do_save = True
        #if args.loss_type==1 and mbatch>lr_steps[-1] and mbatch%10000==0:
        #  do_save = True
        if do_save:
          print('saving', msave)
          if val_dataiter is not None:
            val_test()
          arg, aux = model.get_params()
          mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
        print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1]))
      if args.max_steps>0 and mbatch>args.max_steps:
        sys.exit(0)

    #epoch_cb = mx.callback.do_checkpoint(prefix, 1)
    epoch_cb = None

    model.fit(train_dataiter,
        begin_epoch        = begin_epoch,
        num_epoch          = end_epoch,
        eval_data          = val_dataiter,
        eval_metric        = eval_metrics,
        kvstore            = 'device',
        optimizer          = opt,
        #optimizer_params   = optimizer_params,
        initializer        = initializer,
        arg_params         = arg_params,
        aux_params         = aux_params,
        allow_missing      = True,
        batch_end_callback = _batch_callback,
        epoch_end_callback = epoch_cb )
Exemplo n.º 15
0
def test_lr_scheduler():
    from mxnet import lr_scheduler, optimizer
    scheduler = lr_scheduler.FactorScheduler(base_lr=1, step=250, factor=0.5)
    optim = optimizer.SGD(learning_rate=0.1, lr_scheduler=scheduler)
Exemplo n.º 16
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    if args.per_batch_size == 0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num
    args.image_channel = 3

    data_dir_list = args.data_dir.split(',')
    assert len(data_dir_list) == 1
    data_dir = data_dir_list[0]
    path_imgrec = None
    path_imglist = None
    prop = face_image.load_property(data_dir)
    args.num_classes = prop.num_classes
    image_size = prop.image_size
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    print('image_size', image_size)

    assert (args.num_classes > 0)
    print('num_classes', args.num_classes)

    #path_imglist = "/raid5data/dplearn/MS-Celeb-Aligned/lst2"
    path_imgrec = os.path.join(data_dir, "train.rec")

    assert args.images_per_identity >= 2
    assert args.triplet_bag_size % args.batch_size == 0

    print('Called with argument:', args)

    data_shape = (args.image_channel, image_size[0], image_size[1])
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    if len(args.pretrained) == 0:
        arg_params = None
        aux_params = None
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
        if args.network[0] == 's':
            data_shape_dict = {'data': (args.per_batch_size, ) + data_shape}
            spherenet.init_weights(sym, data_shape_dict, args.num_layers)
    else:
        vec = args.pretrained.split(',')
        print('loading', vec)
        sym, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        all_layers = sym.get_internals()
        sym = all_layers['fc1_output']
        sym, arg_params, aux_params = get_symbol(args,
                                                 arg_params,
                                                 aux_params,
                                                 sym_embedding=sym)

    data_extra = None
    hard_mining = False
    triplet_params = [
        args.triplet_bag_size, args.triplet_alpha, args.triplet_max_ap
    ]
    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
        #data_names = ('data',),
        #label_names = None,
        #label_names = ('softmax_label',),
    )
    label_shape = (args.batch_size, )

    val_dataiter = None

    train_dataiter = FaceImageIter(
        batch_size=args.batch_size,
        data_shape=data_shape,
        path_imgrec=path_imgrec,
        shuffle=True,
        rand_mirror=args.rand_mirror,
        mean=mean,
        cutoff=args.cutoff,
        ctx_num=args.ctx_num,
        images_per_identity=args.images_per_identity,
        triplet_params=triplet_params,
        mx_model=model,
    )

    _metric = LossValueMetric()
    eval_metrics = [mx.metric.create(_metric)]

    if args.network[0] == 'r':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style
    elif args.network[0] == 'i' or args.network[0] == 'x':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="in",
                                     magnitude=2)  #inception
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    _rescale = 1.0 / args.ctx_num
    if args.noise_sgd > 0.0:
        print('use noise sgd')
        opt = NoiseSGD(scale=args.noise_sgd,
                       learning_rate=base_lr,
                       momentum=base_mom,
                       wd=base_wd,
                       rescale_grad=_rescale)
    else:
        opt = optimizer.SGD(learning_rate=base_lr,
                            momentum=base_mom,
                            wd=base_wd,
                            rescale_grad=_rescale)
    som = 2
    _cb = mx.callback.Speedometer(args.batch_size, som)

    ver_list = []
    ver_name_list = []
    for name in args.target.split(','):
        path = os.path.join(data_dir, name + ".bin")
        if os.path.exists(path):
            data_set = verification.load_bin(path, image_size)
            ver_list.append(data_set)
            ver_name_list.append(name)
            print('ver', name)

    def ver_test(nbatch):
        results = []
        for i in xrange(len(ver_list)):
            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
                ver_list[i], model, args.batch_size, 10, None, label_shape)
            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
            #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
                  (ver_name_list[i], nbatch, acc2, std2))
            results.append(acc2)
        return results

    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in xrange(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    if len(args.lr_steps) == 0:
        lr_steps = [1000000000]
    else:
        lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        #global global_step
        global_step[0] += 1
        mbatch = global_step[0]
        for _lr in lr_steps:
            if mbatch == _lr:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break

        _cb(param)
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)

        if mbatch >= 0 and mbatch % args.verbose == 0:
            acc_list = ver_test(mbatch)
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            if len(acc_list) > 0:
                lfw_score = acc_list[0]
                if lfw_score > highest_acc[0]:
                    highest_acc[0] = lfw_score
                    if lfw_score >= 0.998:
                        do_save = True
                if acc_list[-1] >= highest_acc[-1]:
                    highest_acc[-1] = acc_list[-1]
                    if lfw_score >= 0.99:
                        do_save = True
            if args.ckpt == 0:
                do_save = False
            elif args.ckpt > 1:
                do_save = True
            #for i in xrange(len(acc_list)):
            #  acc = acc_list[i]
            #  if acc>=highest_acc[i]:
            #    highest_acc[i] = acc
            #    if lfw_score>=0.99:
            #      do_save = True
            #if args.loss_type==1 and mbatch>lr_steps[-1] and mbatch%10000==0:
            #  do_save = True
            if do_save:
                print('saving', msave)
                if val_dataiter is not None:
                    val_test()
                arg, aux = model.get_params()
                mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
        if args.max_steps > 0 and mbatch > args.max_steps:
            sys.exit(0)

    #epoch_cb = mx.callback.do_checkpoint(prefix, 1)
    epoch_cb = None

    model.fit(
        train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=end_epoch,
        eval_data=val_dataiter,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        #optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
Exemplo n.º 17
0
def main(args):
    _seed = 727
    random.seed(_seed)
    np.random.seed(_seed)
    mx.random.seed(_seed)
    ctx = []
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    #ctx = [mx.gpu(0)]
    args.ctx_num = len(ctx)

    args.batch_size = args.per_batch_size * args.ctx_num
    config.per_batch_size = args.per_batch_size

    print('Call with', args, config)
    train_iter = FaceSegIter(
        path_imgrec=os.path.join(config.dataset_path, 'train.rec'),
        batch_size=args.batch_size,
        per_batch_size=args.per_batch_size,
        aug_level=1,
        exf=args.exf,
        args=args,
    )

    data_shape, data_size = train_iter.get_data_shape()
    #label_shape = train_iter.get_label_shape()
    sym = eval(config.network).get_symbol(num_classes=config.num_classes)
    if len(args.pretrained) == 0:
        #data_shape_dict = {'data' : (args.per_batch_size,)+data_shape, 'softmax_label' : (args.per_batch_size,)+label_shape}
        data_shape_dict = train_iter.get_shape_dict()
        arg_params, aux_params = init_weights(sym, data_shape_dict)
    else:
        vec = args.pretrained.split(',')
        print('loading', vec)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        #sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)

    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
        label_names=train_iter.get_label_names(),
    )
    #lr = 1.0e-3
    #lr = 2.5e-4
    _rescale_grad = 1.0 / args.ctx_num
    #_rescale_grad = 1.0/args.batch_size
    #lr = args.lr
    #opt = optimizer.Nadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0)
    if args.optimizer == 'onadam':
        opt = ONadam(learning_rate=args.lr,
                     wd=args.wd,
                     rescale_grad=_rescale_grad,
                     clip_gradient=5.0)
    elif args.optimizer == 'nadam':
        opt = optimizer.Nadam(learning_rate=args.lr,
                              rescale_grad=_rescale_grad)
    elif args.optimizer == 'rmsprop':
        opt = optimizer.RMSProp(learning_rate=args.lr,
                                rescale_grad=_rescale_grad)
    elif args.optimizer == 'adam':
        opt = optimizer.Adam(learning_rate=args.lr, rescale_grad=_rescale_grad)
    else:
        opt = optimizer.SGD(learning_rate=args.lr,
                            momentum=0.9,
                            wd=args.wd,
                            rescale_grad=_rescale_grad)
    initializer = mx.init.Xavier(rnd_type='gaussian',
                                 factor_type="in",
                                 magnitude=2)
    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)
    _metric = LossValueMetric()
    #_metric = NMEMetric()
    #_metric2 = AccMetric()
    #eval_metrics = [_metric, _metric2]
    eval_metrics = [_metric]
    lr_epoch_steps = [int(x) for x in args.lr_epoch_step.split(',')]
    print('lr-epoch-steps', lr_epoch_steps)

    global_step = [0]
    highest_acc = [1.0, 1.0]

    def _batch_callback(param):
        _cb(param)
        global_step[0] += 1
        mbatch = global_step[0]
        mepoch = mbatch * args.batch_size // data_size
        pre = mbatch * args.batch_size % data_size
        is_highest = False
        for _lr in lr_epoch_steps[0:-1]:
            if mepoch == _lr and pre < args.batch_size:
                opt.lr *= 0.2
                print('lr change to', opt.lr)
                break
        if mbatch % 1000 == 0:
            print('lr:', opt.lr, 'batch:', param.nbatch, 'epoch:', param.epoch)
        if mbatch > 0 and mbatch % args.verbose == 0:
            acc_list = val_test(sym, model, ctx, data_shape, global_step)
            score = np.mean(acc_list)
            if acc_list[0] < highest_acc[0]:  # ibug
                is_highest = True
                highest_acc[0] = acc_list[0]
            if score < highest_acc[1]:  # mean
                is_highest = True
                highest_acc[1] = score
            if args.ckpt == 1 and is_highest == True:
                msave = mbatch // args.verbose
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg,
                                         aux)
        if mepoch == lr_epoch_steps[-1]:
            if args.ckpt == 1:
                acc_list = val_test(sym, model, ctx, data_shape, global_step)
                msave = mbatch // args.verbose
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg,
                                         aux)
            sys.exit(0)

    train_iter = mx.io.PrefetchingIter(train_iter)

    model.fit(
        train_iter,
        begin_epoch=0,
        num_epoch=9999,
        #eval_data          = val_iter,
        eval_data=None,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=None,
    )
Exemplo n.º 18
0
def fit(args, network, train_data_loader, val_data_loader, logger, **kwargs):
    """
    train a model
    args : argparse returns
    network : the symbol definition of the nerual network
    data_loader : function that returns the train and val data iterators
    """
    # kvstore
    kv = mx.kvstore.create(args.kv_store)
    if args.gc_type != 'none':
        kv.set_gradient_compression({
            'type': args.gc_type,
            'threshold': args.gc_threshold
        })

    # logging
    #head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
    #logging.basicConfig(level=logging.DEBUG, format=head)
    #logging.info('start with arguments %s', args)

    # data iterators
    #(train, val) = data_loader(args, kv)
    (train, val) = (train_data_loader, val_data_loader)
    if args.test_io:
        tic = time.time()
        for i, batch in enumerate(train):
            for j in batch.data:
                j.wait_to_read()
            if (i + 1) % args.disp_batches == 0:
                logging.info(
                    'Batch [%d]\tSpeed: %.2f samples/sec', i,
                    args.disp_batches * args.batch_size / (time.time() - tic))
                tic = time.time()
        return

    # load model
    if 'arg_params' in kwargs and 'aux_params' in kwargs:
        arg_params = kwargs['arg_params']
        aux_params = kwargs['aux_params']
    else:
        sym, arg_params, aux_params = _load_model(args, kv.rank)
        print("load model", args.load_epoch)
        #if sym is not None:
        #   assert sym.tojson() == network.tojson()

    # save model
    checkpoint = _save_model(args, kv.rank)

    # devices for training
    devs = mx.cpu() if args.gpus is None or args.gpus == "" else [
        mx.gpu(int(i)) for i in args.gpus.split(',')
    ]

    # learning rate
    lr, lr_scheduler = _get_lr_scheduler(args, kv)

    # create model
    model = mx.mod.Module(context=devs, symbol=network, logger=logger)

    lr_scheduler = lr_scheduler
    optimizer_params = {
        'learning_rate': lr,
        'wd': args.wd,
        'lr_scheduler': lr_scheduler,
        'multi_precision': True,
        # 'centered': False,
        # 'gamma1': 0.95,
    }

    # Only a limited number of optimizers have 'momentum' property
    has_momentum = {'sgd', 'dcasgd', 'nag'}
    if args.optimizer in has_momentum:
        optimizer_params['momentum'] = args.mom

    monitor = mx.mon.Monitor(args.monitor,
                             pattern=".*") if args.monitor > 0 else None

    # A limited number of optimizers have a warmup period

    if args.initializer == 'default':
        if args.network == 'alexnet':
            # AlexNet will not converge using Xavier
            initializer = mx.init.Normal()
            # VGG will not trend to converge using Xavier-Gaussian
        elif 'vgg' in args.network:
            initializer = mx.init.Xavier()
        else:
            initializer = mx.init.Xavier(rnd_type='gaussian',
                                         factor_type="in",
                                         magnitude=2)
    # initializer   = mx.init.Xavier(factor_type="in", magnitude=2.34),
    elif args.initializer == 'xavier':
        initializer = mx.init.Xavier()
    elif args.initializer == 'msra':
        initializer = mx.init.MSRAPrelu()
    elif args.initializer == 'orthogonal':
        initializer = mx.init.Orthogonal()
    elif args.initializer == 'normal':
        initializer = mx.init.Normal()
    elif args.initializer == 'uniform':
        initializer = mx.init.Uniform()
    elif args.initializer == 'one':
        initializer = mx.init.One()
    elif args.initializer == 'zero':
        initializer = mx.init.Zero()

    # evaluation metrices
    eval_metrics = ['accuracy']
    if args.top_k > 0:
        eval_metrics.append(
            mx.metric.create('top_k_accuracy', top_k=args.top_k))

    supported_loss = ['ce', 'nll_loss']
    if len(args.loss) > 0:
        # ce or nll loss is only applicable to softmax output
        loss_type_list = args.loss.split(',')
        if 'softmax_output' in network.list_outputs():
            for loss_type in loss_type_list:
                loss_type = loss_type.strip()
                if loss_type == 'nll':
                    loss_type = 'nll_loss'
                if loss_type not in supported_loss:
                    logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \
                                    'negative likelihood loss is supported!')
                else:
                    eval_metrics.append(mx.metric.create(loss_type))
        else:
            logging.warning(
                "The output is not softmax_output, loss argument will be skipped!"
            )
    #optimizer
    base_wd = args.wd
    base_mom = 0.9
    base_lr = args.lr
    gpu_list = [int(i) for i in args.gpus.split(',')]
    _rescale = 1.0 / len(gpu_list)
    opt = optimizer.SGD(learning_rate=base_lr,
                        momentum=base_mom,
                        wd=base_wd,
                        rescale_grad=_rescale)
    # callbacks that run after each batch
    #global global_step
    global_step = [0]
    save_step = [args.load_epoch]
    epoch_step = [0]
    som = args.display
    _cb = mx.callback.Speedometer(args.batch_size, som)
    model_prefix = args.model_prefix
    lr_steps = [int(l) for l in args.lr_step_epochs.split(',')]

    def _batch_callback(param):
        global_step[0] += 1
        mbatch = global_step[0]
        _cb(param)
        if mbatch % 1000 == 0:
            print("%s : " % (datetime.now()), 'lr-batch-epoch:', opt.lr,
                  param.nbatch, param.epoch)
        if mbatch >= 0 and mbatch % args.savestep == 0:
            save_step[0] += 1
            msave = save_step[0]
            print('saving', msave)
            arg, aux = model.get_params()
            mx.model.save_checkpoint(model_prefix, msave, model.symbol, arg,
                                     aux)

    def _epoch_callback(param1, param2, param3, param4):
        epoch_step[0] += 1
        mepoch = epoch_step[0]
        for _lr in lr_steps:
            if mepoch == args.beta_freeze + _lr:
                opt.lr *= args.lr_factor
                print('lr change to', opt.lr, param1)
                break

    # run
    model.fit(train,
              begin_epoch=args.load_epoch if args.load_epoch else 0,
              num_epoch=args.num_epochs,
              eval_data=val,
              eval_metric=eval_metrics,
              kvstore='device',
              optimizer=opt,
              optimizer_params=optimizer_params,
              initializer=initializer,
              arg_params=arg_params,
              aux_params=aux_params,
              batch_end_callback=_batch_callback,
              epoch_end_callback=_epoch_callback,
              allow_missing=True)
Exemplo n.º 19
0
def train_net(args,
              ctx,
              pretrained,
              epoch,
              prefix,
              begin_epoch,
              end_epoch,
              lr=0.001,
              lr_step='5'):
    # setup config
    #init_config()
    #print(config)
    # setup multi-gpu

    input_batch_size = config.TRAIN.BATCH_IMAGES * len(ctx)

    # print config
    logger.info(pprint.pformat(config))

    # load dataset and prepare imdb for training
    image_sets = [iset for iset in args.image_set.split('+')]
    roidbs = [
        load_gt_roidb(args.dataset,
                      image_set,
                      args.root_path,
                      args.dataset_path,
                      flip=not args.no_flip) for image_set in image_sets
    ]
    roidb = merge_roidb(roidbs)
    roidb = filter_roidb(roidb)

    # load symbol
    #sym = eval('get_' + args.network + '_train')(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS)
    #feat_sym = sym.get_internals()['rpn_cls_score_output']
    #train_data = AnchorLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle,
    #                          ctx=ctx, work_load_list=args.work_load_list,
    #                          feat_stride=config.RPN_FEAT_STRIDE, anchor_scales=config.ANCHOR_SCALES,
    #                          anchor_ratios=config.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING)

    sym = eval('get_' + args.network + '_train')()
    #print(sym.get_internals())
    feat_sym = []
    for stride in config.RPN_FEAT_STRIDE:
        feat_sym.append(sym.get_internals()['rpn_cls_score_stride%s_output' %
                                            stride])

    #train_data = AnchorLoaderFPN(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle,
    #                              ctx=ctx, work_load_list=args.work_load_list)
    train_data = CropLoader(feat_sym,
                            roidb,
                            batch_size=input_batch_size,
                            shuffle=not args.no_shuffle,
                            ctx=ctx,
                            work_load_list=args.work_load_list)

    # infer max shape
    max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]),
                                max([v[1] for v in config.SCALES])))]
    #max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))]
    max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape)
    max_data_shape.append(('gt_boxes', (1, roidb[0]['max_num_boxes'], 5)))
    logger.info('providing maximum shape %s %s' %
                (max_data_shape, max_label_shape))

    # infer shape
    data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
    arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict)
    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
    out_shape_dict = dict(zip(sym.list_outputs(), out_shape))
    aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
    logger.info('output shape %s' % pprint.pformat(out_shape_dict))

    # load and initialize params
    if args.resume:
        arg_params, aux_params = load_param(prefix, begin_epoch, convert=True)
    else:
        arg_params, aux_params = load_param(pretrained, epoch, convert=True)
        #for k in ['rpn_conv_3x3', 'rpn_cls_score', 'rpn_bbox_pred', 'cls_score', 'bbox_pred']:
        #  _k = k+"_weight"
        #  if _k in arg_shape_dict:
        #    v = 0.001 if _k.startswith('bbox_') else 0.01
        #    arg_params[_k] = mx.random.normal(0, v, shape=arg_shape_dict[_k])
        #    print('init %s with normal %.5f'%(_k,v))
        #  _k = k+"_bias"
        #  if _k in arg_shape_dict:
        #    arg_params[_k] = mx.nd.zeros(shape=arg_shape_dict[_k])
        #    print('init %s with zero'%(_k))

        for k, v in arg_shape_dict.iteritems():
            if k.find('upsampling') >= 0:
                print('initializing upsampling_weight', k)
                arg_params[k] = mx.nd.zeros(shape=v)
                init = mx.init.Initializer()
                init._init_bilinear(k, arg_params[k])
                #print(args[k])

    # check parameter shapes
    #for k in sym.list_arguments():
    #    if k in data_shape_dict:
    #        continue
    #    assert k in arg_params, k + ' not initialized'
    #    assert arg_params[k].shape == arg_shape_dict[k], \
    #        'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape)
    #for k in sym.list_auxiliary_states():
    #    assert k in aux_params, k + ' not initialized'
    #    assert aux_params[k].shape == aux_shape_dict[k], \
    #        'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape)

    # create solver
    fixed_param_prefix = config.FIXED_PARAMS
    data_names = [k[0] for k in train_data.provide_data]
    label_names = [k[0] for k in train_data.provide_label]
    #mod = MutableModule(sym, data_names=data_names, label_names=label_names,
    #                    logger=logger, context=ctx, work_load_list=args.work_load_list,
    #                    max_data_shapes=max_data_shape, max_label_shapes=max_label_shape,
    #                    fixed_param_prefix=fixed_param_prefix)
    fixed_param_names = get_fixed_params(sym, fixed_param_prefix)
    print('fixed', fixed_param_names, file=sys.stderr)
    mod = Module(sym,
                 data_names=data_names,
                 label_names=label_names,
                 logger=logger,
                 context=ctx,
                 work_load_list=args.work_load_list,
                 fixed_param_names=fixed_param_names)

    # decide training params
    # metric
    eval_metrics = mx.metric.CompositeEvalMetric()
    #if len(sym.list_outputs())>4:
    #  metric_names = ['RPNAccMetric', 'RPNLogLossMetric', 'RPNL1LossMetric', 'RCNNAccMetric', 'RCNNLogLossMetric', 'RCNNL1LossMetric']
    #else:#train rpn only
    #print('sym', sym.list_outputs())
    #metric_names = ['RPNAccMetric', 'RPNLogLossMetric', 'RPNL1LossMetric']
    mids = [0, 4, 8]
    for mid in mids:
        _metric = metric.RPNAccMetric(pred_idx=mid, label_idx=mid + 1)
        eval_metrics.add(_metric)
        #_metric = metric.RPNLogLossMetric(pred_idx=mid, label_idx=mid+1)
        #eval_metrics.add(_metric)
        _metric = metric.RPNL1LossMetric(loss_idx=mid + 2, weight_idx=mid + 3)
        eval_metrics.add(_metric)

    #rpn_eval_metric = metric.RPNAccMetric()
    #rpn_cls_metric = metric.RPNLogLossMetric()
    #rpn_bbox_metric = metric.RPNL1LossMetric()
    #eval_metric = metric.RCNNAccMetric()
    #cls_metric = metric.RCNNLogLossMetric()
    #bbox_metric = metric.RCNNL1LossMetric()
    #for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]:
    #    eval_metrics.add(child_metric)
    # callback
    means = np.tile(np.array(config.TRAIN.BBOX_MEANS), config.NUM_CLASSES)
    stds = np.tile(np.array(config.TRAIN.BBOX_STDS), config.NUM_CLASSES)
    #epoch_end_callback = callback.do_checkpoint(prefix, means, stds)
    epoch_end_callback = None
    # decide learning rate
    base_lr = lr
    lr_factor = 0.1
    lr_epoch = [int(epoch) for epoch in lr_step.split(',')]
    lr_epoch_diff = [
        epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch
    ]
    lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff)))
    lr_iters = [
        int(epoch * len(roidb) / input_batch_size) for epoch in lr_epoch_diff
    ]

    #lr_iters = [36000,42000] #TODO
    #lr_iters = [40000,50000,60000] #TODO
    #lr_iters = [40,50,60] #TODO
    end_epoch = 10000
    #lr_iters = [4,8] #TODO
    logger.info('lr %f lr_epoch_diff %s lr_iters %s' %
                (lr, lr_epoch_diff, lr_iters))
    #lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)
    # optimizer
    opt = optimizer.SGD(learning_rate=lr,
                        momentum=0.9,
                        wd=0.0005,
                        rescale_grad=1.0 / len(ctx),
                        clip_gradient=None)
    initializer = mx.init.Xavier()
    #initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style

    if len(ctx) > 1:
        train_data = mx.io.PrefetchingIter(train_data)

    _cb = mx.callback.Speedometer(train_data.batch_size,
                                  frequent=args.frequent,
                                  auto_reset=False)
    global_step = [0]

    def save_model(epoch):
        arg, aux = mod.get_params()
        all_layers = mod.symbol.get_internals()
        outs = []
        for stride in config.RPN_FEAT_STRIDE:
            num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS']
            _name = 'rpn_cls_score_stride%d_output' % stride
            rpn_cls_score = all_layers[_name]

            # prepare rpn data
            rpn_cls_score_reshape = mx.symbol.Reshape(
                data=rpn_cls_score,
                shape=(0, 2, -1, 0),
                name="rpn_cls_score_reshape_stride%d" % stride)

            rpn_cls_prob = mx.symbol.SoftmaxActivation(
                data=rpn_cls_score_reshape,
                mode="channel",
                name="rpn_cls_prob_stride%d" % stride)
            rpn_cls_prob_reshape = mx.symbol.Reshape(
                data=rpn_cls_prob,
                shape=(0, 2 * num_anchors, -1, 0),
                name='rpn_cls_prob_reshape_stride%d' % stride)
            _name = 'rpn_bbox_pred_stride%d_output' % stride
            rpn_bbox_pred = all_layers[_name]
            outs.append(rpn_cls_prob_reshape)
            outs.append(rpn_bbox_pred)
        _sym = mx.sym.Group(outs)
        mx.model.save_checkpoint(prefix, epoch, _sym, arg, aux)

    def _batch_callback(param):
        #global global_step
        _cb(param)
        global_step[0] += 1
        mbatch = global_step[0]
        for _iter in lr_iters:
            if mbatch == _iter:
                opt.lr *= 0.1
                print('lr change to',
                      opt.lr,
                      ' in batch',
                      mbatch,
                      file=sys.stderr)
                break

        if mbatch % 1000 == 0:
            print('saving final checkpoint', mbatch, file=sys.stderr)
            save_model(mbatch)

        if mbatch == lr_iters[-1]:
            print('saving final checkpoint', mbatch, file=sys.stderr)
            save_model(0)
            #arg, aux = mod.get_params()
            #mx.model.save_checkpoint(prefix, 99, mod.symbol, arg, aux)
            sys.exit(0)

    # train
    mod.fit(train_data,
            eval_metric=eval_metrics,
            epoch_end_callback=epoch_end_callback,
            batch_end_callback=_batch_callback,
            kvstore=args.kvstore,
            optimizer=opt,
            initializer=initializer,
            allow_missing=True,
            arg_params=arg_params,
            aux_params=aux_params,
            begin_epoch=begin_epoch,
            num_epoch=end_epoch)
Exemplo n.º 20
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    if args.per_batch_size == 0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = 3

    data_dir_list = args.data_dir.split(',')
    assert len(data_dir_list) == 1
    data_dir = data_dir_list[0]
    path_imgrec = None
    path_imglist = None
    image_size = [int(x) for x in args.image_size.split(',')]
    assert len(image_size) == 2
    assert image_size[0] == image_size[1]
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    print('image_size', image_size)
    path_imgrec = os.path.join(data_dir, "train.rec")
    path_imgrec_val = os.path.join(data_dir, "val.rec")

    print('Called with argument:', args)
    data_shape = (args.image_channel, image_size[0], image_size[1])
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    if len(args.pretrained) == 0:
        arg_params = None
        aux_params = None
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
    else:
        vec = args.pretrained.split(',')
        print('loading', vec)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)

    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)
    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
    )
    val_dataiter = None

    train_dataiter = FaceImageIter(
        batch_size=args.batch_size,
        data_shape=data_shape,
        path_imgrec=path_imgrec,
        shuffle=True,
        rand_mirror=args.rand_mirror,
        mean=mean,
        cutoff=args.cutoff,
        color_jittering=args.color,
    )
    val_dataiter = FaceImageIter(
        batch_size=args.batch_size,
        data_shape=data_shape,
        path_imgrec=path_imgrec_val,
        shuffle=False,
        rand_mirror=False,
        mean=mean,
    )

    metric = mx.metric.CompositeEvalMetric(
        [AccMetric(), MAEMetric(), CUMMetric()])

    if args.network[0] == 'r' or args.network[0] == 'y':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style
    elif args.network[0] == 'i' or args.network[0] == 'x':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="in",
                                     magnitude=2)  #inception
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    _rescale = 1.0 / args.ctx_num
    opt = optimizer.SGD(learning_rate=base_lr,
                        momentum=base_mom,
                        wd=base_wd,
                        rescale_grad=_rescale)
    #opt = optimizer.Nadam(learning_rate=base_lr, wd=base_wd, rescale_grad=_rescale)
    som = 20
    _cb = mx.callback.Speedometer(args.batch_size, som)
    lr_steps = [int(x) for x in args.lr_steps.split(',')]

    global_step = [0]
    save_step = [0]

    def _batch_callback(param):
        _cb(param)
        global_step[0] += 1
        mbatch = global_step[0]
        for _lr in lr_steps:
            if mbatch == _lr:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)
        if args.max_steps > 0 and mbatch > args.max_steps:
            sys.exit(0)

    def _epoch_callback(epoch, symbol, arg_params, aux_params):
        save_step[0] += 1
        msave = save_step[0]
        do_save = False
        if args.ckpt == 0:
            do_save = False
        elif args.ckpt == 2:
            do_save = True
        if do_save:
            print('saving %s' % msave)
            arg, aux = model.get_params()
            all_layers = model.symbol.get_internals()
            _sym = all_layers['fc1_output']
            mx.model.save_checkpoint(args.prefix, msave, _sym, arg, aux)

    train_dataiter = mx.io.PrefetchingIter(train_dataiter)
    print('start fitting')

    model.fit(
        train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=end_epoch,
        eval_data=val_dataiter,
        eval_metric=metric,
        kvstore='device',
        optimizer=opt,
        #optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=_epoch_callback)
Exemplo n.º 21
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
      for i in xrange(len(cvd.split(','))):
        ctx.append(mx.gpu(i))
    if len(ctx)==0:
      ctx = [mx.cpu()]
      print('use cpu')
    else:
      print('gpu num:', len(ctx))
    prefix = args.prefix
    end_epoch = args.end_epoch
    pretrained = '../model/resnet-152'
    load_epoch = args.load_epoch
    args.image_size = 160
    per_batch_size = 60
    args.ctx_num = len(ctx)
    args.batch_size = per_batch_size*args.ctx_num
    #args.all_batch_size = args.batch_size*args.ctx_num
    args.bag_size = 3600
    args.margin = 0.2
    args.num_classes = 10575 #webface


    data_shape = (3,args.image_size,args.image_size)

    begin_epoch = 0
    base_lr = 0.05
    base_wd = 0.0002
    base_mom = 0.0
    lr_decay = 0.98
    if not args.retrain:
      #load and initialize params
      print(pretrained)
      _, arg_params, aux_params = mx.model.load_checkpoint(pretrained, load_epoch)
      sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
      #arg_params, aux_params = load_param(pretrained, epoch, convert=True)
      data_shape_dict = {'data': (args.batch_size, 3, args.image_size, args.image_size), 'softmax_label': (args.batch_size,)}
      resnet_dcn.init_weights(sym, data_shape_dict, arg_params, aux_params)
    else:
      pretrained = args.prefix
      sym, arg_params, aux_params = mx.model.load_checkpoint(pretrained, load_epoch)
      begin_epoch = load_epoch
      end_epoch = begin_epoch+10
      base_wd = 0.00005
      lr_decay = 0.5
      base_lr = 0.015
    # infer max shape

    model = mx.mod.Module(
        context       = ctx,
        symbol        = sym,
        #label_names   = [],
        #fixed_param_prefix = fixed_param_prefix,
    )

    train_dataiter = FaceIter(
        path_imglist         = "/raid5data/dplearn/faceinsight_align_webface.lst",
        data_shape           = data_shape,
        mod                  = model,
        ctx_num              = args.ctx_num,
        batch_size           = args.batch_size,
        bag_size             = args.bag_size,
        images_per_person    = 5,
    )

    #_dice = DiceMetric()
    _acc = AccMetric()
    eval_metrics = [mx.metric.create(_acc)]

    # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric
    #for child_metric in [fcn_loss_metric]:
    #    eval_metrics.add(child_metric)

    # callback
    #batch_end_callback = callback.Speedometer(input_batch_size, frequent=args.frequent)
    #epoch_end_callback = mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True)

    # decide learning rate
    #lr_step = '10,20,30'
    #train_size = 4848
    #nrof_batch_in_epoch = int(train_size/input_batch_size)
    #print('nrof_batch_in_epoch:', nrof_batch_in_epoch)
    #lr_factor = 0.1
    #lr_epoch = [float(epoch) for epoch in lr_step.split(',')]
    #lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
    #lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
    #lr_iters = [int(epoch * train_size / batch_size) for epoch in lr_epoch_diff]
    #print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters

    #lr_scheduler = MultiFactorScheduler(lr_iters, lr_factor)

    # optimizer
    #optimizer_params = {'momentum': 0.9,
    #                    'wd': 0.0005,
    #                    'learning_rate': base_lr,
    #                    'rescale_grad': 1.0,
    #                    'clip_gradient': None}
    initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2)
    #opt = optimizer.SGD(learning_rate=base_lr, momentum=0.9, wd=base_wd, rescale_grad=(1.0/args.batch_size))
    opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=1.0)
    #opt = optimizer.AdaGrad(learning_rate=base_lr, wd=base_wd, rescale_grad=1.0)
    _cb = mx.callback.Speedometer(args.batch_size, 10)

    lfw_dir = '/raid5data/dplearn/lfw_mtcnn'
    lfw_pairs = lfw.read_pairs(os.path.join(lfw_dir, 'pairs.txt'))
    lfw_paths, issame_list = lfw.get_paths(lfw_dir, lfw_pairs, 'png')
    imgs = []
    lfw_data_list = []
    for flip in [0,1]:
      lfw_data = nd.empty((len(lfw_paths), 3, args.image_size, args.image_size))
      i = 0
      for path in lfw_paths:
        with open(path, 'rb') as fin:
          _bin = fin.read()
          img = mx.image.imdecode(_bin)
          img = nd.transpose(img, axes=(2, 0, 1))
          if flip==1:
            img = img.asnumpy()
            for c in xrange(img.shape[0]):
              img[c,:,:] = np.fliplr(img[c,:,:])
            img = nd.array( img )
          #print(img.shape)
          lfw_data[i][:] = img
          i+=1
          if i%1000==0:
            print('loading lfw', i)
      print(lfw_data.shape)
      lfw_data_list.append(lfw_data)

    def lfw_test(nbatch):
      print('testing lfw..')
      embeddings_list = []
      for i in xrange( len(lfw_data_list) ):
        lfw_data = lfw_data_list[i]
        embeddings = None
        ba = 0
        while ba<lfw_data.shape[0]:
          bb = min(ba+args.batch_size, lfw_data.shape[0])
          _data = nd.slice_axis(lfw_data, axis=0, begin=ba, end=bb)
          _label = nd.ones( (bb-ba,) )
          db = mx.io.DataBatch(data=(_data,), label=(_label,))
          model.forward(db, is_train=False)
          net_out = model.get_outputs()
          _embeddings = net_out[0].asnumpy()
          if embeddings is None:
            embeddings = np.zeros( (lfw_data.shape[0], _embeddings.shape[1]) )
          embeddings[ba:bb,:] = _embeddings
          ba = bb
        embeddings_list.append(embeddings)

      acc_list = []
      embeddings = embeddings_list[0]
      _, _, accuracy, val, val_std, far = lfw.evaluate(embeddings, issame_list, nrof_folds=10)
      acc_list.append(np.mean(accuracy))
      print('[%d]Accuracy: %1.3f+-%1.3f' % (nbatch, np.mean(accuracy), np.std(accuracy)))
      print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
      embeddings = np.concatenate(embeddings_list, axis=1)
      embeddings = sklearn.preprocessing.normalize(embeddings)
      print(embeddings.shape)
      _, _, accuracy, val, val_std, far = lfw.evaluate(embeddings, issame_list, nrof_folds=10)
      acc_list.append(np.mean(accuracy))
      print('[%d]Accuracy-Flip: %1.3f+-%1.3f' % (nbatch, np.mean(accuracy), np.std(accuracy)))
      print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
      pca = PCA(n_components=128)
      embeddings = pca.fit_transform(embeddings)
      embeddings = sklearn.preprocessing.normalize(embeddings)
      print(embeddings.shape)
      _, _, accuracy, val, val_std, far = lfw.evaluate(embeddings, issame_list, nrof_folds=10)
      acc_list.append(np.mean(accuracy))
      print('[%d]Accuracy-PCA: %1.3f+-%1.3f' % (nbatch, np.mean(accuracy), np.std(accuracy)))
      print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
      return max(*acc_list)


    #global_step = 0
    highest_acc = [0.0]
    last_save_acc = [0.0]
    def _batch_callback(param):
      #global global_step
      mbatch = param.nbatch+1
      if mbatch % 4000 == 0:
        opt.lr *= lr_decay

      #print(param.nbatch, opt.lr)
      _cb(param)
      if param.nbatch%100==0:
        print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch)
      if param.nbatch>=0 and param.nbatch%400==0:
        acc = lfw_test(param.nbatch)
        if acc>highest_acc[0]:
          highest_acc[0] = acc
        if acc>0.9 and acc-last_save_acc[0]>=0.01:
          print('saving', mbatch, acc, last_save_acc[0])
          _arg, _aux = model.get_params()
          mx.model.save_checkpoint(args.prefix, mbatch, model.symbol, _arg, _aux)
          last_save_acc[0] = acc
        print('[%d]highest Accu: %1.3f'%(param.nbatch, highest_acc[0]))




      sys.stdout.flush()
      sys.stderr.flush()

    epoch_cb = mx.callback.do_checkpoint(prefix, 1)
    #epoch_cb = None



    def _epoch_callback(epoch, sym, arg_params, aux_params):
      print('epoch-end', epoch)

    model.fit(train_dataiter,
        begin_epoch        = begin_epoch,
        num_epoch          = end_epoch,
        #eval_data          = val_dataiter,
        eval_metric        = eval_metrics,
        kvstore            = 'device',
        optimizer          = opt,
        #optimizer_params   = optimizer_params,
        initializer        = initializer,
        arg_params         = arg_params,
        aux_params         = aux_params,
        allow_missing      = True,
        batch_end_callback = _batch_callback,
        epoch_end_callback = epoch_cb )
Exemplo n.º 22
0
def main():
    ratio_list = [0.25, 0.125, 0.0625, 0.03125]  # 1/4, 1/8, 1/16, 1/32
    if args.depth == 18:
        units = [2, 2, 2, 2]
    elif args.depth == 34:
        units = [3, 4, 6, 3]
    elif args.depth == 50:
        units = [3, 4, 6, 3]
    elif args.depth == 101:
        units = [3, 4, 23, 3]
    elif args.depth == 152:
        units = [3, 8, 36, 3]
    elif args.depth == 200:
        units = [3, 24, 36, 3]
    elif args.depth == 269:
        units = [3, 30, 48, 8]
    else:
        raise ValueError(
            "no experiments done on detph {}, you can do it youself".format(
                args.depth))
    symbol = resnext(units=units,
                     num_stage=4,
                     filter_list=[64, 256, 512, 1024, 2048]
                     if args.depth >= 50 else [64, 64, 128, 256, 512],
                     ratio_list=ratio_list,
                     num_class=args.num_classes,
                     num_group=args.num_group,
                     data_type="imagenet",
                     drop_out=args.drop_out,
                     bottle_neck=True if args.depth >= 50 else False,
                     bn_mom=args.bn_mom,
                     workspace=args.workspace,
                     memonger=args.memonger)

    kv = mx.kvstore.create(args.kv_store)
    devs = mx.cpu() if args.gpus is None else [
        mx.gpu(int(i)) for i in args.gpus.split(',')
    ]
    epoch_size = max(int(args.num_examples / args.batch_size / kv.num_workers),
                     1)
    begin_epoch = args.model_load_epoch if args.model_load_epoch else 0
    if not os.path.exists("./" + args.model_name):
        os.mkdir("./" + args.model_name)
    model_prefix = args.model_name + "/se-resnext-{}-{}-{}".format(
        args.data_type, args.depth, kv.rank)
    checkpoint = mx.callback.do_checkpoint(model_prefix)
    arg_params = None
    aux_params = None
    load_model_prefix = 'model/se-resnext-imagenet-50-0'
    if args.retrain:
        if args.finetune:
            (symbol, arg_params,
             aux_params) = get_fine_tune_model(load_model_prefix,
                                               args.model_load_epoch)
        else:
            symbol, arg_params, aux_params = mx.model.load_checkpoint(
                model_prefix, args.model_load_epoch)
    if args.memonger:
        import memonger
        symbol = memonger.search_plan(
            symbol,
            data=(args.batch_size, 3, 32,
                  32) if args.data_type == "cifar10" else
            (args.batch_size, 3, 224, 224))
    train = mx.io.ImageRecordIter(
        path_imgrec=args.data_train + '.rec',
        path_imgidx=args.data_train + '.idx',
        label_width=1,
        data_name='data',
        label_name='softmax_label',
        data_shape=(3, 224, 224),
        batch_size=args.batch_size,
        pad=0,
        fill_value=0,  # only used when pad is valid
        rand_crop=False,
        shuffle=True)
    train.reset()
    if (args.data_val == 'None'):
        val = None
    else:
        val = mx.io.ImageRecordIter(path_imgrec=args.data_val,
                                    label_width=1,
                                    data_name='data',
                                    label_name='softmax_label',
                                    batch_size=args.batch_size,
                                    data_shape=(3, 224, 224),
                                    rand_crop=False,
                                    rand_mirror=False,
                                    num_parts=kv.num_workers,
                                    part_index=kv.rank)

    fix_param = None
    if args.freeze:
        fix_param = [k for k in arg_params if 'fc' not in k]

    model = mx.mod.Module(symbol=symbol,
                          context=devs,
                          fixed_param_names=fix_param)
    model.bind(data_shapes=train.provide_data,
               label_shapes=train.provide_label)
    # sgd = mx.optimizer.Optimizer.create_optimizer('sgd')
    # finetune_lr = dict({k: 0 for k in arg_params})
    # sgd.set_lr_mult(finetune_lr)

    opt = optimizer.SGD(learning_rate=args.lr,
                        momentum=0.9,
                        wd=0.0005,
                        rescale_grad=1.0 / args.batch_size /
                        (len(args.gpus.split(','))))

    # training
    model.fit(train,
              val,
              num_epoch=args.num_epoch,
              arg_params=arg_params,
              aux_params=aux_params,
              allow_missing=True,
              kvstore='device',
              optimizer=opt,
              initializer=mx.init.Xavier(rnd_type='gaussian',
                                         factor_type="in",
                                         magnitude=2),
              batch_end_callback=mx.callback.Speedometer(
                  args.batch_size, args.frequent),
              epoch_end_callback=checkpoint,
              eval_metric=['acc', 'ce'])
Exemplo n.º 23
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in range(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = os.path.join(args.models_root,
                          '%s-%s-%s' % (args.network, args.loss, args.dataset),
                          'model')
    prefix_dir = os.path.dirname(prefix)
    print('prefix', prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    if args.per_batch_size == 0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = config.image_shape[2]
    data_dir = config.dataset_path
    path_imgrecs = None
    path_imglist = None
    image_size = config.image_shape[0:2]
    assert len(image_size) == 2
    assert image_size[0] == image_size[1]
    print('image_size', image_size)
    print('num_classes', config.num_classes)
    path_imgrecs = [os.path.join(data_dir, "train.rec")]

    data_shape = (args.image_channel, image_size[0], image_size[1])

    num_workers = config.num_workers
    global_num_ctx = num_workers * args.ctx_num
    if config.num_classes % global_num_ctx == 0:
        args.ctx_num_classes = config.num_classes // global_num_ctx
    else:
        args.ctx_num_classes = config.num_classes // global_num_ctx + 1
    print(config.num_classes, global_num_ctx, args.ctx_num_classes)
    args.local_num_classes = args.ctx_num_classes * args.ctx_num
    args.local_class_start = args.local_num_classes * args.worker_id

    #if len(args.partial)==0:
    #  local_classes_range = (0, args.num_classes)
    #else:
    #  _vec = args.partial.split(',')
    #  local_classes_range = (int(_vec[0]), int(_vec[1]))

    #args.partial_num_classes = local_classes_range[1] - local_classes_range[0]
    #args.partial_start = local_classes_range[0]

    print('Called with argument:', args, config)
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    arg_params = None
    aux_params = None
    esym = get_symbol_embedding()
    asym = get_symbol_arcface
    if config.num_workers == 1:
        sys.path.append(os.path.join(os.path.dirname(__file__), 'utils'))
        from parall_module_local_v1 import ParallModule
    else:
        from parall_module_dist import ParallModule

    model = ParallModule(
        context=ctx,
        symbol=esym,
        data_names=['data'],
        label_names=['softmax_label'],
        asymbol=asym,
        args=args,
    )

    val_dataiter = None
    train_dataiter = FaceImageIter(
        batch_size=args.batch_size,
        data_shape=data_shape,
        path_imgrecs=path_imgrecs,
        shuffle=True,
        rand_mirror=config.data_rand_mirror,
        mean=mean,
        cutout=default.cutout if config.data_cutout else None,
        crop=default.crop if config.data_crop else None,
        mask=default.mask if config.data_mask else None,
        gridmask=default.gridmask if config.data_grid else None,
        #color_jittering      = config.data_color,
        #images_filter        = config.data_images_filter,
        loss_type=args.loss,
        #margin_m             = config.loss_m2,
        data_names=['data'],
        downsample_back=config.downsample_back,
        motion_blur=config.motion_blur,
        use_bgr=config.use_bgr)

    if config.net_name == 'fresnet' or config.net_name == 'fmobilefacenet':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)

    _rescale = 1.0 / 8  #/ args.batch_size
    print(base_lr, base_mom, base_wd, args.batch_size)

    lr_steps = [int(x) for x in args.lr_steps.split(',')]
    lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_steps,
                                                        factor=0.1,
                                                        base_lr=base_lr)
    optimizer_params = {
        'learning_rate': base_lr,
        'momentum': base_mom,
        'wd': base_wd,
        'rescale_grad': _rescale,
        'lr_scheduler': lr_scheduler
    }

    opt = optimizer.SGD(learning_rate=base_lr,
                        momentum=base_mom,
                        wd=base_wd,
                        rescale_grad=_rescale)

    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)

    ver_list = []
    ver_name_list = []
    for name in config.val_targets:
        path = os.path.join(data_dir, name + ".bin")
        if os.path.exists(path):
            data_set = verification.load_bin(path, image_size)
            ver_list.append(data_set)
            ver_name_list.append(name)
            print('ver', name)

    def ver_test(nbatch):
        results = []
        for i in range(len(ver_list)):
            _, issame_list = ver_list[i]
            if all(issame_list):
                fp_rates, fp_dict, thred_dict, recall_dict = verification.test(
                    ver_list[i],
                    model,
                    args.batch_size,
                    use_bgr=config.use_bgr,
                    label_shape=(args.batch_size, len(path_imgrecs)))
                for k in fp_rates:
                    print("[%s] TPR at FPR %.2e[%.2e: %.4f]:\t%.5f" %
                          (ver_name_list[i], k, fp_dict[k], thred_dict[k],
                           recall_dict[k]))

            else:
                acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
                    ver_list[i],
                    model,
                    args.batch_size,
                    10,
                    None,
                    label_shape=(args.batch_size, len(path_imgrecs)),
                    use_bgr=config.use_bgr)
                print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
                #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
                print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
                      (ver_name_list[i], nbatch, acc2, std2))
                results.append(acc2)

        return results

    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in range(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        #global global_step
        global_step[0] += 1
        mbatch = global_step[0]

        #for step in lr_steps:
        #  if mbatch==step:
        #    opt.lr *= 0.1
        #    print('lr change to', opt.lr)
        #    break

        _cb(param)
        if mbatch % 1000 == 0:
            #print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch)
            print('batch-epoch:', param.nbatch, param.epoch)

        if mbatch >= 0 and mbatch % args.verbose == 0:
            acc_list = ver_test(mbatch)
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            is_highest = False
            if len(acc_list) > 0:
                #lfw_score = acc_list[0]
                #if lfw_score>highest_acc[0]:
                #  highest_acc[0] = lfw_score
                #  if lfw_score>=0.998:
                #    do_save = True
                score = sum(acc_list)
                if acc_list[-1] >= highest_acc[-1]:
                    if acc_list[-1] > highest_acc[-1]:
                        is_highest = True
                    else:
                        if score >= highest_acc[0]:
                            is_highest = True
                            highest_acc[0] = score
                    highest_acc[-1] = acc_list[-1]
                    #if lfw_score>=0.99:
                    #  do_save = True
            if is_highest:
                do_save = True
            if args.ckpt == 0:
                do_save = False
            elif args.ckpt == 2:
                do_save = True
            elif args.ckpt == 3:
                msave = 1

            if do_save:
                print('saving', msave)
                arg, aux = model.get_params()  #get_export_params()
                all_layers = model.symbol.get_internals()
                _sym = model.symbol  #all_layers['fc1_output']
                mx.model.save_checkpoint(prefix, msave, _sym, arg, aux)
            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
        if args.max_steps > 0 and mbatch > args.max_steps:
            sys.exit(0)

    epoch_cb = None
    train_dataiter = mx.io.PrefetchingIter(train_dataiter)

    if len(args.pretrained) != 0:
        model_prefix, epoch = args.pretrained.split(',')
        begin_epoch = int(epoch)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            model_prefix, begin_epoch)
        #model.set_params(arg_params, aux_params)

    model.fit(
        train_dataiter,
        begin_epoch=0,  #begin_epoch,
        num_epoch=default.end_epoch,
        eval_data=val_dataiter,
        #eval_metric        = eval_metrics,
        kvstore=args.kvstore,
        #optimizer          = opt,
        optimizer_params=optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
Exemplo n.º 24
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    if args.per_batch_size == 0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num
    args.image_channel = 3

    data_dir = args.data_dir
    if args.task == 'gender':
        data_dir = args.gender_data_dir
    elif args.task == 'age':
        data_dir = args.age_data_dir
    print('data dir', data_dir)
    path_imgrec = None
    path_imglist = None
    prop = face_image.load_property(data_dir)
    args.num_classes = prop.num_classes
    image_size = prop.image_size
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    print('image_size', image_size)
    assert (args.num_classes > 0)
    print('num_classes', args.num_classes)
    path_imgrec = os.path.join(data_dir, "train.rec")

    print('Called with argument:', args)
    data_shape = (args.image_channel, image_size[0], image_size[1])
    mean = None

    begin_epoch = 0
    net = get_model()
    #if args.task=='':
    #  test_net = get_model_test(net)
    #print(net.__class__)
    #net = net0[0]
    if args.network[0] == 'r' or args.network[0] == 'y':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style
    elif args.network[0] == 'i' or args.network[0] == 'x':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="in",
                                     magnitude=2)  #inception
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    net.hybridize()
    if args.mode == 'gluon':
        if len(args.pretrained) == 0:
            pass
        else:
            net.load_params(args.pretrained,
                            allow_missing=True,
                            ignore_extra=True)
        net.initialize(initializer)
        net.collect_params().reset_ctx(ctx)

    val_iter = None
    if args.task == '':
        train_iter = FaceImageIter(
            batch_size=args.batch_size,
            data_shape=data_shape,
            path_imgrec=path_imgrec,
            shuffle=True,
            rand_mirror=args.rand_mirror,
            mean=mean,
            cutoff=args.cutoff,
        )
    else:
        train_iter = FaceImageIterAge(
            batch_size=args.batch_size,
            data_shape=data_shape,
            path_imgrec=path_imgrec,
            task=args.task,
            shuffle=True,
            rand_mirror=args.rand_mirror,
            mean=mean,
            cutoff=args.cutoff,
        )

    if args.task == 'age':
        metric = CompositeEvalMetric([MAEMetric(), CUMMetric()])
    elif args.task == 'gender':
        metric = CompositeEvalMetric([AccMetric()])
    else:
        metric = CompositeEvalMetric([AccMetric()])

    ver_list = []
    ver_name_list = []
    if args.task == '':
        for name in args.eval.split(','):
            path = os.path.join(data_dir, name + ".bin")
            if os.path.exists(path):
                data_set = verification.load_bin(path, image_size)
                ver_list.append(data_set)
                ver_name_list.append(name)
                print('ver', name)

    def ver_test(nbatch):
        results = []
        for i in xrange(len(ver_list)):
            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
                ver_list[i], net, ctx, batch_size=args.batch_size)
            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
            #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
                  (ver_name_list[i], nbatch, acc2, std2))
            results.append(acc2)
        return results

    def val_test(nbatch=0):
        acc = 0.0
        #if args.task=='age':
        if len(args.age_data_dir) > 0:
            val_iter = FaceImageIterAge(
                batch_size=args.batch_size,
                data_shape=data_shape,
                path_imgrec=os.path.join(args.age_data_dir, 'val.rec'),
                task=args.task,
                shuffle=False,
                rand_mirror=False,
                mean=mean,
            )
            _metric = MAEMetric()
            val_metric = mx.metric.create(_metric)
            val_metric.reset()
            _metric2 = CUMMetric()
            val_metric2 = mx.metric.create(_metric2)
            val_metric2.reset()
            val_iter.reset()
            for batch in val_iter:
                data = gluon.utils.split_and_load(batch.data[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
                label = gluon.utils.split_and_load(batch.label[0],
                                                   ctx_list=ctx,
                                                   batch_axis=0)
                outputs = []
                for x in data:
                    outputs.append(net(x)[2])
                val_metric.update(label, outputs)
                val_metric2.update(label, outputs)
            _value = val_metric.get_name_value()[0][1]
            print('[%d][VMAE]: %f' % (nbatch, _value))
            _value = val_metric2.get_name_value()[0][1]
            if args.task == 'age':
                acc = _value
            print('[%d][VCUM]: %f' % (nbatch, _value))
        if len(args.gender_data_dir) > 0:
            val_iter = FaceImageIterAge(
                batch_size=args.batch_size,
                data_shape=data_shape,
                path_imgrec=os.path.join(args.gender_data_dir, 'val.rec'),
                task=args.task,
                shuffle=False,
                rand_mirror=False,
                mean=mean,
            )
            _metric = AccMetric()
            val_metric = mx.metric.create(_metric)
            val_metric.reset()
            val_iter.reset()
            for batch in val_iter:
                data = gluon.utils.split_and_load(batch.data[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
                label = gluon.utils.split_and_load(batch.label[0],
                                                   ctx_list=ctx,
                                                   batch_axis=0)
                outputs = []
                for x in data:
                    outputs.append(net(x)[1])
                val_metric.update(label, outputs)
            _value = val_metric.get_name_value()[0][1]
            if args.task == 'gender':
                acc = _value
            print('[%d][VACC]: %f' % (nbatch, _value))
        return acc

    total_time = 0
    num_epochs = 0
    best_acc = [0]
    highest_acc = [0.0, 0.0]  #lfw and target
    global_step = [0]
    save_step = [0]
    if len(args.lr_steps) == 0:
        lr_steps = [100000, 140000, 160000]
        p = 512.0 / args.batch_size
        for l in xrange(len(lr_steps)):
            lr_steps[l] = int(lr_steps[l] * p)
    else:
        lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    kv = mx.kv.create('device')
    #kv = mx.kv.create('local')
    #_rescale = 1.0/args.ctx_num
    #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=_rescale)
    #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd)
    if args.mode == 'gluon':
        trainer = gluon.Trainer(net.collect_params(),
                                'sgd', {
                                    'learning_rate': args.lr,
                                    'wd': args.wd,
                                    'momentum': args.mom,
                                    'multi_precision': True
                                },
                                kvstore=kv)
    else:
        _rescale = 1.0 / args.ctx_num
        opt = optimizer.SGD(learning_rate=args.lr,
                            momentum=args.mom,
                            wd=args.wd,
                            rescale_grad=_rescale)
        _cb = mx.callback.Speedometer(args.batch_size, 20)
        arg_params = None
        aux_params = None
        data = mx.sym.var('data')
        label = mx.sym.var('softmax_label')
        if args.margin_a > 0.0:
            fc7 = net(data, label)
        else:
            fc7 = net(data)
        #sym = mx.symbol.SoftmaxOutput(data=fc7, label = label, name='softmax', normalization='valid')
        ceop = gluon.loss.SoftmaxCrossEntropyLoss()
        loss = ceop(fc7, label)
        #loss = loss/args.per_batch_size
        loss = mx.sym.mean(loss)
        sym = mx.sym.Group([
            mx.symbol.BlockGrad(fc7),
            mx.symbol.MakeLoss(loss, name='softmax')
        ])

    def _batch_callback():
        mbatch = global_step[0]
        global_step[0] += 1
        for _lr in lr_steps:
            if mbatch == _lr:
                args.lr *= 0.1
                if args.mode == 'gluon':
                    trainer.set_learning_rate(args.lr)
                else:
                    opt.lr = args.lr
                print('lr change to', args.lr)
                break

        #_cb(param)
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', args.lr, mbatch)

        if mbatch > 0 and mbatch % args.verbose == 0:
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            is_highest = False
            if args.task == 'age' or args.task == 'gender':
                acc = val_test(mbatch)
                if acc >= highest_acc[-1]:
                    highest_acc[-1] = acc
                    is_highest = True
                    do_save = True
            else:
                acc_list = ver_test(mbatch)
                if len(acc_list) > 0:
                    lfw_score = acc_list[0]
                    if lfw_score > highest_acc[0]:
                        highest_acc[0] = lfw_score
                        if lfw_score >= 0.998:
                            do_save = True
                    if acc_list[-1] >= highest_acc[-1]:
                        highest_acc[-1] = acc_list[-1]
                        if lfw_score >= 0.99:
                            do_save = True
                            is_highest = True
            if args.ckpt == 0:
                do_save = False
            elif args.ckpt > 1:
                do_save = True
            if do_save:
                print('saving', msave)
                #print('saving gluon params')
                fname = os.path.join(args.prefix, 'model-gluon.params')
                net.save_params(fname)
                fname = os.path.join(args.prefix, 'model')
                net.export(fname, msave)
                #arg, aux = model.get_params()
                #mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
        if args.max_steps > 0 and mbatch > args.max_steps:
            sys.exit(0)

    def _batch_callback_sym(param):
        _cb(param)
        _batch_callback()

    if args.mode != 'gluon':
        model = mx.mod.Module(
            context=ctx,
            symbol=sym,
        )
        model.fit(train_iter,
                  begin_epoch=0,
                  num_epoch=args.end_epoch,
                  eval_data=None,
                  eval_metric=metric,
                  kvstore='device',
                  optimizer=opt,
                  initializer=initializer,
                  arg_params=arg_params,
                  aux_params=aux_params,
                  allow_missing=True,
                  batch_end_callback=_batch_callback_sym,
                  epoch_end_callback=None)
    else:
        loss_weight = 1.0
        if args.task == 'age':
            loss_weight = 1.0 / AGE
        #loss = gluon.loss.SoftmaxCrossEntropyLoss(weight = loss_weight)
        loss = nd.SoftmaxOutput
        #loss = gluon.loss.SoftmaxCrossEntropyLoss()
        while True:
            #trainer = update_learning_rate(opt.lr, trainer, epoch, opt.lr_factor, lr_steps)
            tic = time.time()
            train_iter.reset()
            metric.reset()
            btic = time.time()
            for i, batch in enumerate(train_iter):
                _batch_callback()
                #data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
                #label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
                data = gluon.utils.split_and_load(batch.data[0],
                                                  ctx_list=ctx,
                                                  batch_axis=0)
                label = gluon.utils.split_and_load(batch.label[0],
                                                   ctx_list=ctx,
                                                   batch_axis=0)
                outputs = []
                Ls = []
                with ag.record():
                    for x, y in zip(data, label):
                        #print(y.asnumpy())
                        if args.task == '':
                            if args.margin_a > 0.0:
                                z = net(x, y)
                            else:
                                z = net(x)
                            #print(z[0].shape, z[1].shape)
                        else:
                            z = net(x)
                        if args.task == 'gender':
                            L = loss(z[1], y)
                            #L = L/args.per_batch_size
                            Ls.append(L)
                            outputs.append(z[1])
                        elif args.task == 'age':
                            for k in xrange(AGE):
                                _z = nd.slice_axis(z[2],
                                                   axis=1,
                                                   begin=k * 2,
                                                   end=k * 2 + 2)
                                _y = nd.slice_axis(y,
                                                   axis=1,
                                                   begin=k,
                                                   end=k + 1)
                                _y = nd.flatten(_y)
                                L = loss(_z, _y)
                                #L = L/args.per_batch_size
                                #L /= AGE
                                Ls.append(L)
                            outputs.append(z[2])
                        else:
                            L = loss(z, y)
                            #L = L/args.per_batch_size
                            Ls.append(L)
                            outputs.append(z)
                        # store the loss and do backward after we have done forward
                        # on all GPUs for better speed on multiple GPUs.
                    ag.backward(Ls)
                #trainer.step(batch.data[0].shape[0], ignore_stale_grad=True)
                #trainer.step(args.ctx_num)
                n = batch.data[0].shape[0]
                #print(n,n)
                trainer.step(n)
                metric.update(label, outputs)
                if i > 0 and i % 20 == 0:
                    name, acc = metric.get()
                    if len(name) == 2:
                        logger.info(
                            'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f, %s=%f'
                            % (num_epochs, i, args.batch_size /
                               (time.time() - btic), name[0], acc[0], name[1],
                               acc[1]))
                    else:
                        logger.info(
                            'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f'
                            % (num_epochs, i, args.batch_size /
                               (time.time() - btic), name[0], acc[0]))
                    #metric.reset()
                btic = time.time()

            epoch_time = time.time() - tic

            # First epoch will usually be much slower than the subsequent epics,
            # so don't factor into the average
            if num_epochs > 0:
                total_time = total_time + epoch_time

            #name, acc = metric.get()
            #logger.info('[Epoch %d] training: %s=%f, %s=%f'%(num_epochs, name[0], acc[0], name[1], acc[1]))
            logger.info('[Epoch %d] time cost: %f' % (num_epochs, epoch_time))
            num_epochs = num_epochs + 1
            #name, val_acc = test(ctx, val_data)
            #logger.info('[Epoch %d] validation: %s=%f, %s=%f'%(epoch, name[0], val_acc[0], name[1], val_acc[1]))

            # save model if meet requirements
            #save_checkpoint(epoch, val_acc[0], best_acc)
        if num_epochs > 1:
            print('Average epoch time: {}'.format(
                float(total_time) / (num_epochs - 1)))
Exemplo n.º 25
0
def train_net(args,
              ctx,
              pretrained,
              epoch,
              prefix,
              begin_epoch,
              end_epoch,
              lr=0.001,
              lr_step='5'):
    # setup config
    #init_config()
    #print(config)
    # setup multi-gpu

    input_batch_size = config.TRAIN.BATCH_IMAGES * len(ctx)

    # print config
    logger.info(pprint.pformat(config))

    # load dataset and prepare imdb for training
    image_sets = [iset for iset in args.image_set.split('+')]
    roidbs = [
        load_gt_roidb(args.dataset,
                      image_set,
                      args.root_path,
                      args.dataset_path,
                      flip=not args.no_flip) for image_set in image_sets
    ]
    #roidb = merge_roidb(roidbs)
    #roidb = filter_roidb(roidb)
    roidb = roidbs[0]

    # load symbol
    #sym = eval('get_' + args.network + '_train')(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS)
    #feat_sym = sym.get_internals()['rpn_cls_score_output']
    #train_data = AnchorLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle,
    #                          ctx=ctx, work_load_list=args.work_load_list,
    #                          feat_stride=config.RPN_FEAT_STRIDE, anchor_scales=config.ANCHOR_SCALES,
    #                          anchor_ratios=config.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING)

    # load and initialize params
    sym = None
    if len(pretrained) == 0:
        arg_params = {}
        aux_params = {}
    else:
        logger.info('loading %s,%d' % (pretrained, epoch))
        sym, arg_params, aux_params = mx.model.load_checkpoint(
            pretrained, epoch)
        #arg_params, aux_params = load_param(pretrained, epoch, convert=True)
        #for k in ['rpn_conv_3x3', 'rpn_cls_score', 'rpn_bbox_pred', 'cls_score', 'bbox_pred']:
        #  _k = k+"_weight"
        #  if _k in arg_shape_dict:
        #    v = 0.001 if _k.startswith('bbox_') else 0.01
        #    arg_params[_k] = mx.random.normal(0, v, shape=arg_shape_dict[_k])
        #    print('init %s with normal %.5f'%(_k,v))
        #  _k = k+"_bias"
        #  if _k in arg_shape_dict:
        #    arg_params[_k] = mx.nd.zeros(shape=arg_shape_dict[_k])
        #    print('init %s with zero'%(_k))

    sym = eval('get_' + args.network + '_train')(sym)
    feat_sym = []
    for stride in config.RPN_FEAT_STRIDE:
        feat_sym.append(
            sym.get_internals()['face_rpn_cls_score_stride%s_output' % stride])

    train_data = CropLoader(feat_sym,
                            roidb,
                            batch_size=input_batch_size,
                            shuffle=not args.no_shuffle,
                            ctx=ctx,
                            work_load_list=args.work_load_list)

    # infer max shape
    max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]),
                                max([v[1] for v in config.SCALES])))]
    #max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))]
    max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape)
    max_data_shape.append(('gt_boxes', (1, roidb[0]['max_num_boxes'], 5)))
    logger.info('providing maximum shape %s %s' %
                (max_data_shape, max_label_shape))

    # infer shape
    data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
    arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict)
    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
    out_shape_dict = dict(zip(sym.list_outputs(), out_shape))
    aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
    logger.info('output shape %s' % pprint.pformat(out_shape_dict))

    for k, v in arg_shape_dict.items():
        if k.find('upsampling') >= 0:
            print('initializing upsampling_weight', k)
            arg_params[k] = mx.nd.zeros(shape=v)
            init = mx.init.Initializer()
            init._init_bilinear(k, arg_params[k])
            #print(args[k])

    # check parameter shapes
    #for k in sym.list_arguments():
    #    if k in data_shape_dict:
    #        continue
    #    assert k in arg_params, k + ' not initialized'
    #    assert arg_params[k].shape == arg_shape_dict[k], \
    #        'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape)
    #for k in sym.list_auxiliary_states():
    #    assert k in aux_params, k + ' not initialized'
    #    assert aux_params[k].shape == aux_shape_dict[k], \
    #        'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape)

    fixed_param_prefix = config.FIXED_PARAMS
    # create solver
    data_names = [k[0] for k in train_data.provide_data]
    label_names = [k[0] for k in train_data.provide_label]
    fixed_param_names = get_fixed_params(sym, fixed_param_prefix)
    print('fixed', fixed_param_names, file=sys.stderr)
    mod = Module(sym,
                 data_names=data_names,
                 label_names=label_names,
                 logger=logger,
                 context=ctx,
                 work_load_list=args.work_load_list,
                 fixed_param_names=fixed_param_names)

    # metric
    eval_metrics = mx.metric.CompositeEvalMetric()
    mid = 0
    for m in range(len(config.RPN_FEAT_STRIDE)):
        stride = config.RPN_FEAT_STRIDE[m]
        #mid = m*MSTEP
        _metric = metric.RPNAccMetric(pred_idx=mid,
                                      label_idx=mid + 1,
                                      name='RPNAcc_s%s' % stride)
        eval_metrics.add(_metric)
        mid += 2
        #_metric = metric.RPNLogLossMetric(pred_idx=mid, label_idx=mid+1)
        #eval_metrics.add(_metric)

        _metric = metric.RPNL1LossMetric(loss_idx=mid,
                                         weight_idx=mid + 1,
                                         name='RPNL1Loss_s%s' % stride)
        eval_metrics.add(_metric)
        mid += 2
        if config.FACE_LANDMARK:
            _metric = metric.RPNL1LossMetric(loss_idx=mid,
                                             weight_idx=mid + 1,
                                             name='RPNLandMarkL1Loss_s%s' %
                                             stride)
            eval_metrics.add(_metric)
            mid += 2
        if config.HEAD_BOX:
            _metric = metric.RPNAccMetric(pred_idx=mid,
                                          label_idx=mid + 1,
                                          name='RPNAcc_head_s%s' % stride)
            eval_metrics.add(_metric)
            mid += 2
            #_metric = metric.RPNLogLossMetric(pred_idx=mid, label_idx=mid+1)
            #eval_metrics.add(_metric)

            _metric = metric.RPNL1LossMetric(loss_idx=mid,
                                             weight_idx=mid + 1,
                                             name='RPNL1Loss_head_s%s' %
                                             stride)
            eval_metrics.add(_metric)
            mid += 2

    # callback
    #means = np.tile(np.array(config.TRAIN.BBOX_MEANS), config.NUM_CLASSES)
    #stds = np.tile(np.array(config.TRAIN.BBOX_STDS), config.NUM_CLASSES)
    #epoch_end_callback = callback.do_checkpoint(prefix)
    epoch_end_callback = None
    # decide learning rate
    #base_lr = lr
    #lr_factor = 0.1
    #lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))

    lr_epoch = [int(epoch) for epoch in lr_step.split(',')]
    lr_epoch_diff = [
        epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch
    ]
    lr_iters = [
        int(epoch * len(roidb) / input_batch_size) for epoch in lr_epoch_diff
    ]

    lr_steps = []
    if len(lr_iters) == 5:
        factors = [0.5, 0.5, 0.4, 0.1, 0.1]
        for i in range(5):
            lr_steps.append((lr_iters[i], factors[i]))
    elif len(lr_iters) == 8:  #warmup
        for li in lr_iters[0:5]:
            lr_steps.append((li, 1.5849))
        for li in lr_iters[5:]:
            lr_steps.append((li, 0.1))
    else:
        for li in lr_iters:
            lr_steps.append((li, 0.1))
    #lr_steps = [ (20,0.1), (40, 0.1) ] #XXX

    end_epoch = 10000
    logger.info('lr %f lr_epoch_diff %s lr_steps %s' %
                (lr, lr_epoch_diff, lr_steps))
    # optimizer
    opt = optimizer.SGD(learning_rate=lr,
                        momentum=0.9,
                        wd=0.0005,
                        rescale_grad=1.0 / len(ctx),
                        clip_gradient=None)
    initializer = mx.init.Xavier()
    #initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style

    train_data = mx.io.PrefetchingIter(train_data)

    _cb = mx.callback.Speedometer(train_data.batch_size,
                                  frequent=args.frequent,
                                  auto_reset=False)
    global_step = [0]

    def save_model(epoch):
        arg, aux = mod.get_params()
        all_layers = mod.symbol.get_internals()
        outs = []
        for stride in config.RPN_FEAT_STRIDE:
            num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS']
            _name = 'face_rpn_cls_score_stride%d_output' % stride
            rpn_cls_score = all_layers[_name]

            # prepare rpn data
            rpn_cls_score_reshape = mx.symbol.Reshape(
                data=rpn_cls_score,
                shape=(0, 2, -1, 0),
                name="face_rpn_cls_score_reshape_stride%d" % stride)

            rpn_cls_prob = mx.symbol.SoftmaxActivation(
                data=rpn_cls_score_reshape,
                mode="channel",
                name="face_rpn_cls_prob_stride%d" % stride)
            rpn_cls_prob_reshape = mx.symbol.Reshape(
                data=rpn_cls_prob,
                shape=(0, 2 * num_anchors, -1, 0),
                name='face_rpn_cls_prob_reshape_stride%d' % stride)
            _name = 'face_rpn_bbox_pred_stride%d_output' % stride
            rpn_bbox_pred = all_layers[_name]
            outs.append(rpn_cls_prob_reshape)
            outs.append(rpn_bbox_pred)
            if config.FACE_LANDMARK:
                _name = 'face_rpn_landmark_pred_stride%d_output' % stride
                rpn_landmark_pred = all_layers[_name]
                outs.append(rpn_landmark_pred)
        _sym = mx.sym.Group(outs)
        mx.model.save_checkpoint(prefix, epoch, _sym, arg, aux)

    def _batch_callback(param):
        #global global_step
        _cb(param)
        global_step[0] += 1
        mbatch = global_step[0]
        for step in lr_steps:
            if mbatch == step[0]:
                opt.lr *= step[1]
                print('lr change to',
                      opt.lr,
                      ' in batch',
                      mbatch,
                      file=sys.stderr)
                break

        if mbatch == lr_steps[-1][0]:
            print('saving final checkpoint', mbatch, file=sys.stderr)
            save_model(0)
            #arg, aux = mod.get_params()
            #mx.model.save_checkpoint(prefix, 99, mod.symbol, arg, aux)
            sys.exit(0)

    if args.checkpoint is not None:
        _, arg_params, aux_params = mx.model.load_checkpoint(
            args.checkpoint, 0)

    # train
    mod.fit(train_data,
            eval_metric=eval_metrics,
            epoch_end_callback=checkpoint_callback('model/testR50'),
            batch_end_callback=_batch_callback,
            kvstore=args.kvstore,
            optimizer=opt,
            initializer=initializer,
            arg_params=arg_params,
            aux_params=aux_params,
            begin_epoch=begin_epoch,
            num_epoch=end_epoch)
Exemplo n.º 26
0
def train_net(args):

    ctx = []
    # cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() #0,使用第一块GPU
    cvd = []

    if len(cvd) > 0:
        for i in range(len(cvd.split(','))):
            ctx.append(mx.gpu(i))  #讲GPU context添加到ctx,ctx = [gpu(0)]

    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))  #使用了gpu

    prefix = args.prefix  #../model-r100
    prefix_dir = os.path.dirname(prefix)  #..

    if not os.path.exists(prefix_dir):  #未执行
        os.makedirs(prefix_dir)

    end_epoch = args.end_epoch  #100 000

    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])

    print('num_layers', args.num_layers)  #100

    if args.per_batch_size == 0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num  #10

    args.rescale_threshold = 0
    args.image_channel = 3

    os.environ['BETA'] = str(args.beta)  #1000.0,参见Arcface公式(6),退火训练的lambda

    data_dir_list = args.data_dir.split(',')
    print('data_dir_list: ', data_dir_list)

    data_dir = data_dir_list[0]

    # 加载数据集属性
    prop = face_image.load_property(data_dir)
    args.num_classes = prop.num_classes
    image_size = prop.image_size
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    print('image_size', image_size)
    print('num_classes: ', args.num_classes)

    # path_imgrec = os.path.join(data_dir, "train.rec")
    path_imgrec = os.path.join(data_dir, "all.rec")

    if args.loss_type == 1 and args.num_classes > 20000:  #sphereface
        args.beta_freeze = 5000
        args.gamma = 0.06

    print('***Called with argument:', args)

    data_shape = (args.image_channel, image_size[0], image_size[1]
                  )  #(3L,112L,112L)

    mean = None

    begin_epoch = 0
    base_lr = args.lr  #0.1
    base_wd = args.wd  #weight decay = 0.0005
    base_mom = args.mom  #动量:0.9

    if len(args.pretrained) == 0:
        arg_params = None
        aux_params = None
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
    else:
        vec = args.pretrained.split(
            ',')  #['../models/model-r50-am-lfw/model', '0000']
        print('***loading', vec)

        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
        # print(sym[1])
        # mx.viz.plot_network(sym[1]).view() #可视化
        # sys.exit()
    if args.network[0] == 's':  # spherenet
        data_shape_dict = {'data': (args.per_batch_size, ) + data_shape}
        spherenet.init_weights(sym, data_shape_dict, args.num_layers)

    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)
    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
    )

    # print(args.batch_size)
    # print(data_shape)
    # print(path_imgrec)
    # print(args.rand_mirror)
    # print(mean)
    # print(args.cutoff)
    # sys.exit()

    train_dataiter = FaceImageIter(
        batch_size=args.batch_size,
        data_shape=data_shape,  #(3L,112L,112L)
        path_imgrec=path_imgrec,  # train.rec
        shuffle=True,
        rand_mirror=args.rand_mirror,  # 1
        mean=mean,
        cutoff=args.cutoff,  # 0
    )

    if args.loss_type < 10:
        _metric = AccMetric()
    else:
        _metric = LossValueMetric()
    # 创建一个评价指标
    eval_metrics = [mx.metric.create(_metric)]

    if args.network[0] == 'r' or args.network[0] == 'y':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style  mobilefacenet
    elif args.network[0] == 'i' or args.network[0] == 'x':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="in",
                                     magnitude=2)  #inception
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    _rescale = 1.0 / args.ctx_num
    opt = optimizer.SGD(learning_rate=base_lr,
                        momentum=base_mom,
                        wd=base_wd,
                        rescale_grad=_rescale)  #多卡训练的话,rescale_grad将总的结果分开
    som = 64
    # 回调函数,用来阶段性显示训练速度和准确率
    _cb = mx.callback.Speedometer(args.batch_size, som)

    ver_list = []
    ver_name_list = []
    for name in args.target.split(','):
        path = os.path.join(data_dir, name + ".bin")
        if os.path.exists(path):
            data_set = verification.load_bin(path, image_size)
            ver_list.append(data_set)
            ver_name_list.append(name)
            print('ver', name)

    def ver_test(nbatch):
        results = []
        for i in range(len(ver_list)):
            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
                ver_list[i], model, args.batch_size, 10, None, None)
            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
            #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
                  (ver_name_list[i], nbatch, acc2, std2))
            results.append(acc2)
        return results

    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in xrange(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    if len(args.lr_steps) == 0:
        lr_steps = [30000, 40000, 50000]
        if args.loss_type >= 1 and args.loss_type <= 7:

            lr_steps = [100000, 140000, 160000]
        # 单GPU,去掉p
        # p = 512.0/args.batch_size
        for l in range(len(lr_steps)):
            # lr_steps[l] = int(lr_steps[l]*p)
            lr_steps[l] = int(lr_steps[l])
    else:
        lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        #global global_step

        mbatch = global_step[0]
        global_step[0] += 1
        for _lr in lr_steps:
            if mbatch == args.beta_freeze + _lr:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break

        _cb(param)
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)
        print('mbatch=', mbatch)
        if mbatch >= 0 and mbatch % args.verbose == 0:
            acc_list = ver_test(mbatch)
            print(acc_list)
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            if len(acc_list) > 0:
                lfw_score = acc_list[0]
                if lfw_score > highest_acc[0]:
                    highest_acc[0] = lfw_score
                    # 修改验证集阈值,测试最佳阈值
                    # if lfw_score>=0.998:
                    if lfw_score >= 0.99:
                        do_save = True
                if acc_list[-1] >= highest_acc[-1]:
                    highest_acc[-1] = acc_list[-1]
                    # if lfw_score>=0.99: #LFW测试大于0.99时,保存模型
                    if lfw_score >= 0.99:  #LFW测试大于0.99时,保存模型
                        do_save = True
            if args.ckpt == 0:
                do_save = False
            elif args.ckpt > 1:
                do_save = True
            if do_save:
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
        if mbatch <= args.beta_freeze:
            _beta = args.beta
        else:
            move = max(0, mbatch - args.beta_freeze)
            _beta = max(
                args.beta_min,
                args.beta * math.pow(1 + args.gamma * move, -1.0 * args.power))
        #print('beta', _beta)
        os.environ['BETA'] = str(_beta)
        if args.max_steps > 0 and mbatch > args.max_steps:
            sys.exit(0)

    epoch_cb = None
    train_dataiter = mx.io.PrefetchingIter(train_dataiter)

    model.fit(
        train_data=train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=end_epoch,
        eval_data=None,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        #optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
Exemplo n.º 27
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd) > 0:
        for i in xrange(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    if args.per_batch_size == 0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size * args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = 3

    os.environ['BETA'] = str(args.beta)
    data_dir_list = args.data_dir.split(',')
    assert len(data_dir_list) == 1
    data_dir = data_dir_list[0]
    path_imgrec = None
    path_imglist = None
    prop = face_image.load_property(data_dir)
    args.num_classes = prop.num_classes
    image_size = prop.image_size
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    print('image_size', image_size)
    assert (args.num_classes > 0)
    print('num_classes', args.num_classes)
    path_imgrec = os.path.join(data_dir, "train.rec")

    if args.loss_type == 1 and args.num_classes > 20000:
        args.beta_freeze = 5000
        args.gamma = 0.06

    print('Called with argument:', args)
    data_shape = (args.image_channel, image_size[0], image_size[1])
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    if len(args.pretrained) == 0:
        arg_params = None
        aux_params = None
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
    else:
        vec = args.pretrained.split(',')
        print('loading', vec)
        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
        # if args.finetune:
        #     def get_fine_tune_model(symbol, arg_params, num_classes, layer_name='flatten0'):
        #         """
        #         symbol: the pretrained network symbol
        #         arg_params: the argument parameters of the pretrained model
        #         num_classes: the number of classes for the fine-tune datasets
        #         layer_name: the layer name before the last fully-connected layer
        #         """
        #         all_layers = symbol.get_internals()
        #         # print(all_layers);exit(0)
        #         for k in arg_params:
        #             if k.startswith('fc'):
        #               print(k)
        #         exit(0)
        #         net = all_layers[layer_name + '_output']
        #         net = mx.symbol.FullyConnected(data=net, num_hidden=num_classes, name='fc1')
        #         net = mx.symbol.SoftmaxOutput(data=net, name='softmax')
        #         new_args = dict({k: arg_params[k] for k in arg_params if 'fc1' not in k})
        #         return (net, new_args)
        #     sym, arg_params = get_fine_tune_model(sym, arg_params, args.num_classes)

    if args.network[0] == 's':
        data_shape_dict = {'data': (args.per_batch_size, ) + data_shape}
        spherenet.init_weights(sym, data_shape_dict, args.num_layers)

    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)
    model = mx.mod.Module(
        context=ctx,
        symbol=sym,
    )
    val_dataiter = None

    train_dataiter = FaceImageIter(
        batch_size=args.batch_size,
        data_shape=data_shape,
        path_imgrec=path_imgrec,
        shuffle=True,
        rand_mirror=args.rand_mirror,
        mean=mean,
        cutoff=args.cutoff,
    )

    if args.loss_type < 10:
        _metric = AccMetric()
    else:
        _metric = LossValueMetric()
    eval_metrics = [mx.metric.create(_metric)]

    if args.network[0] == 'r' or args.network[0] == 'y':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)  #resnet style
    elif args.network[0] == 'i' or args.network[0] == 'x':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="in",
                                     magnitude=2)  #inception
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    _rescale = 1.0 / args.ctx_num
    opt = optimizer.SGD(learning_rate=base_lr,
                        momentum=base_mom,
                        wd=base_wd,
                        rescale_grad=_rescale)
    som = 20
    _cb = mx.callback.Speedometer(args.batch_size, som)

    ver_list = []
    ver_name_list = []
    for name in args.target.split(','):
        path = os.path.join(data_dir, name + ".bin")
        if os.path.exists(path):
            data_set = verification.load_bin(path, image_size)
            ver_list.append(data_set)
            ver_name_list.append(name)
            print('ver', name)

    def ver_test(nbatch):
        results = []
        for i in xrange(len(ver_list)):
            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
                ver_list[i], model, args.batch_size, 10, None, None)
            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
            #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' %
                  (ver_name_list[i], nbatch, acc2, std2))
            results.append(acc2)
        return results

    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in xrange(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    if len(args.lr_steps) == 0:
        lr_steps = [40000, 60000, 80000]
        if args.loss_type >= 1 and args.loss_type <= 7:
            lr_steps = [100000, 140000, 160000]
        p = 512.0 / args.batch_size
        for l in xrange(len(lr_steps)):
            lr_steps[l] = int(lr_steps[l] * p)
    else:
        lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    def _batch_callback(param):
        #global global_step
        global_step[0] += 1
        mbatch = global_step[0]
        for _lr in lr_steps:
            if mbatch == args.beta_freeze + _lr:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break

        _cb(param)
        if mbatch % 1000 == 0:
            print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)

        if mbatch >= 0 and mbatch % args.verbose == 0:
            acc_list = ver_test(mbatch)
            save_step[0] += 1
            msave = save_step[0]
            do_save = False
            if len(acc_list) > 0:
                lfw_score = acc_list[0]
                if lfw_score > highest_acc[0]:
                    highest_acc[0] = lfw_score
                    if lfw_score >= 0.998:
                        do_save = True
                if acc_list[-1] >= highest_acc[-1]:
                    highest_acc[-1] = acc_list[-1]
                    if lfw_score >= 0.99:
                        do_save = True
            if args.ckpt == 0:
                do_save = False
            elif args.ckpt > 1:
                do_save = True
            if do_save:
                print('saving', msave)
                arg, aux = model.get_params()
                mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
            print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
        if mbatch <= args.beta_freeze:
            _beta = args.beta
        else:
            move = max(0, mbatch - args.beta_freeze)
            _beta = max(
                args.beta_min,
                args.beta * math.pow(1 + args.gamma * move, -1.0 * args.power))
        #print('beta', _beta)
        os.environ['BETA'] = str(_beta)
        if args.max_steps > 0 and mbatch > args.max_steps:
            sys.exit(0)

    epoch_cb = None
    train_dataiter = mx.io.PrefetchingIter(train_dataiter)

    model.fit(
        train_dataiter,
        begin_epoch=begin_epoch,
        num_epoch=end_epoch,
        eval_data=val_dataiter,
        eval_metric=eval_metrics,
        kvstore='device',
        optimizer=opt,
        #optimizer_params   = optimizer_params,
        initializer=initializer,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback=_batch_callback,
        epoch_end_callback=epoch_cb)
Exemplo n.º 28
0
def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
      for i in xrange(len(cvd.split(','))):
        ctx.append(mx.gpu(i))
    if len(ctx)==0:
      ctx = [mx.cpu()]
      print('use cpu')
    else:
      print('gpu num:', len(ctx))
    prefix = args.prefix
    prefix_dir = os.path.dirname(prefix)
    if not os.path.exists(prefix_dir):
      os.makedirs(prefix_dir)
    end_epoch = args.end_epoch
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    if args.per_batch_size==0:
      args.per_batch_size = 128
    args.batch_size = args.per_batch_size*args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = 3

    data_dir_list = args.data_dir.split(',')
    assert len(data_dir_list)==1
    data_dir = data_dir_list[0]
    path_imgrec = None
    path_imglist = None
    prop = face_image.load_property(data_dir)
    args.num_classes = prop.num_classes
    image_size = prop.image_size
    args.image_h = image_size[0]
    args.image_w = image_size[1]
    print('image_size', image_size)
    assert(args.num_classes>0)
    print('num_classes', args.num_classes)
    path_imgrec = os.path.join(data_dir, "train.rec")

    print('Called with argument:', args)
    data_shape = (args.image_channel,image_size[0],image_size[1])
    mean = None

    begin_epoch = 0
    base_lr = args.lr
    base_wd = args.wd
    base_mom = args.mom
    if len(args.pretrained)==0:
      arg_params = None
      aux_params = None
      sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
    else:
      vec = args.pretrained.split(',')
      print('loading', vec)
      _, arg_params, aux_params = mx.model.load_checkpoint(vec[0], int(vec[1]))
      sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
    if args.network[0]=='s':
      data_shape_dict = {'data' : (args.per_batch_size,)+data_shape}
      spherenet.init_weights(sym, data_shape_dict, args.num_layers)

    #label_name = 'softmax_label'
    #label_shape = (args.batch_size,)
    model = mx.mod.Module(
        context       = ctx,
        symbol        = sym,
    )

    train_dataiter = FaceImageIter(
        batch_size           = args.batch_size,
        data_shape           = data_shape,
        path_imgrec          = path_imgrec,
        shuffle              = True,
        rand_mirror          = args.rand_mirror,
        mean                 = mean,
        cutoff               = args.cutoff,
    )
    val_rec = os.path.join(data_dir, "val.rec")
    val_iter = None
    if os.path.exists(val_rec):
        val_iter = FaceImageIter(
            batch_size           = args.batch_size,
            data_shape           = data_shape,
            path_imgrec          = val_rec,
            shuffle              = False,
            rand_mirror          = False,
            mean                 = mean,
        )

    if args.loss_type<10:
      _metric = AccMetric()
    else:
      _metric = LossValueMetric()
    eval_metrics = []
    if USE_FR:
      _metric = AccMetric(pred_idx=1)
      eval_metrics.append(_metric)
      if USE_GENDER:
          _metric = AccMetric(pred_idx=2, name='gender')
          eval_metrics.append(_metric)
    elif USE_GENDER:
      _metric = AccMetric(pred_idx=1, name='gender')
      eval_metrics.append(_metric)
    if USE_AGE:
      _metric = MAEMetric()
      eval_metrics.append(_metric)
      _metric = CUMMetric()
      eval_metrics.append(_metric)

    if args.network[0]=='r':
      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    elif args.network[0]=='i' or args.network[0]=='x':
      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) #inception
    else:
      initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2)
    _rescale = 1.0/args.ctx_num
    opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale)
    som = 20
    _cb = mx.callback.Speedometer(args.batch_size, som)

    ver_list = []
    ver_name_list = []
    for name in args.target.split(','):
      path = os.path.join(data_dir,name+".bin")
      if os.path.exists(path):
        data_set = verification.load_bin(path, image_size)
        ver_list.append(data_set)
        ver_name_list.append(name)
        print('ver', name)



    def ver_test(nbatch):
      results = []
      for i in xrange(len(ver_list)):
        acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], model, args.batch_size, 10, None, None)
        print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
        #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
        print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2))
        results.append(acc2)
      return results

    def val_test():
      _metric = MAEMetric()
      val_metric = mx.metric.create(_metric)
      val_metric.reset()
      _metric2 = CUMMetric()
      val_metric2 = mx.metric.create(_metric2)
      val_metric2.reset()
      val_iter.reset()
      for i, eval_batch in enumerate(val_iter):
        model.forward(eval_batch, is_train=False)
        model.update_metric(val_metric, eval_batch.label)
        model.update_metric(val_metric2, eval_batch.label)
      _value = val_metric.get_name_value()[0][1]
      print('MAE: %f'%(_value))
      _value = val_metric2.get_name_value()[0][1]
      print('CUM: %f'%(_value))


    highest_acc = [0.0, 0.0]  #lfw and target
    #for i in xrange(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
    if len(args.lr_steps)==0:
      lr_steps = [40000, 60000, 80000]
      if args.loss_type>=1 and args.loss_type<=7:
        lr_steps = [100000, 140000, 160000]
      p = 512.0/args.batch_size
      for l in xrange(len(lr_steps)):
        lr_steps[l] = int(lr_steps[l]*p)
    else:
      lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)
    def _batch_callback(param):
      #global global_step
      global_step[0]+=1
      mbatch = global_step[0]
      for _lr in lr_steps:
        if mbatch==_lr:
          opt.lr *= 0.1
          print('lr change to', opt.lr)
          break

      _cb(param)
      if mbatch%1000==0:
        print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch)

      if mbatch>=0 and mbatch%args.verbose==0:
        if val_iter is not None:
            val_test()
        acc_list = ver_test(mbatch)
        save_step[0]+=1
        msave = save_step[0]
        do_save = False
        if len(acc_list)>0:
          lfw_score = acc_list[0]
          if lfw_score>highest_acc[0]:
            highest_acc[0] = lfw_score
            if lfw_score>=0.998:
              do_save = True
          if acc_list[-1]>=highest_acc[-1]:
            highest_acc[-1] = acc_list[-1]
            if lfw_score>=0.99:
              do_save = True
        if args.ckpt==0:
          do_save = False
        elif args.ckpt>1:
          do_save = True
        if do_save:
          print('saving', msave)
          arg, aux = model.get_params()
          mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
        print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1]))
      if args.max_steps>0 and mbatch>args.max_steps:
        sys.exit(0)

    epoch_cb = None

    model.fit(train_dataiter,
        begin_epoch        = begin_epoch,
        num_epoch          = end_epoch,
        eval_data          = None,
        eval_metric        = eval_metrics,
        kvstore            = 'device',
        optimizer          = opt,
        #optimizer_params   = optimizer_params,
        initializer        = initializer,
        arg_params         = arg_params,
        aux_params         = aux_params,
        allow_missing      = True,
        batch_end_callback = _batch_callback,
        epoch_end_callback = epoch_cb )
Exemplo n.º 29
0
def train_net(args):
    ctx = []
    # 设置使用GPU或者CPU训练
    gpu_ids = args.gpu_ids.split(',')
    for gpu_id in gpu_ids:
        ctx.append(mx.gpu(int(gpu_id)))
    if len(ctx) == 0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))
    prefix_dir = os.path.dirname(args.prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)
    args.ctx_num = len(ctx)
    args.num_layers = int(args.network[1:])
    print('num_layers', args.num_layers)
    args.batch_size = args.batch_size * args.ctx_num
    args.rescale_threshold = 0
    args.image_channel = 3

    data_dir_list = args.data_dir.split(',')
    assert len(data_dir_list) == 1
    data_dir = data_dir_list[0]

    data_shape = [int(x) for x in args.data_shape.split(',')]
    assert len(data_shape) == 3
    assert data_shape[1] == data_shape[2]
    args.image_h = data_shape[1]
    args.image_w = data_shape[2]
    print('data_shape', data_shape)
    path_imgrec = os.path.join(data_dir, "train.rec")
    path_imgrec_val = os.path.join(data_dir, "val.rec")

    print('Called with argument:', args)
    data_shape = tuple(data_shape)
    mean = None

    begin_epoch = 0
    if len(args.pretrained) == 0:
        arg_params = None
        aux_params = None
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
    else:
        # 加载预训练模型
        vec = args.pretrained.split(',')
        print('loading', vec)
        begin_epoch = int(vec[1])
        _, arg_params, aux_params = mx.model.load_checkpoint(
            vec[0], int(vec[1]))
        sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)

    model = mx.mod.Module(context=ctx, symbol=sym)

    train_dataiter = FaceImageIter(batch_size=args.batch_size,
                                   data_shape=data_shape,
                                   path_imgrec=path_imgrec,
                                   shuffle=True,
                                   rand_mirror=args.rand_mirror,
                                   mean=mean,
                                   cutoff=args.cutoff,
                                   color_jittering=args.color)
    val_dataiter = FaceImageIter(batch_size=args.batch_size,
                                 data_shape=data_shape,
                                 path_imgrec=path_imgrec_val,
                                 shuffle=False,
                                 rand_mirror=False,
                                 mean=mean)

    metric = mx.metric.CompositeEvalMetric(
        [AccMetric(), MAEMetric(), CUMMetric()])

    if args.network[0] == 'r':
        initializer = mx.init.Xavier(rnd_type='gaussian',
                                     factor_type="out",
                                     magnitude=2)
    else:
        initializer = mx.init.Xavier(rnd_type='uniform',
                                     factor_type="in",
                                     magnitude=2)
    _rescale = 1.0 / args.ctx_num
    opt = optimizer.SGD(learning_rate=args.lr,
                        momentum=0.9,
                        wd=0.0005,
                        rescale_grad=_rescale)
    som = 20
    _cb = mx.callback.Speedometer(args.batch_size, som)
    lr_steps = [int(x) for x in args.lr_steps.split(',')]

    def _batch_callback(param):
        _cb(param)

    # 每轮结束回调函数
    def _epoch_callback(epoch, symbol, arg_params, aux_params):
        epoch = epoch + 1
        for _lr in lr_steps:
            if epoch == _lr:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                break
        # 保存模型
        if epoch % 10 == 0 or epoch == args.end_epoch:
            print('lr-epoch:', opt.lr, epoch)
            arg, aux = model.get_params()
            all_layers = model.symbol.get_internals()
            _sym = all_layers['fc1_output']
            mx.model.save_checkpoint(args.prefix, epoch, _sym, arg, aux)

    train_dataiter = mx.io.PrefetchingIter(train_dataiter)
    print('开始训练...')

    model.fit(train_dataiter,
              begin_epoch=begin_epoch,
              num_epoch=args.end_epoch,
              eval_data=val_dataiter,
              eval_metric=metric,
              kvstore='device',
              optimizer=opt,
              initializer=initializer,
              arg_params=arg_params,
              aux_params=aux_params,
              allow_missing=True,
              batch_end_callback=_batch_callback,
              epoch_end_callback=_epoch_callback)
Exemplo n.º 30
0
def train_net(args):
    ## =================== parse context ==========================
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
        for i in range(len(cvd.split(','))):
            ctx.append(mx.gpu(i))
    if len(ctx)==0:
        ctx = [mx.cpu()]
        print('use cpu')
    else:
        print('gpu num:', len(ctx))


    ## ==================== get model save prefix and log ============
    if len(args.extra_model_name)==0:
        prefix = os.path.join(args.models_root, '%s-%s-%s'%(args.network, args.loss, args.dataset), 'model')
    else:
        prefix = os.path.join(args.models_root, '%s-%s-%s-%s'%(args.network, args.loss, args.dataset, args.extra_model_name), 'model')

    prefix_dir = os.path.dirname(prefix)
    print('prefix', prefix)
    if not os.path.exists(prefix_dir):
        os.makedirs(prefix_dir)

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    filehandler = logging.FileHandler("{}.log".format(prefix))
    streamhandler = logging.StreamHandler()
    logger.addHandler(filehandler)
    logger.addHandler(streamhandler)

    ## ================ parse batch size and class info ======================
    args.ctx_num = len(ctx)
    if args.per_batch_size==0:
        args.per_batch_size = 128
    args.batch_size = args.per_batch_size*args.ctx_num
    
    global_num_ctx = config.num_workers * args.ctx_num
    if config.num_classes % global_num_ctx == 0:
        args.ctx_num_classes = config.num_classes//global_num_ctx
    else:
        args.ctx_num_classes = config.num_classes//global_num_ctx+1

    args.local_num_classes = args.ctx_num_classes * args.ctx_num
    args.local_class_start = args.local_num_classes * args.worker_id

    logger.info("Train model with argument: {}\nconfig : {}".format(args, config))

    train_dataiter, val_dataiter = get_data_iter(config, args.batch_size)

    ## =============== get train info ============================
    image_size = config.image_shape[0:2]
    if len(args.pretrained) == 0: # train from scratch 
        esym = get_symbol_embedding(config)
        asym = functools.partial(get_symbol_arcface, config=config)
    else: # load train model to continue
        assert False

    if config.count_flops:
        all_layers = esym.get_internals()
        _sym = all_layers['fc1_output']
        FLOPs = flops_counter.count_flops(_sym, data=(1,3,image_size[0],image_size[1]))
        _str = flops_counter.flops_str(FLOPs)
        print('Network FLOPs: %s'%_str)
        logging.info("Network FLOPs : %s" % _str)

    if config.num_workers==1:
        #from parall_loss_module import ParallLossModule
        from parall_module_local_v1 import ParallModule
    else: # distribute parall loop
        assert False


    model = ParallModule(
        context       = ctx,
        symbol        = esym,
        data_names    = ['data'],
        label_names    = ['softmax_label'],
        asymbol       = asym,
        args = args,
        logger=logger,
    )
    

    ## ============ get optimizer =====================================
    if config.net_name=='fresnet' or config.net_name=='fmobilefacenet':
        initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
    else:
        initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2)

    opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=1.0/args.batch_size)
    _cb = mx.callback.Speedometer(args.batch_size, args.frequent)

    ver_list = []
    ver_name_list = []
    for name in config.val_targets:
        path = os.path.join(config.dataset_path, name+".bin")
        if os.path.exists(path):
            data_set = verification.load_bin(path, image_size)
            ver_list.append(data_set)
            ver_name_list.append(name)
            print('ver', name)

    def ver_test(nbatch):
        results = []
        for i in range(len(ver_list)):
            acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], model, args.batch_size, 10, None, None)
            print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
            #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
            print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2))
            results.append(acc2)
        return results

    highest_acc = [0.0, 0.0]  #lfw and target
    

    global_step = [0]
    save_step = [0]
    lr_steps = [int(x) for x in args.lr_steps.split(',')]
    print('lr_steps', lr_steps)

    ## =============== batch end callback definition ===================================
    def _batch_callback(param):
        #global global_step
        global_step[0]+=1
        mbatch = global_step[0]
        for step in lr_steps:
            if mbatch==step:
                opt.lr *= 0.1
                print('lr change to', opt.lr)
                logger.info('lr change to', opt.lr)
                break

        _cb(param)
        if mbatch%1000==0:
            print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch)
            logger.info('lr-batch-epoch: {}'.format(opt.lr,param.nbatch,param.epoch))

        if mbatch>=0 and mbatch%args.verbose==0:
            acc_list = ver_test(mbatch)
            save_step[0]+=1
            msave = save_step[0]
            do_save = False
            is_highest = False
            if len(acc_list)>0:
                #lfw_score = acc_list[0]
                #if lfw_score>highest_acc[0]:
                #  highest_acc[0] = lfw_score
                #  if lfw_score>=0.998:
                #    do_save = True
                score = sum(acc_list)
                if acc_list[-1]>=highest_acc[-1]:
                    if acc_list[-1]>highest_acc[-1]:
                        is_highest = True
                    else:
                        if score>=highest_acc[0]:
                            is_highest = True
                            highest_acc[0] = score
                    highest_acc[-1] = acc_list[-1]
                    
            if is_highest:
                do_save = True
            if args.ckpt==0:
                do_save = False
            elif args.ckpt==2:
                do_save = True
            elif args.ckpt==3:
                msave = 1

            if do_save:
                print('saving', msave)
                logger.info('saving {}'.format(msave))

                arg, aux = model.get_export_params()
                all_layers = model.symbol.get_internals()
                _sym = all_layers['fc1_output']
                mx.model.save_checkpoint(prefix, msave, _sym, arg, aux)
            print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1]))
            logger.info('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1]))

        if config.max_steps>0 and mbatch>config.max_steps:
            sys.exit(0)

    model.fit(train_dataiter,
        begin_epoch        = 0,
        num_epoch          = 999999,
        eval_data          = val_dataiter,
        kvstore            = args.kvstore,
        optimizer          = opt,
        initializer        = initializer,
        arg_params         = None,
        aux_params         = None,
        allow_missing      = True,
        batch_end_callback = _batch_callback)