def train(x, lr, iters): tic = time() t = 1 v = x.zeros_like() sqr = x.zeros_like() optim = optimizer.Adam(learning_rate=lr) for idx in range(iters): with autograd.record(): loss = go(x) loss.backward() optim.update(t, x, x.grad, [sqr, v]) nd.waitall() # TODO:it is a time cost operation t = t + 1 sys.stdout.write('\r training..........%s%%' % (100 * idx // iters + 1)) sys.stdout.flush() print(" all_train_time:", time() - tic) return x
def build_optimizer(type, lr, kerasDefaults): if type == 'sgd': if kerasDefaults['nesterov_sgd']: return optimizer.NAG(learning_rate=lr, momentum=kerasDefaults['momentum_sgd'], #rescale_grad=kerasDefaults['clipnorm'], #clip_gradient=kerasDefaults['clipvalue'], lr_scheduler=None) else: return optimizer.SGD(learning_rate=lr, momentum=kerasDefaults['momentum_sgd'], #rescale_grad=kerasDefaults['clipnorm'], #clip_gradient=kerasDefaults['clipvalue'], lr_scheduler=None) elif type == 'rmsprop': return optimizer.RMSProp(learning_rate=lr, gamma1=kerasDefaults['rho'], epsilon=kerasDefaults['epsilon'], centered=False, #rescale_grad=kerasDefaults['clipnorm'], #clip_gradient=kerasDefaults['clipvalue'], lr_scheduler=None) elif type == 'adagrad': return optimizer.AdaGrad(learning_rate=lr, epsilon=kerasDefaults['epsilon'])#, #rescale_grad=kerasDefaults['clipnorm'], #clip_gradient=kerasDefaults['clipvalue']) elif type == 'adadelta': return optimizer.AdaDelta(epsilon=kerasDefaults['epsilon'], rho=kerasDefaults['rho'])#, #rescale_grad=kerasDefaults['clipnorm'], #clip_gradient=kerasDefaults['clipvalue']) elif type == 'adam': return optimizer.Adam(learning_rate=lr, beta_1=kerasDefaults['beta_1'], beta_2=kerasDefaults['beta_2'], epsilon=kerasDefaults['epsilon'])#,
def cifar_mxnet_objective(config): #net = MXNet_AlexNet(config) net = gluoncv.model_zoo.get_model('alexnet', classes=1000, pretrained=False) gpus = mx.test_utils.list_gpus() ctx = [mx.gpu(0)] if gpus else [mx.cpu(0)] net.initialize(mx.init.Uniform(scale=1), ctx=ctx) optim = optimizer.Adam(learning_rate=config['learning_rate']) trainer = gluon.Trainer(net.collect_params(), optim) train_data = gluon.data.DataLoader(vision.datasets.CIFAR100( train=True, transform=transform), batch_size=config['batch_size'], shuffle=False) val_data = gluon.data.DataLoader(vision.datasets.CIFAR100( train=False, transform=transform), batch_size=config['batch_size'], shuffle=False) # # Use Accuracy as the evaluation metric. # metric = mx.metric.Accuracy() criterion = gluon.loss.SoftmaxCrossEntropyLoss() for epoch in tqdm(range(config['epochs'])): for data, label in train_data: # forward + backward with ag.record(): output = net(data) loss = criterion(output, label) loss.backward() # update parameters trainer.step(config['batch_size']) # Evaluate on Validation data name, val_acc = test(ctx, val_data, net) return val_acc, net
def main(args): _seed = 727 random.seed(_seed) np.random.seed(_seed) mx.random.seed(_seed) ctx = [] # cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() # if len(cvd)>0: # for i in range(len(cvd.split(','))): # ctx.append(mx.gpu(i)) # if len(ctx)==0: # ctx = [mx.cpu()] # print('use cpu') # else: # print('gpu num:', len(ctx)) ctx = [mx.cpu()] args.ctx_num = len(ctx) args.batch_size = args.per_batch_size * args.ctx_num config.per_batch_size = args.per_batch_size print('Call with', args, config) train_iter = FaceSegIter( path_imgrec=os.path.join(config.dataset_path, 'train.rec'), batch_size=args.batch_size, per_batch_size=args.per_batch_size, aug_level=1, exf=args.exf, args=args, ) data_shape = train_iter.get_data_shape() #label_shape = train_iter.get_label_shape() sym = sym_heatmap.get_symbol(num_classes=config.num_classes) if len(args.pretrained) == 0: #data_shape_dict = {'data' : (args.per_batch_size,)+data_shape, 'softmax_label' : (args.per_batch_size,)+label_shape} data_shape_dict = train_iter.get_shape_dict() arg_params, aux_params = sym_heatmap.init_weights(sym, data_shape_dict) else: vec = args.pretrained.split(',') print('loading', vec) _, arg_params, aux_params = mx.model.load_checkpoint( vec[0], int(vec[1])) #sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params) model = mx.mod.Module( context=ctx, symbol=sym, label_names=train_iter.get_label_names(), ) #lr = 1.0e-3 #lr = 2.5e-4 _rescale_grad = 1.0 / args.ctx_num #_rescale_grad = 1.0/args.batch_size #lr = args.lr #opt = optimizer.Nadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0) if args.optimizer == 'onadam': opt = ONadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0) elif args.optimizer == 'nadam': opt = optimizer.Nadam(learning_rate=args.lr, rescale_grad=_rescale_grad) elif args.optimizer == 'rmsprop': opt = optimizer.RMSProp(learning_rate=args.lr, rescale_grad=_rescale_grad) elif args.optimizer == 'adam': opt = optimizer.Adam(learning_rate=args.lr, rescale_grad=_rescale_grad) else: opt = optimizer.SGD(learning_rate=args.lr, momentum=0.9, wd=args.wd, rescale_grad=_rescale_grad) initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) _cb = mx.callback.Speedometer(args.batch_size, args.frequent) _metric = LossValueMetric() #_metric = NMEMetric() #_metric2 = AccMetric() #eval_metrics = [_metric, _metric2] eval_metrics = [_metric] lr_steps = [int(x) for x in args.lr_step.split(',')] print('lr-steps', lr_steps) global_step = [0] def val_test(): all_layers = sym.get_internals() vsym = all_layers['heatmap_output'] vmodel = mx.mod.Module(symbol=vsym, context=ctx, label_names=None) #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))]) vmodel.bind(data_shapes=[('data', (args.batch_size, ) + data_shape)]) arg_params, aux_params = model.get_params() vmodel.set_params(arg_params, aux_params) for target in config.val_targets: _file = os.path.join(config.dataset_path, '%s.rec' % target) if not os.path.exists(_file): continue val_iter = FaceSegIter( path_imgrec=_file, batch_size=args.batch_size, #batch_size = 4, aug_level=0, args=args, ) _metric = NMEMetric() val_metric = mx.metric.create(_metric) val_metric.reset() val_iter.reset() for i, eval_batch in enumerate(val_iter): #print(eval_batch.data[0].shape, eval_batch.label[0].shape) batch_data = mx.io.DataBatch(eval_batch.data) model.forward(batch_data, is_train=False) model.update_metric(val_metric, eval_batch.label) nme_value = val_metric.get_name_value()[0][1] print('[%d][%s]NME: %f' % (global_step[0], target, nme_value)) def _batch_callback(param): _cb(param) global_step[0] += 1 mbatch = global_step[0] for _lr in lr_steps: if mbatch == _lr: opt.lr *= 0.2 print('lr change to', opt.lr) break if mbatch % 1000 == 0: print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch) if mbatch > 0 and mbatch % args.verbose == 0: val_test() if args.ckpt == 1: msave = mbatch // args.verbose print('saving', msave) arg, aux = model.get_params() mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg, aux) if mbatch == lr_steps[-1]: if args.ckpt == 2: #msave = mbatch//args.verbose msave = 1 print('saving', msave) arg, aux = model.get_params() mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg, aux) sys.exit(0) train_iter = mx.io.PrefetchingIter(train_iter) model.fit( train_iter, begin_epoch=0, num_epoch=9999, #eval_data = val_iter, eval_data=None, eval_metric=eval_metrics, kvstore='device', optimizer=opt, initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=True, batch_end_callback=_batch_callback, epoch_end_callback=None, )
def main(args): _seed = 727 random.seed(_seed) np.random.seed(_seed) mx.random.seed(_seed) ctx = [] os.environ['CUDA_VISIBLE_DEVICES'] = '0' cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() if len(cvd) > 0: for i in xrange(len(cvd.split(','))): ctx.append(mx.gpu(i)) if len(ctx) == 0: ctx = [mx.cpu()] print('use cpu') else: print('gpu num:', len(ctx)) #ctx = [mx.gpu(0)] args.ctx_num = len(ctx) args.batch_size = args.per_batch_size * args.ctx_num config.per_batch_size = args.per_batch_size print('Call with', args, config) train_iter = FaceSegIter( path_imgrec=os.path.join(config.dataset_path, 'train.rec'), batch_size=args.batch_size, per_batch_size=args.per_batch_size, aug_level=1, exf=args.exf, args=args, ) data_shape, data_size = train_iter.get_data_shape() #label_shape = train_iter.get_label_shape() sym = eval(config.network).get_symbol(num_classes=config.num_classes) if len(args.pretrained) == 0: #data_shape_dict = {'data' : (args.per_batch_size,)+data_shape, 'softmax_label' : (args.per_batch_size,)+label_shape} data_shape_dict = train_iter.get_shape_dict() arg_params, aux_params = init_weights(sym, data_shape_dict) else: vec = args.pretrained.split(',') print('loading', vec) _, arg_params, aux_params = mx.model.load_checkpoint( vec[0], int(vec[1])) #sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params) model = mx.mod.Module( context=ctx, symbol=sym, label_names=train_iter.get_label_names(), ) #lr = 1.0e-3 #lr = 2.5e-4 _rescale_grad = 1.0 / args.ctx_num #_rescale_grad = 1.0/args.batch_size #lr = args.lr #opt = optimizer.Nadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0) if args.optimizer == 'onadam': opt = ONadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0) elif args.optimizer == 'nadam': opt = optimizer.Nadam(learning_rate=args.lr, rescale_grad=_rescale_grad) elif args.optimizer == 'rmsprop': opt = optimizer.RMSProp(learning_rate=args.lr, rescale_grad=_rescale_grad) elif args.optimizer == 'adam': opt = optimizer.Adam(learning_rate=args.lr, rescale_grad=_rescale_grad) else: opt = optimizer.SGD(learning_rate=args.lr, momentum=0.9, wd=args.wd, rescale_grad=_rescale_grad) initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) _cb = mx.callback.Speedometer(args.batch_size, args.frequent) _metric = LossValueMetric() #_metric = NMEMetric() #_metric2 = AccMetric() #eval_metrics = [_metric, _metric2] eval_metrics = [_metric] lr_epoch_steps = [int(x) for x in args.lr_epoch_step.split(',')] print('lr-epoch-steps', lr_epoch_steps) global_step = [0] highest_acc = [1.0, 1.0] def _batch_callback(param): _cb(param) global_step[0] += 1 mbatch = global_step[0] mepoch = mbatch * args.batch_size // data_size pre = mbatch * args.batch_size % data_size is_highest = False for _lr in lr_epoch_steps[0:-1]: if mepoch == _lr and pre < args.batch_size: opt.lr *= 0.2 print('lr change to', opt.lr) break if mbatch % 1000 == 0: print('lr:', opt.lr, 'batch:', param.nbatch, 'epoch:', param.epoch) if mbatch > 0 and mbatch % args.verbose == 0: acc_list = val_test(sym, model, ctx, data_shape, global_step) score = np.mean(acc_list) if acc_list[0] < highest_acc[0]: # ibug is_highest = True highest_acc[0] = acc_list[0] if score < highest_acc[1]: # mean is_highest = True highest_acc[1] = score if args.ckpt == 1 and is_highest == True: msave = mbatch // args.verbose print('saving', msave) arg, aux = model.get_params() mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg, aux) if mepoch == lr_epoch_steps[-1]: if args.ckpt == 1: acc_list = val_test(sym, model, ctx, data_shape, global_step) msave = mbatch // args.verbose print('saving', msave) arg, aux = model.get_params() mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg, aux) sys.exit(0) train_iter = mx.io.PrefetchingIter(train_iter) model.fit( train_iter, begin_epoch=0, num_epoch=9999, #eval_data = val_iter, eval_data=None, eval_metric=eval_metrics, kvstore='device', optimizer=opt, initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=True, batch_end_callback=_batch_callback, epoch_end_callback=None, )
def train_net(args): # 判断使用GPU还是CPU ctx = [] cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() if len(cvd) > 0: for i in range(len(cvd.split(','))): ctx.append(mx.gpu(i)) if len(ctx) == 0: ctx = [mx.cpu()] print('use cpu') else: print('gpu num:', len(ctx)) # 保存模型的前缀 prefix = os.path.join(args.models_root, '%s-%s-%s' % (args.network, args.loss, args.dataset), 'model') # 保存模型的路径 prefix_dir = os.path.dirname(prefix) print('prefix', prefix) if not os.path.exists(prefix_dir): os.makedirs(prefix_dir) # GPU的数目 args.ctx_num = len(ctx) # 计算总batch_size args.batch_size = args.per_batch_size * args.ctx_num args.rescale_threshold = 0 args.image_channel = config.image_shape[2] config.batch_size = args.batch_size # 每个GPU一个批次的大小 config.per_batch_size = args.per_batch_size # 训练数据的目录 data_dir = config.dataset_path path_imgrec = None path_imglist = None # 图片大小以及验证 image_size = config.image_shape[0:2] assert len(image_size) == 2 assert image_size[0] == image_size[1] print('image_size', image_size) # 数据集id数目 print('num_classes', config.num_classes) path_imgrec = os.path.join(data_dir, "train.rec") print('Called with argument:', args, config) data_shape = (args.image_channel, image_size[0], image_size[1]) mean = None begin_epoch = 0 # 判断预训练模型是否存在,如果不存在,初始化权重 if len(args.pretrained) == 0: arg_params = None aux_params = None sym = get_symbol(args) # 模型构建 if config.net_name == 'spherenet': data_shape_dict = {'data': (args.per_batch_size, ) + data_shape} spherenet.init_weights(sym, data_shape_dict, args.num_layers) else: # 如果存在,则加载模型 print('loading', args.pretrained, args.pretrained_epoch) _, arg_params, aux_params = mx.model.load_checkpoint( args.pretrained, args.pretrained_epoch) sym = get_symbol(args) # 浮点型数据占用空间计算 if config.count_flops: all_layers = sym.get_internals() _sym = all_layers['fc1_output'] FLOPs = flops_counter.count_flops(_sym, data=(1, 3, image_size[0], image_size[1])) _str = flops_counter.flops_str(FLOPs) print('Network FLOPs: %s' % _str) # label_name = 'softmax_label' # label_shape = (args.batch_size,) model = mx.mod.Module( context=mx.gpu(), symbol=sym, ) val_dataiter = None # 主要获取数据的迭代器,triplet与sfotmax输入数据的迭代器是不一样的,具体哪里不一样,后续章节为大家分析 if config.loss_name.find('triplet') >= 0: from triplet_image_iter import FaceImageIter triplet_params = [ config.triplet_bag_size, config.triplet_alpha, config.triplet_max_ap ] train_dataiter = FaceImageIter( batch_size=args.batch_size, data_shape=data_shape, path_imgrec=path_imgrec, shuffle=True, rand_mirror=config.data_rand_mirror, mean=mean, cutoff=config.data_cutoff, ctx_num=args.ctx_num, images_per_identity=config.images_per_identity, triplet_params=triplet_params, mx_model=model, ) _metric = LossValueMetric() eval_metrics = [mx.metric.create(_metric)] else: from image_iter import FaceImageIter train_dataiter = FaceImageIter( batch_size=args.batch_size, data_shape=data_shape, path_imgrec=path_imgrec, shuffle=True, rand_mirror=config.data_rand_mirror, mean=mean, cutoff=config.data_cutoff, color_jittering=config.data_color, images_filter=config.data_images_filter, ) metric1 = AccMetric() eval_metrics = [mx.metric.create(metric1)] if config.ce_loss: metric2 = LossValueMetric() eval_metrics.append(mx.metric.create(metric2)) # 对权重进行初始化 if config.net_name == 'fresnet' or config.net_name == 'fmobilefacenet': initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) # resnet style else: initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style _rescale = 1.0 / args.ctx_num #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=_rescale) opt = optimizer.Adam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale) _cb = mx.callback.Speedometer(args.batch_size, args.frequent) # 加载所有测试数据集 ver_list = [] ver_name_list = [] for name in config.val_targets: path = os.path.join(data_dir, name + ".bin") if os.path.exists(path): data_set = verification.load_bin(path, image_size) ver_list.append(data_set) ver_name_list.append(name) print('ver', name) # 对测试集进行测试 def ver_test(nbatch): results = [] for i in range(len(ver_list)): acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test( ver_list[i], model, args.batch_size, 10, None, None) print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm)) # print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1)) print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2)) results.append(acc2) return results # 最高的准曲率 highest_acc = [0.0, 0.0] # lfw and target # for i in range(len(ver_list)): # highest_acc.append(0.0) global_step = [0] save_step = [0] lr_steps = [int(x) for x in args.lr_steps.split(',')] print('lr_steps', lr_steps) def _batch_callback(param): # global global_step global_step[0] += 1 mbatch = global_step[0] # 降低学习率到原来的十分之一 for step in lr_steps: if mbatch == step: opt.lr *= 0.1 print('lr change to', opt.lr) break #print(param) _cb(param) # 每1000批次进行一次打印 if mbatch % 1000 == 0: print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch) # 进行 if mbatch >= 0 and mbatch % args.verbose == 0: acc_list = ver_test(mbatch) save_step[0] += 1 msave = save_step[0] do_save = False is_highest = False # 如果存在评估集 print('-' * 50) print(acc_list) if len(acc_list) > 0: # lfw_score = acc_list[0] # if lfw_score>highest_acc[0]: # highest_acc[0] = lfw_score # if lfw_score>=0.998: # do_save = True score = sum(acc_list) if acc_list[-1] >= highest_acc[-1]: if acc_list[-1] > highest_acc[-1]: #print('is_highest = True') is_highest = True else: if score >= highest_acc[0]: is_highest = True highest_acc[0] = score highest_acc[-1] = acc_list[-1] # if lfw_score>=0.99: # do_save = True if is_highest: do_save = True if args.ckpt == 0: do_save = False elif args.ckpt == 2: do_save = True elif args.ckpt == 3: msave = 1 # 模型保存 if do_save: print('saving', msave) arg, aux = model.get_params() if config.ckpt_embedding: all_layers = model.symbol.get_internals() _sym = all_layers['fc1_output'] _arg = {} for k in arg: if not k.startswith('fc7'): _arg[k] = arg[k] mx.model.save_checkpoint(prefix, msave, _sym, _arg, aux) else: mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux) print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1])) if config.max_steps > 0 and mbatch > config.max_steps: sys.exit(0) epoch_cb = None # 把train_dataiter转化为mx.ioPrefetchingIter迭代器 train_dataiter = mx.io.PrefetchingIter(train_dataiter) model.fit( train_dataiter, begin_epoch=begin_epoch, num_epoch=999999, eval_data=val_dataiter, eval_metric=eval_metrics, kvstore=args.kvstore, optimizer=opt, # optimizer_params = optimizer_params, initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=True, batch_end_callback=_batch_callback, epoch_end_callback=epoch_cb)
def train(self, model: mdn_rnn): """ Trains a given model on data. Applies truncated BPTT :param model (mdn_rnn) the model to be trained :param data ((nd.array(float), nd.array(float)) The training data. In our case, this contains an array of hidden states, and an array of actions. The hidden states are of shape [n_episodes, n_timesteps_per_episode, z_dim] The actions are of shape [n_episodes, n_timesteps_per_episode, a_dim] :param n_epochs (int) number of epochs to train :return test (int) This is a testr return: model:(mdn_rnn) trained mdn_rnn object negative_log_likelihoods: (nd.array(float)) the training losses """ retain_graph = self.args.k1 < self.args.k2 optim = optimizer.Adam(learning_rate=self.args.rnn_lr) trainer = gluon.Trainer(model.collect_params(), optim) # losses = np.zeros((self.args.rnn_rounds, 500)) for epo in range(self.args.rnn_rounds): input_data, output_data = self.get_single_rollout() observations = input_data.shape[0] - self.args.k2 hidden_states = [(nd.zeros( (1, model.RNN.h_dim)), nd.zeros((1, model.RNN.c_dim)))] gc.collect() # epo_loss = nd.zeros(observations) for t in range(observations): print(f"Epoch {epo}, timestep {t}") # Re-use previously computed states h_cur, c_cur = hidden_states[t] za_t = input_data[t] z_tplusone = output_data[t] with autograd.record(): # Model the new prediction, and get updated hidden and output states pz, h_cur, c_cur = model(za_t[None, :], h_cur, c_cur) # Store the hidden states to re-use them later hidden_states.append((h_cur.detach(), c_cur.detach())) # Take k2-1 more steps for j in range(self.args.k2 - 1): # Get new input and target za_t = input_data[t + j + 1] z_tplusone = output_data[t + j + 1] # Make new prediction pz, h_cur, c_cur = model(za_t[None, :], h_cur, c_cur) neg_log_prob = -pz.log_prob(z_tplusone) # Do backprop on the current output neg_log_prob.backward(retain_graph=retain_graph) trainer.step(1, ignore_stale_grad=False)
def train_net(args): ctx = [] cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() if len(cvd) > 0: for i in range(len(cvd.split(','))): ctx.append(mx.gpu(i)) if len(ctx) == 0: ctx = [mx.cpu()] print('use cpu') else: print('gpu num:', len(ctx)) prefix = args.prefix prefix_dir = os.path.dirname(prefix) if not os.path.exists(prefix_dir): os.makedirs(prefix_dir) end_epoch = args.end_epoch args.ctx_num = len(ctx) if args.per_batch_size == 0: args.per_batch_size = 128 args.batch_size = args.per_batch_size * args.ctx_num #args.rescale_threshold = 0 args.image_channel = 3 data_dir_list = args.data_dir.split(',') assert len(data_dir_list) == 1 data_dir = data_dir_list[0] # path_imgrec = None # path_imglist = None args.num_classes = 0 image_size = (64, 64) args.image_h = image_size[0] args.image_w = image_size[1] print('image_size', image_size) path_imgrec = os.path.join(data_dir, "train.rec") print('Called with argument:', args) data_shape = (args.image_channel, image_size[0], image_size[1]) mean = None begin_epoch = 0 base_lr = args.lr base_wd = args.wd base_mom = args.mom if len(args.pretrained) == 0: arg_params = None aux_params = None sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params) else: vec = args.pretrained.split(',') print('loading', vec) _, arg_params, aux_params = mx.model.load_checkpoint( vec[0], int(vec[1])) sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params) # if args.network[0]=='s': # data_shape_dict = {'data' : (args.per_batch_size,)+data_shape} # spherenet.init_weights(sym, data_shape_dict, args.num_layers) #label_name = 'softmax_label' #label_shape = (args.batch_size,) model = mx.mod.Module(context=ctx, symbol=sym, data_names=['data'], label_names=['label_gender']) print(data_shape) train_dataiter = SSR_ITER( batch_size=args.batch_size, data_shape=data_shape, path_imgrec=path_imgrec, shuffle=True, mean=mean, ) val_rec = os.path.join(data_dir, "val.rec") val_iter = None if os.path.exists(val_rec): val_iter = SSR_ITER( batch_size=args.batch_size, data_shape=data_shape, path_imgrec=val_rec, shuffle=False, mean=mean, ) print(train_dataiter.provide_label) initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(rnd_type='uniform') _rescale = 1.0 / args.ctx_num # opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale) opt = optimizer.Adam(learning_rate=base_lr, rescale_grad=_rescale) #opt = optimizer.SGD(learning_rate=base_lr) #opt = optimizer.Nadam(learning_rate=base_lr, wd=base_wd, rescale_grad=_rescale) som = 50 _cb = mx.callback.Speedometer(args.batch_size, som) global_step = [0] save_step = [0] if len(args.lr_steps) == 0: lr_steps = [40000, 60000, 80000] # if args.loss_type>=1 and args.loss_type<=7: # lr_steps = [100000, 140000, 160000] # p = 512.0/args.batch_size for l in range(len(lr_steps)): lr_steps[l] = int(lr_steps[l]) else: lr_steps = [int(x) for x in args.lr_steps.split(',')] print('lr_steps', lr_steps) def _batch_callback(param): #global global_step global_step[0] += 1 mbatch = global_step[0] for _lr in lr_steps: if mbatch == _lr: opt.lr *= 0.1 print('lr change to', opt.lr) break _cb(param) if mbatch % 1000 == 0: print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch) if mbatch >= 0 and mbatch % args.verbose == 0: save_step[0] += 1 msave = save_step[0] do_save = False if args.ckpt == 0: do_save = False elif args.ckpt > 1: do_save = True if do_save: print('saving', msave) arg, aux = model.get_params() mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux) # print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1])) if args.max_steps > 0 and mbatch > args.max_steps: sys.exit(0) epoch_cb = None a = mx.viz.plot_network(sym, shape={"data": (1, 3, 64, 64)}, node_attrs={ "shape": 'rect', "fixedsize": 'false' }) a.render('xx') model.fit( train_dataiter, begin_epoch=begin_epoch, num_epoch=end_epoch, eval_data=val_iter, eval_metric='mae', kvstore='device', optimizer=opt, #optimizer_params = optimizer_params, initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=True, batch_end_callback=_batch_callback, epoch_end_callback=epoch_cb)
def train_net(args): ctx = [] cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() if len(cvd) > 0: for i in xrange(len(cvd.split(','))): ctx.append(mx.gpu(i)) if len(ctx) == 0: ctx = [mx.cpu()] print('use cpu') else: print('gpu num:', len(ctx), ctx, cvd) prefix = args.prefix prefix_dir = os.path.dirname(prefix) if not os.path.exists(prefix_dir): os.makedirs(prefix_dir) end_epoch = args.end_epoch args.ctx_num = len(ctx) args.num_layers = int(args.network[1:]) print('num_layers', args.num_layers) if args.per_batch_size == 0: args.per_batch_size = 128 args.batch_size = args.per_batch_size * args.ctx_num args.rescale_threshold = 0 args.image_channel = 3 os.environ['BETA'] = str(args.beta) data_dir_list = args.data_dir.split(',') assert len(data_dir_list) == 1 data_dir = data_dir_list[0] path_imgrec = None path_imglist = None prop = face_image.load_property(data_dir) args.num_classes = prop.num_classes # image_size = prop.image_size image_size = [int(x) for x in args.image_size.split(',')] assert len(image_size) == 2 assert image_size[0] == image_size[1] args.image_h = image_size[0] args.image_w = image_size[1] print('image_size', image_size) assert (args.num_classes > 0) print('num_classes', args.num_classes) path_imgrec = os.path.join(data_dir, "train.rec") if args.loss_type == 1 and args.num_classes > 20000: args.beta_freeze = 5000 args.gamma = 0.06 print('Called with argument:', args) data_shape = (args.image_channel, image_size[0], image_size[1]) mean = None begin_epoch = 0 base_lr = args.lr base_wd = args.wd base_mom = args.mom arg_params = None aux_params = None sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params, layer_name='ms1m_fc7') fixed_args = [n for n in sym.list_arguments() if 'fc7' in n] # sym.get_internals() # sym.list_arguments() # sym.list_auxiliary_states() # sym.list_inputs() # sym.list_outputs() # label_name = 'softmax_label' # label_shape = (args.batch_size,) # arg_params['glint_fc7_weight'] = arg_params['fc7_weight'].copy() # arg_params['ms1m_fc7_weight'] = arg_params['glint_fc7_weight'].copy() assert 'ms1m_fc7_weight' in arg_params model = mx.mod.Module( context=ctx, symbol=sym, fixed_param_names=fixed_args, ) val_dataiter = None train_dataiter = FaceImageIter( batch_size=args.batch_size, data_shape=data_shape, path_imgrec=path_imgrec, shuffle=True, rand_mirror=args.rand_mirror, mean=mean, cutoff=args.cutoff, color_jittering=args.color, images_filter=args.images_filter, ) metric1 = AccMetric() eval_metrics = [mx.metric.create(metric1)] if args.ce_loss: metric2 = LossValueMetric() eval_metrics.append(mx.metric.create(metric2)) if args.network[0] == 'r' or args.network[0] == 'y': initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) # resnet style elif args.network[0] == 'i' or args.network[0] == 'x': initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) # inception else: initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2) # initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style _rescale = 1.0 / args.ctx_num # opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale) logging.info(f'base lr {base_lr}') opt = optimizer.Adam( learning_rate=base_lr, wd=base_wd, rescale_grad=_rescale, ) som = 20 _cb = mx.callback.Speedometer(args.batch_size, som) ver_list = [] ver_name_list = [] for name in args.target.split(','): path = os.path.join(data_dir, name + ".bin") if os.path.exists(path): data_set = verification.load_bin(path, image_size) ver_list.append(data_set) ver_name_list.append(name) print('ver', name) def ver_test(nbatch): results = [] for i in xrange(len(ver_list)): acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test( ver_list[i], model, args.batch_size, 10, None, None) print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm)) # print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1)) print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2)) results.append(acc2) return results # ver_test( 0 ) highest_acc = [0.0, 0.0] # lfw and target # for i in xrange(len(ver_list)): # highest_acc.append(0.0) global_step = [0] save_step = [0] if len(args.lr_steps) == 0: lr_steps = [40000, 60000, 80000] if args.loss_type >= 1 and args.loss_type <= 7: lr_steps = [100000, 140000, 160000] p = 512.0 / args.batch_size for l in xrange(len(lr_steps)): lr_steps[l] = int(lr_steps[l] * p) else: lr_steps = [int(x) for x in args.lr_steps.split(',')] print('lr_steps', lr_steps) def _batch_callback(param): # global global_step global_step[0] += 1 mbatch = global_step[0] for _lr in lr_steps: if mbatch == args.beta_freeze + _lr: opt.lr *= 0.1 print('lr change to', opt.lr) break _cb(param) if mbatch % 1000 == 0: print('lr-batch-epoch: lr ', opt.lr, 'nbatch ', param.nbatch, 'epoch ', param.epoch, 'mbatch ', mbatch, 'lr_step', lr_steps) if mbatch >= 0 and mbatch % args.verbose == 0: acc_list = ver_test(mbatch) save_step[0] += 1 msave = save_step[0] do_save = False is_highest = False if len(acc_list) > 0: # lfw_score = acc_list[0] # if lfw_score>highest_acc[0]: # highest_acc[0] = lfw_score # if lfw_score>=0.998: # do_save = True score = sum(acc_list) if acc_list[-1] >= highest_acc[-1]: if acc_list[-1] > highest_acc[-1]: is_highest = True else: if score >= highest_acc[0]: is_highest = True highest_acc[0] = score highest_acc[-1] = acc_list[-1] # if lfw_score>=0.99: # do_save = True if is_highest: do_save = True if args.ckpt == 0: do_save = False elif args.ckpt == 2: do_save = True elif args.ckpt == 3: msave = 1 if do_save: print('saving', msave) arg, aux = model.get_params() mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux) print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1])) if mbatch <= args.beta_freeze: _beta = args.beta else: move = max(0, mbatch - args.beta_freeze) _beta = max( args.beta_min, args.beta * math.pow(1 + args.gamma * move, -1.0 * args.power)) # print('beta', _beta) os.environ['BETA'] = str(_beta) if args.max_steps > 0 and mbatch > args.max_steps: sys.exit(0) epoch_cb = None train_dataiter = mx.io.PrefetchingIter(train_dataiter) # model.set_params(arg_params, aux_params) model.fit( train_dataiter, begin_epoch=begin_epoch, num_epoch=end_epoch, eval_data=val_dataiter, eval_metric=eval_metrics, kvstore='device', optimizer=opt, # optimizer_params = optimizer_params, initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=True, batch_end_callback=_batch_callback, epoch_end_callback=epoch_cb)
def train_net(args): ctx = [] cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() if len(cvd)>0: for i in xrange(len(cvd.split(','))): ctx.append(mx.gpu(i)) if len(ctx)==0: ctx = [mx.cpu()] print('use cpu') else: print('gpu num:', len(ctx)) prefix = os.path.join(args.models_root, '%s-%s-%s'%(args.network, args.loss, args.dataset), 'model') prefix_dir = os.path.dirname(prefix) print('prefix', prefix) if not os.path.exists(prefix_dir): os.makedirs(prefix_dir) args.ctx_num = len(ctx) #GPU num args.batch_size = args.per_batch_size*args.ctx_num args.rescale_threshold = 0 args.image_channel = config.image_shape[2] config.batch_size = args.batch_size config.per_batch_size = args.per_batch_size data_dir = config.dataset_path path_imgrec = None path_imglist = None image_size = config.image_shape[0:2] assert len(image_size)==2 assert image_size[0]==image_size[1] print('image_size', image_size) print('num_classes', config.num_classes) path_imgrec = os.path.join(data_dir, "train.rec") print('Called with argument:', args, config) data_shape = (args.image_channel,image_size[0],image_size[1]) # chw mean = None #[127.5,127.5,127.5] begin_epoch = 0 if len(args.pretrained)==0: arg_params = None aux_params = None sym = get_symbol(args) if config.net_name=='spherenet': data_shape_dict = {'data' : (args.per_batch_size,)+data_shape} spherenet.init_weights(sym, data_shape_dict, args.num_layers) else: #��Ԥѵ��ģ�ͣ�������,sym����get_symbol(args)������ sym,sym_high,arg_params,aux_params,t_arg_params, t_aux_params = two_sym(args) d_sym = discriminator(args) config.count_flops=False #me add if config.count_flops: #true all_layers = sym.get_internals() _sym = all_layers['fc1_output'] #ͼƬ�� 128 ά�ȵ�����fc1 ���ٶ� FLOPs = flops_counter.count_flops(_sym, data=(1,3,image_size[0],image_size[1])) _str = flops_counter.flops_str(FLOPs) print('Network FLOPs: %s'%_str) #label_name = 'softmax_label' #label_shape = (args.batch_size,) val_dataiter = None if config.loss_name.find('triplet')>=0: from triplet_image_iter import FaceImageIter triplet_params = [config.triplet_bag_size, config.triplet_alpha, config.triplet_max_ap] train_dataiter = FaceImageIter( batch_size = args.batch_size, data_shape = data_shape, path_imgrec = path_imgrec, shuffle = True, rand_mirror = config.data_rand_mirror, # rand_resize = True, #me add to differ resolution img mean = mean, cutoff = config.data_cutoff, ctx_num = args.ctx_num, images_per_identity = config.images_per_identity, triplet_params = triplet_params, mx_model = model, ) _metric = LossValueMetric() eval_metrics = [mx.metric.create(_metric)] else: from distribute_image_iter import FaceImageIter train_dataiter_low = FaceImageIter( #�õ� batch img label, train_dataiter_high batch_size = args.batch_size, data_shape = data_shape, path_imgrec = path_imgrec, shuffle = True, rand_mirror = config.data_rand_mirror, #true rand_resize = True, #me add to differ resolution img mean = mean, cutoff = config.data_cutoff, #0 color_jittering = config.data_color, #0 images_filter = config.data_images_filter, #0 ) source_imgrec = os.path.join("/home/svt/mxnet_recognition/dataes/faces_glintasia","train.rec") data2 = FaceImageIter( #�õ� batch img label, train_dataiter_high batch_size = args.batch_size, data_shape = data_shape, path_imgrec = source_imgrec, shuffle = True, rand_mirror = config.data_rand_mirror, #true rand_resize = False, #me add to differ resolution img mean = mean, cutoff = config.data_cutoff, #0 color_jittering = config.data_color, #0 images_filter = config.data_images_filter, #0 ) metric1 = AccMetric() #�õ����ȼ��� eval_metrics = [mx.metric.create(metric1)] if config.ce_loss: #is True metric2 = LossValueMetric() #�õ���ʧֵ eval_metrics.append( mx.metric.create(metric2) ) # if config.net_name=='fresnet' or config.net_name=='fmobilefacenet': initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style else: initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2) #initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style _rescale = 1.0/args.ctx_num #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=_rescale) opt = optimizer.Adam(learning_rate=0.0001, beta1=0.5, beta2=0.9, epsilon=1e-08) _cb = mx.callback.Speedometer(args.batch_size, args.frequent) ver_list = [] ver_name_list = [] for name in config.val_targets: path = os.path.join(data_dir,name+".bin") if os.path.exists(path): data_set = verification.load_bin(path, image_size) ver_list.append(data_set) ver_name_list.append(name) print('ver', name) def ver_test(nbatch): results = [] for i in xrange(len(ver_list)): acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], model, args.batch_size, 10, None, None) print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm)) #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1)) print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2)) results.append(acc2) return results highest_acc = [0.0, 0.0] #lfw and target #for i in xrange(len(ver_list)): # highest_acc.append(0.0) global_step = [0] save_step = [0] lr_steps = [int(x) for x in args.lr_steps.split(',')] high_save = 0 # me add print('lr_steps', lr_steps) def _batch_callback(param): #global global_step global_step[0]+=1 mbatch = global_step[0] for step in lr_steps: if mbatch==step: opt.lr *= 0.1 print('lr change to', opt.lr) break _cb(param) if mbatch%1000==0: print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch) if mbatch %4000==0:#(fc7_save): name=os.path.join(args.models_root, '%s-%s-%s'%(args.network, args.loss, args.dataset), 'modelfc7') arg, aux = model.get_params() mx.model.save_checkpoint(name, param.epoch, model.symbol, arg, aux) print('save model include fc7 layer') print("mbatch",mbatch) me_msave=0 if mbatch>=0 and mbatch%args.verbose==0: #default.verbose = 2000,mbatch is acc_list = ver_test(mbatch) save_step[0]+=1 msave = save_step[0] # batch ��512��һ��epoch1300 me_msave=me_msave+1 do_save = False is_highest = False #me add save2 = False if len(acc_list)>0: lfw_score = acc_list[0] if lfw_score>highest_acc[0]: highest_acc[0] = lfw_score if lfw_score>=0.9960: save2 = True score = sum(acc_list) if acc_list[-1]>=highest_acc[-1]: if acc_list[-1]>highest_acc[-1]: is_highest = True else: if score>=highest_acc[0]: is_highest = True highest_acc[0] = score highest_acc[-1] = acc_list[-1] #if lfw_score>=0.99: # do_save = True # if is_highest: # do_save = True if args.ckpt==0: do_save = False elif args.ckpt==2: do_save = True elif args.ckpt==3 and is_highest: #me add and is_highest high_save = 0 #ÿ�α���lfw��ߵ�ģ��,�и��ߵ��滻ԭ�������ģ�� if do_save: #������ߵ����ݲ��� print('saving high pretrained-epoch always: ', high_save) arg, aux = model.get_params() if config.ckpt_embedding: #true all_layers = model.symbol.get_internals() _sym = all_layers['fc1_output'] _arg = {} for k in arg: if not k.startswith('fc7'):#�ַ�����ʼ�� fc7 ��ͷ������ѭ�������������������㣩 _arg[k] = arg[k] mx.model.save_checkpoint(prefix, high_save, _sym, _arg, aux) #��������֣������ǰ������IJ���ֻ��fc1(128ά�ȵ�����) else: mx.model.save_checkpoint(prefix, high_save, model.symbol, arg, aux) print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1])) if save2: arg, aux = model.get_params() if config.ckpt_embedding: #true all_layers = model.symbol.get_internals() _sym = all_layers['fc1_output'] _arg = {} for k in arg: if not k.startswith('fc7'):#�ַ�����ʼ�� fc7 ��ͷ������ѭ�������������������㣩 _arg[k] = arg[k] mx.model.save_checkpoint(prefix, (me_msave), _sym, _arg, aux) #��������֣������ǰ������IJ���ֻ��fc1(128ά�ȵ�����) else: mx.model.save_checkpoint(prefix, (me_msave), model.symbol, arg, aux) print("save pretrained-epoch :param.epoch + me_msave",param.epoch,me_msave) print('[%d]LFW Accuracy>=0.9960: %1.5f'%(mbatch, highest_acc[-1])) #mbatch �Ǵ�0 ��13000 һ��epoch ,Ȼ���ٴ�0���� if config.max_steps>0 and mbatch>config.max_steps: sys.exit(0) ########################################################################### epoch_cb = None train_dataiter_low = mx.io.PrefetchingIter(train_dataiter_low) #���̵߳����� data2 = mx.io.PrefetchingIter(data2) # ���̵߳����� #����model, �õ����ݣ�bind(data��label,�������ִ�к�����Դ�ռ�)��Ȼ���ʼ���������params #Ȼ�� fit ����ѵ�� lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(step=[100, 200, 300], factor=0.1) optimizer_params = {'learning_rate':0.01, 'momentum':0.9, 'wd':0.0005, # 'lr_scheduler':lr_scheduler, "rescale_grad":_rescale} #���ݶȽ�����ƽ�� ###################################################################### # # ��ʦ���� data_shapes = [('data', (args.batch_size, 3, 112, 112))] #teacher model only need data, no label t_module = mx.module.Module(symbol=sym_high, context=ctx, label_names=[]) t_module.bind(data_shapes=data_shapes, for_training=False, grad_req='null') t_module.set_params(arg_params=t_arg_params, aux_params=t_aux_params) t_model=t_module ###################################################################### ##ѧ������ label_shapes = [('softmax_label', (args.batch_size, ))] model = mx.mod.Module( context = ctx, symbol = sym, label_names=[] # data_names = #Ĭ��data,�� softmax_label,����Ķ���label �����֣���Ҫ���´��� ) #ѧ��������Ҫ ���ݺͱ�ǩ����ѵ�� #��ʦ������Ҫ���ݣ����ñ�ǩ����ѵ�������Ұ����������ֵ��ӵ���ǩ���� # print (train_dataiter_low.provide_data) # print ((train_dataiter_low.provide_label)) #opt_d = optimizer.SGD(learning_rate=args.lr*0.01, momentum=args.mom, wd=args.wd, rescale_grad=_rescale) ##lr e-5 opt_d = optimizer.Adam(learning_rate=0.0001, beta1=0.5, beta2=0.9, epsilon=1e-08) model.bind(data_shapes=data_shapes,for_training=True) #label shape���ˣ����˱�ǩ�������� model.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=True) #���Ϊtrue����������ܰ���ȱ�ٵ�ֵ�����ҽ����ó�ʼֵ�趨���������Щȱ�ٵIJ��� # model.init_optimizer(kvstore=args.kvstore,optimizer='sgd', optimizer_params=(optimizer_params)) model.init_optimizer(kvstore=args.kvstore,optimizer=opt_d) # metric = eval_metrics #�������㣬�б� ########################################################################## ## ���������� # ����ģ�飬�DZ���� model_d = mx.module.Module(symbol=d_sym, context=ctx,data_names=['data'], label_names=['softmax_label']) data_shapes = [('data', (args.batch_size*2,512))] label_shapes = [('softmax_label', (args.batch_size*2,))] #bind ������Զ��ı�batch��С��Ҳ����ʹ�õ�ʱ���ٰ� model_d.bind(data_shapes=data_shapes,label_shapes = label_shapes,inputs_need_grad=True) model_d.init_params(initializer=initializer) model_d.init_optimizer(kvstore=args.kvstore,optimizer=opt) #�Ż���������Ҫ�Ķ� #lr e-3 ## �����õ��ǣ������� discriminator ������������� metric_d = AccMetric_d() #�õ����ȼ���,��metric.py ��Ӻ���AccMetric_d�������õ���softmax eval_metrics_d = [mx.metric.create(metric_d)] metric2_d = LossValueMetric_d() #�õ���ʧֵ ,metric.py ��Ӻ���AccMetric_d�������õ���cros entropy eval_metrics_d.append( mx.metric.create(metric2_d) ) # metric_d =eval_metrics_d # mx.metric.create('acc')## ����������softmax�� symbol ֻ��һ�����softmax ,ʱ���, global_step=[0] batch_num=[0] resize_acc=[0] for epoch in range(0, 40): # if epoch==1 or epoch==2 or epoch==3: # model.init_optimizer(kvstore=args.kvstore,optimizer='sgd', optimizer_params=(optimizer_params)) if not isinstance(metric_d, mx.metric.EvalMetric):#������������� metric_d = mx.metric.create(metric_d) # metric_d = mx.metric.create(metric_d) metric_d.reset() train_dataiter_low.reset() data2.reset() print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") data_iter = iter(train_dataiter_low) data2_iter = iter(data2) data_len=0 for batch in data_iter: # batch is high ## 1���õ� ��ʦ����train false, ѧ������train true ����������ϲ������� label���趨��1����0 ####��ʦ����õ�feature����ӳ� label����Ϊ�������ݣ� data_len +=len(batch.data[0]) if len(batch.data[0])<args.batch_size: #batch.data[0] is ����batch print ("���data����batch,����") print ("data_len:",data_len) break if data_len >=2830147: #2830147,Ŀ���������ݳ��� print ("һ��batch ����") break batch2 = data2_iter.next() t_model.forward(batch2, is_train=False) #high data,�Լ� low_data,,�������������ݣ����ݿ��Դ�С��ͬ t_feat = t_model.get_outputs() # type list batch.label,type list�����ֻ��fc1 # print (batch.data[0].grad is None) # not None, batch.data[0].detach.grad ,is None ## batch.data[0].grad ��None ,batch.data[0].detach.grad Ҳ��None ## �����û�����ݶ� ��bind, bind ������������������ݶȣ�������detach ,��ʾ������������ݶȼ��� ## batch.data[0] #���ص����б�[batch_data] [label]����[ array[bchw] ] [ array[0 1...]] ## ѧ���������ɶԿ����� fack model.forward(batch,is_train=True) ##fc1 ��� g_feat = model.get_outputs() #get_symol ���صģ�����ֵ����,���յļ���ֵ����һ����fc1���� label_t = nd.ones((args.batch_size,)) #1 label_g = nd.zeros((args.batch_size,)) #0 ## ������һ�� label_concat = nd.concat(label_t,label_g,dim=0) feat_concat = nd.concat(t_feat[0],g_feat[0],dim=0) # ����nd �ϲ�nd.L2Normalization(����Ҫ ### 2.1�� �ϲ������ݽ���ѵ�����ݶȸ��£��ڶ���,�ڽ��У� is train = true,�� �����������ݵ��ݶȣ� ##��false,�Dz�����������ݶȣ����벻�䣬������Ҫ������ݶȣ� feat_data = mx.io.DataBatch([feat_concat.detach()], [label_concat]) model_d.forward(feat_data, is_train=True) # #���е���ʧ model_d.backward() # print(feat_data.data[0].grad is None) #is None ##��ֵ ģ���ݶȴ��� gradD = [[grad.copyto(grad.context) for grad in grads] for grads in model_d._exec_group.grad_arrays] model_d.update() ##�ݶȸ��� model_d.update_metric(metric_d, [label_concat]) ### 2.2 ,��ѧ������������õ� ����ֵ�������ݶ����ô��ݸ� ѧ�����磬�����£����ݵ��������� batch ��С label_g = nd.ones((args.batch_size,)) #��ǩ����Ϊ1 feat_data = mx.io.DataBatch([g_feat[0]], [label_g]) #have input grad model_d.forward(feat_data, is_train=True) # #true �õ�������ݶ� model_d.backward() ## �ҵ����û���ۼӹ��ܣ���һ����ִ������ forward �Ḳ���ϴεĽ�� ####3. G �õ� �ݶ� ���� ��ѧ������ g_grad=model_d.get_input_grads() model.backward(g_grad) model.update() ## ѵ�������� s t ���������뵽���������磬�������ݶȸ��£�Ȼ�õ�s������������������н�������ʧ���ݶȴ��� ## ������ ���� �������ǽ�ʦ��ѧ�����������ƴ�ӣ�label�ǣ�1 �� 0 # gan_label = [nd.empty((args.batch_size*2,2))] #(batch*2,2) ����ģ�͵�������ƴ�� ��С��0 1 label, # discrim_data = [nd.empty((args.batch_size*2,512))] #(batch*2,512) # print (gan_label[0].shape) lr_steps = [int(x) for x in args.lr_steps.split(',')] global_step[0]+=1 batch_num[0]+=1 mbatch = global_step[0] for step in lr_steps: if mbatch==step: opt.lr *= 0.1 opt_d.lr*=0.1 print('opt.lr ,opt_d.lr lr change to', opt.lr,opt_d.lr) break if mbatch %200==0 and mbatch >0: #(fc7_save): print('mbath %d, Training %s' % (epoch, metric_d.get())) if mbatch %1000==0 and mbatch >0: arg, aux = model.get_params() mx.model.save_checkpoint(prefix, epoch, model.symbol, arg, aux) arg, aux = model_d.get_params() mx.model.save_checkpoint(prefix+"discriminator", epoch, model_d.symbol, arg, aux) top1,top10 = my_top(epoch) yidong_test_top1,yidong_test_top1=my_top_yidong_test(epoch) if top1 >= resize_acc[0]: resize_acc[0]=top1 #������ߵ����ݲ��� arg, aux = model.get_params() all_layers = model.symbol.get_internals() _sym = all_layers['fc1_output'] _arg = {} for k in arg: if not k.startswith('fc7'):#�ַ�����ʼ�� fc7 ��ͷ������ѭ�������������������㣩 _arg[k] = arg[k] mx.model.save_checkpoint(prefix+"_best", 1, _sym, _arg, aux) acc_list = ver_test(mbatch) if len(acc_list)>0: print ("LFW acc is :",acc_list[0]) print("batch_num",batch_num[0],"epoch",epoch, "lr ",opt.lr) print('mbath %d, Training %s' % (epoch, metric_d.get()))
# <NDArray 1x4 @cpu(0)> # update trainer.step(batch_size) print(net.weight.data()) # [[0.31892323 0.21269077 0.34669656 0.29598683]] # <NDArray 1x4 @cpu(0)> print(curr_weight - net.weight.data() * 1 / batch_size) # [[ 0.02714116 -0.03028122 -0.00145487 0.00512915]] # <NDArray 1x4 @cpu(0)> ################################################################################ # define an optimzer directly and pass to trainer # ex. using the AdamOptimizer: a popular adaptive optimizer for deep learning optim = optimizer.Adam(learning_rate=1) trainer = gluon.Trainer(net.collect_params(), optim) # update network weights forward_backward() trainer.step(batch_size) print(net.weight.data()) # [[-0.6810826 -0.7873151 -0.65330917 -0.7040191 ]] # <NDArray 1x4 @cpu(0)> ################################################################################ # changing learning rate print(trainer.learning_rate) # 1
def do_train(self): # 判断使用GPU还是CPU ctx = [] cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() if len(cvd) > 0: gpu_num_list = cvd.split(",") gpu_num_list_len = len(gpu_num_list) for i in range(gpu_num_list_len): ctx.append(mx.gpu(i)) pass pass if len(ctx) == 0: ctx = [mx.cpu()] print("use cpu") pass else: print("gpu num: {}".format(ctx)) pass assert self.face_shape[0] == self.face_shape[ 1], "face_shape[0] neq face_shape[1]" # 每个 GPU 的 batch size per_batch_size = self.net_info["per_batch_size"] ctx_count = len(ctx) batch_size = per_batch_size * ctx_count data_shape = [ self.face_shape[2], self.face_shape[0], self.face_shape[1] ] data_shape = tuple(data_shape) # 模型是否存在,如果存在则加载模型,如果不存在则初始化权重 if os.path.exists(self.model_path): path_info, file_name = os.path.split(self.model_path) name_info, suffix_info = os.path.splitext(file_name) name_prefix, name_epoch = name_info.split("-") model_path_prefix = os.path.join(path_info, name_prefix) epoch_num = int(name_epoch) symbol, arg_params, aux_params = mx.model.load_checkpoint( model_path_prefix, epoch_num) # 模型构建 symbol_info = self.get_symbol() pass else: arg_params = None aux_params = None # 模型构建 symbol_info = self.get_symbol() pass # 计算模型的缓存空间 if self.count_flops_flag: all_layers = symbol_info.get_internals() fc1_sym = all_layers['fc1_output'] flops_info = flops_utils.count_flops(fc1_sym, data=(1, data_shape[0], data_shape[1], data_shape[2])) flops_info_str = flops_utils.flops_str(flops_info) print("Network flops_info_str: {}".format(flops_info_str)) pass model = mx.mod.Module( context=mx.gpu(), # context=ctx, symbol=symbol_info) loss_name = self.loss_info["loss_name"] rec_data_path = self.data_info["rec_data_path"] idx_data_path = self.data_info["idx_data_path"] bin_data_file_path = self.data_info["bin_data_file_path"] val_targets_list = self.data_info["val_targets"] # 加载 .bin data 验证数据 bin_data_list = get_data_utils.load_bin( bin_data_file_path=bin_data_file_path, bin_name_list=val_targets_list, image_shape=self.face_shape) # 主要获取数据的迭代器,triplet 与 sfotmax 输入数据的迭代器是不一样的, # 具体哪里不一样,后续章节为大家分析 if loss_name.find("triplet") >= 0: triplet_bag_size = self.loss_info["triplet_bag_size"] triplet_alpha = self.loss_info["triplet_alpha"] triplet_max_ap = self.loss_info["triplet_max_ap"] images_per_identity = self.loss_info["images_per_identity"] triplet_params = [triplet_bag_size, triplet_alpha, triplet_max_ap] train_data_iter = TripletFaceImageIter( rec_data_path=rec_data_path, idx_data_path=idx_data_path, batch_size=batch_size, data_shape=data_shape, shuffle_flag=True, rand_mirror=self.data_rand_mirror_flag, cutoff=self.data_crop_flag, ctx_num=ctx_count, images_per_identity=images_per_identity, triplet_params=triplet_params, mx_model=model) metric2 = LossValueMetric() eval_metrics = [mx.metric.create(metric2)] pass else: train_data_iter = FaceImageIter( rec_data_path=rec_data_path, idx_data_path=idx_data_path, batch_size=batch_size, data_shape=data_shape, shuffle_flag=True, rand_mirror=self.data_rand_mirror_flag, cutoff=self.data_crop_flag, color_jitter=self.data_color_aug, images_filter=self.data_image_filter) metric1 = AccMetric() eval_metrics = [mx.metric.create(metric1)] # Focal loss,一种改进的交叉损失熵 if self.ce_loss: metric2 = LossValueMetric() eval_metrics.append(mx.metric.create(metric2)) pass pass # 把 train_data_iter 转化为 mx.io.PrefetchingIter 迭代器 train_data_iter = mx.io.PrefetchingIter(train_data_iter) net_name = self.net_info["net_name"] if net_name == "f_res_net" or net_name == "f_mobile_face_net": # resNet style initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) pass else: initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2) pass re_scale = 1.0 / ctx_count optimize = optimizer.Adam(learning_rate=self.learning_rate, wd=self.weight_decay, rescale_grad=re_scale) callback_speed = mx.callback.Speedometer(batch_size, self.print_step) # 最高的准曲率 lfw and target highest_acc = [0.0, 0.0] global_step = [0] save_step = [0] print("learning_rate_step_list: {}".format( self.learning_rate_step_list)) def batch_callback_fun(param): global_step[0] += 1 m_batch = global_step[0] for step in self.learning_rate_step_list: if m_batch == step: optimize.lr *= 0.1 print("learning rate change to: {}".format(optimize.lr)) break pass pass # print(param) callback_speed(param) # 每1000批次进行一次打印 if m_batch % 1000 == 0: print("learning_rate: {}\nbatch: {}\n epoch: {}".format( optimize.lr, param.nbatch, param.epoch)) pass if m_batch >= 0 and m_batch % self.val_step == 0: acc_list = self.ver_test(bin_data_list=bin_data_list, val_targets_list=val_targets_list, model_net=model, batch_size=batch_size, n_batch=m_batch) save_step[0] += 1 m_save = save_step[0] do_save_flag = False is_highest_flag = False print("-" * 100) # 如果存在评估集 print("acc_list: {}".format(acc_list)) if len(acc_list) > 0: score = sum(acc_list) if acc_list[-1] >= highest_acc[-1]: if acc_list[-1] > highest_acc[-1]: is_highest_flag = True pass else: if score >= highest_acc[0]: is_highest_flag = True highest_acc[0] = score pass pass highest_acc[-1] = acc_list[-1] pass pass # 判断是否保存模型 if is_highest_flag: do_save_flag = True pass if self.save_model_num == 0: do_save_flag = False pass elif self.save_model_num == 2: do_save_flag = True pass elif self.save_model_num == 3: m_save = 1 pass if do_save_flag: print("m_save: {}".format(m_save)) arg, aux = model.get_params() if self.check_feature_flag: all_layers = model.symbol.get_internals() fc1_sym = all_layers["fc1_output"] arg_base = {} for key in arg: if not key.startswith("fc7"): arg_base[key] = arg[key] pass pass mx.model.save_checkpoint(self.save_model_prefix_path, m_save, fc1_sym, arg_base, aux) pass else: mx.model.save_checkpoint(self.save_model_prefix_path, m_save, model.symbol, arg, aux) pass pass print("highest_acc[m_batch: {}]: {:.5f}".format( m_batch, highest_acc[-1])) pass # 如果最大步骤大于 0, 且 训练步骤 > 最大步骤, 则退出程序 if self.max_steps > 0 and m_batch > self.max_steps: sys.exit(0) pass pass begin_epoch = 0 # 训练模型 model.fit(train_data=train_data_iter, begin_epoch=begin_epoch, num_epoch=999999, eval_metric=eval_metrics, kvstore=self.kv_store, optimizer=optimize, initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=True, batch_end_callback=batch_callback_fun) pass