def train(): #数据集文件夹路径 PATH = 'coco' TEST_PATH = "test.jpg" #batch size BATCH_SIZE = 128 #输入图像尺寸 IMG_W, IMG_H = 192, 256 #特征图尺寸 HEATMAP_W, HEATMAP_H = 48, 64 #学习率 lr = 0.001 lr_factor = 0.1 EPOCH = 140 lr_steps = [90,120,140] #指定训练的GPU num_workers=2 #值越大数据读取越快但内存占用越多 context = mx.gpu(0) #响应点回归采用L2loss L = gluon.loss.L2Loss() #构建网络结构 MobileNetV2 net = posenet.mobilenetv2_05(context, IMG_W, IMG_H) #构建评价指标 metric = HeatmapAccuracy() #加载数据 train_data, val_data = data_loader(PATH, BATCH_SIZE, IMG_W, IMG_H, HEATMAP_W, HEATMAP_H, num_workers) #优化器采用ADAM trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr}) #迭代训练 lr_counter = 0 for epoch in range(EPOCH): metric.reset() if epoch == lr_steps[lr_counter]: trainer.set_learning_rate(trainer.learning_rate*lr_factor) lr_counter += 1 for i, batch in enumerate(train_data): tic = time.time() data = gluon.utils.split_and_load(batch[0], ctx_list=[context], batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=[context], batch_axis=0) weight = gluon.utils.split_and_load(batch[2], ctx_list=[context], batch_axis=0) with ag.record(): outputs = [net(X) for X in data] loss = [L(yhat, y, w) for yhat, y, w in zip(outputs, label, weight)] for l in loss: l.backward() trainer.step(BATCH_SIZE) train_loss = sum([l.mean().asscalar() for l in loss]) / len(loss) print('[Epoch %d] batch_num: %d | learn_rate: %.5f | loss: %.8f | time: %.1f' % (epoch, i, trainer.learning_rate, train_loss, time.time() - tic)) _, val_acc = test(net, val_data, context) mxnet_demo.demo(TEST_PATH, IMG_W, IMG_H, net, context) print("============================Val acc: %.5f============================"%val_acc) net.export('model/Ultralight-Nano-SimplePose_%.5f'%val_acc, epoch=epoch)
def test(net, val_data, context): metric = HeatmapAccuracy() for i, batch in enumerate(val_data): data = gluon.utils.split_and_load(batch[0], ctx_list=[context], batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=[context], batch_axis=0) weight = gluon.utils.split_and_load(batch[2], ctx_list=[context], batch_axis=0) outputs = [net(X) for X in data] metric.update(label, outputs) return metric.get()
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.use_pretrained_base: net.deconv_layers.initialize(ctx=ctx) net.final_layer.initialize(ctx=ctx) else: net.initialize(mx.init.MSRAPrelu(), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss() metric = HeatmapAccuracy() best_val_score = 1 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() for i, batch in enumerate(train_data): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight)] ag.backward(loss) trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i+1)%opt.log_interval: metric_name, metric_score = metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'%( epoch, i, batch_size*opt.log_interval/(time.time()-btic), loss_val / (i+1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info('Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n'%( epoch, int(i*batch_size / time_elapsed), int(time_elapsed), loss_val / (i+1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, opt.num_epochs-1)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, opt.num_epochs-1)) return net
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.use_pretrained_base: net.deconv_layers.initialize(ctx=ctx) net.final_layer.initialize(ctx=ctx) else: net.initialize(mx.init.MSRAPrelu(), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss() metric = HeatmapAccuracy() best_val_score = 1 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() for i, batch in enumerate(train_data): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight)] for l in loss: l.backward() trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i+1)%opt.log_interval: metric_name, metric_score = metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'%( epoch, i, batch_size*opt.log_interval/(time.time()-btic), loss_val / (i+1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info('Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n'%( epoch, int(i*batch_size / time_elapsed), int(time_elapsed), loss_val / (i+1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, opt.num_epochs-1)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, opt.num_epochs-1)) return net
############################################################################# # # For this model we use ``adam`` as the optimizer. trainer = gluon.Trainer(net.collect_params(), 'adam', {'lr_scheduler': lr_scheduler}) ############################################################################# # # - Metric # # The metric for this model is called heatmap accuracy, i.e. it compares the # keypoint heatmaps from the prediction and groundtruth and check if the center # of the gaussian distributions are within a certain distance. metric = HeatmapAccuracy() ############################################################################# # Training Loop # ------------- # # Since we have all necessary blocks, we can now put them together to start the training. # net.hybridize(static_alloc=True, static_shape=True) for epoch in range(1): metric.reset() for i, batch in enumerate(train_data): if i > 0: break
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss(weight=2.0) metric = HeatmapAccuracy() best_ap = 0 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() train_data_desc = tqdm(train_data, dynamic_ncols=True) for i, batch in enumerate(train_data_desc): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [ nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight) ] ag.backward(loss) trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i + 1) % opt.log_interval: metric_name, metric_score = metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f' % (epoch, i, batch_size * opt.log_interval / (time.time() - btic), loss_val / (i + 1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info( 'Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n' % (epoch, int(i * batch_size / time_elapsed), int(time_elapsed), loss_val / (i + 1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, epoch)) if (epoch + 1) % 2 == 0: res = validate(val_data, val_dataset, net, context, opt)[0] logger.info(res) if res['AP'] > best_ap: bestAP = res['AP'] net.save_parameters( f'{save_dir}/best-{round(bestAP, 3)}.params') if os.path.islink(f'{save_dir}/final.params'): os.remove(f'{save_dir}/final.params') os.symlink(f'./best-{round(bestAP, 3)}.params', f'{save_dir}/final.params') if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, opt.num_epochs - 1)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, opt.num_epochs - 1)) return net
############################################################################# # # For this model we use ``adam`` as the optimizer. trainer = gluon.Trainer(net.collect_params(), 'adam', {'lr_scheduler': lr_scheduler}) ############################################################################# # # - Metric # # The metric for this model is called heatmap accuracy, i.e. it compares the # keypoint heatmaps from the prediction and groundtruth and check if the center # of the gaussian distributions are within a certain distance. metric = HeatmapAccuracy() ############################################################################# # Training Loop # ------------- # # Since we have all necessary blocks, we can now put them together to start the training. # net.hybridize(static_alloc=True, static_shape=True) for epoch in range(1): metric.reset() for i, batch in enumerate(train_data): if i > 0:
def train(opt): batch_size = opt.batch_size num_joints = opt.num_joints num_gpus = opt.num_gpus batch_size *= max(1, num_gpus) ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] num_workers = opt.num_workers model_name = opt.model kwargs = { 'ctx': ctx, 'num_joints': num_joints, 'pretrained': opt.use_pretrained, 'pretrained_base': opt.use_pretrained_base, 'pretrained_ctx': ctx } net = get_model(model_name, **kwargs) net.cast(opt.dtype) input_size = [int(i) for i in opt.input_size.split(',')] train_dataset, train_data, train_batch_fn = get_data_loader( opt, batch_size, num_workers, input_size) num_training_samples = len(train_dataset) lr_decay = opt.lr_decay lr_decay_period = opt.lr_decay_period if opt.lr_decay_period > 0: lr_decay_epoch = list( range(lr_decay_period, opt.num_epochs, lr_decay_period)) else: lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')] lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch] num_batches = num_training_samples // batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=opt.lr, nepochs=opt.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(opt.lr_mode, base_lr=opt.lr, target_lr=0, nepochs=opt.num_epochs - opt.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) # optimizer = 'sgd' # optimizer_params = {'wd': opt.wd, 'momentum': 0.9, 'lr_scheduler': lr_scheduler} optimizer = 'adam' optimizer_params = {'wd': opt.wd, 'lr_scheduler': lr_scheduler} if opt.dtype != 'float32': optimizer_params['multi_precision'] = True save_frequency = opt.save_frequency if opt.save_dir and save_frequency: save_dir = opt.save_dir makedirs(save_dir) else: save_dir = '' save_frequency = 0 if isinstance(ctx, mx.Context): ctx = [ctx] if opt.use_pretrained_base: if model_name.startswith('simple'): net.deconv_layers.initialize(ctx=ctx) net.final_layer.initialize(ctx=ctx) elif model_name.startswith('mobile'): net.upsampling.initialize(ctx=ctx) else: net.initialize(mx.init.MSRAPrelu(), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss() metric = HeatmapAccuracy() best_val_score = 1 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() for i, batch in enumerate(train_data): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [ nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight) ] ag.backward(loss) trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i + 1) % opt.log_interval: metric_name, metric_score = metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f' % (epoch, i, batch_size * opt.log_interval / (time.time() - btic), loss_val / (i + 1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info( 'Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n' % (epoch, int(i * batch_size / time_elapsed), int(time_elapsed), loss_val / (i + 1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, opt.num_epochs - 1)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, opt.num_epochs - 1)) return net