def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.use_pretrained_base: net.deconv_layers.initialize(ctx=ctx) net.final_layer.initialize(ctx=ctx) else: net.initialize(mx.init.MSRAPrelu(), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss() metric = HeatmapAccuracy() best_val_score = 1 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() for i, batch in enumerate(train_data): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight)] for l in loss: l.backward() trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i+1)%opt.log_interval: metric_name, metric_score = metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'%( epoch, i, batch_size*opt.log_interval/(time.time()-btic), loss_val / (i+1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info('Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n'%( epoch, int(i*batch_size / time_elapsed), int(time_elapsed), loss_val / (i+1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, opt.num_epochs-1)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, opt.num_epochs-1)) return net
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.use_pretrained_base: net.deconv_layers.initialize(ctx=ctx) net.final_layer.initialize(ctx=ctx) else: net.initialize(mx.init.MSRAPrelu(), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss() metric = HeatmapAccuracy() best_val_score = 1 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() for i, batch in enumerate(train_data): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight)] ag.backward(loss) trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i+1)%opt.log_interval: metric_name, metric_score = metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'%( epoch, i, batch_size*opt.log_interval/(time.time()-btic), loss_val / (i+1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info('Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n'%( epoch, int(i*batch_size / time_elapsed), int(time_elapsed), loss_val / (i+1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, opt.num_epochs-1)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, opt.num_epochs-1)) return net
def test(net, val_data, context): metric = HeatmapAccuracy() for i, batch in enumerate(val_data): data = gluon.utils.split_and_load(batch[0], ctx_list=[context], batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=[context], batch_axis=0) weight = gluon.utils.split_and_load(batch[2], ctx_list=[context], batch_axis=0) outputs = [net(X) for X in data] metric.update(label, outputs) return metric.get()
batch_axis=0) weight = gluon.utils.split_and_load(batch[2], ctx_list=[context], batch_axis=0) with ag.record(): outputs = [net(X) for X in data] loss = [ L(yhat, y, w) for yhat, y, w in zip(outputs, label, weight) ] for l in loss: l.backward() trainer.step(batch_size) metric.update(label, outputs) break ############################################################################# # Due to limitation on the resources, we only train the model for one batch in this tutorial. # # Please checkout the full :download:`training script # <../../../scripts/pose/simple_pose/train_simple_pose.py>` to reproduce our results. # # References # ---------- # # .. [1] Xiao, Bin, Haiping Wu, and Yichen Wei. \ # "Simple baselines for human pose estimation and tracking." \ # Proceedings of the European Conference on Computer Vision (ECCV). 2018.
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss(weight=2.0) metric = HeatmapAccuracy() best_ap = 0 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() train_data_desc = tqdm(train_data, dynamic_ncols=True) for i, batch in enumerate(train_data_desc): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [ nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight) ] ag.backward(loss) trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i + 1) % opt.log_interval: metric_name, metric_score = metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f' % (epoch, i, batch_size * opt.log_interval / (time.time() - btic), loss_val / (i + 1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info( 'Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n' % (epoch, int(i * batch_size / time_elapsed), int(time_elapsed), loss_val / (i + 1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, epoch)) if (epoch + 1) % 2 == 0: res = validate(val_data, val_dataset, net, context, opt)[0] logger.info(res) if res['AP'] > best_ap: bestAP = res['AP'] net.save_parameters( f'{save_dir}/best-{round(bestAP, 3)}.params') if os.path.islink(f'{save_dir}/final.params'): os.remove(f'{save_dir}/final.params') os.symlink(f'./best-{round(bestAP, 3)}.params', f'{save_dir}/final.params') if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, opt.num_epochs - 1)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, opt.num_epochs - 1)) return net
for i, batch in enumerate(train_data): if i > 0: break data = gluon.utils.split_and_load(batch[0], ctx_list=[context], batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=[context], batch_axis=0) weight = gluon.utils.split_and_load(batch[2], ctx_list=[context], batch_axis=0) with ag.record(): outputs = [net(X) for X in data] loss = [L(yhat, y, w) for yhat, y, w in zip(outputs, label, weight)] for l in loss: l.backward() trainer.step(batch_size) metric.update(label, outputs) break ############################################################################# # Due to limitation on the resources, we only train the model for one batch in this tutorial. # # Please checkout the full :download:`training script # <../../../scripts/pose/simple_pose/train_simple_pose.py>` to reproduce our results. # # References # ---------- # # .. [1] Xiao, Bin, Haiping Wu, and Yichen Wei. \ # "Simple baselines for human pose estimation and tracking." \ # Proceedings of the European Conference on Computer Vision (ECCV). 2018.
def train(opt): batch_size = opt.batch_size num_joints = opt.num_joints num_gpus = opt.num_gpus batch_size *= max(1, num_gpus) ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()] num_workers = opt.num_workers model_name = opt.model kwargs = { 'ctx': ctx, 'num_joints': num_joints, 'pretrained': opt.use_pretrained, 'pretrained_base': opt.use_pretrained_base, 'pretrained_ctx': ctx } net = get_model(model_name, **kwargs) net.cast(opt.dtype) input_size = [int(i) for i in opt.input_size.split(',')] train_dataset, train_data, train_batch_fn = get_data_loader( opt, batch_size, num_workers, input_size) num_training_samples = len(train_dataset) lr_decay = opt.lr_decay lr_decay_period = opt.lr_decay_period if opt.lr_decay_period > 0: lr_decay_epoch = list( range(lr_decay_period, opt.num_epochs, lr_decay_period)) else: lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')] lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch] num_batches = num_training_samples // batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=opt.lr, nepochs=opt.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(opt.lr_mode, base_lr=opt.lr, target_lr=0, nepochs=opt.num_epochs - opt.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) # optimizer = 'sgd' # optimizer_params = {'wd': opt.wd, 'momentum': 0.9, 'lr_scheduler': lr_scheduler} optimizer = 'adam' optimizer_params = {'wd': opt.wd, 'lr_scheduler': lr_scheduler} if opt.dtype != 'float32': optimizer_params['multi_precision'] = True save_frequency = opt.save_frequency if opt.save_dir and save_frequency: save_dir = opt.save_dir makedirs(save_dir) else: save_dir = '' save_frequency = 0 if isinstance(ctx, mx.Context): ctx = [ctx] if opt.use_pretrained_base: if model_name.startswith('simple'): net.deconv_layers.initialize(ctx=ctx) net.final_layer.initialize(ctx=ctx) elif model_name.startswith('mobile'): net.upsampling.initialize(ctx=ctx) else: net.initialize(mx.init.MSRAPrelu(), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss() metric = HeatmapAccuracy() best_val_score = 1 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() for i, batch in enumerate(train_data): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [ nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight) ] ag.backward(loss) trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i + 1) % opt.log_interval: metric_name, metric_score = metric.get() logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f' % (epoch, i, batch_size * opt.log_interval / (time.time() - btic), loss_val / (i + 1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info( 'Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n' % (epoch, int(i * batch_size / time_elapsed), int(time_elapsed), loss_val / (i + 1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params' % (save_dir, model_name, opt.num_epochs - 1)) trainer.save_states('%s/%s-%d.states' % (save_dir, model_name, opt.num_epochs - 1)) return net