예제 #1
0
def train(ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    if opt.use_pretrained_base:
        net.deconv_layers.initialize(ctx=ctx)
        net.final_layer.initialize(ctx=ctx)
    else:
        net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

    trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

    L = gluon.loss.L2Loss()
    metric = HeatmapAccuracy()

    best_val_score = 1

    if opt.mode == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=True)

    for epoch in range(opt.num_epochs):
        loss_val = 0
        tic = time.time()
        btic = time.time()
        metric.reset()

        for i, batch in enumerate(train_data):
            data, label, weight, imgid = train_batch_fn(batch, ctx)

            with ag.record():
                outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                loss = [nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype)
                        for yhat, y, w in zip(outputs, label, weight)]
            for l in loss:
                l.backward()
            trainer.step(batch_size)

            metric.update(label, outputs)

            loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus
            if opt.log_interval and not (i+1)%opt.log_interval:
                metric_name, metric_score = metric.get()
                logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'%(
                             epoch, i, batch_size*opt.log_interval/(time.time()-btic),
                             loss_val / (i+1), trainer.learning_rate, metric_name, metric_score))
                btic = time.time()

        time_elapsed = time.time() - tic
        logger.info('Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n'%(
                     epoch, int(i*batch_size / time_elapsed), int(time_elapsed), loss_val / (i+1)))
        if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
            net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, epoch))
            trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, epoch))

    if save_frequency and save_dir:
        net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, opt.num_epochs-1))
        trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, opt.num_epochs-1))

    return net
def train(ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    if opt.use_pretrained_base:
        net.deconv_layers.initialize(ctx=ctx)
        net.final_layer.initialize(ctx=ctx)
    else:
        net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

    trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

    L = gluon.loss.L2Loss()
    metric = HeatmapAccuracy()

    best_val_score = 1

    if opt.mode == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=True)

    for epoch in range(opt.num_epochs):
        loss_val = 0
        tic = time.time()
        btic = time.time()
        metric.reset()

        for i, batch in enumerate(train_data):
            data, label, weight, imgid = train_batch_fn(batch, ctx)

            with ag.record():
                outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                loss = [nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype)
                        for yhat, y, w in zip(outputs, label, weight)]
            ag.backward(loss)
            trainer.step(batch_size)

            metric.update(label, outputs)

            loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus
            if opt.log_interval and not (i+1)%opt.log_interval:
                metric_name, metric_score = metric.get()
                logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'%(
                             epoch, i, batch_size*opt.log_interval/(time.time()-btic),
                             loss_val / (i+1), trainer.learning_rate, metric_name, metric_score))
                btic = time.time()

        time_elapsed = time.time() - tic
        logger.info('Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n'%(
                     epoch, int(i*batch_size / time_elapsed), int(time_elapsed), loss_val / (i+1)))
        if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
            net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, epoch))
            trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, epoch))

    if save_frequency and save_dir:
        net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, opt.num_epochs-1))
        trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, opt.num_epochs-1))

    return net
def test(net, val_data, context):
    metric = HeatmapAccuracy()
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch[0], ctx_list=[context], batch_axis=0)
        label = gluon.utils.split_and_load(batch[1], ctx_list=[context], batch_axis=0)
        weight = gluon.utils.split_and_load(batch[2], ctx_list=[context], batch_axis=0)
        outputs = [net(X) for X in data]
        metric.update(label, outputs)
    return metric.get()
예제 #4
0
def train(ctx):
    if isinstance(ctx, mx.Context):
        ctx = [ctx]

    trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

    L = gluon.loss.L2Loss(weight=2.0)
    metric = HeatmapAccuracy()

    best_ap = 0

    if opt.mode == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=True)

    for epoch in range(opt.num_epochs):
        loss_val = 0
        tic = time.time()
        btic = time.time()
        metric.reset()

        train_data_desc = tqdm(train_data, dynamic_ncols=True)
        for i, batch in enumerate(train_data_desc):
            data, label, weight, imgid = train_batch_fn(batch, ctx)

            with ag.record():
                outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                loss = [
                    nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype)
                    for yhat, y, w in zip(outputs, label, weight)
                ]
            ag.backward(loss)
            trainer.step(batch_size)

            metric.update(label, outputs)

            loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus
            if opt.log_interval and not (i + 1) % opt.log_interval:
                metric_name, metric_score = metric.get()
                logger.info(
                    'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'
                    % (epoch, i, batch_size * opt.log_interval /
                       (time.time() - btic), loss_val / (i + 1),
                       trainer.learning_rate, metric_name, metric_score))
                btic = time.time()

        time_elapsed = time.time() - tic
        logger.info(
            'Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n' %
            (epoch, int(i * batch_size / time_elapsed), int(time_elapsed),
             loss_val / (i + 1)))
        if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
            net.save_parameters('%s/%s-%d.params' %
                                (save_dir, model_name, epoch))
            trainer.save_states('%s/%s-%d.states' %
                                (save_dir, model_name, epoch))
        if (epoch + 1) % 2 == 0:
            res = validate(val_data, val_dataset, net, context, opt)[0]
            logger.info(res)
            if res['AP'] > best_ap:
                bestAP = res['AP']
                net.save_parameters(
                    f'{save_dir}/best-{round(bestAP, 3)}.params')
                if os.path.islink(f'{save_dir}/final.params'):
                    os.remove(f'{save_dir}/final.params')
                os.symlink(f'./best-{round(bestAP, 3)}.params',
                           f'{save_dir}/final.params')

    if save_frequency and save_dir:
        net.save_parameters('%s/%s-%d.params' %
                            (save_dir, model_name, opt.num_epochs - 1))
        trainer.save_states('%s/%s-%d.states' %
                            (save_dir, model_name, opt.num_epochs - 1))

    return net
예제 #5
0
def train(opt):

    batch_size = opt.batch_size
    num_joints = opt.num_joints

    num_gpus = opt.num_gpus
    batch_size *= max(1, num_gpus)
    ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
    num_workers = opt.num_workers

    model_name = opt.model

    kwargs = {
        'ctx': ctx,
        'num_joints': num_joints,
        'pretrained': opt.use_pretrained,
        'pretrained_base': opt.use_pretrained_base,
        'pretrained_ctx': ctx
    }

    net = get_model(model_name, **kwargs)
    net.cast(opt.dtype)

    input_size = [int(i) for i in opt.input_size.split(',')]
    train_dataset, train_data, train_batch_fn = get_data_loader(
        opt, batch_size, num_workers, input_size)

    num_training_samples = len(train_dataset)
    lr_decay = opt.lr_decay
    lr_decay_period = opt.lr_decay_period
    if opt.lr_decay_period > 0:
        lr_decay_epoch = list(
            range(lr_decay_period, opt.num_epochs, lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')]
    lr_decay_epoch = [e - opt.warmup_epochs for e in lr_decay_epoch]
    num_batches = num_training_samples // batch_size
    lr_scheduler = LRSequential([
        LRScheduler('linear',
                    base_lr=0,
                    target_lr=opt.lr,
                    nepochs=opt.warmup_epochs,
                    iters_per_epoch=num_batches),
        LRScheduler(opt.lr_mode,
                    base_lr=opt.lr,
                    target_lr=0,
                    nepochs=opt.num_epochs - opt.warmup_epochs,
                    iters_per_epoch=num_batches,
                    step_epoch=lr_decay_epoch,
                    step_factor=lr_decay,
                    power=2)
    ])

    # optimizer = 'sgd'
    # optimizer_params = {'wd': opt.wd, 'momentum': 0.9, 'lr_scheduler': lr_scheduler}
    optimizer = 'adam'
    optimizer_params = {'wd': opt.wd, 'lr_scheduler': lr_scheduler}
    if opt.dtype != 'float32':
        optimizer_params['multi_precision'] = True

    save_frequency = opt.save_frequency
    if opt.save_dir and save_frequency:
        save_dir = opt.save_dir
        makedirs(save_dir)
    else:
        save_dir = ''
        save_frequency = 0

    if isinstance(ctx, mx.Context):
        ctx = [ctx]

    if opt.use_pretrained_base:
        if model_name.startswith('simple'):
            net.deconv_layers.initialize(ctx=ctx)
            net.final_layer.initialize(ctx=ctx)
        elif model_name.startswith('mobile'):
            net.upsampling.initialize(ctx=ctx)
    else:
        net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

    trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

    L = gluon.loss.L2Loss()
    metric = HeatmapAccuracy()

    best_val_score = 1

    if opt.mode == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=True)

    for epoch in range(opt.num_epochs):
        loss_val = 0
        tic = time.time()
        btic = time.time()
        metric.reset()

        for i, batch in enumerate(train_data):
            data, label, weight, imgid = train_batch_fn(batch, ctx)

            with ag.record():
                outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                loss = [
                    nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype)
                    for yhat, y, w in zip(outputs, label, weight)
                ]
            ag.backward(loss)
            trainer.step(batch_size)

            metric.update(label, outputs)

            loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus
            if opt.log_interval and not (i + 1) % opt.log_interval:
                metric_name, metric_score = metric.get()
                logger.info(
                    'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'
                    % (epoch, i, batch_size * opt.log_interval /
                       (time.time() - btic), loss_val / (i + 1),
                       trainer.learning_rate, metric_name, metric_score))
                btic = time.time()

        time_elapsed = time.time() - tic
        logger.info(
            'Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n' %
            (epoch, int(i * batch_size / time_elapsed), int(time_elapsed),
             loss_val / (i + 1)))
        if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
            net.save_parameters('%s/%s-%d.params' %
                                (save_dir, model_name, epoch))
            trainer.save_states('%s/%s-%d.states' %
                                (save_dir, model_name, epoch))

    if save_frequency and save_dir:
        net.save_parameters('%s/%s-%d.params' %
                            (save_dir, model_name, opt.num_epochs - 1))
        trainer.save_states('%s/%s-%d.states' %
                            (save_dir, model_name, opt.num_epochs - 1))

    return net