Exemplo n.º 1
0
def load_distillation_model(model, pretrained_model, load_static_weights):
    logger.info("In distillation mode, teacher model will be "
                "loaded firstly before student model.")
    assert len(pretrained_model
               ) == 2, "pretrained_model length should be 2 but got {}".format(
                   len(pretrained_model))
    assert len(
        load_static_weights
    ) == 2, "load_static_weights length should be 2 but got {}".format(
        len(load_static_weights))
    teacher = model.teacher if hasattr(model,
                                       "teacher") else model._layers.teacher
    student = model.student if hasattr(model,
                                       "student") else model._layers.student
    load_dygraph_pretrain(teacher,
                          path=pretrained_model[0],
                          load_static_weights=load_static_weights[0])
    logger.info(
        logger.coloring(
            "Finish initing teacher model from {}".format(pretrained_model),
            "HEADER"))
    load_dygraph_pretrain(student,
                          path=pretrained_model[1],
                          load_static_weights=load_static_weights[1])
    logger.info(
        logger.coloring(
            "Finish initing student model from {}".format(pretrained_model),
            "HEADER"))
Exemplo n.º 2
0
def init_model(config, net, optimizer=None):
    """
    load model from checkpoint or pretrained_model
    """
    checkpoints = config.get('checkpoints')
    if checkpoints and optimizer is not None:
        assert os.path.exists(checkpoints + ".pdparams"), \
            "Given dir {}.pdparams not exist.".format(checkpoints)
        assert os.path.exists(checkpoints + ".pdopt"), \
            "Given dir {}.pdopt not exist.".format(checkpoints)
        para_dict = paddle.load(checkpoints + ".pdparams")
        opti_dict = paddle.load(checkpoints + ".pdopt")
        net.set_dict(para_dict)
        optimizer.set_state_dict(opti_dict)
        logger.info("Finish load checkpoints from {}".format(checkpoints))
        return

    pretrained_model = config.get('pretrained_model')
    load_static_weights = config.get('load_static_weights', False)
    use_distillation = config.get('use_distillation', False)
    if pretrained_model:
        if use_distillation:
            load_distillation_model(net, pretrained_model, load_static_weights)
        else:  # common load
            load_dygraph_pretrain(net,
                                  path=pretrained_model,
                                  load_static_weights=load_static_weights)
            logger.info(
                logger.coloring(
                    "Finish load pretrained model from {}".format(
                        pretrained_model), "HEADER"))
Exemplo n.º 3
0
def print_dict(d, delimiter=0):
    """
    Recursively visualize a dict and
    indenting acrrording by the relationship of keys.
    """
    placeholder = "-" * 60
    for k, v in sorted(d.items()):
        if isinstance(v, dict):
            logger.info("{}{} : ".format(delimiter * " ", logger.coloring(k, "HEADER")))
            print_dict(v, delimiter + 4)
        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
            logger.info("{}{} : ".format(delimiter * " ", logger.coloring(str(k),"HEADER")))
            for value in v:
                print_dict(value, delimiter + 4)
        else:
            logger.info("{}{} : {}".format(delimiter * " ", logger.coloring(k,"HEADER"), logger.coloring(v,"OKGREEN")))

        if k.isupper():
            logger.info(placeholder)
Exemplo n.º 4
0
def save_model(program, model_path, epoch_id, prefix='ppcls'):
    """
    save model to the target path
    """
    model_path = os.path.join(model_path, str(epoch_id))
    _mkdir_if_not_exist(model_path)
    model_prefix = os.path.join(model_path, prefix)
    paddle.static.save(program, model_prefix)
    logger.info(
        logger.coloring("Already save model in {}".format(model_path),
                        "HEADER"))
Exemplo n.º 5
0
def init_model(config, program, exe):
    """
    load model from checkpoint or pretrained_model
    """
    checkpoints = config.get('checkpoints')
    if checkpoints:
        paddle.static.load(program, checkpoints, exe)
        logger.info(
            logger.coloring("Finish initing model from {}".format(checkpoints),
                            "HEADER"))
        return

    pretrained_model = config.get('pretrained_model')
    if pretrained_model:
        if not isinstance(pretrained_model, list):
            pretrained_model = [pretrained_model]
        for pretrain in pretrained_model:
            load_params(exe, program, pretrain)
        logger.info(
            logger.coloring("Finish initing model from {}".format(
                pretrained_model), "HEADER"))
Exemplo n.º 6
0
def run(dataloader, exe, program, fetchs, epoch=0, mode='train', vdl_writer=None):
    """
    Feed data to the model and fetch the measures and loss

    Args:
        dataloader(fluid dataloader):
        exe():
        program():
        fetchs(dict): dict of measures and the loss
        epoch(int): epoch of training or validation
        model(str): log only

    Returns:
    """
    fetch_list = [f[0] for f in fetchs.values()]
    metric_list = [f[1] for f in fetchs.values()]
    for m in metric_list:
        m.reset()
    batch_time = AverageMeter('elapse', '.3f')
    tic = time.time()
    for idx, batch in enumerate(dataloader()):
        metrics = exe.run(program=program, feed=batch, fetch_list=fetch_list)
        batch_time.update(time.time() - tic)
        tic = time.time()
        for i, m in enumerate(metrics):
            metric_list[i].update(m[0], len(batch[0]))
        fetchs_str = ''.join([str(m.value) + ' '
                              for m in metric_list] + [batch_time.value]) + 's'
        if vdl_writer:
            global total_step
            logger.scaler('loss', metrics[0][0], total_step, vdl_writer)
            total_step += 1
        if mode == 'eval':
            logger.info("{:s} step:{:<4d} {:s}s".format(mode, idx, fetchs_str))
        else:
            epoch_str = "epoch:{:<3d}".format(epoch)
            step_str = "{:s} step:{:<4d}".format(mode, idx)

            logger.info("{:s} {:s} {:s}".format(
                logger.coloring(epoch_str, "HEADER")
                if idx == 0 else epoch_str,
                logger.coloring(step_str, "PURPLE"),
                logger.coloring(fetchs_str, 'OKGREEN')))

    end_str = ''.join([str(m.mean) + ' '
                       for m in metric_list] + [batch_time.total]) + 's'
    if mode == 'eval':
        logger.info("END {:s} {:s}s".format(mode, end_str))
    else:
        end_epoch_str = "END epoch:{:<3d}".format(epoch)

        logger.info("{:s} {:s} {:s}".format(
            logger.coloring(end_epoch_str, "RED"),
            logger.coloring(mode, "PURPLE"),
            logger.coloring(end_str, "OKGREEN")))

    # return top1_acc in order to save the best model
    if mode == 'valid':
        return fetchs["top1"][1].avg
Exemplo n.º 7
0
def load_params(exe, prog, path, ignore_params=None):
    """
    Load model from the given path.
    Args:
        exe (fluid.Executor): The fluid.Executor object.
        prog (fluid.Program): load weight to which Program object.
        path (string): URL string or loca model path.
        ignore_params (list): ignore variable to load when finetuning.
            It can be specified by finetune_exclude_pretrained_params
            and the usage can refer to the document
            docs/advanced_tutorials/TRANSFER_LEARNING.md
    """
    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
        raise ValueError("Model pretrain path {} does not "
                         "exists.".format(path))

    logger.info(
        logger.coloring('Loading parameters from {}...'.format(path),
                        'HEADER'))

    ignore_set = set()
    state = _load_state(path)

    # ignore the parameter which mismatch the shape
    # between the model and pretrain weight.
    all_var_shape = {}
    for block in prog.blocks:
        for param in block.all_parameters():
            all_var_shape[param.name] = param.shape
    ignore_set.update([
        name for name, shape in all_var_shape.items()
        if name in state and shape != state[name].shape
    ])

    if ignore_params:
        all_var_names = [var.name for var in prog.list_vars()]
        ignore_list = filter(
            lambda var: any([re.match(name, var) for name in ignore_params]),
            all_var_names)
        ignore_set.update(list(ignore_list))

    if len(ignore_set) > 0:
        for k in ignore_set:
            if k in state:
                logger.warning(
                    'variable {} is already excluded automatically'.format(k))
                del state[k]

    paddle.static.set_program_state(prog, state)
Exemplo n.º 8
0
def save_model(net, optimizer, model_path, epoch_id, prefix='ppcls'):
    """
    save model to the target path
    """
    if paddle.distributed.get_rank() != 0:
        return
    model_path = os.path.join(model_path, str(epoch_id))
    _mkdir_if_not_exist(model_path)
    model_prefix = os.path.join(model_path, prefix)

    paddle.save(net.state_dict(), model_prefix + ".pdparams")
    paddle.save(optimizer.state_dict(), model_prefix + ".pdopt")
    logger.info(
        logger.coloring("Already save model in {}".format(model_path),
                        "HEADER"))
Exemplo n.º 9
0
def main(args):
    config = get_config(args.config, overrides=args.override, show=True)
    # assign the place
    use_gpu = config.get("use_gpu", True)
    places = fluid.cuda_places() if use_gpu else fluid.cpu_places()

    # startup_prog is used to do some parameter init work,
    # and train prog is used to hold the network
    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    best_top1_acc = 0.0  # best top1 acc record

    if not config.get('use_ema'):
        train_dataloader, train_fetchs = program.build(config,
                                                       train_prog,
                                                       startup_prog,
                                                       is_train=True,
                                                       is_distributed=False)
    else:
        train_dataloader, train_fetchs, ema = program.build(
            config,
            train_prog,
            startup_prog,
            is_train=True,
            is_distributed=False)

    if config.validate:
        valid_prog = fluid.Program()
        valid_dataloader, valid_fetchs = program.build(config,
                                                       valid_prog,
                                                       startup_prog,
                                                       is_train=False,
                                                       is_distributed=False)
        # clone to prune some content which is irrelevant in valid_prog
        valid_prog = valid_prog.clone(for_test=True)

    # create the "Executor" with the statement of which place
    exe = fluid.Executor(places[0])
    # Parameter initialization
    exe.run(startup_prog)

    # load model from 1. checkpoint to resume training, 2. pretrained model to finetune
    init_model(config, train_prog, exe)

    train_reader = Reader(config, 'train')()
    train_dataloader.set_sample_list_generator(train_reader, places)

    compiled_train_prog = program.compile(config, train_prog,
                                          train_fetchs['loss'][0].name)

    if config.validate:
        valid_reader = Reader(config, 'valid')()
        valid_dataloader.set_sample_list_generator(valid_reader, places)
        compiled_valid_prog = program.compile(config,
                                              valid_prog,
                                              share_prog=compiled_train_prog)

    if args.vdl_dir:
        from visualdl import LogWriter
        vdl_writer = LogWriter(args.vdl_dir)
    else:
        vdl_writer = None

    for epoch_id in range(config.epochs):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_fetchs,
                    epoch_id, 'train', vdl_writer)

        # 2. validate with validate dataset
        if config.validate and epoch_id % config.valid_interval == 0:
            if config.get('use_ema'):
                logger.info(logger.coloring("EMA validate start..."))
                with ema.apply(exe):
                    top1_acc = program.run(valid_dataloader, exe,
                                           compiled_valid_prog, valid_fetchs,
                                           epoch_id, 'valid')
                logger.info(logger.coloring("EMA validate over!"))

            top1_acc = program.run(valid_dataloader, exe, compiled_valid_prog,
                                   valid_fetchs, epoch_id, 'valid')
            if top1_acc > best_top1_acc:
                best_top1_acc = top1_acc
                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                    best_top1_acc, epoch_id)
                logger.info("{:s}".format(logger.coloring(message, "RED")))
                if epoch_id % config.save_interval == 0:

                    model_path = os.path.join(config.model_save_dir,
                                              config.ARCHITECTURE["name"])
                    save_model(train_prog, model_path,
                               "best_model_in_epoch_" + str(epoch_id))

        # 3. save the persistable model
        if epoch_id % config.save_interval == 0:
            model_path = os.path.join(config.model_save_dir,
                                      config.ARCHITECTURE["name"])
            save_model(train_prog, model_path, epoch_id)
Exemplo n.º 10
0
def main(args):
    paddle.seed(12345)

    config = get_config(args.config, overrides=args.override, show=True)
    # assign the place
    use_gpu = config.get("use_gpu", True)
    place = paddle.set_device('gpu' if use_gpu else 'cpu')

    trainer_num = paddle.distributed.get_world_size()
    use_data_parallel = trainer_num != 1
    config["use_data_parallel"] = use_data_parallel

    if config["use_data_parallel"]:
        paddle.distributed.init_parallel_env()

    net = program.create_model(config.ARCHITECTURE, config.classes_num)
    optimizer, lr_scheduler = program.create_optimizer(
        config, parameter_list=net.parameters())

    if config["use_data_parallel"]:
        net = paddle.DataParallel(net)

    # load model from checkpoint or pretrained model
    init_model(config, net, optimizer)

    train_dataloader = Reader(config, 'train', places=place)()

    if config.validate:
        valid_dataloader = Reader(config, 'valid', places=place)()

    last_epoch_id = config.get("last_epoch", -1)
    best_top1_acc = 0.0  # best top1 acc record
    best_top1_epoch = last_epoch_id
    for epoch_id in range(last_epoch_id + 1, config.epochs):
        net.train()
        # 1. train with train dataset
        program.run(train_dataloader, config, net, optimizer, lr_scheduler,
                    epoch_id, 'train')

        # 2. validate with validate dataset
        if config.validate and epoch_id % config.valid_interval == 0:
            net.eval()
            with paddle.no_grad():
                top1_acc = program.run(valid_dataloader, config, net, None,
                                       None, epoch_id, 'valid')
            if top1_acc > best_top1_acc:
                best_top1_acc = top1_acc
                best_top1_epoch = epoch_id
                if epoch_id % config.save_interval == 0:
                    model_path = os.path.join(config.model_save_dir,
                                              config.ARCHITECTURE["name"])
                    save_model(net, optimizer, model_path, "best_model")
            message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                best_top1_acc, best_top1_epoch)
            logger.info("{:s}".format(logger.coloring(message, "RED")))

        # 3. save the persistable model
        if epoch_id % config.save_interval == 0:
            model_path = os.path.join(config.model_save_dir,
                                      config.ARCHITECTURE["name"])
            save_model(net, optimizer, model_path, epoch_id)
Exemplo n.º 11
0
def run(dataloader,
        exe,
        program,
        feeds,
        fetchs,
        epoch=0,
        mode='train',
        config=None,
        vdl_writer=None,
        lr_scheduler=None):
    """
    Feed data to the model and fetch the measures and loss

    Args:
        dataloader(paddle io dataloader):
        exe():
        program():
        fetchs(dict): dict of measures and the loss
        epoch(int): epoch of training or validation
        model(str): log only

    Returns:
    """
    fetch_list = [f[0] for f in fetchs.values()]
    metric_list = [
        ("lr", AverageMeter('lr', 'f', postfix=",", need_avg=False)),
        ("batch_time", AverageMeter('batch_cost', '.5f', postfix=" s,")),
        ("reader_time", AverageMeter('reader_cost', '.5f', postfix=" s,")),
    ]
    topk_name = 'top{}'.format(config.topk)
    metric_list.insert(0, ("loss", fetchs["loss"][1]))
    use_mix = config.get("use_mix", False) and mode == "train"
    if not use_mix:
        metric_list.insert(0, (topk_name, fetchs[topk_name][1]))
        metric_list.insert(0, ("top1", fetchs["top1"][1]))

    metric_list = OrderedDict(metric_list)

    for m in metric_list.values():
        m.reset()

    use_dali = config.get('use_dali', False)
    dataloader = dataloader if use_dali else dataloader()
    tic = time.time()

    idx = 0
    batch_size = None
    while True:
        # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG
        try:
            batch = next(dataloader)
        except StopIteration:
            break
        except RuntimeError:
            logger.warning(
                "Except RuntimeError when reading data from dataloader, try to read once again..."
            )
            continue
        idx += 1
        # ignore the warmup iters
        if idx == 5:
            metric_list["batch_time"].reset()
            metric_list["reader_time"].reset()

        metric_list['reader_time'].update(time.time() - tic)

        if use_dali:
            batch_size = batch[0]["feed_image"].shape()[0]
            feed_dict = batch[0]
        else:
            batch_size = batch[0].shape()[0]
            feed_dict = {
                key.name: batch[idx]
                for idx, key in enumerate(feeds.values())
            }
        metrics = exe.run(program=program,
                          feed=feed_dict,
                          fetch_list=fetch_list)

        for name, m in zip(fetchs.keys(), metrics):
            metric_list[name].update(np.mean(m), batch_size)
        metric_list["batch_time"].update(time.time() - tic)
        if mode == "train":
            metric_list['lr'].update(lr_scheduler.get_lr())

        fetchs_str = ' '.join([
            str(metric_list[key].mean)
            if "time" in key else str(metric_list[key].value)
            for key in metric_list
        ])
        ips_info = " ips: {:.5f} images/sec.".format(
            batch_size / metric_list["batch_time"].avg)
        fetchs_str += ips_info

        if lr_scheduler is not None:
            if lr_scheduler.update_specified:
                curr_global_counter = lr_scheduler.step_each_epoch * epoch + idx
                update = max(
                    0, curr_global_counter - lr_scheduler.update_start_step
                ) % lr_scheduler.update_step_interval == 0
                if update:
                    lr_scheduler.step()
            else:
                lr_scheduler.step()

        if vdl_writer:
            global total_step
            logger.scaler('loss', metrics[0][0], total_step, vdl_writer)
            total_step += 1
        if mode == 'valid':
            if idx % config.get('print_interval', 10) == 0:
                logger.info("{:s} step:{:<4d} {:s}".format(
                    mode, idx, fetchs_str))
        else:
            epoch_str = "epoch:{:<3d}".format(epoch)
            step_str = "{:s} step:{:<4d}".format(mode, idx)

            if idx % config.get('print_interval', 10) == 0:
                logger.info("{:s} {:s} {:s}".format(
                    logger.coloring(epoch_str, "HEADER")
                    if idx == 0 else epoch_str,
                    logger.coloring(step_str, "PURPLE"),
                    logger.coloring(fetchs_str, 'OKGREEN')))

        tic = time.time()

    end_str = ' '.join([str(m.mean) for m in metric_list.values()] +
                       [metric_list["batch_time"].total])
    ips_info = "ips: {:.5f} images/sec.".format(
        batch_size * metric_list["batch_time"].count /
        metric_list["batch_time"].sum)
    if mode == 'valid':
        logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info))
    else:
        end_epoch_str = "END epoch:{:<3d}".format(epoch)
        logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str,
                                                 ips_info))
    if use_dali:
        dataloader.reset()

    # return top1_acc in order to save the best model
    if mode == 'valid':
        return fetchs["top1"][1].avg
Exemplo n.º 12
0
def main(args):
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)

    config = get_config(args.config, overrides=args.override, show=True)
    # assign the place
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id)

    # startup_prog is used to do some parameter init work,
    # and train prog is used to hold the network
    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    best_top1_acc = 0.0  # best top1 acc record

    if not config.get('use_ema'):
        train_dataloader, train_fetchs = program.build(config,
                                                       train_prog,
                                                       startup_prog,
                                                       is_train=True)
    else:
        train_dataloader, train_fetchs, ema = program.build(config,
                                                            train_prog,
                                                            startup_prog,
                                                            is_train=True)

    if config.validate:
        valid_prog = fluid.Program()
        valid_dataloader, valid_fetchs = program.build(config,
                                                       valid_prog,
                                                       startup_prog,
                                                       is_train=False)
        # clone to prune some content which is irrelevant in valid_prog
        valid_prog = valid_prog.clone(for_test=True)

    # create the "Executor" with the statement of which place
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(startup_prog)

    # load model from 1. checkpoint to resume training, 2. pretrained model to finetune
    init_model(config, train_prog, exe)

    train_reader = Reader(config, 'train')()
    train_dataloader.set_sample_list_generator(train_reader, place)

    if config.validate:
        valid_reader = Reader(config, 'valid')()
        valid_dataloader.set_sample_list_generator(valid_reader, place)
        compiled_valid_prog = program.compile(config, valid_prog)

    compiled_train_prog = fleet.main_program
    vdl_writer = LogWriter(args.vdl_dir) if args.vdl_dir else None

    for epoch_id in range(config.epochs):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_fetchs,
                    epoch_id, 'train', vdl_writer)
        if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
            # 2. validate with validate dataset
            if config.validate and epoch_id % config.valid_interval == 0:
                if config.get('use_ema'):
                    logger.info(logger.coloring("EMA validate start..."))
                    with train_fetchs('ema').apply(exe):
                        top1_acc = program.run(valid_dataloader, exe,
                                               compiled_valid_prog,
                                               valid_fetchs, epoch_id, 'valid')
                    logger.info(logger.coloring("EMA validate over!"))

                top1_acc = program.run(valid_dataloader, exe,
                                       compiled_valid_prog, valid_fetchs,
                                       epoch_id, 'valid')
                if top1_acc > best_top1_acc:
                    best_top1_acc = top1_acc
                    message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                        best_top1_acc, epoch_id)
                    logger.info("{:s}".format(logger.coloring(message, "RED")))
                    if epoch_id % config.save_interval == 0:

                        model_path = os.path.join(config.model_save_dir,
                                                  config.ARCHITECTURE["name"])
                        save_model(train_prog, model_path,
                                   "best_model_in_epoch_" + str(epoch_id))

            # 3. save the persistable model
            if epoch_id % config.save_interval == 0:
                model_path = os.path.join(config.model_save_dir,
                                          config.ARCHITECTURE["name"])
                save_model(train_prog, model_path, epoch_id)
Exemplo n.º 13
0
def main(args):
    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
    fleet.init(role)

    config = get_config(args.config, overrides=args.override, show=True)
    use_fp16 = config.get('use_fp16', False)
    if use_fp16:
        AMP_RELATED_FLAGS_SETTING = {
            'FLAGS_cudnn_exhaustive_search': 1,
            'FLAGS_conv_workspace_size_limit': 4000,
            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
            'FLAGS_max_inplace_grad_add': 8,
        }
        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
    # assign the place
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id)

    # startup_prog is used to do some parameter init work,
    # and train prog is used to hold the network
    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    best_top1_acc = 0.0  # best top1 acc record

    if not config.get('use_ema'):
        train_dataloader, train_fetchs = program.build(
            config, train_prog, startup_prog, is_train=True)
    else:
        train_dataloader, train_fetchs, ema = program.build(
            config, train_prog, startup_prog, is_train=True)

    if config.validate:
        valid_prog = fluid.Program()
        valid_dataloader, valid_fetchs = program.build(
            config, valid_prog, startup_prog, is_train=False)
        # clone to prune some content which is irrelevant in valid_prog
        valid_prog = valid_prog.clone(for_test=True)

    # create the "Executor" with the statement of which place
    exe = fluid.Executor(place)
    # Parameter initialization
    exe.run(startup_prog)

    # load model from 1. checkpoint to resume training, 2. pretrained model to finetune
    init_model(config, train_prog, exe)
    if not config.get('use_dali', False):
        train_reader = Reader(config, 'train')()
        train_dataloader.set_sample_list_generator(train_reader, place)
        if config.validate:
            valid_reader = Reader(config, 'valid')()
            valid_dataloader.set_sample_list_generator(valid_reader, place)
            compiled_valid_prog = program.compile(config, valid_prog)

    else:
        import dali
        train_dataloader = dali.train(config)
        if config.validate and int(os.getenv("PADDLE_TRAINER_ID", 0)):
            if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
                valid_dataloader = dali.val(config)
            compiled_valid_prog = program.compile(config, valid_prog)

    compiled_train_prog = fleet.main_program

    vdl_writer = None
    if args.vdl_dir:
        if version_info.major == 2:
            logger.info(
                "visualdl is just supported for python3, so it is disabled in python2..."
            )
        else:
            from visualdl import LogWriter
            vdl_writer = LogWriter(args.vdl_dir)

    for epoch_id in range(config.epochs):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_fetchs,
                    epoch_id, 'train', config, vdl_writer)
        if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
            # 2. validate with validate dataset
            if config.validate and epoch_id % config.valid_interval == 0:
                if config.get('use_ema'):
                    logger.info(logger.coloring("EMA validate start..."))
                    with ema.apply(exe):
                        top1_acc = program.run(
                            valid_dataloader, exe, compiled_valid_prog,
                            valid_fetchs, epoch_id, 'valid', config)
                    logger.info(logger.coloring("EMA validate over!"))

                top1_acc = program.run(valid_dataloader, exe,
                                       compiled_valid_prog, valid_fetchs,
                                       epoch_id, 'valid', config)
                if top1_acc > best_top1_acc:
                    best_top1_acc = top1_acc
                    message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                        best_top1_acc, epoch_id)
                    logger.info("{:s}".format(logger.coloring(message, "RED")))
                    if epoch_id % config.save_interval == 0:

                        model_path = os.path.join(config.model_save_dir,
                                                  config.ARCHITECTURE["name"])
                        save_model(train_prog, model_path, "best_model")

            # 3. save the persistable model
            if epoch_id % config.save_interval == 0:
                model_path = os.path.join(config.model_save_dir,
                                          config.ARCHITECTURE["name"])
                save_model(train_prog, model_path, epoch_id)
Exemplo n.º 14
0
def main(args):
    config = get_config(args.config, overrides=args.override, show=True)
    if config.get("is_distributed", True):
        fleet.init(is_collective=True)
    # assign the place
    use_gpu = config.get("use_gpu", True)
    # amp related config
    use_amp = config.get('use_amp', False)
    use_pure_fp16 = config.get('use_pure_fp16', False)
    if use_amp or use_pure_fp16:
        AMP_RELATED_FLAGS_SETTING = {
            'FLAGS_cudnn_exhaustive_search': 1,
            'FLAGS_conv_workspace_size_limit': 4000,
            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
            'FLAGS_max_inplace_grad_add': 8,
        }
        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
    use_xpu = config.get("use_xpu", False)
    assert (
        use_gpu and use_xpu
    ) is not True, "gpu and xpu can not be true in the same time in static mode!"

    if use_gpu:
        place = paddle.set_device('gpu')
    elif use_xpu:
        place = paddle.set_device('xpu')
    else:
        place = paddle.set_device('cpu')

    # startup_prog is used to do some parameter init work,
    # and train prog is used to hold the network
    startup_prog = paddle.static.Program()
    train_prog = paddle.static.Program()

    best_top1_acc = 0.0  # best top1 acc record

    train_fetchs, lr_scheduler, train_feeds = program.build(
        config,
        train_prog,
        startup_prog,
        is_train=True,
        is_distributed=config.get("is_distributed", True))

    if config.validate:
        valid_prog = paddle.static.Program()
        valid_fetchs, _, valid_feeds = program.build(config,
                                                     valid_prog,
                                                     startup_prog,
                                                     is_train=False,
                                                     is_distributed=config.get(
                                                         "is_distributed",
                                                         True))
        # clone to prune some content which is irrelevant in valid_prog
        valid_prog = valid_prog.clone(for_test=True)

    # create the "Executor" with the statement of which place
    exe = paddle.static.Executor(place)
    # Parameter initialization
    exe.run(startup_prog)
    if config.get("use_pure_fp16", False):
        cast_parameters_to_fp16(place, train_prog, fluid.global_scope())
    # load pretrained models or checkpoints
    init_model(config, train_prog, exe)

    if not config.get("is_distributed", True):
        compiled_train_prog = program.compile(
            config, train_prog, loss_name=train_fetchs["loss"][0].name)
    else:
        compiled_train_prog = train_prog

    if not config.get('use_dali', False):
        train_dataloader = Reader(config, 'train', places=place)()
        if config.validate and paddle.distributed.get_rank() == 0:
            valid_dataloader = Reader(config, 'valid', places=place)()
            if use_xpu:
                compiled_valid_prog = valid_prog
            else:
                compiled_valid_prog = program.compile(config, valid_prog)
    else:
        assert use_gpu is True, "DALI only support gpu, please set use_gpu to True!"
        import dali
        train_dataloader = dali.train(config)
        if config.validate and paddle.distributed.get_rank() == 0:
            valid_dataloader = dali.val(config)
            compiled_valid_prog = program.compile(config, valid_prog)

    vdl_writer = None
    if args.vdl_dir:
        if version_info.major == 2:
            logger.info(
                "visualdl is just supported for python3, so it is disabled in python2..."
            )
        else:
            from visualdl import LogWriter
            vdl_writer = LogWriter(args.vdl_dir)

    for epoch_id in range(config.epochs):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
                    train_fetchs, epoch_id, 'train', config, vdl_writer,
                    lr_scheduler)
        if paddle.distributed.get_rank() == 0:
            # 2. validate with validate dataset
            if config.validate and epoch_id % config.valid_interval == 0:
                top1_acc = program.run(valid_dataloader, exe,
                                       compiled_valid_prog, valid_feeds,
                                       valid_fetchs, epoch_id, 'valid', config)
                if top1_acc > best_top1_acc:
                    best_top1_acc = top1_acc
                    message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                        best_top1_acc, epoch_id)
                    logger.info("{:s}".format(logger.coloring(message, "RED")))
                    if epoch_id % config.save_interval == 0:

                        model_path = os.path.join(config.model_save_dir,
                                                  config.ARCHITECTURE["name"])
                        save_model(train_prog, model_path, "best_model")

            # 3. save the persistable model
            if epoch_id % config.save_interval == 0:
                model_path = os.path.join(config.model_save_dir,
                                          config.ARCHITECTURE["name"])
                save_model(train_prog, model_path, epoch_id)
Exemplo n.º 15
0
def run(dataloader,
        exe,
        program,
        feeds,
        fetchs,
        epoch=0,
        mode='train',
        config=None,
        vdl_writer=None,
        lr_scheduler=None):
    """
    Feed data to the model and fetch the measures and loss

    Args:
        dataloader(paddle io dataloader):
        exe():
        program():
        fetchs(dict): dict of measures and the loss
        epoch(int): epoch of training or validation
        model(str): log only

    Returns:
    """
    fetch_list = [f[0] for f in fetchs.values()]
    metric_list = [f[1] for f in fetchs.values()]
    if mode == "train":
        metric_list.append(AverageMeter('lr', 'f', need_avg=False))
    for m in metric_list:
        m.reset()
    batch_time = AverageMeter('elapse', '.3f')
    use_dali = config.get('use_dali', False)
    dataloader = dataloader if use_dali else dataloader()
    tic = time.time()
    for idx, batch in enumerate(dataloader):
        # ignore the warmup iters
        if idx == 5:
            batch_time.reset()
        if use_dali:
            batch_size = batch[0]["feed_image"].shape()[0]
            feed_dict = batch[0]
        else:
            batch_size = batch[0].shape()[0]
            feed_dict = {
                key.name: batch[idx]
                for idx, key in enumerate(feeds.values())
            }
        metrics = exe.run(program=program,
                          feed=feed_dict,
                          fetch_list=fetch_list)
        batch_time.update(time.time() - tic)
        for i, m in enumerate(metrics):
            metric_list[i].update(np.mean(m), batch_size)

        if mode == "train":
            metric_list[-1].update(lr_scheduler.get_lr())

        fetchs_str = ''.join([str(m.value) + ' '
                              for m in metric_list] + [batch_time.mean]) + 's'
        ips_info = " ips: {:.5f} images/sec.".format(batch_size /
                                                     batch_time.avg)
        fetchs_str += ips_info

        if lr_scheduler is not None:
            if lr_scheduler.update_specified:
                curr_global_counter = lr_scheduler.step_each_epoch * epoch + idx
                update = max(
                    0, curr_global_counter - lr_scheduler.update_start_step
                ) % lr_scheduler.update_step_interval == 0
                if update:
                    lr_scheduler.step()
            else:
                lr_scheduler.step()

        if vdl_writer:
            global total_step
            logger.scaler('loss', metrics[0][0], total_step, vdl_writer)
            total_step += 1
        if mode == 'valid':
            if idx % config.get('print_interval', 10) == 0:
                logger.info("{:s} step:{:<4d} {:s}".format(
                    mode, idx, fetchs_str))
        else:
            epoch_str = "epoch:{:<3d}".format(epoch)
            step_str = "{:s} step:{:<4d}".format(mode, idx)

            if idx % config.get('print_interval', 10) == 0:
                logger.info("{:s} {:s} {:s}".format(
                    logger.coloring(epoch_str, "HEADER")
                    if idx == 0 else epoch_str,
                    logger.coloring(step_str, "PURPLE"),
                    logger.coloring(fetchs_str, 'OKGREEN')))

        tic = time.time()

    end_str = ''.join([str(m.mean) + ' '
                       for m in metric_list] + [batch_time.total]) + 's'
    ips_info = "ips: {:.5f} images/sec.".format(batch_size * batch_time.count /
                                                batch_time.sum)
    if mode == 'valid':
        logger.info("END {:s} {:s}s {:s}".format(mode, end_str, ips_info))
    else:
        end_epoch_str = "END epoch:{:<3d}".format(epoch)
        logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str,
                                                 ips_info))
    if use_dali:
        dataloader.reset()

    # return top1_acc in order to save the best model
    if mode == 'valid':
        return fetchs["top1"][1].avg
Exemplo n.º 16
0
def main(args):
    config = get_config(args.config, overrides=args.override, show=True)
    # 如果需要量化训练,就必须开启评估
    if not config.validate and args.use_quant:
        logger.error("=====>Train quant model must use validate!")
        sys.exit(1)
    if args.use_quant:
        config.epochs = config.epochs + 5
        gpu_count = get_gpu_count()
        if gpu_count != 1:
            logger.error(
                "=====>`Train quant model must use only one GPU. "
                "Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` ."
            )
            sys.exit(1)

    # 设置是否使用 GPU
    use_gpu = config.get("use_gpu", True)
    places = fluid.cuda_places() if use_gpu else fluid.cpu_places()

    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    best_top1_acc = 0.0

    # 获取训练数据和模型输出
    if not config.get('use_ema'):
        train_dataloader, train_fetchs, out, softmax_out = program.build(
            config,
            train_prog,
            startup_prog,
            is_train=True,
            is_distributed=False)
    else:
        train_dataloader, train_fetchs, ema, out, softmax_out = program.build(
            config,
            train_prog,
            startup_prog,
            is_train=True,
            is_distributed=False)
    # 获取评估数据和模型输出
    if config.validate:
        valid_prog = fluid.Program()
        valid_dataloader, valid_fetchs, _, _ = program.build(
            config,
            valid_prog,
            startup_prog,
            is_train=False,
            is_distributed=False)
        # 克隆评估程序,可以去掉与评估无关的计算
        valid_prog = valid_prog.clone(for_test=True)

    # 创建执行器
    exe = fluid.Executor(places[0])
    exe.run(startup_prog)

    # 加载模型,可以是预训练模型,也可以是检查点
    init_model(config, train_prog, exe)

    train_reader = Reader(config, 'train')()
    train_dataloader.set_sample_list_generator(train_reader, places)

    compiled_train_prog = program.compile(config, train_prog,
                                          train_fetchs['loss'][0].name)

    if config.validate:
        valid_reader = Reader(config, 'valid')()
        valid_dataloader.set_sample_list_generator(valid_reader, places)
        compiled_valid_prog = program.compile(config,
                                              valid_prog,
                                              share_prog=compiled_train_prog)

    vdl_writer = LogWriter(args.vdl_dir)

    for epoch_id in range(config.epochs - 5):
        # 训练一轮
        program.run(train_dataloader, exe, compiled_train_prog, train_fetchs,
                    epoch_id, 'train', config, vdl_writer)

        # 执行一次评估
        if config.validate and epoch_id % config.valid_interval == 0:
            if config.get('use_ema'):
                logger.info(logger.coloring("EMA validate start..."))
                with ema.apply(exe):
                    _ = program.run(valid_dataloader, exe, compiled_valid_prog,
                                    valid_fetchs, epoch_id, 'valid', config)
                logger.info(logger.coloring("EMA validate over!"))

            top1_acc = program.run(valid_dataloader, exe, compiled_valid_prog,
                                   valid_fetchs, epoch_id, 'valid', config)

            if vdl_writer:
                logger.scaler('valid_avg', top1_acc, epoch_id, vdl_writer)

            if top1_acc > best_top1_acc:
                best_top1_acc = top1_acc
                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                    best_top1_acc, epoch_id)
                logger.info("{:s}".format(logger.coloring(message, "RED")))
                if epoch_id % config.save_interval == 0:
                    model_path = os.path.join(config.model_save_dir,
                                              config.ARCHITECTURE["name"])
                    save_model(train_prog, model_path, "best_model")

        # 保存模型
        if epoch_id % config.save_interval == 0:
            model_path = os.path.join(config.model_save_dir,
                                      config.ARCHITECTURE["name"])
            if epoch_id >= 3 and os.path.exists(
                    os.path.join(model_path, str(epoch_id - 3))):
                shutil.rmtree(os.path.join(model_path, str(epoch_id - 3)),
                              ignore_errors=True)
            save_model(train_prog, model_path, epoch_id)

    # 量化训练
    if args.use_quant and config.validate:
        # 执行量化训练
        quant_program = slim.quant.quant_aware(train_prog,
                                               exe.place,
                                               for_test=False)
        # 评估量化的结果
        val_quant_program = slim.quant.quant_aware(valid_prog,
                                                   exe.place,
                                                   for_test=True)

        fetch_list = [f[0] for f in train_fetchs.values()]
        metric_list = [f[1] for f in train_fetchs.values()]
        for i in range(5):
            for idx, batch in enumerate(train_dataloader()):
                metrics = exe.run(program=quant_program,
                                  feed=batch,
                                  fetch_list=fetch_list)
                for i, m in enumerate(metrics):
                    metric_list[i].update(np.mean(m), len(batch[0]))
                fetchs_str = ''.join([str(m.value) + ' ' for m in metric_list])

                if idx % 10 == 0:
                    logger.info("quant train : " + fetchs_str)

        fetch_list = [f[0] for f in valid_fetchs.values()]
        metric_list = [f[1] for f in valid_fetchs.values()]
        for idx, batch in enumerate(valid_dataloader()):
            metrics = exe.run(program=val_quant_program,
                              feed=batch,
                              fetch_list=fetch_list)
            for i, m in enumerate(metrics):
                metric_list[i].update(np.mean(m), len(batch[0]))
            fetchs_str = ''.join([str(m.value) + ' ' for m in metric_list])

            if idx % 10 == 0:
                logger.info("quant valid: " + fetchs_str)

        # 保存量化训练模型
        float_prog, int8_prog = slim.quant.convert(val_quant_program,
                                                   exe.place,
                                                   save_int8=True)
        fluid.io.save_inference_model(dirname=args.output_path,
                                      feeded_var_names=['feed_image'],
                                      target_vars=[softmax_out],
                                      executor=exe,
                                      main_program=float_prog,
                                      model_filename='__model__',
                                      params_filename='__params__')
def run(dataloader,
        config,
        net,
        optimizer=None,
        lr_scheduler=None,
        epoch=0,
        mode='train'):
    """
    Feed data to the model and fetch the measures and loss

    Args:
        dataloader(paddle dataloader):
        exe():
        program():
        fetchs(dict): dict of measures and the loss
        epoch(int): epoch of training or validation
        model(str): log only

    Returns:
    """
    print_interval = config.get("print_interval", 10)
    use_mix = config.get("use_mix", False) and mode == "train"

    metric_list = [
        ("loss", AverageMeter('loss', '7.5f', postfix=",")),
        ("lr", AverageMeter('lr', 'f', postfix=",", need_avg=False)),
        ("batch_time", AverageMeter('batch_cost', '.5f', postfix=" s,")),
        ("reader_time", AverageMeter('reader_cost', '.5f', postfix=" s,")),
    ]
    if not use_mix:
        topk_name = 'top{}'.format(config.topk)
        metric_list.insert(
            0, (topk_name, AverageMeter(topk_name, '.5f', postfix=",")))
        metric_list.insert(0,
                           ("top1", AverageMeter("top1", '.5f', postfix=",")))

    metric_list = OrderedDict(metric_list)

    tic = time.time()
    for idx, batch in enumerate(dataloader()):
        # avoid statistics from warmup time
        if idx == 10:
            metric_list["batch_time"].reset()
            metric_list["reader_time"].reset()

        metric_list['reader_time'].update(time.time() - tic)
        batch_size = len(batch[0])
        feeds = create_feeds(batch, use_mix)
        fetchs = create_fetchs(feeds, net, config, mode)
        if mode == 'train':
            avg_loss = fetchs['loss']
            avg_loss.backward()

            optimizer.step()
            optimizer.clear_grad()
            metric_list['lr'].update(
                optimizer._global_learning_rate().numpy()[0], batch_size)

            if lr_scheduler is not None:
                if lr_scheduler.update_specified:
                    curr_global_counter = lr_scheduler.step_each_epoch * epoch + idx
                    update = max(
                        0, curr_global_counter - lr_scheduler.update_start_step
                    ) % lr_scheduler.update_step_interval == 0
                    if update:
                        lr_scheduler.step()
                else:
                    lr_scheduler.step()

        for name, fetch in fetchs.items():
            metric_list[name].update(fetch.numpy()[0], batch_size)
        metric_list["batch_time"].update(time.time() - tic)
        tic = time.time()

        fetchs_str = ' '.join([
            str(metric_list[key].mean)
            if "time" in key else str(metric_list[key].value)
            for key in metric_list
        ])

        if idx % print_interval == 0:
            ips_info = "ips: {:.5f} images/sec.".format(
                batch_size / metric_list["batch_time"].avg)
            if mode == 'eval':
                logger.info("{:s} step:{:<4d}, {:s} {:s}".format(
                    mode, idx, fetchs_str, ips_info))
            else:
                epoch_str = "epoch:{:<3d}".format(epoch)
                step_str = "{:s} step:{:<4d}".format(mode, idx)
                logger.info("{:s}, {:s}, {:s} {:s}".format(
                    logger.coloring(epoch_str, "HEADER")
                    if idx == 0 else epoch_str,
                    logger.coloring(step_str, "PURPLE"),
                    logger.coloring(fetchs_str, 'OKGREEN'),
                    logger.coloring(ips_info, 'OKGREEN')))

    end_str = ' '.join([str(m.mean) for m in metric_list.values()] +
                       [metric_list['batch_time'].total])
    ips_info = "ips: {:.5f} images/sec.".format(
        batch_size * metric_list["batch_time"].count /
        metric_list["batch_time"].sum)

    if mode == 'eval':
        logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info))
    else:
        end_epoch_str = "END epoch:{:<3d}".format(epoch)

        logger.info("{:s} {:s} {:s} {:s}".format(
            logger.coloring(end_epoch_str, "RED"),
            logger.coloring(mode, "PURPLE"),
            logger.coloring(end_str, "OKGREEN"),
            logger.coloring(ips_info, "OKGREEN"),
        ))

    # return top1_acc in order to save the best model
    if mode == 'valid':
        return metric_list['top1'].avg