示例#1
0
def main():
    args = conf.parse_args()
    config = conf.get_config(args.config, overrides=args.override, show=False)

    assert os.path.exists(
        os.path.join(config["Global"]["save_inference_dir"],
                     'inference.pdmodel')) and os.path.exists(
                         os.path.join(config["Global"]["save_inference_dir"],
                                      'inference.pdiparams'))
    config["DataLoader"]["Eval"]["sampler"]["batch_size"] = 1
    config["DataLoader"]["Eval"]["loader"]["num_workers"] = 0
    init_logger()
    device = paddle.set_device("cpu")
    train_dataloader = build_dataloader(config["DataLoader"], "Eval", device,
                                        False)

    def sample_generator(loader):
        def __reader__():
            for indx, data in enumerate(loader):
                images = np.array(data[0])
                yield images

        return __reader__

    paddle.enable_static()
    place = paddle.CPUPlace()
    exe = paddle.static.Executor(place)
    paddleslim.quant.quant_post_static(
        executor=exe,
        model_dir=config["Global"]["save_inference_dir"],
        model_filename='inference.pdmodel',
        params_filename='inference.pdiparams',
        quantize_model_path=os.path.join(
            config["Global"]["save_inference_dir"], "quant_post_static_model"),
        sample_generator=sample_generator(train_dataloader),
        batch_nums=10)
示例#2
0
def main(args):
    """
    all the config of training paradigm should be in config["Global"]
    """
    config = get_config(args.config, overrides=args.override, show=False)
    global_config = config["Global"]

    mode = "train"

    log_file = os.path.join(global_config['output_dir'],
                            config["Arch"]["name"], f"{mode}.log")
    init_logger(log_file=log_file)
    print_config(config)

    if global_config.get("is_distributed", True):
        fleet.init(is_collective=True)
    # assign the device
    use_gpu = global_config.get("use_gpu", True)
    # amp related config
    if 'AMP' in config:
        AMP_RELATED_FLAGS_SETTING = {
            'FLAGS_cudnn_exhaustive_search': 1,
            'FLAGS_conv_workspace_size_limit': 1500,
            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
            'FLAGS_max_inplace_grad_add': 8,
        }
        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)

    use_xpu = global_config.get("use_xpu", False)
    use_npu = global_config.get("use_npu", False)
    use_mlu = global_config.get("use_mlu", False)
    assert (
        use_gpu and use_xpu and use_npu and use_mlu
    ) is not True, "gpu, xpu, npu and mlu can not be true in the same time in static mode!"

    if use_gpu:
        device = paddle.set_device('gpu')
    elif use_xpu:
        device = paddle.set_device('xpu')
    elif use_npu:
        device = paddle.set_device('npu')
    elif use_mlu:
        device = paddle.set_device('mlu')
    else:
        device = paddle.set_device('cpu')

    # visualDL
    vdl_writer = None
    if global_config["use_visualdl"]:
        vdl_dir = os.path.join(global_config["output_dir"], "vdl")
        vdl_writer = LogWriter(vdl_dir)

    # build dataloader
    eval_dataloader = None
    use_dali = global_config.get('use_dali', False)

    class_num = config["Arch"].get("class_num", None)
    config["DataLoader"].update({"class_num": class_num})
    train_dataloader = build_dataloader(config["DataLoader"],
                                        "Train",
                                        device=device,
                                        use_dali=use_dali)
    if global_config["eval_during_train"]:
        eval_dataloader = build_dataloader(config["DataLoader"],
                                           "Eval",
                                           device=device,
                                           use_dali=use_dali)

    step_each_epoch = len(train_dataloader)

    # startup_prog is used to do some parameter init work,
    # and train prog is used to hold the network
    startup_prog = paddle.static.Program()
    train_prog = paddle.static.Program()

    best_top1_acc = 0.0  # best top1 acc record

    train_fetchs, lr_scheduler, train_feeds, optimizer = program.build(
        config,
        train_prog,
        startup_prog,
        class_num,
        step_each_epoch=step_each_epoch,
        is_train=True,
        is_distributed=global_config.get("is_distributed", True))

    if global_config["eval_during_train"]:
        eval_prog = paddle.static.Program()
        eval_fetchs, _, eval_feeds, _ = program.build(
            config,
            eval_prog,
            startup_prog,
            is_train=False,
            is_distributed=global_config.get("is_distributed", True))
        # clone to prune some content which is irrelevant in eval_prog
        eval_prog = eval_prog.clone(for_test=True)

    # create the "Executor" with the statement of which device
    exe = paddle.static.Executor(device)
    # Parameter initialization
    exe.run(startup_prog)
    # load pretrained models or checkpoints
    init_model(global_config, train_prog, exe)

    if 'AMP' in config and config.AMP.get("level", "O1") == "O2":
        optimizer.amp_init(device,
                           scope=paddle.static.global_scope(),
                           test_program=eval_prog
                           if global_config["eval_during_train"] else None)

    if not global_config.get("is_distributed", True):
        compiled_train_prog = program.compile(
            config, train_prog, loss_name=train_fetchs["loss"][0].name)
    else:
        compiled_train_prog = train_prog

    if eval_dataloader is not None:
        compiled_eval_prog = program.compile(config, eval_prog)

    for epoch_id in range(global_config["epochs"]):
        # 1. train with train dataset
        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
                    train_fetchs, epoch_id, 'train', config, vdl_writer,
                    lr_scheduler, args.profiler_options)
        # 2. evaate with eval dataset
        if global_config["eval_during_train"] and epoch_id % global_config[
                "eval_interval"] == 0:
            top1_acc = program.run(eval_dataloader, exe, compiled_eval_prog,
                                   eval_feeds, eval_fetchs, epoch_id, "eval",
                                   config)
            if top1_acc > best_top1_acc:
                best_top1_acc = top1_acc
                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
                    best_top1_acc, epoch_id)
                logger.info(message)
                if epoch_id % global_config["save_interval"] == 0:

                    model_path = os.path.join(global_config["output_dir"],
                                              config["Arch"]["name"])
                    save_model(train_prog, model_path, "best_model")

        # 3. save the persistable model
        if epoch_id % global_config["save_interval"] == 0:
            model_path = os.path.join(global_config["output_dir"],
                                      config["Arch"]["name"])
            save_model(train_prog, model_path, epoch_id)
示例#3
0
    def __init__(self, config, mode="train"):
        assert mode in ["train", "eval", "infer", "export"]
        self.mode = mode
        self.config = config
        self.eval_mode = self.config["Global"].get("eval_mode",
                                                   "classification")
        if "Head" in self.config["Arch"] or self.config["Arch"].get(
                "is_rec", False):
            self.is_rec = True
        else:
            self.is_rec = False

        # set seed
        seed = self.config["Global"].get("seed", False)
        if seed or seed == 0:
            assert isinstance(seed, int), "The 'seed' must be a integer!"
            paddle.seed(seed)
            np.random.seed(seed)
            random.seed(seed)

        # init logger
        self.output_dir = self.config['Global']['output_dir']
        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
                                f"{mode}.log")
        init_logger(log_file=log_file)
        print_config(config)

        # init train_func and eval_func
        assert self.eval_mode in ["classification", "retrieval"], logger.error(
            "Invalid eval mode: {}".format(self.eval_mode))
        self.train_epoch_func = train_epoch
        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")

        self.use_dali = self.config['Global'].get("use_dali", False)

        # for visualdl
        self.vdl_writer = None
        if self.config['Global'][
                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
            vdl_writer_path = os.path.join(self.output_dir, "vdl")
            if not os.path.exists(vdl_writer_path):
                os.makedirs(vdl_writer_path)
            self.vdl_writer = LogWriter(logdir=vdl_writer_path)

        # set device
        assert self.config["Global"]["device"] in [
            "cpu", "gpu", "xpu", "npu", "mlu"
        ]
        self.device = paddle.set_device(self.config["Global"]["device"])
        logger.info('train with paddle {} and device {}'.format(
            paddle.__version__, self.device))

        # AMP training
        self.amp = True if "AMP" in self.config and self.mode == "train" else False
        if self.amp and self.config["AMP"] is not None:
            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
            self.use_dynamic_loss_scaling = self.config["AMP"].get(
                "use_dynamic_loss_scaling", False)
        else:
            self.scale_loss = 1.0
            self.use_dynamic_loss_scaling = False
        if self.amp:
            AMP_RELATED_FLAGS_SETTING = {
                'FLAGS_max_inplace_grad_add': 8,
            }
            if paddle.is_compiled_with_cuda():
                AMP_RELATED_FLAGS_SETTING.update(
                    {'FLAGS_cudnn_batchnorm_spatial_persistent': 1})
            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)

        if "class_num" in config["Global"]:
            global_class_num = config["Global"]["class_num"]
            if "class_num" not in config["Arch"]:
                config["Arch"]["class_num"] = global_class_num
                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
            else:
                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
            logger.warning(msg)
        #TODO(gaotingquan): support rec
        class_num = config["Arch"].get("class_num", None)
        self.config["DataLoader"].update({"class_num": class_num})
        # build dataloader
        if self.mode == 'train':
            self.train_dataloader = build_dataloader(self.config["DataLoader"],
                                                     "Train", self.device,
                                                     self.use_dali)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            if self.eval_mode == "classification":
                self.eval_dataloader = build_dataloader(
                    self.config["DataLoader"], "Eval", self.device,
                    self.use_dali)
            elif self.eval_mode == "retrieval":
                self.gallery_query_dataloader = None
                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
                    self.gallery_query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], key, self.device,
                        self.use_dali)
                else:
                    self.gallery_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Gallery",
                        self.device, self.use_dali)
                    self.query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Query",
                        self.device, self.use_dali)

        # build loss
        if self.mode == "train":
            loss_info = self.config["Loss"]["Train"]
            self.train_loss_func = build_loss(loss_info)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            loss_config = self.config.get("Loss", None)
            if loss_config is not None:
                loss_config = loss_config.get("Eval")
                if loss_config is not None:
                    self.eval_loss_func = build_loss(loss_config)
                else:
                    self.eval_loss_func = None
            else:
                self.eval_loss_func = None

        # build metric
        if self.mode == 'train':
            metric_config = self.config.get("Metric")
            if metric_config is not None:
                metric_config = metric_config.get("Train")
                if metric_config is not None:
                    if hasattr(
                            self.train_dataloader, "collate_fn"
                    ) and self.train_dataloader.collate_fn is not None:
                        for m_idx, m in enumerate(metric_config):
                            if "TopkAcc" in m:
                                msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed."
                                logger.warning(msg)
                                break
                        metric_config.pop(m_idx)
                    self.train_metric_func = build_metrics(metric_config)
                else:
                    self.train_metric_func = None
        else:
            self.train_metric_func = None

        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            metric_config = self.config.get("Metric")
            if self.eval_mode == "classification":
                if metric_config is not None:
                    metric_config = metric_config.get("Eval")
                    if metric_config is not None:
                        self.eval_metric_func = build_metrics(metric_config)
            elif self.eval_mode == "retrieval":
                if metric_config is None:
                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
                else:
                    metric_config = metric_config["Eval"]
                self.eval_metric_func = build_metrics(metric_config)
        else:
            self.eval_metric_func = None

        # build model
        self.model = build_model(self.config)
        # set @to_static for benchmark, skip this by default.
        apply_to_static(self.config, self.model)

        # load_pretrain
        if self.config["Global"]["pretrained_model"] is not None:
            if self.config["Global"]["pretrained_model"].startswith("http"):
                load_dygraph_pretrain_from_url(
                    self.model, self.config["Global"]["pretrained_model"])
            else:
                load_dygraph_pretrain(
                    self.model, self.config["Global"]["pretrained_model"])

        # build optimizer
        if self.mode == 'train':
            self.optimizer, self.lr_sch = build_optimizer(
                self.config["Optimizer"], self.config["Global"]["epochs"],
                len(self.train_dataloader), [self.model])

        # for amp training
        if self.amp:
            self.scaler = paddle.amp.GradScaler(
                init_loss_scaling=self.scale_loss,
                use_dynamic_loss_scaling=self.use_dynamic_loss_scaling)
            amp_level = self.config['AMP'].get("level", "O1")
            if amp_level not in ["O1", "O2"]:
                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
                logger.warning(msg)
                self.config['AMP']["level"] = "O1"
                amp_level = "O1"
            self.model, self.optimizer = paddle.amp.decorate(
                models=self.model,
                optimizers=self.optimizer,
                level=amp_level,
                save_dtype='float32')

        # for distributed
        world_size = dist.get_world_size()
        self.config["Global"]["distributed"] = world_size != 1
        if world_size != 4 and self.mode == "train":
            msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train."
            logger.warning(msg)
        if self.config["Global"]["distributed"]:
            dist.init_parallel_env()
            self.model = paddle.DataParallel(self.model)

        # build postprocess for infer
        if self.mode == 'infer':
            self.preprocess_func = create_operators(
                self.config["Infer"]["transforms"])
            self.postprocess_func = build_postprocess(
                self.config["Infer"]["PostProcess"])
示例#4
0
    def __init__(self, config, mode="train"):
        assert mode in ["train", "eval", "infer", "export"]
        self.mode = mode
        self.config = config
        self.eval_mode = self.config["Global"].get("eval_mode",
                                                   "classification")
        if "Head" in self.config["Arch"]:
            self.is_rec = True
        else:
            self.is_rec = False

        # set seed
        seed = self.config["Global"].get("seed", False)
        if seed or seed == 0:
            assert isinstance(seed, int), "The 'seed' must be a integer!"
            paddle.seed(seed)
            np.random.seed(seed)
            random.seed(seed)

        # init logger
        self.output_dir = self.config['Global']['output_dir']
        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
                                f"{mode}.log")
        init_logger(name='root', log_file=log_file)
        print_config(config)

        # init train_func and eval_func
        assert self.eval_mode in ["classification", "retrieval"], logger.error(
            "Invalid eval mode: {}".format(self.eval_mode))
        self.train_epoch_func = train_epoch
        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")

        self.use_dali = self.config['Global'].get("use_dali", False)

        # for visualdl
        self.vdl_writer = None
        if self.config['Global']['use_visualdl'] and mode == "train":
            vdl_writer_path = os.path.join(self.output_dir, "vdl")
            if not os.path.exists(vdl_writer_path):
                os.makedirs(vdl_writer_path)
            self.vdl_writer = LogWriter(logdir=vdl_writer_path)

        # set device
        assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu", "npu"]
        self.device = paddle.set_device(self.config["Global"]["device"])
        logger.info('train with paddle {} and device {}'.format(
            paddle.__version__, self.device))

        # AMP training
        self.amp = True if "AMP" in self.config else False
        if self.amp and self.config["AMP"] is not None:
            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
            self.use_dynamic_loss_scaling = self.config["AMP"].get(
                "use_dynamic_loss_scaling", False)
        else:
            self.scale_loss = 1.0
            self.use_dynamic_loss_scaling = False
        if self.amp:
            AMP_RELATED_FLAGS_SETTING = {
                'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
                'FLAGS_max_inplace_grad_add': 8,
            }
            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)

        #TODO(gaotingquan): support rec
        class_num = config["Arch"].get("class_num", None)
        self.config["DataLoader"].update({"class_num": class_num})
        # build dataloader
        if self.mode == 'train':
            self.train_dataloader = build_dataloader(self.config["DataLoader"],
                                                     "Train", self.device,
                                                     self.use_dali)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            if self.eval_mode == "classification":
                self.eval_dataloader = build_dataloader(
                    self.config["DataLoader"], "Eval", self.device,
                    self.use_dali)
            elif self.eval_mode == "retrieval":
                self.gallery_query_dataloader = None
                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
                    self.gallery_query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], key, self.device,
                        self.use_dali)
                else:
                    self.gallery_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Gallery",
                        self.device, self.use_dali)
                    self.query_dataloader = build_dataloader(
                        self.config["DataLoader"]["Eval"], "Query",
                        self.device, self.use_dali)

        # build loss
        if self.mode == "train":
            loss_info = self.config["Loss"]["Train"]
            self.train_loss_func = build_loss(loss_info)
        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            loss_config = self.config.get("Loss", None)
            if loss_config is not None:
                loss_config = loss_config.get("Eval")
                if loss_config is not None:
                    self.eval_loss_func = build_loss(loss_config)
                else:
                    self.eval_loss_func = None
            else:
                self.eval_loss_func = None

        # build metric
        if self.mode == 'train':
            metric_config = self.config.get("Metric")
            if metric_config is not None:
                metric_config = metric_config.get("Train")
                if metric_config is not None:
                    self.train_metric_func = build_metrics(metric_config)
                else:
                    self.train_metric_func = None
        else:
            self.train_metric_func = None

        if self.mode == "eval" or (self.mode == "train" and
                                   self.config["Global"]["eval_during_train"]):
            metric_config = self.config.get("Metric")
            if self.eval_mode == "classification":
                if metric_config is not None:
                    metric_config = metric_config.get("Eval")
                    if metric_config is not None:
                        self.eval_metric_func = build_metrics(metric_config)
            elif self.eval_mode == "retrieval":
                if metric_config is None:
                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
                else:
                    metric_config = metric_config["Eval"]
                self.eval_metric_func = build_metrics(metric_config)
        else:
            self.eval_metric_func = None

        # build model
        self.model = build_model(self.config["Arch"])
        # set @to_static for benchmark, skip this by default.
        apply_to_static(self.config, self.model)

        # for slim
        self.pruner = get_pruner(self.config, self.model)
        self.quanter = get_quaner(self.config, self.model)

        # load_pretrain
        if self.config["Global"]["pretrained_model"] is not None:
            if self.config["Global"]["pretrained_model"].startswith("http"):
                load_dygraph_pretrain_from_url(
                    self.model, self.config["Global"]["pretrained_model"])
            else:
                load_dygraph_pretrain(
                    self.model, self.config["Global"]["pretrained_model"])

        # build optimizer
        if self.mode == 'train':
            self.optimizer, self.lr_sch = build_optimizer(
                self.config["Optimizer"], self.config["Global"]["epochs"],
                len(self.train_dataloader), [self.model])

        # for distributed
        self.config["Global"][
            "distributed"] = paddle.distributed.get_world_size() != 1
        if self.config["Global"]["distributed"]:
            dist.init_parallel_env()
        if self.config["Global"]["distributed"]:
            self.model = paddle.DataParallel(self.model)

        # build postprocess for infer
        if self.mode == 'infer':
            self.preprocess_func = create_operators(
                self.config["Infer"]["transforms"])
            self.postprocess_func = build_postprocess(
                self.config["Infer"]["PostProcess"])