def main(): args = conf.parse_args() config = conf.get_config(args.config, overrides=args.override, show=False) assert os.path.exists( os.path.join(config["Global"]["save_inference_dir"], 'inference.pdmodel')) and os.path.exists( os.path.join(config["Global"]["save_inference_dir"], 'inference.pdiparams')) config["DataLoader"]["Eval"]["sampler"]["batch_size"] = 1 config["DataLoader"]["Eval"]["loader"]["num_workers"] = 0 init_logger() device = paddle.set_device("cpu") train_dataloader = build_dataloader(config["DataLoader"], "Eval", device, False) def sample_generator(loader): def __reader__(): for indx, data in enumerate(loader): images = np.array(data[0]) yield images return __reader__ paddle.enable_static() place = paddle.CPUPlace() exe = paddle.static.Executor(place) paddleslim.quant.quant_post_static( executor=exe, model_dir=config["Global"]["save_inference_dir"], model_filename='inference.pdmodel', params_filename='inference.pdiparams', quantize_model_path=os.path.join( config["Global"]["save_inference_dir"], "quant_post_static_model"), sample_generator=sample_generator(train_dataloader), batch_nums=10)
def main(args): """ all the config of training paradigm should be in config["Global"] """ config = get_config(args.config, overrides=args.override, show=False) global_config = config["Global"] mode = "train" log_file = os.path.join(global_config['output_dir'], config["Arch"]["name"], f"{mode}.log") init_logger(log_file=log_file) print_config(config) if global_config.get("is_distributed", True): fleet.init(is_collective=True) # assign the device use_gpu = global_config.get("use_gpu", True) # amp related config if 'AMP' in config: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_exhaustive_search': 1, 'FLAGS_conv_workspace_size_limit': 1500, 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1' paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) use_xpu = global_config.get("use_xpu", False) use_npu = global_config.get("use_npu", False) use_mlu = global_config.get("use_mlu", False) assert ( use_gpu and use_xpu and use_npu and use_mlu ) is not True, "gpu, xpu, npu and mlu can not be true in the same time in static mode!" if use_gpu: device = paddle.set_device('gpu') elif use_xpu: device = paddle.set_device('xpu') elif use_npu: device = paddle.set_device('npu') elif use_mlu: device = paddle.set_device('mlu') else: device = paddle.set_device('cpu') # visualDL vdl_writer = None if global_config["use_visualdl"]: vdl_dir = os.path.join(global_config["output_dir"], "vdl") vdl_writer = LogWriter(vdl_dir) # build dataloader eval_dataloader = None use_dali = global_config.get('use_dali', False) class_num = config["Arch"].get("class_num", None) config["DataLoader"].update({"class_num": class_num}) train_dataloader = build_dataloader(config["DataLoader"], "Train", device=device, use_dali=use_dali) if global_config["eval_during_train"]: eval_dataloader = build_dataloader(config["DataLoader"], "Eval", device=device, use_dali=use_dali) step_each_epoch = len(train_dataloader) # startup_prog is used to do some parameter init work, # and train prog is used to hold the network startup_prog = paddle.static.Program() train_prog = paddle.static.Program() best_top1_acc = 0.0 # best top1 acc record train_fetchs, lr_scheduler, train_feeds, optimizer = program.build( config, train_prog, startup_prog, class_num, step_each_epoch=step_each_epoch, is_train=True, is_distributed=global_config.get("is_distributed", True)) if global_config["eval_during_train"]: eval_prog = paddle.static.Program() eval_fetchs, _, eval_feeds, _ = program.build( config, eval_prog, startup_prog, is_train=False, is_distributed=global_config.get("is_distributed", True)) # clone to prune some content which is irrelevant in eval_prog eval_prog = eval_prog.clone(for_test=True) # create the "Executor" with the statement of which device exe = paddle.static.Executor(device) # Parameter initialization exe.run(startup_prog) # load pretrained models or checkpoints init_model(global_config, train_prog, exe) if 'AMP' in config and config.AMP.get("level", "O1") == "O2": optimizer.amp_init(device, scope=paddle.static.global_scope(), test_program=eval_prog if global_config["eval_during_train"] else None) if not global_config.get("is_distributed", True): compiled_train_prog = program.compile( config, train_prog, loss_name=train_fetchs["loss"][0].name) else: compiled_train_prog = train_prog if eval_dataloader is not None: compiled_eval_prog = program.compile(config, eval_prog) for epoch_id in range(global_config["epochs"]): # 1. train with train dataset program.run(train_dataloader, exe, compiled_train_prog, train_feeds, train_fetchs, epoch_id, 'train', config, vdl_writer, lr_scheduler, args.profiler_options) # 2. evaate with eval dataset if global_config["eval_during_train"] and epoch_id % global_config[ "eval_interval"] == 0: top1_acc = program.run(eval_dataloader, exe, compiled_eval_prog, eval_feeds, eval_fetchs, epoch_id, "eval", config) if top1_acc > best_top1_acc: best_top1_acc = top1_acc message = "The best top1 acc {:.5f}, in epoch: {:d}".format( best_top1_acc, epoch_id) logger.info(message) if epoch_id % global_config["save_interval"] == 0: model_path = os.path.join(global_config["output_dir"], config["Arch"]["name"]) save_model(train_prog, model_path, "best_model") # 3. save the persistable model if epoch_id % global_config["save_interval"] == 0: model_path = os.path.join(global_config["output_dir"], config["Arch"]["name"]) save_model(train_prog, model_path, epoch_id)
def __init__(self, config, mode="train"): assert mode in ["train", "eval", "infer", "export"] self.mode = mode self.config = config self.eval_mode = self.config["Global"].get("eval_mode", "classification") if "Head" in self.config["Arch"] or self.config["Arch"].get( "is_rec", False): self.is_rec = True else: self.is_rec = False # set seed seed = self.config["Global"].get("seed", False) if seed or seed == 0: assert isinstance(seed, int), "The 'seed' must be a integer!" paddle.seed(seed) np.random.seed(seed) random.seed(seed) # init logger self.output_dir = self.config['Global']['output_dir'] log_file = os.path.join(self.output_dir, self.config["Arch"]["name"], f"{mode}.log") init_logger(log_file=log_file) print_config(config) # init train_func and eval_func assert self.eval_mode in ["classification", "retrieval"], logger.error( "Invalid eval mode: {}".format(self.eval_mode)) self.train_epoch_func = train_epoch self.eval_func = getattr(evaluation, self.eval_mode + "_eval") self.use_dali = self.config['Global'].get("use_dali", False) # for visualdl self.vdl_writer = None if self.config['Global'][ 'use_visualdl'] and mode == "train" and dist.get_rank() == 0: vdl_writer_path = os.path.join(self.output_dir, "vdl") if not os.path.exists(vdl_writer_path): os.makedirs(vdl_writer_path) self.vdl_writer = LogWriter(logdir=vdl_writer_path) # set device assert self.config["Global"]["device"] in [ "cpu", "gpu", "xpu", "npu", "mlu" ] self.device = paddle.set_device(self.config["Global"]["device"]) logger.info('train with paddle {} and device {}'.format( paddle.__version__, self.device)) # AMP training self.amp = True if "AMP" in self.config and self.mode == "train" else False if self.amp and self.config["AMP"] is not None: self.scale_loss = self.config["AMP"].get("scale_loss", 1.0) self.use_dynamic_loss_scaling = self.config["AMP"].get( "use_dynamic_loss_scaling", False) else: self.scale_loss = 1.0 self.use_dynamic_loss_scaling = False if self.amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_max_inplace_grad_add': 8, } if paddle.is_compiled_with_cuda(): AMP_RELATED_FLAGS_SETTING.update( {'FLAGS_cudnn_batchnorm_spatial_persistent': 1}) paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) if "class_num" in config["Global"]: global_class_num = config["Global"]["class_num"] if "class_num" not in config["Arch"]: config["Arch"]["class_num"] = global_class_num msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}." else: msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored." logger.warning(msg) #TODO(gaotingquan): support rec class_num = config["Arch"].get("class_num", None) self.config["DataLoader"].update({"class_num": class_num}) # build dataloader if self.mode == 'train': self.train_dataloader = build_dataloader(self.config["DataLoader"], "Train", self.device, self.use_dali) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): if self.eval_mode == "classification": self.eval_dataloader = build_dataloader( self.config["DataLoader"], "Eval", self.device, self.use_dali) elif self.eval_mode == "retrieval": self.gallery_query_dataloader = None if len(self.config["DataLoader"]["Eval"].keys()) == 1: key = list(self.config["DataLoader"]["Eval"].keys())[0] self.gallery_query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], key, self.device, self.use_dali) else: self.gallery_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Gallery", self.device, self.use_dali) self.query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Query", self.device, self.use_dali) # build loss if self.mode == "train": loss_info = self.config["Loss"]["Train"] self.train_loss_func = build_loss(loss_info) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): loss_config = self.config.get("Loss", None) if loss_config is not None: loss_config = loss_config.get("Eval") if loss_config is not None: self.eval_loss_func = build_loss(loss_config) else: self.eval_loss_func = None else: self.eval_loss_func = None # build metric if self.mode == 'train': metric_config = self.config.get("Metric") if metric_config is not None: metric_config = metric_config.get("Train") if metric_config is not None: if hasattr( self.train_dataloader, "collate_fn" ) and self.train_dataloader.collate_fn is not None: for m_idx, m in enumerate(metric_config): if "TopkAcc" in m: msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed." logger.warning(msg) break metric_config.pop(m_idx) self.train_metric_func = build_metrics(metric_config) else: self.train_metric_func = None else: self.train_metric_func = None if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): metric_config = self.config.get("Metric") if self.eval_mode == "classification": if metric_config is not None: metric_config = metric_config.get("Eval") if metric_config is not None: self.eval_metric_func = build_metrics(metric_config) elif self.eval_mode == "retrieval": if metric_config is None: metric_config = [{"name": "Recallk", "topk": (1, 5)}] else: metric_config = metric_config["Eval"] self.eval_metric_func = build_metrics(metric_config) else: self.eval_metric_func = None # build model self.model = build_model(self.config) # set @to_static for benchmark, skip this by default. apply_to_static(self.config, self.model) # load_pretrain if self.config["Global"]["pretrained_model"] is not None: if self.config["Global"]["pretrained_model"].startswith("http"): load_dygraph_pretrain_from_url( self.model, self.config["Global"]["pretrained_model"]) else: load_dygraph_pretrain( self.model, self.config["Global"]["pretrained_model"]) # build optimizer if self.mode == 'train': self.optimizer, self.lr_sch = build_optimizer( self.config["Optimizer"], self.config["Global"]["epochs"], len(self.train_dataloader), [self.model]) # for amp training if self.amp: self.scaler = paddle.amp.GradScaler( init_loss_scaling=self.scale_loss, use_dynamic_loss_scaling=self.use_dynamic_loss_scaling) amp_level = self.config['AMP'].get("level", "O1") if amp_level not in ["O1", "O2"]: msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'." logger.warning(msg) self.config['AMP']["level"] = "O1" amp_level = "O1" self.model, self.optimizer = paddle.amp.decorate( models=self.model, optimizers=self.optimizer, level=amp_level, save_dtype='float32') # for distributed world_size = dist.get_world_size() self.config["Global"]["distributed"] = world_size != 1 if world_size != 4 and self.mode == "train": msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train." logger.warning(msg) if self.config["Global"]["distributed"]: dist.init_parallel_env() self.model = paddle.DataParallel(self.model) # build postprocess for infer if self.mode == 'infer': self.preprocess_func = create_operators( self.config["Infer"]["transforms"]) self.postprocess_func = build_postprocess( self.config["Infer"]["PostProcess"])
def __init__(self, config, mode="train"): assert mode in ["train", "eval", "infer", "export"] self.mode = mode self.config = config self.eval_mode = self.config["Global"].get("eval_mode", "classification") if "Head" in self.config["Arch"]: self.is_rec = True else: self.is_rec = False # set seed seed = self.config["Global"].get("seed", False) if seed or seed == 0: assert isinstance(seed, int), "The 'seed' must be a integer!" paddle.seed(seed) np.random.seed(seed) random.seed(seed) # init logger self.output_dir = self.config['Global']['output_dir'] log_file = os.path.join(self.output_dir, self.config["Arch"]["name"], f"{mode}.log") init_logger(name='root', log_file=log_file) print_config(config) # init train_func and eval_func assert self.eval_mode in ["classification", "retrieval"], logger.error( "Invalid eval mode: {}".format(self.eval_mode)) self.train_epoch_func = train_epoch self.eval_func = getattr(evaluation, self.eval_mode + "_eval") self.use_dali = self.config['Global'].get("use_dali", False) # for visualdl self.vdl_writer = None if self.config['Global']['use_visualdl'] and mode == "train": vdl_writer_path = os.path.join(self.output_dir, "vdl") if not os.path.exists(vdl_writer_path): os.makedirs(vdl_writer_path) self.vdl_writer = LogWriter(logdir=vdl_writer_path) # set device assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu", "npu"] self.device = paddle.set_device(self.config["Global"]["device"]) logger.info('train with paddle {} and device {}'.format( paddle.__version__, self.device)) # AMP training self.amp = True if "AMP" in self.config else False if self.amp and self.config["AMP"] is not None: self.scale_loss = self.config["AMP"].get("scale_loss", 1.0) self.use_dynamic_loss_scaling = self.config["AMP"].get( "use_dynamic_loss_scaling", False) else: self.scale_loss = 1.0 self.use_dynamic_loss_scaling = False if self.amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) #TODO(gaotingquan): support rec class_num = config["Arch"].get("class_num", None) self.config["DataLoader"].update({"class_num": class_num}) # build dataloader if self.mode == 'train': self.train_dataloader = build_dataloader(self.config["DataLoader"], "Train", self.device, self.use_dali) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): if self.eval_mode == "classification": self.eval_dataloader = build_dataloader( self.config["DataLoader"], "Eval", self.device, self.use_dali) elif self.eval_mode == "retrieval": self.gallery_query_dataloader = None if len(self.config["DataLoader"]["Eval"].keys()) == 1: key = list(self.config["DataLoader"]["Eval"].keys())[0] self.gallery_query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], key, self.device, self.use_dali) else: self.gallery_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Gallery", self.device, self.use_dali) self.query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Query", self.device, self.use_dali) # build loss if self.mode == "train": loss_info = self.config["Loss"]["Train"] self.train_loss_func = build_loss(loss_info) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): loss_config = self.config.get("Loss", None) if loss_config is not None: loss_config = loss_config.get("Eval") if loss_config is not None: self.eval_loss_func = build_loss(loss_config) else: self.eval_loss_func = None else: self.eval_loss_func = None # build metric if self.mode == 'train': metric_config = self.config.get("Metric") if metric_config is not None: metric_config = metric_config.get("Train") if metric_config is not None: self.train_metric_func = build_metrics(metric_config) else: self.train_metric_func = None else: self.train_metric_func = None if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): metric_config = self.config.get("Metric") if self.eval_mode == "classification": if metric_config is not None: metric_config = metric_config.get("Eval") if metric_config is not None: self.eval_metric_func = build_metrics(metric_config) elif self.eval_mode == "retrieval": if metric_config is None: metric_config = [{"name": "Recallk", "topk": (1, 5)}] else: metric_config = metric_config["Eval"] self.eval_metric_func = build_metrics(metric_config) else: self.eval_metric_func = None # build model self.model = build_model(self.config["Arch"]) # set @to_static for benchmark, skip this by default. apply_to_static(self.config, self.model) # for slim self.pruner = get_pruner(self.config, self.model) self.quanter = get_quaner(self.config, self.model) # load_pretrain if self.config["Global"]["pretrained_model"] is not None: if self.config["Global"]["pretrained_model"].startswith("http"): load_dygraph_pretrain_from_url( self.model, self.config["Global"]["pretrained_model"]) else: load_dygraph_pretrain( self.model, self.config["Global"]["pretrained_model"]) # build optimizer if self.mode == 'train': self.optimizer, self.lr_sch = build_optimizer( self.config["Optimizer"], self.config["Global"]["epochs"], len(self.train_dataloader), [self.model]) # for distributed self.config["Global"][ "distributed"] = paddle.distributed.get_world_size() != 1 if self.config["Global"]["distributed"]: dist.init_parallel_env() if self.config["Global"]["distributed"]: self.model = paddle.DataParallel(self.model) # build postprocess for infer if self.mode == 'infer': self.preprocess_func = create_operators( self.config["Infer"]["transforms"]) self.postprocess_func = build_postprocess( self.config["Infer"]["PostProcess"])