def __init__( self, origin_dataset_path: str = "", tfrecord_dataset_path: str = "", model_save_path: str = "", validation_split: float = 0.2, batch_size: int = 32, epochs: int = 30, project_id: int = 0, image_size: int = 224, do_fine_tune=False, with_image_net=True, **kwargs, ): """ Args: origin_dataset_path (str): 处理前的数据集路径 tfrecord_dataset_path (str): 处理后的数据集路径 model_save_path (str): 模型保存路径 validation_split (float): 验证集切割比例 batch_size (int): mini batch 大小 epochs (int): 训练epoch数 project_id (int): 训练项目编号 with_image_net (bool): 是否使用imagenet的均值初始化数据 """ self._call_code = "" self.project_id = project_id self.do_fine_tune = do_fine_tune self.with_image_net = with_image_net origin_dataset_path = file_util.abspath(origin_dataset_path) tfrecord_dataset_path = file_util.abspath(tfrecord_dataset_path) model_save_path = file_util.abspath(model_save_path) self.image_size = image_size self.origin_dataset_path = origin_dataset_path # 当未给定处理后数据集的路径时,默认保存到原始数据集相同路径 if tfrecord_dataset_path: self.tfrecord_dataset_path = tfrecord_dataset_path else: self.tfrecord_dataset_path = origin_dataset_path # 当未给定模型保存路径时,默认保存到处理后数据集相同路径 if self.project_id: self.project_save_name = f"luwu-classification-project-{self.project_id}" else: self.project_save_name = f"luwu-classification-project" if model_save_path: self.project_save_path = os.path.join(model_save_path, self.project_save_name) else: self.project_save_path = os.path.join(self.tfrecord_dataset_path, self.project_save_name) self.model_save_path = os.path.join(self.project_save_path, "best_weights.h5") self.validation_split = validation_split self.batch_size = batch_size self.epochs = epochs file_util.mkdirs(self.project_save_path) file_util.mkdirs(self.tfrecord_dataset_path)
def download_pre_trained_model(self): pre_trained_models_config = self.model_lang_weights_dict[ self.pre_trained_model_type][self.language] url = pre_trained_models_config["url"] config_path = pre_trained_models_config["config_path"] checkpoint_path = pre_trained_models_config["checkpoint_path"] dict_path = pre_trained_models_config["dict_path"] filename = url.split("/")[-1] cache_subdir = file_util.abspath("~/.luwu/cache_models") filepath = tf.keras.utils.get_file( filename, url, cache_dir=".", cache_subdir=cache_subdir, extract=True, archive_format="zip", ) # os.remove(filepath) cache_subdir = os.path.join(cache_subdir, filename.split(".")[0]) self.pre_trained_model_config_path = os.path.join( cache_subdir, config_path) self.pre_trained_model_checkpoint_path = os.path.join( cache_subdir, checkpoint_path) self.pre_trained_model_dict_path = os.path.join( cache_subdir, dict_path)
def __init__( self, origin_dataset_path: str = "", tfrecord_dataset_path: str = "", label_map_path: str = "", do_fine_tune: bool = True, fine_tune_checkpoint_path: str = "", fine_tune_model_name: str = "", model_save_path: str = "", batch_size: int = 8, steps: int = 2000, project_id: int = 0, **kwargs, ): """陆吾目标检测模型基类 # TODO:暂时不增加验证集相关功能,后面再加 Args: origin_dataset_path (str): 处理前的数据集路径 tfrecord_dataset_path (str): 处理后的tfrecord数据集路径 label_map_path (str): 目标检测类别映射表(pbtxt) do_fine_tune (bool): 是否在预训练模型的基础上进行微调 fine_tune_checkpoint_path (str): 预训练权重路径 fine_tune_model_name (str): 预训练模型名称 model_save_path (str): 模型保存路径 batch_size (int): mini batch 大小 steps (int, optional): 训练steps数量. Defaults to 2000. project_id (int, optional): 项目编号. Defaults to 0. """ super().__init__( origin_dataset_path=origin_dataset_path, tfrecord_dataset_path=tfrecord_dataset_path, label_map_path=label_map_path, model_save_path=model_save_path, batch_size=batch_size, steps=steps, project_id=project_id, **kwargs, ) self.do_fine_tune = do_fine_tune fine_tune_checkpoint_path = file_util.abspath( fine_tune_checkpoint_path) self.fine_tune_checkpoint_path = fine_tune_checkpoint_path self.fine_tune_model_name = fine_tune_model_name if self.fine_tune_model_name not in self.fine_tune_models_config_map: raise Exception( f"暂不支持的 object detection model! {self.fine_tune_model_name}")
def download_result_from_kaggle(self): output_path = os.path.join(self.tmp_dir_path, "kaggle-output") logger.info(f"创建文件夹 {output_path} ...") file_util.mkdirs(output_path) logger.info("从kaggle拉取运行结果...") cmd = f"kaggle kernels output {self.kernel_id} -p {output_path}" cmd_util.run_cmd(cmd) model_save_path = self.kwargs.get("model_save_path", "") if not model_save_path: model_save_path = "luwu-output" project_path = file_util.abspath(model_save_path) file_util.mkdirs(project_path) output_files_path = os.path.join(output_path, "*") logger.info(f"将运行结果移动到指定目录 {project_path} ...") cmd = f"cp -r {output_files_path} {project_path}" cmd_util.run_cmd(cmd) logger.info("Done.")
def __init__( self, origin_dataset_path: str = "", tfrecord_dataset_path: str = "", label_map_path: str = "", model_save_path: str = "", batch_size: int = 8, steps: int = 2000, project_id: int = 0, **kwargs, ): """陆吾目标检测模型基类 # TODO:暂时不增加验证集相关功能,后面再加 Args: origin_dataset_path (str): 处理前的数据集路径 tfrecord_dataset_path (str): 处理后的tfrecord数据集路径 label_map_path (str): 目标检测类别映射表(pbtxt) model_save_path (str): 模型保存路径 batch_size (int): mini batch 大小 steps (int, optional): 训练steps数量. Defaults to 2000. project_id (int, optional): 项目编号. Defaults to 0. """ self._call_code = "" self.project_id = project_id origin_dataset_path = file_util.abspath(origin_dataset_path) tfrecord_dataset_path = file_util.abspath(tfrecord_dataset_path) label_map_path = file_util.abspath(label_map_path) model_save_path = file_util.abspath(model_save_path) self.origin_dataset_path = origin_dataset_path # 当未给定处理后数据集的路径时,默认保存到原始数据集相同路径 if tfrecord_dataset_path: # 区分给定的是文件夹还是文件。 # 如果是文件夹,则需要生成tfrecord文件 # 如果指定指定到文件,则直接使用指定文件,跳过生成步骤 if os.path.isfile(tfrecord_dataset_path): self.tfrecord_dataset_file_path = tfrecord_dataset_path self.tfrecord_dataset_dir = os.path.dirname( tfrecord_dataset_path) self.need_generate_tfrecord = False else: self.tfrecord_dataset_dir = tfrecord_dataset_path self.tfrecord_dataset_file_path = os.path.join( self.tfrecord_dataset_dir, "train.tfrecord") self.need_generate_tfrecord = True else: self.tfrecord_dataset_dir = self.origin_dataset_path self.tfrecord_dataset_file_path = os.path.join( self.tfrecord_dataset_dir, "train.tfrecord") self.need_generate_tfrecord = True # 当未给定pbtxt路径时,也默认保存到tfrecord相同目录下 if label_map_path: if os.path.isfile(label_map_path): self.label_map_file_path = label_map_path self.label_map_dir = os.path.dirname(self.label_map_file_path) self.need_generate_label_map = False else: self.label_map_dir = label_map_path self.label_map_file_path = os.path.join( self.label_map_dir, "label_map.pbtxt") self.need_generate_label_map = True else: self.label_map_dir = self.tfrecord_dataset_dir self.label_map_file_path = os.path.join(self.label_map_dir, "label_map.pbtxt") self.need_generate_label_map = True # 当未给定模型保存路径时,默认保存到处理后数据集相同路径 if self.project_id: self.project_save_name = f"luwu-object-detection-project-{self.project_id}" else: self.project_save_name = f"luwu-object-detection-project" if model_save_path: self.project_save_path = os.path.join(model_save_path, self.project_save_name) else: self.project_save_path = os.path.join(self.tfrecord_dataset_dir, self.project_save_name) self.batch_size = batch_size self.steps = steps
def __init__( self, origin_dataset_path: str, validation_dataset_path: str = "", test_dataset_path: str = "", model_save_path: str = "", validation_split: float = 0.1, test_split: float = 0.1, batch_size: int = 32, epochs: int = 30, learning_rate: float = 0.01, project_id: int = 0, maxlen: int = 128, frezee_pre_trained_model=False, optimizer: str = "Adam", optimize_with_piecewise_linear_lr: bool = False, simplified_tokenizer: bool = False, pre_trained_model_type: str = "bert_base", language: str = "chinese", *args, **kwargs, ): """ Args: origin_dataset_path (str): 处理前的数据集路径 validation_dataset_path (str): 验证数据集路径。如不指定, 则从origin_dataset_path中进行切分。 test_dataset_path (str): 测试数据集路径。如不指定,则从 origin_dataset_path中进行切分。 model_save_path (str): 模型保存路径 validation_split (float): 验证集切割比例 test_split (float): 测试集切割比例 batch_size (int): mini batch 大小 learning_rate (float): 学习率大小 epochs (int): 训练epoch数 project_id (int): 训练项目编号 maxlen (int, optional): 单个文本的最大长度. Defaults to 128. frezee_pre_trained_model (bool, optional): 在训练下游网络时,是否冻结预训练模型权重. Defaults to False. optimizer (str, optional): 优化器类别. Defaults to "Adam". optimize_with_piecewise_linear_lr (bool): 是否使用分段的线性学习率进行优化. 默认 False simplified_tokenizer (bool): 是否对分词器的词表进行精简,默认False pre_trained_model_type (str): 使用何种预训练模型 language (str): 预训练语料的语言 """ self._call_code = "" self.project_id = project_id self.frezee_pre_trained_model = frezee_pre_trained_model self.learning_rate = learning_rate self.optimize_with_piecewise_linear_lr = optimize_with_piecewise_linear_lr self.optimizer_cls = self.get_optimizer_cls(optimizer) origin_dataset_path = file_util.abspath(origin_dataset_path) model_save_path = file_util.abspath(model_save_path) self.simplified_tokenizer = simplified_tokenizer self.pre_trained_model_type = pre_trained_model_type self.language = language if self.pre_trained_model_type not in self.model_lang_weights_dict: raise Exception( f"指定模型 {self.pre_trained_model_type} 不存在!当前支持的模型为:{list(self.model_lang_weights_dict.keys())}" ) if (self.language not in self.model_lang_weights_dict[ self.pre_trained_model_type]): languages = list(self.model_lang_weights_dict[ self.pre_trained_model_type].keys()) raise Exception( f"指定语料 {self.language} 的预训练模型 {self.pre_trained_model_type} 不存在!支持的语料为:{languages}" ) self.maxlen = maxlen self.origin_dataset_path = origin_dataset_path if validation_dataset_path: self.validation_dataset_path = file_util.abspath( validation_dataset_path) else: self.validation_dataset_path = validation_dataset_path if test_dataset_path: self.test_dataset_path = file_util.abspath(test_dataset_path) else: self.test_dataset_path = test_dataset_path # 当未给定模型保存路径时,默认保存到origin数据集相同路径 self.project_save_name = self.init_project_save_name(project_id) if model_save_path: self.project_save_path = os.path.join(model_save_path, self.project_save_name) else: self.project_save_path = os.path.join( os.path.dirname(self.origin_dataset_path), self.project_save_name) self.model_save_path = os.path.join(self.project_save_path, "best_weights.h5") self.validation_split = validation_split self.test_split = test_split self.batch_size = batch_size self.epochs = epochs file_util.mkdirs(self.project_save_path) self.model = None
def __init__( self, origin_dataset_path: str = "", validation_dataset_path: str = "", test_dataset_path: str = "", tfrecord_dataset_path: str = "", model_save_path: str = "", validation_split: float = 0.1, test_split: float = 0.1, batch_size: int = 32, epochs: int = 30, learning_rate: float = 0.01, project_id: int = 0, image_size: int = 224, do_fine_tune=False, with_image_net=True, optimizer: str = "Adam", freeze_epochs_ratio: float = 0.1, image_augmentation_random_flip_horizontal: bool = False, image_augmentation_random_flip_vertival: bool = False, image_augmentation_random_crop: bool = False, image_augmentation_random_brightness: bool = False, image_augmentation_random_hue: bool = False, **kwargs, ): """ Args: origin_dataset_path (str): 处理前的数据集路径 validation_dataset_path (str): 验证数据集路径。如不指定, 则从origin_dataset_path中进行切分。 test_dataset_path (str): 测试数据集路径。如不指定,则从 origin_dataset_path中进行切分。 tfrecord_dataset_path (str): 处理后的数据集路径 model_save_path (str): 模型保存路径 validation_split (float): 验证集切割比例 test_split (float): 测试集切割比例 batch_size (int): mini batch 大小 learning_rate (float): 学习率大小 epochs (int): 训练epoch数 project_id (int): 训练项目编号 with_image_net (bool): 是否使用imagenet的均值初始化数据 optimizer (str): 优化器类别 freeze_epochs_ratio (float): 当进行fine_tune时,会先冻结预训练模型进行训练一定epochs, 再解冻全部参数训练一定epochs,此参数表示冻结训练epochs占 全部epochs的比例(此参数仅当 do_fine_tune = True 时有效)。 默认 0.1(当总epochs>1时,只要设置了比例,至少会训练一个epoch) image_augmentation_random_flip_horizontal (bool): 数据增强选项,是否做随机左右镜像。默认False. image_augmentation_random_flip_vertival (bool): 数据增强选项,是否做随机上下镜像。默认False. image_augmentation_random_crop (bool): 数据增强选项,是否做随机剪裁,剪裁尺寸为原来比例的0.9。默认False. image_augmentation_random_brightness (bool): 数据增强选项,是否做随机饱和度调节。默认False. image_augmentation_random_hue (bool): 数据增强选项,是否做随机色调调节。默认False. """ self._call_code = "" self.project_id = project_id self.do_fine_tune = do_fine_tune self.with_image_net = with_image_net self.learning_rate = learning_rate self.freeze_epochs_ratio = freeze_epochs_ratio self.image_augmentation_random_flip_horizontal = ( image_augmentation_random_flip_horizontal) self.image_augmentation_random_flip_vertival = ( image_augmentation_random_flip_vertival) self.image_augmentation_random_crop = image_augmentation_random_crop self.image_augmentation_random_brightness = image_augmentation_random_brightness self.image_augmentation_random_hue = image_augmentation_random_hue self.optimizer_cls = self.get_optimizer_cls(optimizer) origin_dataset_path = file_util.abspath(origin_dataset_path) tfrecord_dataset_path = file_util.abspath(tfrecord_dataset_path) model_save_path = file_util.abspath(model_save_path) self.image_size = image_size self.origin_dataset_path = origin_dataset_path if validation_dataset_path: self.validation_dataset_path = file_util.abspath( validation_dataset_path) else: self.validation_dataset_path = validation_dataset_path if test_dataset_path: self.test_dataset_path = file_util.abspath(test_dataset_path) else: self.test_dataset_path = test_dataset_path # 当未给定处理后数据集的路径时,默认保存到原始数据集相同路径 if tfrecord_dataset_path: self.tfrecord_dataset_path = tfrecord_dataset_path else: self.tfrecord_dataset_path = origin_dataset_path # 当未给定模型保存路径时,默认保存到处理后数据集相同路径 if self.project_id: self.project_save_name = f"luwu-classification-project-{self.project_id}" else: self.project_save_name = f"luwu-classification-project" if model_save_path: self.project_save_path = os.path.join(model_save_path, self.project_save_name) else: self.project_save_path = os.path.join(self.tfrecord_dataset_path, self.project_save_name) self.model_save_path = os.path.join(self.project_save_path, "best_weights.h5") self.validation_split = validation_split self.test_split = test_split self.batch_size = batch_size self.epochs = epochs file_util.mkdirs(self.project_save_path) file_util.mkdirs(self.tfrecord_dataset_path)