def __init__( self, origin_dataset_path: str = "", tfrecord_dataset_path: str = "", model_save_path: str = "", validation_split: float = 0.2, batch_size: int = 32, epochs: int = 30, project_id: int = 0, image_size: int = 224, do_fine_tune=False, with_image_net=True, **kwargs, ): """ Args: origin_dataset_path (str): 处理前的数据集路径 tfrecord_dataset_path (str): 处理后的数据集路径 model_save_path (str): 模型保存路径 validation_split (float): 验证集切割比例 batch_size (int): mini batch 大小 epochs (int): 训练epoch数 project_id (int): 训练项目编号 with_image_net (bool): 是否使用imagenet的均值初始化数据 """ self._call_code = "" self.project_id = project_id self.do_fine_tune = do_fine_tune self.with_image_net = with_image_net origin_dataset_path = file_util.abspath(origin_dataset_path) tfrecord_dataset_path = file_util.abspath(tfrecord_dataset_path) model_save_path = file_util.abspath(model_save_path) self.image_size = image_size self.origin_dataset_path = origin_dataset_path # 当未给定处理后数据集的路径时,默认保存到原始数据集相同路径 if tfrecord_dataset_path: self.tfrecord_dataset_path = tfrecord_dataset_path else: self.tfrecord_dataset_path = origin_dataset_path # 当未给定模型保存路径时,默认保存到处理后数据集相同路径 if self.project_id: self.project_save_name = f"luwu-classification-project-{self.project_id}" else: self.project_save_name = f"luwu-classification-project" if model_save_path: self.project_save_path = os.path.join(model_save_path, self.project_save_name) else: self.project_save_path = os.path.join(self.tfrecord_dataset_path, self.project_save_name) self.model_save_path = os.path.join(self.project_save_path, "best_weights.h5") self.validation_split = validation_split self.batch_size = batch_size self.epochs = epochs file_util.mkdirs(self.project_save_path) file_util.mkdirs(self.tfrecord_dataset_path)
def upload_dataset(self): origin_dataset_path = self.kwargs.get("origin_dataset_path", "") if os.path.exists(origin_dataset_path): # 先复制一份数据集 # 创建一个文件夹 dataset_path = os.path.join(self.tmp_dir_path, "kaggle-data") copy_path = os.path.join(dataset_path, "data") logger.info(f"创建文件夹 {copy_path} ...") file_util.mkdirs(copy_path) # 复制数据集到临时目录 logger.info(f"复制数据集到临时目录...") if os.path.isdir(origin_dataset_path): cmd = f'cp -r {os.path.join(origin_dataset_path,"*")} {copy_path}' else: cmd = f"cp -r {origin_dataset_path} {copy_path}" cmd_util.run_cmd(cmd) # 使用kaggle api初始化数据集 logger.info("使用kaggle api初始化数据集...") cmd = f"kaggle datasets init -p {dataset_path}" cmd_util.run_cmd(cmd) # 配置dataset meta dataset_meta_path = os.path.join(dataset_path, "dataset-metadata.json") with open(dataset_meta_path, "r") as f: dataset_meta = json.load(f) dataset_meta["title"] = f"luwu-dataset-{self.uuid}" dataset_meta["id"] = (dataset_meta["id"].split("/")[0] + "/" + f"luwu-dataset-{self.uuid}") with open(dataset_meta_path, "w") as f: json.dump(dataset_meta, f, ensure_ascii=False, indent=2) # 上传数据集 logger.info("上传数据集...") cmd = f"kaggle datasets create -r zip -p {dataset_path}" cmd_util.run_cmd(cmd) logger.info("数据集上传完成!") logger.info("等待 kaggle 处理数据集,这可能需要几分钟时间 ...") self.dataset_id = dataset_meta["id"] self.dataset_title = dataset_meta["title"] cmd = f"kaggle datasets status {self.dataset_id}" while True: code, output = subprocess.getstatusoutput(cmd) if code != 0: logger.error(output) raise Exception("查询数据集状态失败!") if output: if "ready" in output: logger.info("数据集准备完成!") else: logger.warning(output) break else: logger.info("暂未查询到数据,等待中 ...") time.sleep(10) else: raise FileNotFoundError( f"指定的 origin_dataset_path 不存在!{origin_dataset_path}")
def download_result_from_kaggle(self): output_path = os.path.join(self.tmp_dir_path, "kaggle-output") logger.info(f"创建文件夹 {output_path} ...") file_util.mkdirs(output_path) logger.info("从kaggle拉取运行结果...") cmd = f"kaggle kernels output {self.kernel_id} -p {output_path}" cmd_util.run_cmd(cmd) model_save_path = self.kwargs.get("model_save_path", "") if not model_save_path: model_save_path = "luwu-output" project_path = file_util.abspath(model_save_path) file_util.mkdirs(project_path) output_files_path = os.path.join(output_path, "*") logger.info(f"将运行结果移动到指定目录 {project_path} ...") cmd = f"cp -r {output_files_path} {project_path}" cmd_util.run_cmd(cmd) logger.info("Done.")
def run_project(project): params = project["params"] model_save_path = params["model_save_path"] log_path = os.path.join( model_save_path, f"luwu-classification-project-{project['id']}", f"train.log", ) file_util.mkdirs(os.path.dirname(log_path)) curdir = os.path.abspath(os.path.dirname(__file__)) cd_path = os.path.abspath(os.path.join(os.path.join(curdir, ".."), "..")) py_path = os.path.join("." + curdir[len(cd_path) :], "train_project.py") cmd = f"""cd {cd_path};python {py_path} {project['id']} > {log_path} 2>&1""" st, out = subprocess.getstatusoutput(cmd) if st == 0: logger.info("处理成功!") else: logger.info("处理失败!") raise Exception(out)
def __init__( self, origin_dataset_path: str, validation_dataset_path: str = "", test_dataset_path: str = "", model_save_path: str = "", validation_split: float = 0.1, test_split: float = 0.1, batch_size: int = 32, epochs: int = 30, learning_rate: float = 0.01, project_id: int = 0, maxlen: int = 128, frezee_pre_trained_model=False, optimizer: str = "Adam", optimize_with_piecewise_linear_lr: bool = False, simplified_tokenizer: bool = False, pre_trained_model_type: str = "bert_base", language: str = "chinese", *args, **kwargs, ): """ Args: origin_dataset_path (str): 处理前的数据集路径 validation_dataset_path (str): 验证数据集路径。如不指定, 则从origin_dataset_path中进行切分。 test_dataset_path (str): 测试数据集路径。如不指定,则从 origin_dataset_path中进行切分。 model_save_path (str): 模型保存路径 validation_split (float): 验证集切割比例 test_split (float): 测试集切割比例 batch_size (int): mini batch 大小 learning_rate (float): 学习率大小 epochs (int): 训练epoch数 project_id (int): 训练项目编号 maxlen (int, optional): 单个文本的最大长度. Defaults to 128. frezee_pre_trained_model (bool, optional): 在训练下游网络时,是否冻结预训练模型权重. Defaults to False. optimizer (str, optional): 优化器类别. Defaults to "Adam". optimize_with_piecewise_linear_lr (bool): 是否使用分段的线性学习率进行优化. 默认 False simplified_tokenizer (bool): 是否对分词器的词表进行精简,默认False pre_trained_model_type (str): 使用何种预训练模型 language (str): 预训练语料的语言 """ self._call_code = "" self.project_id = project_id self.frezee_pre_trained_model = frezee_pre_trained_model self.learning_rate = learning_rate self.optimize_with_piecewise_linear_lr = optimize_with_piecewise_linear_lr self.optimizer_cls = self.get_optimizer_cls(optimizer) origin_dataset_path = file_util.abspath(origin_dataset_path) model_save_path = file_util.abspath(model_save_path) self.simplified_tokenizer = simplified_tokenizer self.pre_trained_model_type = pre_trained_model_type self.language = language if self.pre_trained_model_type not in self.model_lang_weights_dict: raise Exception( f"指定模型 {self.pre_trained_model_type} 不存在!当前支持的模型为:{list(self.model_lang_weights_dict.keys())}" ) if (self.language not in self.model_lang_weights_dict[ self.pre_trained_model_type]): languages = list(self.model_lang_weights_dict[ self.pre_trained_model_type].keys()) raise Exception( f"指定语料 {self.language} 的预训练模型 {self.pre_trained_model_type} 不存在!支持的语料为:{languages}" ) self.maxlen = maxlen self.origin_dataset_path = origin_dataset_path if validation_dataset_path: self.validation_dataset_path = file_util.abspath( validation_dataset_path) else: self.validation_dataset_path = validation_dataset_path if test_dataset_path: self.test_dataset_path = file_util.abspath(test_dataset_path) else: self.test_dataset_path = test_dataset_path # 当未给定模型保存路径时,默认保存到origin数据集相同路径 self.project_save_name = self.init_project_save_name(project_id) if model_save_path: self.project_save_path = os.path.join(model_save_path, self.project_save_name) else: self.project_save_path = os.path.join( os.path.dirname(self.origin_dataset_path), self.project_save_name) self.model_save_path = os.path.join(self.project_save_path, "best_weights.h5") self.validation_split = validation_split self.test_split = test_split self.batch_size = batch_size self.epochs = epochs file_util.mkdirs(self.project_save_path) self.model = None
def train_on_kaggle(self, task_type): # 生成训练代码 # 创建文件夹 kernel_path = os.path.join(self.tmp_dir_path, "kaggle-kernel") logger.info(f"创建文件夹 {kernel_path} ...") file_util.mkdirs(kernel_path) # 初始化kernel logger.info("使用 kaggle api 初始化 kernel ...") cmd = f"kaggle kernels init -p {kernel_path}" cmd_util.run_cmd(cmd) # 生成训练脚本 override_params = {"project_id", "cmd", "luwu_version"} train_cmd_params = [] if task_type == "classification": project_name = "luwu-classification-project" override_params.update(["net_name", "network_name"]) # tfrecord数据集路径 tfrecord_dataset_path = "./dataset" train_cmd_params.append( f"--tfrecord_dataset_path {tfrecord_dataset_path}") override_params.add("tfrecord_dataset_path") elif task_type == "detection": project_name = "luwu-object-detection-project" override_params.update([ "label_map_path", "fine_tune_checkpoint_path", ]) # tfrecord数据集路径 tfrecord_dataset_path = "./dataset" train_cmd_params.append( f"--tfrecord_dataset_path {tfrecord_dataset_path}") override_params.add("tfrecord_dataset_path") elif task_type == "text_classification": project_name = "luwu-text-classification-project" else: raise Exception(f"不支持的任务类型! {task_type}") # 原始数据集路径 origin_dataset_path = os.path.join("../input", self.dataset_title) if self.kwargs.get("cmd") == "text_classification": filename = self.kwargs.get("origin_dataset_path").split("/")[-1] origin_dataset_path = os.path.join(origin_dataset_path, filename) train_cmd_params.append(f"--origin_dataset_path {origin_dataset_path}") override_params.add("origin_dataset_path") # 模型保存路径 model_save_path = "./project" train_cmd_params.append(f"--model_save_path {model_save_path}") override_params.add("model_save_path") # 其他参数 for arg_name, arg_value in self.kwargs.items(): if "kaggle" in arg_name: continue if arg_name in override_params: continue # 兼容bool类型参数 if arg_value != False: train_cmd_params.append(f'--{arg_name} "{arg_value}"') # else: # train_cmd_params.append(f"--{arg_name}") if task_type == "classification": train_cmd = f"!luwu {task_type} {' '.join(train_cmd_params)} {self.luwu_model_class.__name__}\n" elif task_type == "detection": train_cmd = f"!luwu {task_type} {' '.join(train_cmd_params)}\n" elif task_type == "text_classification": train_cmd = f"!luwu {task_type} {' '.join(train_cmd_params)}\n" else: raise Exception(f"不支持的任务类型! {task_type}") project_path = os.path.join(model_save_path, project_name) if task_type == "classification": zip_cmd = ( f"!mv {project_path} ./ " f"&& zip -r {project_name}-{self.uuid}.zip ./{project_name} " f"&& rm -rf {tfrecord_dataset_path} " f"&& rm -rf ./{project_name} " f"&& rm -rf {model_save_path} \n") elif task_type == "detection": zip_cmd = ( f"!mv {project_path} ./ " f'&& rm -rf {os.path.join(project_name,"train_models")} ' f"&& zip -r {project_name}-{self.uuid}.zip ./{project_name} " f"&& rm -rf {tfrecord_dataset_path} " f"&& rm -rf ./{project_name} " f"&& rm -rf {model_save_path} \n") elif task_type == "text_classification": zip_cmd = ( f"!mv {project_path} ./ " f"&& zip -r {project_name}-{self.uuid}.zip ./{project_name} " f"&& rm -rf ./{project_name} " f"&& rm -rf {model_save_path} \n") luwu_version = self.kwargs.get("luwu_version") if luwu_version: install_cmd = f"!pip install luwu=={luwu_version}\n" else: install_cmd = "!pip install luwu\n" codes = [ "# 安装 luwu\n", install_cmd, "# 执行训练指令\n", train_cmd, "# 打包待下载文件的指令\n", zip_cmd, " ", ] script_metadata = self.load_notebook_metadata() self.update_notebook_codes(script_metadata, codes) kernel_file_path = os.path.join(kernel_path, f"luwu-kernel-{self.uuid}.ipynb") with open(kernel_file_path, "w") as f: json.dump(script_metadata, f, ensure_ascii=False, indent=2) # 修改 kernel-metadata.json kernel_metadata_path = os.path.join(kernel_path, "kernel-metadata.json") with open(kernel_metadata_path, "r") as f: kernel_metadata = json.load(f) kernel_metadata["id"] = (kernel_metadata["id"].split("/")[0] + "/" + f"luwu-kernel-{self.uuid}") kernel_metadata["title"] = f"luwu-kernel-{self.uuid}" kernel_metadata["code_file"] = kernel_file_path kernel_metadata["language"] = "python" kernel_metadata["kernel_type"] = "notebook" kaggle_accelerator = self.kwargs.get("kaggle_accelerator", False) if kaggle_accelerator: kernel_metadata["enable_gpu"] = "true" else: kernel_metadata["enable_gpu"] = "false" kernel_metadata["dataset_sources"] = [ self.dataset_id, ] with open(kernel_metadata_path, "w") as f: json.dump(kernel_metadata, f, ensure_ascii=False, indent=2) logger.info(f"kernel metadata :{kernel_metadata}") self.kernel_id = kernel_metadata["id"] self.kernel_title = kernel_metadata["title"] # 推送并运行kernel logger.info("将 kernel 推送到 Kaggle 并运行 ...") cmd = f"kaggle kernels push -p {kernel_path}" logger.debug(cmd) cmd_util.run_cmd(cmd) logger.info("推送完成!等待运行中 ...") running = False error_cnt = 0 while True: cmd = f"kaggle kernels status {self.kernel_id}" code, output = subprocess.getstatusoutput(cmd) if code != 0: logger.error(output) raise Exception(output) pattern = 'has status "([^"]*)"' matches = re.findall(pattern, output) if not matches: logger.error(f"未查询到状态!{output}") error_cnt += 1 if error_cnt > 10: raise Exception( f"连续10次未获取到 kernel {self.kernel_id} 的运行状态!") else: status = matches[0] # 运行之前,所有的状态都忽略 if not running: if status == "running": logger.info(f"{self.kernel_id} running ...") running = True else: # 运行之后,找到第一次非 running 状态就退出 if status == "running": logger.info(f"{self.kernel_id} running ...") else: self.kernel_exit_status = status logger.info(output) logger.info( f"{self.kernel_id} 终止状态:{self.kernel_exit_status} . 已退出!" ) break time.sleep(10) logger.info("kernel 运行已结束!")
def __init__( self, origin_dataset_path: str = "", validation_dataset_path: str = "", test_dataset_path: str = "", tfrecord_dataset_path: str = "", model_save_path: str = "", validation_split: float = 0.1, test_split: float = 0.1, batch_size: int = 32, epochs: int = 30, learning_rate: float = 0.01, project_id: int = 0, image_size: int = 224, do_fine_tune=False, with_image_net=True, optimizer: str = "Adam", freeze_epochs_ratio: float = 0.1, image_augmentation_random_flip_horizontal: bool = False, image_augmentation_random_flip_vertival: bool = False, image_augmentation_random_crop: bool = False, image_augmentation_random_brightness: bool = False, image_augmentation_random_hue: bool = False, **kwargs, ): """ Args: origin_dataset_path (str): 处理前的数据集路径 validation_dataset_path (str): 验证数据集路径。如不指定, 则从origin_dataset_path中进行切分。 test_dataset_path (str): 测试数据集路径。如不指定,则从 origin_dataset_path中进行切分。 tfrecord_dataset_path (str): 处理后的数据集路径 model_save_path (str): 模型保存路径 validation_split (float): 验证集切割比例 test_split (float): 测试集切割比例 batch_size (int): mini batch 大小 learning_rate (float): 学习率大小 epochs (int): 训练epoch数 project_id (int): 训练项目编号 with_image_net (bool): 是否使用imagenet的均值初始化数据 optimizer (str): 优化器类别 freeze_epochs_ratio (float): 当进行fine_tune时,会先冻结预训练模型进行训练一定epochs, 再解冻全部参数训练一定epochs,此参数表示冻结训练epochs占 全部epochs的比例(此参数仅当 do_fine_tune = True 时有效)。 默认 0.1(当总epochs>1时,只要设置了比例,至少会训练一个epoch) image_augmentation_random_flip_horizontal (bool): 数据增强选项,是否做随机左右镜像。默认False. image_augmentation_random_flip_vertival (bool): 数据增强选项,是否做随机上下镜像。默认False. image_augmentation_random_crop (bool): 数据增强选项,是否做随机剪裁,剪裁尺寸为原来比例的0.9。默认False. image_augmentation_random_brightness (bool): 数据增强选项,是否做随机饱和度调节。默认False. image_augmentation_random_hue (bool): 数据增强选项,是否做随机色调调节。默认False. """ self._call_code = "" self.project_id = project_id self.do_fine_tune = do_fine_tune self.with_image_net = with_image_net self.learning_rate = learning_rate self.freeze_epochs_ratio = freeze_epochs_ratio self.image_augmentation_random_flip_horizontal = ( image_augmentation_random_flip_horizontal) self.image_augmentation_random_flip_vertival = ( image_augmentation_random_flip_vertival) self.image_augmentation_random_crop = image_augmentation_random_crop self.image_augmentation_random_brightness = image_augmentation_random_brightness self.image_augmentation_random_hue = image_augmentation_random_hue self.optimizer_cls = self.get_optimizer_cls(optimizer) origin_dataset_path = file_util.abspath(origin_dataset_path) tfrecord_dataset_path = file_util.abspath(tfrecord_dataset_path) model_save_path = file_util.abspath(model_save_path) self.image_size = image_size self.origin_dataset_path = origin_dataset_path if validation_dataset_path: self.validation_dataset_path = file_util.abspath( validation_dataset_path) else: self.validation_dataset_path = validation_dataset_path if test_dataset_path: self.test_dataset_path = file_util.abspath(test_dataset_path) else: self.test_dataset_path = test_dataset_path # 当未给定处理后数据集的路径时,默认保存到原始数据集相同路径 if tfrecord_dataset_path: self.tfrecord_dataset_path = tfrecord_dataset_path else: self.tfrecord_dataset_path = origin_dataset_path # 当未给定模型保存路径时,默认保存到处理后数据集相同路径 if self.project_id: self.project_save_name = f"luwu-classification-project-{self.project_id}" else: self.project_save_name = f"luwu-classification-project" if model_save_path: self.project_save_path = os.path.join(model_save_path, self.project_save_name) else: self.project_save_path = os.path.join(self.tfrecord_dataset_path, self.project_save_name) self.model_save_path = os.path.join(self.project_save_path, "best_weights.h5") self.validation_split = validation_split self.test_split = test_split self.batch_size = batch_size self.epochs = epochs file_util.mkdirs(self.project_save_path) file_util.mkdirs(self.tfrecord_dataset_path)