def process_folder(self, in_dir, out_dir): """ 处理文件夹 """ print('[Info] in_dir: {}'.format(in_dir)) print('[Info] out_dir: {}'.format(out_dir)) mkdir_if_not_exist(out_dir) paths_list, names_list = traverse_dir_files(in_dir) print('[Info] 待处理文件数量: {}'.format(len(paths_list))) random.seed(47) paths_list, names_list = shuffle_two_list(paths_list, names_list) n_prc = 40 pool = Pool(processes=n_prc) # 多线程下载 for idx, (path, name) in enumerate(zip(paths_list, names_list)): pool.apply_async(DataProcessor.process_img, args=(path, name, out_dir)) # DataProcessor.process_img(path, name, out_dir) if (idx + 1) % 1000 == 0: print('[Info] num: {}'.format(idx + 1)) # 多进程逻辑 pool.close() pool.join() print('[Info] 处理完成! {}'.format(out_dir)) return
def process(self): data_dir = os.path.join(DATA_DIR, 'biaozhu_csv') out_dir = os.path.join(DATA_DIR, 'biaozhu_csv_out') mkdir_if_not_exist(out_dir) paths_list, names_list = traverse_dir_files(data_dir) for path, name in zip(paths_list, names_list): print('[Info] path: {}'.format(path)) name_items = name.split(' ') out_name = "_".join(name_items[0:2]) out_path = os.path.join(out_dir, '{}.txt'.format(out_name)) self.process_path_1(path, out_path)
def process_folder(self, img_dir, out_dir): """ 处理文件夹 :param img_dir: 输入文件夹 :param out_dir: 输出文件夹 :return: None """ print('[Info] 处理文件夹: {}'.format(img_dir)) print('[Info] 输出文件夹: {}'.format(out_dir)) mkdir_if_not_exist(out_dir) paths_list, names_list = traverse_dir_files(img_dir) for path, name in zip(paths_list, names_list): patch_list = self.process_img(path) out_name_f = name.split('.')[0] + ".o{}.jpg" out_path_f = os.path.join(out_dir, out_name_f) for idx, img_p in enumerate(patch_list): out_path = out_path_f.format(idx) cv2.imwrite(out_path, img_p) print('[Info] 处理完成: {}'.format(out_dir))
def __init__(self): self.file_dir = os.path.join(DATA_DIR, 'train_data_v3') self.folder1_dir = os.path.join(self.file_dir, 'formula_dec') self.folder2_dir = os.path.join(self.file_dir, 'wrote_formula_dec') self.folder3_dir = os.path.join(self.file_dir, 'text_dec', 'csv') self.folder4_dir = os.path.join(self.file_dir, 'text_dec') self.folder5_dir = os.path.join(self.file_dir, 'formula_detection') self.file1_path = os.path.join(self.folder4_dir, 'raw_ciyubuquan.txt') self.file2_path = os.path.join(self.folder4_dir, 'raw_shouxietouzi.txt') self.file3_path = os.path.join(self.folder4_dir, 'text_formula_biaozhu_1.txt') self.file4_path = os.path.join( self.folder4_dir, 'yuwen_text_detection_1105_zl_1w_clean.csv') self.file5_path = os.path.join(self.folder4_dir, 'wrote_touzi_all.txt') self.file6_path = os.path.join(self.folder4_dir, 'jingbiao.txt') self.out_dir = os.path.join(DATA_DIR, 'train_data_v3_out') mkdir_if_not_exist(self.out_dir) self.out1_dir = os.path.join(self.out_dir, 'formula_dec_out') mkdir_if_not_exist(self.out1_dir) self.out2_dir = os.path.join(self.out_dir, 'wrote_formula_dec_out') mkdir_if_not_exist(self.out2_dir) self.out3_up_dir = os.path.join(self.out_dir, 'text_dec_out') mkdir_if_not_exist(self.out3_up_dir) self.out3_dir = os.path.join(self.out_dir, 'text_dec_out', 'csv_out') mkdir_if_not_exist(self.out3_dir) self.out5_dir = os.path.join(self.out_dir, 'formula_detection_out') mkdir_if_not_exist(self.out5_dir) self.out1_path = os.path.join(self.out_dir, 'text_dec_out', 'raw_ciyubuquan.out.txt') self.out2_path = os.path.join(self.out_dir, 'text_dec_out', 'raw_shouxietouzi.out.txt') self.out3_path = os.path.join(self.out_dir, 'text_dec_out', 'text_formula_biaozhu_1.out.txt') self.out4_path = os.path.join( self.out_dir, 'text_dec_out', 'yuwen_text_detection_1105_zl_1w_clean.out.txt') self.out5_path = os.path.join(self.out_dir, 'text_dec_out', 'wrote_touzi_all.out.txt') self.out6_path = os.path.join(self.out_dir, 'text_dec_out', 'jingbiao.out.txt')
def __init__(self): self.file_path = os.path.join(DATA_DIR, 'sample_complex_formula.txt') self.out_dir = os.path.join(DATA_DIR, 'sample_complex_formula_out') mkdir_if_not_exist(self.out_dir)
def __init__(self, model_path, out_dir): self.model_path = model_path self.out_dir = out_dir mkdir_if_not_exist(self.out_dir) print('[Info] 模型路径: {}'.format(self.model_path)) print('[Info] 输出文件夹: {}'.format(self.out_dir))
def generate_file(file_path, file_idx): file_idx = str(file_idx).zfill(4) print('[Info] file_path: {}, file_idx: {}'.format(file_path, file_idx)) url_format = "http://sm-transfer.oss-cn-hangzhou.aliyuncs.com/zhengsheng.wcl/problems_segmentation/" \ "datasets/prelabeled-20201224/{}.jpg" out_dataset_dir = os.path.join(DATA_DIR, 'ps_datasets_v2') out_images_dir = os.path.join(out_dataset_dir, 'images') out_images_train_dir = os.path.join(out_images_dir, 'train') out_images_val_dir = os.path.join(out_images_dir, 'val') out_labels_dir = os.path.join(out_dataset_dir, 'labels') out_labels_train_dir = os.path.join(out_labels_dir, 'train') out_labels_val_dir = os.path.join(out_labels_dir, 'val') mkdir_if_not_exist(out_dataset_dir) mkdir_if_not_exist(out_images_dir) mkdir_if_not_exist(out_images_train_dir) mkdir_if_not_exist(out_images_val_dir) mkdir_if_not_exist(out_labels_dir) mkdir_if_not_exist(out_labels_train_dir) mkdir_if_not_exist(out_labels_val_dir) print('[Info] 处理数据开始: {}'.format(file_path)) data_line = read_file(file_path)[0] data_dict = json.loads(data_line) print('[Info] keys: {}'.format(data_dict.keys())) images = data_dict['images'] id_name_dict = {} for idx, img in enumerate(images): img_id = img['id'] image_name = img['file_name'].split('.')[0] height = img['height'] width = img['width'] # print('[Info] img: {}'.format(img)) # print('[Info] img_id: {}, file_name: {}'.format(img_id, image_name)) id_name_dict[img_id] = [image_name, height, width] # if idx == 20: # break annotations = data_dict["annotations"] image_dict = collections.defaultdict(list) for idx, anno in enumerate(annotations): image_id = anno['image_id'] image_name, ih, iw = id_name_dict[image_id] wh_box = anno['bbox'] bbox = [wh_box[0], wh_box[1], wh_box[0] + wh_box[2], wh_box[1] + wh_box[3]] if bbox[2] <= bbox[0] or bbox[3] <= bbox[1]: continue bbox_yolo = DatasetGeneratorV2.convert(iw, ih, bbox) bbox_yolo = [str(round(i, 6)) for i in bbox_yolo] # print('[Info] image_id: {}, ih: {}, iw: {}, bbox: {}, bbox_yolo: {}' # .format(image_name, ih, iw, bbox, bbox_yolo)) image_dict[image_name].append(" ".join(["0", *bbox_yolo])) print('[Info] 样本数: {}'.format(len(image_dict.keys()))) image_name_list = list(image_dict.keys()) gap = len(image_name_list) // 10 image_train_list = image_name_list[:gap*9] image_val_list = image_name_list[gap*9:] print('[Info] 训练: {}, 验证: {}'.format(len(image_train_list), len(image_val_list))) for idx, image_name in enumerate(image_train_list): print('[Info] idx: {}'.format(idx)) bbox_yolo_list = image_dict[image_name] image_url = url_format.format(image_name) is_ok, img_bgr = download_url_img(image_url) out_name = "train_{}_{}".format(file_idx, str(idx).zfill(6)) img_path = os.path.join(out_images_train_dir, '{}.jpg'.format(out_name)) cv2.imwrite(img_path, img_bgr) # 写入图像 print('[Info] img_path: {}'.format(img_path)) lbl_path = os.path.join(out_labels_train_dir, '{}.txt'.format(out_name)) write_list_to_file(lbl_path, bbox_yolo_list) print('[Info] lbl_path: {}'.format(lbl_path)) print('[Info] ' + "-" * 100) # if idx == 20: # break for idx, image_name in enumerate(image_val_list): print('[Info] idx: {}'.format(idx)) bbox_yolo_list = image_dict[image_name] image_url = url_format.format(image_name) is_ok, img_bgr = download_url_img(image_url) out_name = "val_{}_{}".format(file_idx, str(idx).zfill(6)) img_path = os.path.join(out_images_val_dir, '{}.jpg'.format(out_name)) cv2.imwrite(img_path, img_bgr) # 写入图像 print('[Info] img_path: {}'.format(img_path)) lbl_path = os.path.join(out_labels_val_dir, '{}.txt'.format(out_name)) write_list_to_file(lbl_path, bbox_yolo_list) print('[Info] lbl_path: {}'.format(lbl_path)) print('[Info] ' + "-" * 100) # if idx == 20: # break print('[Info] 处理完成! {}'.format(file_path))