def process_folder(self, in_dir, out_dir): """ 处理文件夹 """ print('[Info] in_dir: {}'.format(in_dir)) print('[Info] out_dir: {}'.format(out_dir)) mkdir_if_not_exist(out_dir) paths_list, names_list = traverse_dir_files(in_dir) print('[Info] 待处理文件数量: {}'.format(len(paths_list))) random.seed(47) paths_list, names_list = shuffle_two_list(paths_list, names_list) n_prc = 40 pool = Pool(processes=n_prc) # 多线程下载 for idx, (path, name) in enumerate(zip(paths_list, names_list)): pool.apply_async(DataProcessor.process_img, args=(path, name, out_dir)) # DataProcessor.process_img(path, name, out_dir) if (idx + 1) % 1000 == 0: print('[Info] num: {}'.format(idx + 1)) # 多进程逻辑 pool.close() pool.join() print('[Info] 处理完成! {}'.format(out_dir)) return
def check(self): data_dir = os.path.join(ROOT_DIR, '..', 'datasets', 'datasets_v3') paths_list, names_list = traverse_dir_files(data_dir, is_sorted=False) print('[Info] 文件数量: {}'.format(len(paths_list))) for path in paths_list: x_path = path.split("?")[0] shutil.move(path, x_path) print('[Info] 处理完成!')
def get_problems_data(img_dir): image_paths, image_names = traverse_dir_files(img_dir, is_sorted=False) # 90% train images and 10% test images n_train_samples = int(len(image_paths) * 0.95) train_filenames = image_paths[:n_train_samples] test_filenames = image_paths[n_train_samples:] return train_filenames, test_filenames
def process(self): data_dir = os.path.join(DATA_DIR, 'biaozhu_csv') out_dir = os.path.join(DATA_DIR, 'biaozhu_csv_out') mkdir_if_not_exist(out_dir) paths_list, names_list = traverse_dir_files(data_dir) for path, name in zip(paths_list, names_list): print('[Info] path: {}'.format(path)) name_items = name.split(' ') out_name = "_".join(name_items[0:2]) out_path = os.path.join(out_dir, '{}.txt'.format(out_name)) self.process_path_1(path, out_path)
def filter_folder(self, in_dir): paths_list, names_list = traverse_dir_files(in_dir) print('[Info] 样本数: {}'.format(len(paths_list))) n_remove = 0 count = 0 for path, name in zip(paths_list, names_list): img_bgr = cv2.imread(path) h, w, _ = img_bgr.shape x = safe_div(h, w) if x > 2: print('[Info] 删除: {}'.format(path)) os.remove(path) n_remove += 1 count += 1 if count % 100 == 0: print(count) print('[Info] 删除: {}'.format(n_remove)) paths_list, names_list = traverse_dir_files(in_dir) print('[Info] 处理后, 样本数: {}'.format(len(paths_list)))
def merge_files(self): data_dir = os.path.join(DATA_DIR, 'biaozhu_csv_out') paths_list, names_list = traverse_dir_files(data_dir) out_path = os.path.join(DATA_DIR, 'biaozhu_csv_out.txt') all_data_lines = [] for path, name in zip(paths_list, names_list): data_lines = read_file(path) for data_line in data_lines: data_line.replace("\"", "") all_data_lines.append(data_line) write_list_to_file(out_path, all_data_lines)
def merge(self): out_format = os.path.join(DATA_DIR, 'train_data_v3_out.{}.txt') paths_list, names_list = traverse_dir_files(self.out_dir) print('[Info] 总文本数: {}'.format(len(paths_list))) all_data_lines = [] for path in paths_list: data_lines = read_file(path) all_data_lines += data_lines all_data_lines = sorted(list(set(all_data_lines))) out_path = out_format.format(len(all_data_lines)) print('[Info] 总数据量: {}'.format(len(all_data_lines))) write_list_to_file(out_path, all_data_lines) print('[Info] 写入数据完成: {}'.format(out_path))
def process(): dir_path = os.path.join(DATA_DIR, 'ps_datasets_v2_raw') paths_list, names_list = traverse_dir_files(dir_path) pool = Pool(processes=20) for file_idx, (path, name) in enumerate(zip(paths_list, names_list)): # DatasetGeneratorV2.generate_file(path) print('[Info] path: {}'.format(path)) pool.apply_async(DatasetGeneratorV2.generate_file, (path, file_idx)) pool.close() pool.join() print('[Info] 全部处理完成: {}'.format(dir_path))
def process_folder(self, folder_dir, out_folder): """ 处理CSV文件夹 :param folder_dir: 文件夹 :param out_folder: 输出文件夹 :return: None """ print('[Info] 待处理文件夹: {}'.format(folder_dir)) paths_list, names_list = traverse_dir_files(folder_dir, ext='csv') print('[Info] 文件数量: {}'.format(len(paths_list))) for path, name in zip(paths_list, names_list): print('[Info] path: {}'.format(path)) file_name = name.split('.')[0] out_path = os.path.join(out_folder, '{}.out.txt'.format(file_name)) create_file(out_path) self.process_csv(path, out_path)
def process_folder(self, img_dir, out_dir): """ 处理文件夹 :param img_dir: 输入文件夹 :param out_dir: 输出文件夹 :return: None """ print('[Info] 处理文件夹: {}'.format(img_dir)) print('[Info] 输出文件夹: {}'.format(out_dir)) mkdir_if_not_exist(out_dir) paths_list, names_list = traverse_dir_files(img_dir) for path, name in zip(paths_list, names_list): patch_list = self.process_img(path) out_name_f = name.split('.')[0] + ".o{}.jpg" out_path_f = os.path.join(out_dir, out_name_f) for idx, img_p in enumerate(patch_list): out_path = out_path_f.format(idx) cv2.imwrite(out_path, img_p) print('[Info] 处理完成: {}'.format(out_dir))