def crate_lmdb_by_aihub(): output_path = 'data_generation/handwritten_sentence' train_db = MyLMDB(os.path.join(output_path, 'train')) val_db = MyLMDB(os.path.join(output_path, 'val')) ch_class = load_class_dictionary('data_generation/kr_labels.txt') text_in_wild_data_path = "D:\\Data\OCR\\processed\\aihub\\01.손글씨\\" json_path = os.path.join(text_in_wild_data_path, 'handwriting_data_info1.json') with open(json_path, 'r', encoding='utf-8') as f: json_obj = json.load(f) type_dict = {} for annotation in json_obj['annotations']: text = annotation['text'] image_id = annotation['image_id'] image_type = annotation['attributes']['type'] if not is_valid_text(ch_class, text): print(f'skip {text}') continue image_path = get_image_path(text_in_wild_data_path, image_id) if image_path is None: print(f'skip {text_in_wild_data_path}') continue if random.random() < 0.001: val_db.write_image_label(image_path, text) else: train_db.write_image_label(image_path, text) train_db.close() val_db.close()
def create_lmdb_by_TRDG(train_lmdb_path, val_lmdb_path, root_folder_path, resize=False): """ Text Recognition Generator로 생성한 필기체 데이터 lmdb로 변환 """ train_db = MyLMDB(train_lmdb_path) val_db = MyLMDB(val_lmdb_path) ch_class = load_class_dictionary('data_generation/kr_labels.txt') for root_folder, folders, file_names in os.walk(root_folder_path): label_path = os.path.join(root_folder, 'labels.txt') if not os.path.exists(label_path): continue file_name_dict = {} for file_name in file_names: original_name = file_name.split('_')[-1] file_name_dict[original_name] = file_name print(f'{root_folder} is processing') with open(label_path, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: items = line.strip().split() image_name = items[0] text = ' '.join(items[1:]) if not is_valid_text(ch_class, text): print(f'skip {text} {image_name}') continue #if len(text) > 39: # for invoice if len(text) > 24: # for ui patch # if len(text) > 15: # for vertical ui patch continue generated_image_name = file_name_dict[image_name] image_path = os.path.join(root_folder, generated_image_name) if not os.path.exists(image_path): print(f'{image_path} is not found') continue # for file_name in file_names: # text = (file_name.split('_')[-1]).split('.')[0] # image_path = os.path.join(root_folder, file_name) if random.random() < 0.005: val_db.write_image_label(image_path, text, resize) else: train_db.write_image_label(image_path, text, resize) train_db.close() val_db.close()
def create_lmdb_by_various_ocr_dataset(lmdb_path, root_folder_path, train_val, category): """ "다양한 형태의 한글 문자 OCR" Args: Returns: """ db = MyLMDB(lmdb_path) ch_class = load_class_dictionary('data_generation/kr_labels.txt') for sub_folder_name in os.listdir(root_folder_path): # 1, 2, 3 ~~ for file_name in os.listdir( os.path.join(root_folder_path, sub_folder_name)): if file_name.split('.')[-1] != 'json': continue label_file_path = os.path.join(root_folder_path, sub_folder_name, file_name) with open(label_file_path, 'r', encoding='utf-8') as f: obj = json.load(f) text = '' if category == 'word': for ch_info in obj['text']['word']: ch = ch_info['value'] if ch_class.get(ch) is None: print(f'skip {ch}') text = '' break text += ch else: ch = obj['info']['text'] if ch_class.get(ch) is None: print(f'skip {ch}') else: text = ch if text == '': break else: image_path = label_file_path.replace(f'[라벨]{train_val}_필기체', f'[원천]{train_val}_필기체').\ replace('.json', '.jpg') db.write_image_label(image_path, text) db.close()
# src = 'data/train/nullee_train' # target = 'nullee_train_vertical_strip' # garbage_path = 'garbage_vertical_val' # src = './data/train/nullee_train_vertical' # target = 'nullee_synth' # garbage_path = 'garbage_nullee_synth' # src = '/media/data/nullee_data/gen_synth_lmdb' # # src = '/media/data/nullee_data/unity_lmdb/' target = 'nullee_val_vertical' garbage_path = 'garbage_nullee_vertical' src = 'backup/nullee_val_vertical_no_rotate' class_dict = load_class_dictionary('data/train/kr_labels.txt', add_space=True) target_db = MyLMDB(target, mode='w', map_size=6e9) src_db = MyLMDB(src, mode='r', map_size=6e9) strip_db = MyLMDB(garbage_path, mode='w', map_size=6e9) num_of_samples = src_db.num_of_samples for i in range(1, num_of_samples): # if i > 38000: # break im, label = src_db.read_image_label(i) if len(label) > 39 or not is_valid_label( label.upper(), class_dict) or label.find( '#') >= 0 or label == '__#DELETED_LABEL#__)': strip_db.write_im_label(im, label)