예제 #1
0
def crate_lmdb_by_aihub():
    output_path = 'data_generation/handwritten_sentence'
    train_db = MyLMDB(os.path.join(output_path, 'train'))
    val_db = MyLMDB(os.path.join(output_path, 'val'))
    ch_class = load_class_dictionary('data_generation/kr_labels.txt')
    text_in_wild_data_path = "D:\\Data\OCR\\processed\\aihub\\01.손글씨\\"
    json_path = os.path.join(text_in_wild_data_path,
                             'handwriting_data_info1.json')
    with open(json_path, 'r', encoding='utf-8') as f:
        json_obj = json.load(f)

    type_dict = {}
    for annotation in json_obj['annotations']:
        text = annotation['text']
        image_id = annotation['image_id']
        image_type = annotation['attributes']['type']
        if not is_valid_text(ch_class, text):
            print(f'skip {text}')
            continue

        image_path = get_image_path(text_in_wild_data_path, image_id)
        if image_path is None:
            print(f'skip {text_in_wild_data_path}')
            continue

        if random.random() < 0.001:
            val_db.write_image_label(image_path, text)
        else:
            train_db.write_image_label(image_path, text)

    train_db.close()
    val_db.close()
예제 #2
0
def create_lmdb_by_TRDG(train_lmdb_path,
                        val_lmdb_path,
                        root_folder_path,
                        resize=False):
    """
        Text Recognition Generator로 생성한 필기체 데이터 lmdb로 변환 
    """
    train_db = MyLMDB(train_lmdb_path)
    val_db = MyLMDB(val_lmdb_path)

    ch_class = load_class_dictionary('data_generation/kr_labels.txt')
    for root_folder, folders, file_names in os.walk(root_folder_path):
        label_path = os.path.join(root_folder, 'labels.txt')
        if not os.path.exists(label_path):
            continue

        file_name_dict = {}
        for file_name in file_names:
            original_name = file_name.split('_')[-1]

            file_name_dict[original_name] = file_name

        print(f'{root_folder} is processing')
        with open(label_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        for line in lines:
            items = line.strip().split()
            image_name = items[0]
            text = ' '.join(items[1:])
            if not is_valid_text(ch_class, text):
                print(f'skip {text} {image_name}')
                continue
            #if len(text) > 39: # for invoice
            if len(text) > 24:  # for ui patch
                # if len(text) > 15:  # for vertical ui patch
                continue

            generated_image_name = file_name_dict[image_name]
            image_path = os.path.join(root_folder, generated_image_name)
            if not os.path.exists(image_path):
                print(f'{image_path} is not found')
                continue

        # for file_name in file_names:
        # text = (file_name.split('_')[-1]).split('.')[0]
        # image_path = os.path.join(root_folder, file_name)

            if random.random() < 0.005:
                val_db.write_image_label(image_path, text, resize)
            else:
                train_db.write_image_label(image_path, text, resize)

    train_db.close()
    val_db.close()
예제 #3
0
def create_lmdb_by_various_ocr_dataset(lmdb_path, root_folder_path, train_val,
                                       category):
    """
     "다양한 형태의 한글 문자 OCR"
    Args:
    Returns:

    """

    db = MyLMDB(lmdb_path)
    ch_class = load_class_dictionary('data_generation/kr_labels.txt')
    for sub_folder_name in os.listdir(root_folder_path):  # 1, 2, 3 ~~
        for file_name in os.listdir(
                os.path.join(root_folder_path, sub_folder_name)):
            if file_name.split('.')[-1] != 'json':
                continue
            label_file_path = os.path.join(root_folder_path, sub_folder_name,
                                           file_name)
            with open(label_file_path, 'r', encoding='utf-8') as f:
                obj = json.load(f)
            text = ''
            if category == 'word':
                for ch_info in obj['text']['word']:
                    ch = ch_info['value']
                    if ch_class.get(ch) is None:
                        print(f'skip {ch}')
                        text = ''
                        break
                    text += ch
            else:
                ch = obj['info']['text']
                if ch_class.get(ch) is None:
                    print(f'skip {ch}')
                else:
                    text = ch

            if text == '':
                break
            else:
                image_path = label_file_path.replace(f'[라벨]{train_val}_필기체', f'[원천]{train_val}_필기체').\
                    replace('.json', '.jpg')
                db.write_image_label(image_path, text)
    db.close()
예제 #4
0
    # src = 'data/train/nullee_train'

    # target = 'nullee_train_vertical_strip'
    # garbage_path = 'garbage_vertical_val'
    # src = './data/train/nullee_train_vertical'

    # target = 'nullee_synth'
    # garbage_path = 'garbage_nullee_synth'
    # src = '/media/data/nullee_data/gen_synth_lmdb'
    # # src = '/media/data/nullee_data/unity_lmdb/'

    target = 'nullee_val_vertical'
    garbage_path = 'garbage_nullee_vertical'
    src = 'backup/nullee_val_vertical_no_rotate'

    class_dict = load_class_dictionary('data/train/kr_labels.txt',
                                       add_space=True)

    target_db = MyLMDB(target, mode='w', map_size=6e9)
    src_db = MyLMDB(src, mode='r', map_size=6e9)
    strip_db = MyLMDB(garbage_path, mode='w', map_size=6e9)

    num_of_samples = src_db.num_of_samples
    for i in range(1, num_of_samples):
        # if i > 38000:
        #     break

        im, label = src_db.read_image_label(i)
        if len(label) > 39 or not is_valid_label(
                label.upper(), class_dict) or label.find(
                    '#') >= 0 or label == '__#DELETED_LABEL#__)':
            strip_db.write_im_label(im, label)