def words_norm(location, output): output = os.path.join(location, output) if not os.path.exists(output): os.makedirs(output) else: print("THIS DATASET IS BEING SKIPPED") print("Output folder already exists:", output) return 1 imgs = glob.glob(os.path.join(location, data_folder, '*.png')) length = len(imgs) for i, img_path in enumerate(imgs): image = cv2.imread(img_path) # Simple check for invalid images if image.shape[0] > 20: cv2.imwrite( os.path.join(output, os.path.basename(img_path)), word_normalization(image, height=64, border=False, tilt=False, hyst_norm=False)) print(i) print_progress_bar(i, len(imgs)) print("\tNumber of normalized words:", len([n for n in os.listdir(output)]))
def extract(location, output, number=4): output = os.path.join(location, output) if not os.path.exists(output): os.makedirs(output) for sub in ['ORAND-CAR-2014/CAR-A', 'ORAND-CAR-2014/CAR-B']: folder = os.path.join(location, sub) l_files = glob.glob(os.path.join(folder, '*.txt')) length = sum(1 for fl in l_files for line in open(fl)) itr = 0 for fl in l_files: im_folder = fl[:-6] + 'images' with open(fl) as f: for line in f: im, word = line.strip().split('\t') impath = os.path.join(im_folder, im) if os.stat(impath).st_size != 0: outpath = os.path.join( output, '%s_%s_%s.png' % (word, number, time.time())) copyfile(impath, outpath) print_progress_bar(i, length) itr += 1 print("\tNumber of words:", len([n for n in os.listdir(output)]))
def create_csv(datadir): print('Converting word images to CSV...') img_paths = { 'train': glob.glob(os.path.join(datadir, 'train', '*.png')), 'dev': glob.glob(os.path.join(datadir, 'dev', '*.png')), 'test': glob.glob(os.path.join(datadir, 'test', '*.png'))} for split in ['train', 'dev', 'test']: labels = np.array([ os.path.basename(name).split('_')[0] for name in img_paths[split]]) length = len(img_paths[split]) images = np.empty(length, dtype=object) for i, img in enumerate(img_paths[split]): gaplines = 'None' if os.path.isfile(img[:-3] + 'txt'): with open(img[:-3] + 'txt', 'r') as fp: gaplines = str(simplejson.load(fp))[1:-1] images[i] = (cv2.imread(img, 0), gaplines) print_progress_bar(i, length) with open(os.path.join(datadir, split + '.csv'), 'w') as csvfile: fieldnames = ['label', 'shape', 'image', 'gaplines'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i in range(length): writer.writerow({ fieldnames[0]: labels[i], fieldnames[1]: str(images[i].shape)[1:-1], fieldnames[2]: str(list(images[i][0].flatten()))[1:-1], fieldnames[3]: images[i][1] }) print('\tCSV files created!')
def extract(location, output, number=5): output = os.path.join(location, output) if not os.path.exists(output): os.makedirs(output) for sub in ['lob', 'numbers']: folder = os.path.join(location, sub) seg_files = glob.glob(os.path.join(folder, '*.seg')) length = sum([int(open(l, 'r').readline()) for l in seg_files]) itr = 0 for fl in seg_files: image = cv2.imread(fl[:-4] + ".tiff") with open(fl) as f: f.readline() for line in f: rect = [int(val) for val in line.strip().split(' ')[1:]] word = line.split(' ')[0].split('_')[0] im = image[rect[2]:rect[3], rect[0]:rect[1]] if 0 not in im.shape: cv2.imwrite( os.path.join( output, '%s_%s_%s.png' % (word, number, time.time())), im) print_progress_bar(itr, length) itr += 1 print("\tNumber of words:", len([n for n in os.listdir(output)]))
def extract(location, output, number=2): output = os.path.join(location, output) err_output = os.path.join(location, 'words_with_error') if not os.path.exists(output): os.makedirs(output) if not os.path.exists(err_output): os.makedirs(err_output) folder = os.path.join(location, 'words') label_file = os.path.join(location, 'words.txt') length = len(open(label_file).readlines()) with open(label_file) as fp: for i, line in enumerate(fp): if line[0] != '#': l = line.strip().split(" ") impath = os.path.join( folder, l[0].split('-')[0], l[0].split('-')[0] + '-' + l[0].split('-')[1], l[0] + '.png') word = l[-1] if (os.stat(impath).st_size != 0 and word not in ['.', '-', "'"] and not any(i in word for i in prohibited)): out = output if l[1] == 'ok' else err_output outpath = os.path.join( out, "%s_%s_%s.png" % (word, number, time.time())) copyfile(impath, outpath) print_progress_bar(i, length) print("\tNumber of words:", len([n for n in os.listdir(output)]))
def extract(location, output, number=1): output = os.path.join(location, output) if not os.path.exists(output): os.makedirs(output) for sub in ['words', 'archive', 'cz_raw', 'en_raw']: folder = os.path.join(location, sub) img_list = os.listdir(os.path.join(folder)) for i, data in enumerate(img_list): word = data.split('_')[0] img = os.path.join(folder, data) out = os.path.join( output, '%s_%s_%s.png' % (word, number, data.split('_')[-1][:-4])) Image.open(img).save(out) print_progress_bar(i, len(img_list)) print("\tNumber of words:", len([n for n in os.listdir(output)]))
def extract(location, output, number=3): output = os.path.join(location, output) if not os.path.exists(output): os.makedirs(output) for sub in ['cvl-database-1-1/testset', 'cvl-database-1-1/trainset']: folder = os.path.join(location, sub) images = glob.glob(os.path.join(folder, 'words', '*', '*.tif')) for i, im in enumerate(images): word = re.search('\/\d+-\d+-\d+-\d+-(.+?).tif', im).group(1) word = unidecode.unidecode(word) if os.stat(im).st_size != 0: outpath = os.path.join( output, '%s_%s_%s.png' % (word, number, time.time())) Image.open(im).save(outpath) print_progress_bar(i, len(images)) print("\tNumber of words:", len([n for n in os.listdir(output)]))
os.makedirs(output_folder) # imgs = glob.glob(os.path.join(folder, '*/words-final/*.png')) imgs = [] for ds in args.dataset: for loc, _, _ in os.walk(datasets[ds][1].replace("raw", "processed")): imgs += glob.glob(os.path.join(loc, '*.png')) imgs.sort() random.shuffle(imgs) length = len(imgs) sp1 = int(0.8 * length) sp2 = int(0.9 * length) img_paths = {'train': imgs[:sp1], 'dev': imgs[sp1:sp2], 'test': imgs[sp2:]} i = 0 for split in ['train', 'dev', 'test']: split_output = os.path.join(output_folder, split) if not os.path.exists(split_output): os.mkdir(split_output) for im_path in img_paths[split]: # Copy image print_progress_bar(i, length) i += 1 print( "\tNumber of %s words: %s" % (split, len(os.listdir(split_output)))) if args.csv: create_csv(output_folder)