def __init__(self, source, batch_size, charset, max_text_length=128, predict=False): self.tokenizer = Tokenizer(charset, max_text_length) self.batch_size = batch_size self.partitions = ['test'] if predict else ['train', 'valid', 'test'] self.size = dict() self.steps = dict() self.index = dict() self.dataset = reader.read_from_txt(source) self.arange = np.arange(len(self.dataset['train']['gt'])) np.random.seed(42) for pt in self.partitions: self.dataset[pt]['dt'] = np.array( [pp.text_standardize(x) for x in self.dataset[pt]['dt']]) self.dataset[pt]['gt'] = np.array( [pp.text_standardize(x) for x in self.dataset[pt]['gt']]) self.size[pt] = len(self.dataset[pt]['gt']) self.steps[pt] = int(np.ceil(self.size[pt] / self.batch_size)) self.one_hot_process = True self.noise_process = not bool( max(self.dataset['train']['dt'], default=[''])) # increase `iterations` parameter if there is noise process in the train data if self.noise_process: ratio, iterations = pp.add_noise.__defaults__ pp.add_noise.__defaults__ = (ratio, iterations + 2)
def preprocess_partitions(self, input_size): """Preprocess images and sentences from partitions""" for y in self.partitions: arange = range(len(self.dataset[y]['gt'])) for i in reversed(arange): text = pp.text_standardize(self.dataset[y]['gt'][i]) if not self.check_text(text): self.dataset[y]['gt'].pop(i) self.dataset[y]['dt'].pop(i) continue self.dataset[y]['gt'][i] = text.encode() results = [] with multiprocessing.Pool(multiprocessing.cpu_count()) as pool: print(f"Partition: {y}") for result in tqdm(pool.imap( partial(pp.preprocess, input_size=input_size), self.dataset[y]['dt']), total=len(self.dataset[y]['dt'])): results.append(result) pool.close() pool.join() self.dataset[y]['dt'] = results
def read_lines(self, maxlen): """Read sentences from dataset and preprocess""" name = os.path.basename(self.source) print(f"The {name} dataset will be transformed...") dataset = getattr(self, f"_{name}")() if not isinstance(self.dataset, list): dataset = dataset['train'] + dataset['valid'] + dataset['test'] dataset = [y for x in dataset for y in pp.generate_multigrams(x)] dataset = [ y for x in dataset for y in pp.split_by_max_length(x, maxlen) ] dataset = [pp.text_standardize(x) for x in dataset] dataset = [x for x in dataset if self.check_text(x)] dataset = list(set(dataset)) np.random.shuffle(dataset) index = int(len(dataset) * 0.1) self.dataset['train'] = dataset[index:] self.dataset['valid'] = dataset[:index] self.dataset['test'] = dataset[:32] # just a sample del dataset for pt in self.partitions: self.size[pt] = len(self.dataset[pt]) self.size['total'] += self.size[pt]
def decode(self, text): """Decode vector to text""" decoded = "".join([self.chars[int(x)] for x in text if x > -1]) decoded = self.remove_tokens(decoded) decoded = pp.text_standardize(decoded) return decoded
def decode(self, text): """metin vektörünün kodunu çözme...""" decoded = "".join([self.chars[int(x)] for x in text if x > -1]) decoded = self.remove_tokens(decoded) decoded = pp.text_standardize(decoded) return decoded
def preprocess_partitions(self, image_input_size): """Preprocess images and sentences from partitions""" for i in self.partitions: self.dataset[i]["gt"] = [ pp.text_standardize(x).encode() for x in self.dataset[i]["gt"] ] pool = Pool() self.dataset[i]["dt"] = pool.map( partial(pp.preproc, img_size=image_input_size), self.dataset[i]["dt"]) pool.close() pool.join()
def check_text(data): """Checks if the text has more characters instead of punctuation marks""" for i in reversed(range(len(data['gt']))): text = pp.text_standardize(data['gt'][i]) strip_punc = text.strip(string.punctuation).strip() no_punc = text.translate(str.maketrans( "", "", string.punctuation)).strip() if len(text) <= 1 or len(strip_punc) <= 1 or len(no_punc) <= 1: data['gt'].pop(i) data['dt'].pop(i) continue return data
def check_text(data, max_text_length=128): """Checks if the text has more characters instead of punctuation marks""" for i in reversed(range(len(data['gt']))): text = pp.text_standardize(data['gt'][i]) strip_punc = text.strip(string.punctuation).strip() no_punc = text.translate(str.maketrans("", "", string.punctuation)).strip() length_valid = (len(text) > 1) and (len(text) < max_text_length) text_valid = (len(strip_punc) > 1) or (len(no_punc) > 1) if (not length_valid) or (not text_valid): data['gt'].pop(i) data['dt'].pop(i) continue return data
def preprocess_partitions(self, input_size): """Preprocess images and sentences from partitions""" for y in self.partitions: arange = range(len(self.dataset[y]['gt'])) for i in reversed(arange): text = pp.text_standardize(self.dataset[y]['gt'][i]) if not self.check_text(text): self.dataset[y]['gt'].pop(i) self.dataset[y]['dt'].pop(i) continue self.dataset[y]['gt'][i] = text.encode() pool = Pool() self.dataset[y]['dt'] = pool.map(partial(pp.preprocess, input_size=input_size), self.dataset[y]['dt']) pool.close() pool.join()
def preprocess_partitions(self, input_size): """Ön işlem görüntüleri ve bölümlerdeki cümleler""" for y in self.partitions: arange = range(len(self.dataset[y]['gt'])) for i in reversed(arange): text = pp.text_standardize(self.dataset[y]['gt'][i]) if not self.check_text(text): self.dataset[y]['gt'].pop(i) self.dataset[y]['dt'].pop(i) continue self.dataset[y]['gt'][i] = text.encode() pool = Pool() self.dataset[y]['dt'] = pool.map( partial(pp.preprocess, input_size=input_size), self.dataset[y]['dt']) pool.close() pool.join()
dtgen.dataset['test']['gt']) with open(os.path.join(output_path, "corpus.txt"), "w") as lg: lg.write(corpus) elif args.test: if args.mode != "kaldi": lm.read_corpus( corpus_path=os.path.join(output_path, "corpus.txt")) start_time = datetime.datetime.now() predicts = lm.autocorrect( sentences=dtgen.dataset['test']['dt']) predicts = [pp.text_standardize(x) for x in predicts] total_time = datetime.datetime.now() - start_time old_metric, new_metric = ev.ocr_metrics( ground_truth=dtgen.dataset['test']['gt'], data=dtgen.dataset['test']['dt'], predict=predicts, norm_accentuation=args.norm_accentuation, norm_punctuation=args.norm_punctuation) p_corpus, e_corpus = report(dtgen=dtgen, predicts=predicts, metrics=[old_metric, new_metric], total_time=total_time, plus=f"N: {args.N}\n")
ctc_decode=False, verbose=1) # get data and ground truth lists ctc_TK, space_TK = "<ctc>", "<space>" multigrams, multigrams_size = dict(), 0 ground_truth = [] # generate multigrams to compose the dataset for pt in dtgen.partitions: multigrams[pt] = [ pp.generate_multigrams(x) for x in dtgen.dataset[pt]['gt'] ] multigrams[pt] = list( set([ pp.text_standardize(y) for x in multigrams[pt] for y in x ])) multigrams[pt] = [ x for x in multigrams[pt] if Dataset.check_text(x) ] multigrams_size += len(multigrams[pt]) for x in multigrams[pt]: ground_truth.append( [space_TK if y == " " else y for y in list(f" {x} ")]) for x in dtgen.dataset[pt]['gt']: ground_truth.append( [space_TK if y == " " else y for y in list(f" {x} ")])
elif args.kaldi_assets: predicts = model.predict(x=dtgen.next_test_batch(), steps=dtgen.steps['test'], ctc_decode=False, verbose=1) # get data and ground truth lists ctc_TK, space_TK = "<ctc>", "<space>" multigrams, multigrams_size = dict(), 0 ground_truth = [] # generate multigrams to compose the dataset for pt in dtgen.partitions: multigrams[pt] = [pp.generate_multigrams(x) for x in dtgen.dataset[pt]['gt']] multigrams[pt] = list(set([pp.text_standardize(y) for x in multigrams[pt] for y in x])) multigrams[pt] = [x for x in multigrams[pt] if Dataset.check_text(x)] multigrams_size += len(multigrams[pt]) for x in multigrams[pt]: ground_truth.append([space_TK if y == " " else y for y in list(f" {x} ")]) for x in dtgen.dataset[pt]: ground_truth.append([space_TK if y == " " else y for y in list(f" {x} ")]) # define dataset size and default tokens train_size = dtgen.size['train'] + dtgen.size['valid'] + multigrams_size # get chars list and save with the ctc and space tokens chars = list(dtgen.tokenizer.chars) + [ctc_TK]
max_text_length = 128 charset_base = "".join([chr(i) for i in range(32, 127)]) if args.transform: assert os.path.exists(raw_path) print(f"The {args.dataset} dataset will be transformed...") mod = importlib.import_module(f"transform.{args.dataset}") os.makedirs(os.path.dirname(hdf5_src), exist_ok=True) dtgen = mod.Dataset(partitions=["train", "valid", "test"]) dataset = dtgen.get_partitions(source=raw_path) for i in dtgen.partitions: dataset[i]["gt"] = [ pp.text_standardize(x).encode() for x in dataset[i]["gt"] ] pool = Pool() dataset[i]["dt"] = pool.map( partial(pp.preproc, img_size=input_size), dataset[i]["dt"]) pool.close() pool.join() with h5py.File(hdf5_src, "a") as hf: hf.create_dataset(f"{i}/dt", data=dataset[i]["dt"], compression="gzip", compression_opts=9) hf.create_dataset(f"{i}/gt", data=dataset[i]["gt"],