def load_samples(self, processes=1, progress_bar=False): if self.loaded: return self._samples data = parallel_map(self._load_sample, self._samples, desc="Loading Dataset", processes=processes, progress_bar=progress_bar) invalid_samples = [] for i, ((line, text), sample) in enumerate(zip(data, self._samples)): sample["image"] = line sample["text"] = text if line is not None and (line.size == 0 or np.amax(line) == np.amin(line)): if self.skip_invalid: invalid_samples.append(i) print("Empty data: Image at '{}' is empty".format( sample['id'])) else: raise Exception( "Empty data: Image at '{}' is empty".format( sample['id'])) if self.remove_invalid: # remove all invalid samples (reversed order!) for i in sorted(invalid_samples, reverse=True): del self._samples[i] self.loaded = True return self._samples
def evaluate(_sentinel=None, gt_data=None, pred_data=None, processes=1, progress_bar=False): """ evaluate on the given raw data Parameters ---------- _sentinel : do not use Forcing the use of `gt_dataset` and `pred_dataset` fore safety gt_data : Dataset, optional the ground truth pred_data : Dataset the prediction dataset processes : int, optional the processes to use for preprocesing and evaluation progress_bar : bool, optional show a progress bar Returns ------- evaluation dictionary """ if len(gt_data) != len(pred_data): raise Exception("Mismatch in gt and pred files count: {} vs {}".format(len(gt_data), len(pred_data))) # evaluate single lines out = parallel_map(Evaluator.evaluate_single_args, list(zip(gt_data, pred_data)), processes=processes, progress_bar=progress_bar, desc="Evaluation") return Evaluator.evaluate_single_list(out, True)
def augment_datas(self, datas, gt_txts, n_augmentations, processes=1, progress_bar=False): if n_augmentations < 0 or not isinstance(n_augmentations, int): raise ValueError("Number of augmentation must be an integer >= 0") if n_augmentations == 0: return datas, gt_txts out = parallel_map(self.augment_data_tuple, list( zip(datas, gt_txts, [n_augmentations] * len(datas))), desc="Augmentation", processes=processes, progress_bar=progress_bar) out_d, out_t = [], [] for d, t in out: out_d += d out_t += t return datas + out_d, gt_txts + out_t
def apply(self, txts, processes=1, progress_bar=False): if isinstance(txts, str): return self._apply_single(txts) elif isinstance(txts, list): if len(txts) == 0: return [] return parallel_map(self._apply_single, txts, desc="Text Preprocessing", processes=processes, progress_bar=progress_bar) else: raise Exception("Unknown instance of txts: {}. Supported list and str".format(type(txts)))
def apply(self, data, processes=1, progress_bar=False): if isinstance(data, np.ndarray): return self._apply_single(data) elif isinstance(data, list): if len(data) == 0: return [] return parallel_map(self._apply_single, data, desc="Data Preprocessing", processes=processes, progress_bar=progress_bar) else: raise Exception("Unknown instance of txts: {}. Supported list and str".format(type(data)))
def load_samples(self, processes=1, progress_bar=False): """ Load the samples into the memory This is usefull if a FileDataset shall load its files. Parameters ---------- processes : int number of processes to use for loading progress_bar : bool show a progress bar of the progress Returns ------- list of samples """ if self.loaded: return self._samples data = parallel_map(self._load_sample, self._samples, desc="Loading Dataset", processes=processes, progress_bar=progress_bar) invalid_samples = [] for i, ((line, text), sample) in enumerate(zip(data, self._samples)): sample["image"] = line sample["text"] = text if self.mode == DataSetMode.PREDICT or self.mode == DataSetMode.TRAIN: # skip invalid imanges (e. g. corrupted or empty files) if line is None or (line.size == 0 or np.amax(line) == np.amin(line)): if self.skip_invalid: invalid_samples.append(i) if line is None: print( "Empty data: Image at '{}' is None (possibly corrupted)" .format(sample['id'])) else: print("Empty data: Image at '{}' is empty".format( sample['id'])) else: raise Exception( "Empty data: Image at '{}' is empty".format( sample['id'])) if self.remove_invalid: # remove all invalid samples (reversed order!) for i in sorted(invalid_samples, reverse=True): del self._samples[i] self.loaded = True return self._samples
def evaluate(_sentinel=None, gt_data=None, pred_data=None, processes=1, progress_bar=False): """ evaluate on the given raw data Parameters ---------- _sentinel : do not use Forcing the use of `gt_dataset` and `pred_dataset` fore safety gt_data : Dataset, optional the ground truth pred_data : Dataset the prediction dataset processes : int, optional the processes to use for preprocesing and evaluation progress_bar : bool, optional show a progress bar Returns ------- evaluation dictionary """ if len(gt_data) != len(pred_data): raise Exception("Mismatch in gt and pred files count: {} vs {}".format(len(gt_data), len(pred_data))) # evaluate single lines out = parallel_map(Evaluator.evaluate_single, list(zip(gt_data, pred_data)), processes=processes, progress_bar=progress_bar, desc="Evaluation") # sum all errors up total_chars = 0 total_char_errs = 0 confusion = {} total_sync_errs = 0 for chars, char_errs, sync_errs, conf in out: total_chars += chars total_char_errs += char_errs total_sync_errs += sync_errs for key, value in conf.items(): if key not in confusion: confusion[key] = value else: confusion[key] += value # Note the sync errs can be higher than the true edit distance because # replacements are counted as 1 # e.g. ed(in ewych, ierg ch) = 5 # sync(in ewych, ierg ch) = [{i: i}, {n: erg}, {ewy: }, {ch: ch}] = 6 return { "single": out, "avg_ler": total_char_errs / total_chars, "total_chars": total_chars, "total_char_errs": total_char_errs, "total_sync_errs": total_sync_errs, "confusion": confusion, }
def apply(self, data, processes=1, progress_bar=False, max_tasks_per_child=100): if isinstance(data, np.ndarray): return self._apply_single(data) elif isinstance(data, list) or isinstance(data, tuple): if len(data) == 0: return [] return parallel_map(self._apply_single, data, desc="Data Preprocessing", processes=processes, progress_bar=progress_bar, max_tasks_per_child=max_tasks_per_child) else: raise Exception("Unknown instance of data: {}. Supported list and str".format(type(data)))
def augment_datas(self, datas, gt_txts, n_augmentations, processes=1, progress_bar=False): if n_augmentations <= 0: return datas, gt_txts out = parallel_map(self.augment_data_tuple, list(zip(datas, gt_txts, [n_augmentations] * len(datas))), desc="Augmentation", processes=processes, progress_bar=progress_bar) out_d, out_t = [], [] for d, t in out: out_d += d out_t += t return datas + out_d, gt_txts + out_t
def augment_datas(self, datas, gt_txts, n_augmentations, processes=1, progress_bar=False): if n_augmentations <= 0: return datas, gt_txts out = parallel_map(self.augment_data_tuple, list(zip(datas, gt_txts, [n_augmentations] * len(datas))), desc="Augmentation", processes=processes, progress_bar=progress_bar) out_d, out_t = [], [] for d, t in out: out_d += d out_t += t return datas + out_d, gt_txts + out_t
def evaluate(_sentinel=None, gt_data=None, pred_data=None, processes=1, progress_bar=False): if len(gt_data) != len(pred_data): raise Exception( "Mismatch in gt and pred files count: {} vs {}".format( len(gt_data), len(pred_data))) # evaluate single lines out = parallel_map(Evaluator.evaluate_single, list(zip(gt_data, pred_data)), processes=processes, progress_bar=progress_bar, desc="Evaluation") # sum all errors up total_chars = 0 total_char_errs = 0 confusion = {} total_sync_errs = 0 for chars, char_errs, sync_errs, conf in out: total_chars += chars total_char_errs += char_errs total_sync_errs += sync_errs for key, value in conf.items(): if key not in confusion: confusion[key] = value else: confusion[key] += value # Note the sync errs can be higher than the true edit distance because # replacements are counted as 1 # e.g. ed(in ewych, ierg ch) = 5 # sync(in ewych, ierg ch) = [{i: i}, {n: erg}, {ewy: }, {ch: ch}] = 6 return { "single": out, "avg_ler": total_char_errs / total_chars, "total_chars": total_chars, "total_char_errs": total_char_errs, "total_sync_errs": total_sync_errs, "confusion": confusion, }
def apply(self, data, processes=1, progress_bar=False, max_tasks_per_child=100): if isinstance(data, np.ndarray): return self._apply_single(data) elif isinstance(data, list) or isinstance(data, tuple): if len(data) == 0: return [] return parallel_map(self._apply_single, data, desc="Data Preprocessing", processes=processes, progress_bar=progress_bar, max_tasks_per_child=max_tasks_per_child) else: raise Exception( "Unknown instance of data: {}. Supported list and str".format( type(data)))