Пример #1
0
    def to_raw_input_dataset(self,
                             processes=1,
                             progress_bar=False,
                             text_only=False) -> RawInputDataset:
        print("Preloading dataset type {} with size {}".format(
            self.dataset.mode, len(self)))
        prev = self._generate_only_non_augmented.value
        self._generate_only_non_augmented.value = True
        datas, texts, params = zip(*list(
            tqdm_wrapper(self.generator(epochs=1, text_only=text_only),
                         desc="Preloading data",
                         total=len(self.dataset),
                         progress_bar=progress_bar)))
        preloaded_datas, preloaded_texts, preloaded_params = datas, texts, params
        self._generate_only_non_augmented.value = prev

        if self.dataset.mode == DataSetMode.TRAIN and self.data_augmentation_amount > 0:
            abs_n_augs = int(self.data_augmentation_amount
                             ) if self.data_augmentation_amount >= 1 else int(
                                 self.data_augmentation_amount * len(self))
            preloaded_datas, preloaded_texts \
                = self.data_augmenter.augment_datas(list(datas), list(texts), n_augmentations=abs_n_augs,
                                                    processes=processes, progress_bar=progress_bar)

        return RawInputDataset(self.mode, preloaded_datas, preloaded_texts,
                               preloaded_params)
Пример #2
0
    def _compute_current_cer_on_validation_set(self, count):
        def generate_cer():
            it = iter(self.data_gen)
            for _ in range(count):
                yield np.mean(self.predict_func(next(it))[0])

        return np.mean([cer for cer in tqdm_wrapper(generate_cer(), total=count, progress_bar=self.progress_bar, desc="Early stopping") if np.isfinite(cer)])
Пример #3
0
    def from_input_dataset(input_dataset: List[InputDataset], whitelist=set(), progress_bar=False):
        chars = set(whitelist)
        for ds in input_dataset:
            if not ds:
                continue
            for text in tqdm_wrapper(ds.text_generator(), total=len(ds), desc="Computing codec", progress_bar=progress_bar):
                for c in text:
                    chars.add(c)

        return Codec(sorted(list(chars)))
Пример #4
0
    def from_input_dataset(input_dataset: List[InputDataset],
                           whitelist=None,
                           progress_bar=False):
        chars = set() if whitelist is None else set(whitelist)
        for ds in input_dataset:
            if not ds:
                continue
            for text in tqdm_wrapper(ds.text_generator(),
                                     total=len(ds),
                                     desc="Computing codec",
                                     progress_bar=progress_bar):
                for c in text:
                    chars.add(c)

        return Codec(sorted(list(chars)))