示例#1
0
    def cross_validation(self, k: int = 10):
        """
        Run k-fold cross validation based on the training data (excluding the dataset's test data)
        :param k: Number of folds to use for the cross-validation.
        """
        self._experiment_folder /= f"{k}-fold-cross-validation"

        # get cross validation data (training data only, no test set)
        cross_validation_data: Iterable[Tuple[str, int]] = self._get_data(test_set=False)

        # prepare the encoders
        self._prepare_or_load_encoders(
            training_data=cross_validation_data,
            initialized_text_enc=TokenSequenceEncoder(
                limit_vocabulary=self._vocabulary_size,
                default_length=self._max_text_length),
        )
        # prepare training labels
        y_class_labels: np.ndarray = self._label_enc.integer_class_labels(
            labeled_data=ProgressIterator(cross_validation_data, "Extracting labels ..."))

        # extract data vectors (from cross-validation data)
        texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in cross_validation_data),
                                    length=FixedLengthIterable.try_get_len(cross_validation_data))
        x: np.ndarray = self._text_enc.encode(texts=texts)

        # cleanup memory
        gc.collect()
        self._cross_validation(x=x, y_class_labels=y_class_labels, k=k)
示例#2
0
    def _prepare_or_load_encoders(self,
                                  training_data: Iterable[Tuple[str, int]],
                                  initialized_text_enc: AbstractTokenEncoder,
                                  ) -> None:
        if not self._encoder_folder.exists():
            self._encoder_folder.mkdir(parents=True, exist_ok=True)

        text_encoder_file = self._encoder_folder / "text-encoder.pickle"
        label_encoder_file = self._encoder_folder / "label-encoder.pickle"

        text_enc: Optional[AbstractTokenEncoder] = None
        label_enc: Optional[LabelEncoder] = None
        if text_encoder_file.exists() and label_encoder_file.exists():
            logging.info(f"Loading encoders from files: {text_encoder_file}, {label_encoder_file}")
            with open(str(text_encoder_file), "rb") as pickle_file:
                text_enc: AbstractTokenEncoder = pickle.load(pickle_file)
            with open(str(label_encoder_file), "rb") as pickle_file:
                label_enc: LabelEncoder = pickle.load(pickle_file)

        if not text_enc or not label_enc:
            # create label encoder based on training data
            label_enc = LabelEncoder(labeled_data=ProgressIterator(training_data, "Collecting labels ..."))

            # extract vocab (from training data)
            texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in training_data),
                                        length=FixedLengthIterable.try_get_len(training_data))
            text_enc = initialized_text_enc
            text_enc.prepare(texts=texts, show_progress=True)

            #
            # serialize data for next time
            #
            with open(str(text_encoder_file), 'wb') as pickle_file:
                pickle.dump(text_enc, pickle_file)
            with open(str(label_encoder_file), 'wb') as pickle_file:
                pickle.dump(label_enc, pickle_file)

            # cleanup memory
            gc.collect()

        self._text_enc, self._label_enc = text_enc, label_enc
示例#3
0
    def train_and_test(self, validation_split: float = .05):
        """
        Train a model (using epoch validation based on `validation_split`) and test it's performance on the independent
        data test set.
        :param validation_split: Float between 0 and 1. Fraction of the training data to be used as validation data. The
        model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and
        any model metrics on this data at the end of each epoch. The validation data is selected from the last samples
        in the `x` and `y` data provided, before shuffling.
        """
        logging.debug(f"validation-split {validation_split}")
        if validation_split:
            self._experiment_folder /= f"validation-split-{validation_split}"
        #
        # training
        #
        # get training data
        training_data: Iterable[Tuple[str, int]] = self._get_data(test_set=False)

        # prepare the encoders
        self._prepare_or_load_encoders(
            training_data=training_data,
            initialized_text_enc=TokenSequenceEncoder(
                limit_vocabulary=self._vocabulary_size,
                default_length=self._max_text_length,
                pad_beginning=self._pad_beginning,
                add_start_end_indicators=self._use_start_end_indicators,
            ),
        )

        # prepare training labels
        y_train: np.ndarray = self._label_enc.make_categorical(
            labeled_data=ProgressIterator(training_data, "Extracting training labels ..."))

        # extract data vectors (from training data)
        texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in training_data),
                                    length=FixedLengthIterable.try_get_len(training_data))
        x_train: np.ndarray = self._text_enc.encode(texts=texts)

        # cleanup
        gc.collect()

        # load or train model
        self._train_or_load_model(x_train, y_train, validation_split=validation_split)

        # cleanup memory
        del x_train, y_train
        gc.collect()

        #
        # testing / evaluate the performance of the model based on the test set
        #

        # get test data
        test_data: Iterable[Tuple[str, int]] = self._get_data(test_set=True)

        # extract label vectors (from test data)
        y_test_categories: np.ndarray = self._label_enc.make_categorical(
            labeled_data=ProgressIterator(test_data, "Extracting test labels ..."))

        # extract data vectors (from test data)
        texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in test_data),
                                    length=FixedLengthIterable.try_get_len(test_data))
        x_test: np.ndarray = self._text_enc.encode(texts=texts)

        gc.collect()

        self._validate_model(x=x_test, y=y_test_categories, validation_file_name="text.json")

        gc.collect()
def test_try_get_len():
    assert 3 == FixedLengthIterable.try_get_len([1, 2, 3])

    # make sure the estimated length is None for a normal generator (w/o __len__())
    assert None is FixedLengthIterable.try_get_len(x for x in [1, 2, 3])