Exemplo n.º 1
0
    def cross_validation(self, k: int = 10):
        """
        Run k-fold cross validation based on the training data (excluding the dataset's test data)
        :param k: Number of folds to use for the cross-validation.
        """
        self._experiment_folder /= f"{k}-fold-cross-validation"

        # get cross validation data (training data only, no test set)
        cross_validation_data: Iterable[Tuple[str, int]] = self._get_data(test_set=False)

        # prepare the encoders
        self._prepare_or_load_encoders(
            training_data=cross_validation_data,
            initialized_text_enc=TokenSequenceEncoder(
                limit_vocabulary=self._vocabulary_size,
                default_length=self._max_text_length),
        )
        # prepare training labels
        y_class_labels: np.ndarray = self._label_enc.integer_class_labels(
            labeled_data=ProgressIterator(cross_validation_data, "Extracting labels ..."))

        # extract data vectors (from cross-validation data)
        texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in cross_validation_data),
                                    length=FixedLengthIterable.try_get_len(cross_validation_data))
        x: np.ndarray = self._text_enc.encode(texts=texts)

        # cleanup memory
        gc.collect()
        self._cross_validation(x=x, y_class_labels=y_class_labels, k=k)
Exemplo n.º 2
0
def test_fixed_length_iterable_greedy():
    data = [1, 2, 3, 4, 5]

    iterable = FixedLengthIterable(iterable=data, lazy=False)

    # while accessing len(), we will iterate the source the first time
    assert len(iterable) == 5

    assert sum(iterable) == 15
Exemplo n.º 3
0
def test_fixed_length_generator_source_greedy():
    def gen_source():
        return (x for x in [1, 2, 3, 4, 5])

    iterable = FixedLengthIterable(gen_source=gen_source, lazy=False)

    # while accessing len(), we will iterate the source the first time
    assert len(iterable) == 5

    assert sum(iterable) == 15
Exemplo n.º 4
0
    def _get_data(self, test_set: bool) -> Iterable[Tuple[str, int]]:
        def data_gen_source():
            if not test_set:
                return skip(amazon_binary_review_generator(self._data_file),
                            at_start=self._test_set_skip)

            return skip(amazon_binary_review_generator(self._data_file),
                        at_start=-self._test_set_skip)

        return FixedLengthIterable(gen_source=data_gen_source)
Exemplo n.º 5
0
    def _prepare_or_load_encoders(self,
                                  training_data: Iterable[Tuple[str, int]],
                                  initialized_text_enc: AbstractTokenEncoder,
                                  ) -> None:
        if not self._encoder_folder.exists():
            self._encoder_folder.mkdir(parents=True, exist_ok=True)

        text_encoder_file = self._encoder_folder / "text-encoder.pickle"
        label_encoder_file = self._encoder_folder / "label-encoder.pickle"

        text_enc: Optional[AbstractTokenEncoder] = None
        label_enc: Optional[LabelEncoder] = None
        if text_encoder_file.exists() and label_encoder_file.exists():
            logging.info(f"Loading encoders from files: {text_encoder_file}, {label_encoder_file}")
            with open(str(text_encoder_file), "rb") as pickle_file:
                text_enc: AbstractTokenEncoder = pickle.load(pickle_file)
            with open(str(label_encoder_file), "rb") as pickle_file:
                label_enc: LabelEncoder = pickle.load(pickle_file)

        if not text_enc or not label_enc:
            # create label encoder based on training data
            label_enc = LabelEncoder(labeled_data=ProgressIterator(training_data, "Collecting labels ..."))

            # extract vocab (from training data)
            texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in training_data),
                                        length=FixedLengthIterable.try_get_len(training_data))
            text_enc = initialized_text_enc
            text_enc.prepare(texts=texts, show_progress=True)

            #
            # serialize data for next time
            #
            with open(str(text_encoder_file), 'wb') as pickle_file:
                pickle.dump(text_enc, pickle_file)
            with open(str(label_encoder_file), 'wb') as pickle_file:
                pickle.dump(label_enc, pickle_file)

            # cleanup memory
            gc.collect()

        self._text_enc, self._label_enc = text_enc, label_enc
Exemplo n.º 6
0
Arquivo: yelp.py Projeto: tongr/TextNN
    def _get_data(self, test_set: bool) -> Iterable[Tuple[str, int]]:
        def gen_source():
            if not test_set:
                start_at = self._test_set_skip
            else:
                start_at = -self._test_set_skip

            return skip(yelp_binary_review_generator(
                self._data_file, trim_text=self._max_text_length * 10),
                        at_start=start_at)

        return FixedLengthIterable(gen_source=gen_source)
Exemplo n.º 7
0
def test_fixed_length_iterable_known_length():
    data = [1, 2, 3, 4, 5]

    iterable = FixedLengthIterable(iterable=data, length=3)

    # the length is predefined
    assert len(iterable) == 3

    assert sum(iterable) == 15

    # even though we just iterated over 5 elements, the length stays at the predefined value
    assert len(iterable) == 3
Exemplo n.º 8
0
def test_fixed_length_iterable_lazy():
    data = [1, 2, 3, 4, 5]

    iterable = FixedLengthIterable(iterable=data)

    # make sure the lazy form does not provide iterable length (yet)
    with raises(TypeError) as e_info:
        _ = len(iterable)

    assert sum(iterable) == 15

    # once consumed, the length should be available
    assert len(iterable) == 5

    assert sum(iterable) == 15
Exemplo n.º 9
0
def test_fixed_length_generator_source_lazy():
    def gen_source():
        return (x for x in [1, 2, 3, 4, 5])

    iterable = FixedLengthIterable(gen_source=gen_source)

    # make sure the lazy form does not provide iterable length (yet)
    with raises(TypeError) as e_info:
        _ = len(iterable)

    assert sum(iterable) == 15

    # once consumed, the length should be available
    assert len(iterable) == 5

    assert sum(iterable) == 15
Exemplo n.º 10
0
    def _get_data(self, test_set: bool) -> Iterable[Tuple[str, int]]:
        def gen_source():
            return imdb_data_generator(base_folder=self._base_folder,
                                       train_only=not test_set)

        return FixedLengthIterable(gen_source=gen_source)
Exemplo n.º 11
0
    def train_and_test(self, validation_split: float = .05):
        """
        Train a model (using epoch validation based on `validation_split`) and test it's performance on the independent
        data test set.
        :param validation_split: Float between 0 and 1. Fraction of the training data to be used as validation data. The
        model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and
        any model metrics on this data at the end of each epoch. The validation data is selected from the last samples
        in the `x` and `y` data provided, before shuffling.
        """
        logging.debug(f"validation-split {validation_split}")
        if validation_split:
            self._experiment_folder /= f"validation-split-{validation_split}"
        #
        # training
        #
        # get training data
        training_data: Iterable[Tuple[str, int]] = self._get_data(test_set=False)

        # prepare the encoders
        self._prepare_or_load_encoders(
            training_data=training_data,
            initialized_text_enc=TokenSequenceEncoder(
                limit_vocabulary=self._vocabulary_size,
                default_length=self._max_text_length,
                pad_beginning=self._pad_beginning,
                add_start_end_indicators=self._use_start_end_indicators,
            ),
        )

        # prepare training labels
        y_train: np.ndarray = self._label_enc.make_categorical(
            labeled_data=ProgressIterator(training_data, "Extracting training labels ..."))

        # extract data vectors (from training data)
        texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in training_data),
                                    length=FixedLengthIterable.try_get_len(training_data))
        x_train: np.ndarray = self._text_enc.encode(texts=texts)

        # cleanup
        gc.collect()

        # load or train model
        self._train_or_load_model(x_train, y_train, validation_split=validation_split)

        # cleanup memory
        del x_train, y_train
        gc.collect()

        #
        # testing / evaluate the performance of the model based on the test set
        #

        # get test data
        test_data: Iterable[Tuple[str, int]] = self._get_data(test_set=True)

        # extract label vectors (from test data)
        y_test_categories: np.ndarray = self._label_enc.make_categorical(
            labeled_data=ProgressIterator(test_data, "Extracting test labels ..."))

        # extract data vectors (from test data)
        texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in test_data),
                                    length=FixedLengthIterable.try_get_len(test_data))
        x_test: np.ndarray = self._text_enc.encode(texts=texts)

        gc.collect()

        self._validate_model(x=x_test, y=y_test_categories, validation_file_name="text.json")

        gc.collect()
Exemplo n.º 12
0
def test_try_get_len():
    assert 3 == FixedLengthIterable.try_get_len([1, 2, 3])

    # make sure the estimated length is None for a normal generator (w/o __len__())
    assert None is FixedLengthIterable.try_get_len(x for x in [1, 2, 3])