def cross_validation(self, k: int = 10): """ Run k-fold cross validation based on the training data (excluding the dataset's test data) :param k: Number of folds to use for the cross-validation. """ self._experiment_folder /= f"{k}-fold-cross-validation" # get cross validation data (training data only, no test set) cross_validation_data: Iterable[Tuple[str, int]] = self._get_data(test_set=False) # prepare the encoders self._prepare_or_load_encoders( training_data=cross_validation_data, initialized_text_enc=TokenSequenceEncoder( limit_vocabulary=self._vocabulary_size, default_length=self._max_text_length), ) # prepare training labels y_class_labels: np.ndarray = self._label_enc.integer_class_labels( labeled_data=ProgressIterator(cross_validation_data, "Extracting labels ...")) # extract data vectors (from cross-validation data) texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in cross_validation_data), length=FixedLengthIterable.try_get_len(cross_validation_data)) x: np.ndarray = self._text_enc.encode(texts=texts) # cleanup memory gc.collect() self._cross_validation(x=x, y_class_labels=y_class_labels, k=k)
def _prepare_or_load_encoders(self, training_data: Iterable[Tuple[str, int]], initialized_text_enc: AbstractTokenEncoder, ) -> None: if not self._encoder_folder.exists(): self._encoder_folder.mkdir(parents=True, exist_ok=True) text_encoder_file = self._encoder_folder / "text-encoder.pickle" label_encoder_file = self._encoder_folder / "label-encoder.pickle" text_enc: Optional[AbstractTokenEncoder] = None label_enc: Optional[LabelEncoder] = None if text_encoder_file.exists() and label_encoder_file.exists(): logging.info(f"Loading encoders from files: {text_encoder_file}, {label_encoder_file}") with open(str(text_encoder_file), "rb") as pickle_file: text_enc: AbstractTokenEncoder = pickle.load(pickle_file) with open(str(label_encoder_file), "rb") as pickle_file: label_enc: LabelEncoder = pickle.load(pickle_file) if not text_enc or not label_enc: # create label encoder based on training data label_enc = LabelEncoder(labeled_data=ProgressIterator(training_data, "Collecting labels ...")) # extract vocab (from training data) texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in training_data), length=FixedLengthIterable.try_get_len(training_data)) text_enc = initialized_text_enc text_enc.prepare(texts=texts, show_progress=True) # # serialize data for next time # with open(str(text_encoder_file), 'wb') as pickle_file: pickle.dump(text_enc, pickle_file) with open(str(label_encoder_file), 'wb') as pickle_file: pickle.dump(label_enc, pickle_file) # cleanup memory gc.collect() self._text_enc, self._label_enc = text_enc, label_enc
def train_and_test(self, validation_split: float = .05): """ Train a model (using epoch validation based on `validation_split`) and test it's performance on the independent data test set. :param validation_split: Float between 0 and 1. Fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch. The validation data is selected from the last samples in the `x` and `y` data provided, before shuffling. """ logging.debug(f"validation-split {validation_split}") if validation_split: self._experiment_folder /= f"validation-split-{validation_split}" # # training # # get training data training_data: Iterable[Tuple[str, int]] = self._get_data(test_set=False) # prepare the encoders self._prepare_or_load_encoders( training_data=training_data, initialized_text_enc=TokenSequenceEncoder( limit_vocabulary=self._vocabulary_size, default_length=self._max_text_length, pad_beginning=self._pad_beginning, add_start_end_indicators=self._use_start_end_indicators, ), ) # prepare training labels y_train: np.ndarray = self._label_enc.make_categorical( labeled_data=ProgressIterator(training_data, "Extracting training labels ...")) # extract data vectors (from training data) texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in training_data), length=FixedLengthIterable.try_get_len(training_data)) x_train: np.ndarray = self._text_enc.encode(texts=texts) # cleanup gc.collect() # load or train model self._train_or_load_model(x_train, y_train, validation_split=validation_split) # cleanup memory del x_train, y_train gc.collect() # # testing / evaluate the performance of the model based on the test set # # get test data test_data: Iterable[Tuple[str, int]] = self._get_data(test_set=True) # extract label vectors (from test data) y_test_categories: np.ndarray = self._label_enc.make_categorical( labeled_data=ProgressIterator(test_data, "Extracting test labels ...")) # extract data vectors (from test data) texts = FixedLengthIterable(gen_source=lambda: (tex for tex, lab in test_data), length=FixedLengthIterable.try_get_len(test_data)) x_test: np.ndarray = self._text_enc.encode(texts=texts) gc.collect() self._validate_model(x=x_test, y=y_test_categories, validation_file_name="text.json") gc.collect()
def test_try_get_len(): assert 3 == FixedLengthIterable.try_get_len([1, 2, 3]) # make sure the estimated length is None for a normal generator (w/o __len__()) assert None is FixedLengthIterable.try_get_len(x for x in [1, 2, 3])