def _assess_on_test_dataset(self, encoded_train_dataset, encoding_train_results, method, split_index) -> HPItem: if self.test_dataset is not None and self.test_dataset.get_example_count( ) > 0: processed_test_dataset = HPUtil.preprocess_dataset( self.test_dataset, self.hp_setting.preproc_sequence, self.path / "preprocessed_test_dataset") encoded_test_dataset = HPUtil.encode_dataset( processed_test_dataset, self.hp_setting, self.path / "encoded_datasets", learn_model=False, context=self.report_context, number_of_processes=self.number_of_processes, label_configuration=self.label_config, store_encoded_data=self.store_encoded_data) performance = HPUtil.assess_performance( method, self.metrics, self.optimization_metric, encoded_test_dataset, split_index, self.path, self.test_predictions_path, self.label, self.ml_score_path) encoding_test_results = ReportUtil.run_encoding_reports( encoded_test_dataset, self.encoding_reports, self.report_path / "encoding_test") model_report_results = ReportUtil.run_ML_reports( encoded_train_dataset, encoded_test_dataset, method, self.ml_reports, self.report_path / "ml_method", self.hp_setting, self.label, self.report_context) hp_item = HPItem( method=method, hp_setting=self.hp_setting, train_predictions_path=self.train_predictions_path, test_predictions_path=self.test_predictions_path, ml_details_path=self.ml_details_path, train_dataset=self.train_dataset, test_dataset=self.test_dataset, split_index=split_index, model_report_results=model_report_results, encoding_train_results=encoding_train_results, encoding_test_results=encoding_test_results, performance=performance, encoder=self.hp_setting.encoder) else: hp_item = HPItem( method=method, hp_setting=self.hp_setting, train_predictions_path=self.train_predictions_path, test_predictions_path=None, ml_details_path=self.ml_details_path, train_dataset=self.train_dataset, split_index=split_index, encoding_train_results=encoding_train_results, encoder=self.hp_setting.encoder) return hp_item
def run_assessment_split(state, train_val_dataset, test_dataset, split_index: int, n_splits): """run inner CV loop (selection) and retrain on the full train_val_dataset after optimal model is chosen""" print( f'{datetime.datetime.now()}: Training ML model: running outer CV loop: started split {split_index + 1}/{n_splits}.\n', flush=True) current_path = HPAssessment.create_assessment_path(state, split_index) assessment_state = HPAssessmentState(split_index, train_val_dataset, test_dataset, current_path, state.label_configuration) state.assessment_states.append(assessment_state) state = HPSelection.run_selection(state, train_val_dataset, current_path, split_index) state = HPAssessment.run_assessment_split_per_label(state, split_index) assessment_state.train_val_data_reports = ReportUtil.run_data_reports( train_val_dataset, state.assessment.reports.data_split_reports.values(), current_path / "data_report_train", state.context) assessment_state.test_data_reports = ReportUtil.run_data_reports( test_dataset, state.assessment.reports.data_split_reports.values(), current_path / "data_report_test", state.context) print( f'{datetime.datetime.now()}: Training ML model: running outer CV loop: finished split {split_index + 1}/{n_splits}.\n', flush=True) return state
def run_selection_reports(state: TrainMLModelState, dataset, train_datasets: list, val_datasets: list, selection_state: HPSelectionState): path = selection_state.path data_split_reports = state.selection.reports.data_split_reports.values() for index in range(len(train_datasets)): split_reports_path = path / f"split_{index + 1}" selection_state.train_data_reports += ReportUtil.run_data_reports(train_datasets[index], data_split_reports, split_reports_path / "data_reports_train", state.context) selection_state.val_data_reports += ReportUtil.run_data_reports(val_datasets[index], data_split_reports, split_reports_path / "data_reports_test", state.context) data_reports = state.selection.reports.data_reports.values() selection_state.data_reports = ReportUtil.run_data_reports(dataset, data_reports, path / "reports", state.context)
def run(self, split_index: int) -> HPItem: print( f"{datetime.datetime.now()}: Evaluating hyperparameter setting: {self.hp_setting}...", flush=True) PathBuilder.build(self.path) self._set_paths() processed_dataset = HPUtil.preprocess_dataset( self.train_dataset, self.hp_setting.preproc_sequence, self.path / "preprocessed_train_dataset") encoded_train_dataset = HPUtil.encode_dataset( processed_dataset, self.hp_setting, self.path / "encoded_datasets", learn_model=True, context=self.report_context, number_of_processes=self.number_of_processes, label_configuration=self.label_config, store_encoded_data=self.store_encoded_data) method = HPUtil.train_method(self.label, encoded_train_dataset, self.hp_setting, self.path, self.train_predictions_path, self.ml_details_path, self.number_of_processes, self.optimization_metric) encoding_train_results = ReportUtil.run_encoding_reports( encoded_train_dataset, self.encoding_reports, self.report_path / "encoding_train") hp_item = self._assess_on_test_dataset(encoded_train_dataset, encoding_train_results, method, split_index) print( f"{datetime.datetime.now()}: Completed hyperparameter setting {self.hp_setting}.\n", flush=True) return hp_item