def test_scoring_logreg_tune_correct(data_fixture, request): train_data, test_data = request.getfixturevalue(data_fixture) train_data.features = Scaling().fit(train_data.features).apply( train_data.features) test_data.features = Scaling().fit(test_data.features).apply( test_data.features) logreg = Model(model_type='logit') model, _ = logreg.fit(train_data) test_predicted = logreg.predict(fitted_model=model, data=test_data) test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted) logreg_for_tune = Model(model_type='logit') model_tuned, _ = logreg_for_tune.fine_tune( train_data, iterations=50, max_lead_time=timedelta(minutes=0.1)) test_predicted_tuned = logreg_for_tune.predict(fitted_model=model_tuned, data=test_data) test_roc_auc_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.6 assert round(test_roc_auc_tuned, 2) >= round(test_roc_auc, 2) > roc_threshold
def test_arima_tune_correct(): data = get_synthetic_ts_data() train_data, test_data = train_test_data_setup(data=data) arima_for_tune = Model(model_type='arima') model, _ = arima_for_tune.fine_tune(data=train_data, iterations=5, max_lead_time=timedelta(minutes=0.1)) test_predicted_tuned = arima_for_tune.predict(fitted_model=model, data=test_data) rmse_on_test_tuned = mse(y_true=test_data.target, y_pred=test_predicted_tuned, squared=False) rmse_threshold = np.std(test_data.target) assert rmse_on_test_tuned < rmse_threshold
def test_knn_classification_tune_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) knn = Model(model_type='knn') model, _ = knn.fit(data=train_data) test_predicted = knn.predict(fitted_model=model, data=test_data) roc_on_test = roc_auc(y_true=test_data.target, y_score=test_predicted) knn_for_tune = Model(model_type='knn') model, _ = knn_for_tune.fine_tune(data=train_data, iterations=10, max_lead_time=timedelta(minutes=1)) test_predicted_tuned = knn.predict(fitted_model=model, data=test_data) roc_on_test_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.6 assert roc_on_test_tuned > roc_on_test > roc_threshold
def test_max_lead_time_in_tune_process(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) start = datetime.now() knn_for_tune = Model(model_type='knn') model, _ = knn_for_tune.fine_tune(data=train_data, max_lead_time=timedelta(minutes=0.05), iterations=100) test_predicted_tuned = knn_for_tune.predict(fitted_model=model, data=test_data) roc_on_test_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.6 spent_time = (datetime.now() - start).seconds assert roc_on_test_tuned > roc_threshold assert spent_time == 3
def test_rf_class_tune_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) rf = Model(model_type='rf') model, _ = rf.fit(train_data) test_predicted = rf.predict(fitted_model=model, data=test_data) test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted) model_tuned, _ = rf.fine_tune(data=train_data, iterations=12, max_lead_time=timedelta(minutes=0.1)) test_predicted_tuned = rf.predict(fitted_model=model_tuned, data=test_data) test_roc_auc_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.7 assert test_roc_auc_tuned != test_roc_auc assert test_roc_auc_tuned > roc_threshold
class Node(ABC): def __init__(self, nodes_from: Optional[List['Node']], model_type: str, manual_preprocessing_func: Optional[Callable] = None): self.nodes_from = nodes_from self.model = Model(model_type=model_type) self.cache = FittedModelCache(self) self.manual_preprocessing_func = manual_preprocessing_func @property def descriptive_id(self): return self._descriptive_id_recursive(visited_nodes=[]) def _descriptive_id_recursive(self, visited_nodes): node_label = self.model.description if self.manual_preprocessing_func: node_label = f'{node_label}_custom_preprocessing={self.manual_preprocessing_func.__name__}' full_path = '' if self in visited_nodes: return 'ID_CYCLED' visited_nodes.append(self) if self.nodes_from: previous_items = [] for parent_node in self.nodes_from: previous_items.append(f'{parent_node._descriptive_id_recursive(copy(visited_nodes))};') previous_items.sort() previous_items_str = ';'.join(previous_items) full_path += f'({previous_items_str})' full_path += f'/{node_label}' return full_path @property def model_tags(self) -> List[str]: return self.model.metadata.tags def output_from_prediction(self, input_data, prediction): return OutputData(idx=input_data.idx, features=input_data.features, predict=prediction, task=input_data.task, data_type=self.model.output_datatype(input_data.data_type)) def _transform(self, input_data: InputData): transformed_data = transformation_function_for_data( input_data_type=input_data.data_type, required_data_types=self.model.metadata.input_types)(input_data) return transformed_data def _preprocess(self, data: InputData): preprocessing_func = preprocessing_func_for_data(data, self) if not self.cache.actual_cached_state: # if fitted preprocessor not found in cache preprocessing_strategy = \ preprocessing_func().fit(data.features) else: # if fitted preprocessor already exists preprocessing_strategy = self.cache.actual_cached_state.preprocessor data.features = preprocessing_strategy.apply(data.features) return data, preprocessing_strategy def fit(self, input_data: InputData, verbose=False) -> OutputData: transformed = self._transform(input_data) preprocessed_data, preproc_strategy = self._preprocess(transformed) if not self.cache.actual_cached_state: if verbose: print('Cache is not actual') cached_model, model_predict = self.model.fit(data=preprocessed_data) self.cache.append(CachedState(preprocessor=copy(preproc_strategy), model=cached_model)) else: if verbose: print('Model were obtained from cache') model_predict = self.model.predict(fitted_model=self.cache.actual_cached_state.model, data=preprocessed_data) return self.output_from_prediction(input_data, model_predict) def predict(self, input_data: InputData, verbose=False) -> OutputData: transformed = self._transform(input_data) preprocessed_data, _ = self._preprocess(transformed) if not self.cache: raise ValueError('Model must be fitted before predict') model_predict = self.model.predict(fitted_model=self.cache.actual_cached_state.model, data=preprocessed_data) return self.output_from_prediction(input_data, model_predict) def fine_tune(self, input_data: InputData, max_lead_time: timedelta = timedelta(minutes=5), iterations: int = 30): transformed = self._transform(input_data) preprocessed_data, preproc_strategy = self._preprocess(transformed) fitted_model, _ = self.model.fine_tune(preprocessed_data, max_lead_time=max_lead_time, iterations=iterations) self.cache.append(CachedState(preprocessor=copy(preproc_strategy), model=fitted_model)) def __str__(self): model = f'{self.model}' return model @property def ordered_subnodes_hierarchy(self) -> List['Node']: nodes = [self] if self.nodes_from: for parent in self.nodes_from: nodes += parent.ordered_subnodes_hierarchy return nodes @property def custom_params(self) -> dict: return self.model.params @custom_params.setter def custom_params(self, params): self.model.params = params