def test_pca_model_removes_redunant_features_correct(): n_informative = 5 data = classification_dataset_with_redunant_features( n_samples=1000, n_features=100, n_informative=n_informative) train_data, test_data = train_test_data_setup(data=data) pca = Model(model_type='pca_data_model') _, train_predicted = pca.fit(data=train_data) assert train_predicted.shape[1] < data.features.shape[1]
def test_log_clustering_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) kmeans = Model(model_type='kmeans') _, train_predicted = kmeans.fit(data=train_data) assert all(np.unique(train_predicted) == [0, 1])
def test_random_forest_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) random_forest = Model(model_type='rf') _, train_predicted = random_forest.fit(data=train_data) roc_on_train = get_roc_auc(train_data, train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_log_regression_fit_correct(classification_dataset): data = classification_dataset data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) log_reg = Model(model_type='logit') _, train_predicted = log_reg.fit(data=train_data) roc_on_train = get_roc_auc(train_data, train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_knn_classification_tune_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) knn = Model(model_type='knn') model, _ = knn.fit(data=train_data) test_predicted = knn.predict(fitted_model=model, data=test_data) roc_on_test = roc_auc(y_true=test_data.target, y_score=test_predicted) roc_on_test_tuned_list = [] for _ in range(3): knn_for_tune = Model(model_type='knn') model, _ = knn_for_tune.fine_tune(data=train_data, iterations=10, max_lead_time=timedelta(minutes=1)) test_predicted_tuned = knn_for_tune.predict(fitted_model=model, data=test_data) roc_on_test_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_on_test_tuned_list.append(roc_on_test_tuned) roc_threshold = 0.6 assert np.array( roc_on_test_tuned_list).any() >= roc_on_test > roc_threshold
def test_scoring_logreg_tune_correct(data_fixture, request): train_data, test_data = request.getfixturevalue(data_fixture) train_data.features = ScalingWithImputation().fit( train_data.features).apply(train_data.features) test_data.features = ScalingWithImputation().fit(test_data.features).apply( test_data.features) logreg = Model(model_type='logit') model, _ = logreg.fit(train_data) test_predicted = logreg.predict(fitted_model=model, data=test_data) test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted) logreg_for_tune = Model(model_type='logit') model_tuned, _ = logreg_for_tune.fine_tune( data=train_data, iterations=50, max_lead_time=timedelta(minutes=0.1)) test_predicted_tuned = logreg_for_tune.predict(fitted_model=model_tuned, data=test_data) test_roc_auc_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.6 assert round(test_roc_auc_tuned, 2) >= round(test_roc_auc, 2) > roc_threshold
def __init__(self, nodes_from: Optional[List['Node']], model_type: [str, 'Model'], manual_preprocessing_func: Optional[Callable] = None, log=None): self.nodes_from = nodes_from self.cache = FittedModelCache(self) self.manual_preprocessing_func = manual_preprocessing_func self.log = log if not log: self.log = default_log(__name__) else: self.log = log if not isinstance(model_type, str): self.model = model_type else: self.model = Model(model_type=model_type)
def test_node_factory_log_reg_correct(data_setup): model_type = 'logit' node = PrimaryNode(model_type=model_type) expected_model = Model(model_type=model_type).__class__ actual_model = node.model.__class__ assert node.__class__ == PrimaryNode assert expected_model == actual_model
def test_arima_tune_correct(): data = get_synthetic_ts_data_period() train_data, test_data = train_test_data_setup(data=data) arima_for_tune = Model(model_type='arima') model, _ = arima_for_tune.fine_tune(data=train_data, iterations=5, max_lead_time=timedelta(minutes=0.1)) test_predicted_tuned = arima_for_tune.predict(fitted_model=model, data=test_data) rmse_on_test_tuned = mse(y_true=test_data.target, y_pred=test_predicted_tuned, squared=False) rmse_threshold = np.std(test_data.target) assert rmse_on_test_tuned < rmse_threshold
def test_classification_manual_tuning_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) knn = Model(model_type='knn') model, _ = knn.fit(data=train_data) test_predicted = knn.predict(fitted_model=model, data=test_data) knn_for_tune = Model(model_type='knn') knn_for_tune.params = {'n_neighbors': 1} model, _ = knn_for_tune.fit(data=train_data) test_predicted_tuned = knn_for_tune.predict(fitted_model=model, data=test_data) assert not np.array_equal(test_predicted, test_predicted_tuned)
def test_logger_write_logs_correctly(): test_file_path = str(os.path.dirname(__file__)) test_log_file = os.path.join(test_file_path, 'test_log.log') test_log = default_log('test_log', log_file=test_log_file) # Model data preparation file = os.path.join('../data', 'advanced_classification.csv') data = InputData.from_csv(os.path.join(test_file_path, file)) train_data, test_data = train_test_data_setup(data=data) try: knn = Model(model_type='knnreg', log=test_log) model, _ = knn.fit(data=train_data) except Exception: print('Captured error') if os.path.exists(test_log_file): with open(test_log_file, 'r') as file: content = file.readlines() release_log(logger=test_log, log_file=test_log_file) assert 'Can not find evaluation strategy' in content[0]
def fit_template(chain_template, classes, with_gaussian=False, skip_fit=False): templates_by_models = [] for model_template in itertools.chain.from_iterable(chain_template): model_instance = Model(model_type=model_template.model_type) model_template.model_instance = model_instance templates_by_models.append((model_template, model_instance)) if skip_fit: return for template, instance in templates_by_models: samples, features_amount = template.input_shape if with_gaussian: features, target = gauss_quantiles(samples_amount=samples, features_amount=features_amount, classes_amount=classes) else: options = { 'informative': features_amount, 'redundant': 0, 'repeated': 0, 'clusters_per_class': 1 } features, target = synthetic_dataset( samples_amount=samples, features_amount=features_amount, classes_amount=classes, features_options=options) target = np.expand_dims(target, axis=1) data_train = InputData(idx=np.arange(0, samples), features=features, target=target, data_type=DataTypesEnum.table, task=Task(TaskTypesEnum.classification)) preproc_data = copy(data_train) preprocessor = Normalization().fit(preproc_data.features) preproc_data.features = preprocessor.apply(preproc_data.features) print(f'Fit {instance}') fitted_model, predictions = instance.fit(data=preproc_data) template.fitted_model = fitted_model template.data_fit = preproc_data template.preprocessor = preprocessor
def test_pca_manual_tuning_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) pca = Model(model_type='pca_data_model') model, _ = pca.fit(data=train_data) test_predicted = pca.predict(fitted_model=model, data=test_data) pca_for_tune = Model(model_type='pca_data_model') pca_for_tune.params = { 'svd_solver': 'randomized', 'iterated_power': 'auto', 'dim_reduction_expl_thr': 0.7, 'dim_reduction_min_expl': 0.001 } model, _ = pca_for_tune.fit(data=train_data) test_predicted_tuned = pca_for_tune.predict(fitted_model=model, data=test_data) assert not np.array_equal(test_predicted, test_predicted_tuned)
def test_rf_class_tune_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = ScalingWithImputation().fit(data.features).apply( data.features) train_data, test_data = train_test_data_setup(data=data) rf = Model(model_type='rf') model, _ = rf.fit(train_data) test_predicted = rf.predict(fitted_model=model, data=test_data) test_roc_auc = roc_auc(y_true=test_data.target, y_score=test_predicted) model_tuned, _ = rf.fine_tune(data=train_data, iterations=12, max_lead_time=timedelta(minutes=0.1)) test_predicted_tuned = rf.predict(fitted_model=model_tuned, data=test_data) test_roc_auc_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.7 assert test_roc_auc_tuned >= test_roc_auc assert test_roc_auc_tuned > roc_threshold
class Node(ABC): """ Base class for Node definition in Chain structure :param nodes_from: parent nodes which information comes from :param model_type: str type of the model defined in model repository :param manual_preprocessing_func: optional function for data preprocessing. If not defined one of the available preprocessing strategies is used. \ See the `preprocessors <https://github.com/nccr-itmo/FEDOT/blob/master/core/models/preprocessing.py>`__ :param log: Log object to record messages """ def __init__(self, nodes_from: Optional[List['Node']], model_type: [str, 'Model'], manual_preprocessing_func: Optional[Callable] = None, log=None): self.nodes_from = nodes_from self.cache = FittedModelCache(self) self.manual_preprocessing_func = manual_preprocessing_func self.log = log if not log: self.log = default_log(__name__) else: self.log = log if not isinstance(model_type, str): self.model = model_type else: self.model = Model(model_type=model_type) @property def descriptive_id(self): return self._descriptive_id_recursive(visited_nodes=[]) def _descriptive_id_recursive(self, visited_nodes): node_label = self.model.description if self.manual_preprocessing_func: node_label = f'{node_label}_custom_preprocessing={self.manual_preprocessing_func.__name__}' full_path = '' if self in visited_nodes: return 'ID_CYCLED' visited_nodes.append(self) if self.nodes_from: previous_items = [] for parent_node in self.nodes_from: previous_items.append(f'{parent_node._descriptive_id_recursive(copy(visited_nodes))};') previous_items.sort() previous_items_str = ';'.join(previous_items) full_path += f'({previous_items_str})' full_path += f'/{node_label}' return full_path @property def model_tags(self) -> List[str]: return self.model.metadata.tags def output_from_prediction(self, input_data, prediction): return OutputData(idx=input_data.idx, features=input_data.features, predict=prediction, task=input_data.task, data_type=self.model.output_datatype(input_data.data_type)) def _transform(self, input_data: InputData): transformed_data = transformation_function_for_data( input_data_type=input_data.data_type, required_data_types=self.model.metadata.input_types)(input_data) return transformed_data def _preprocess(self, data: InputData): preprocessing_func = preprocessing_func_for_data(data, self) if not self.cache.actual_cached_state: # if fitted preprocessor not found in cache preprocessing_strategy = \ preprocessing_func().fit(data.features) else: # if fitted preprocessor already exists preprocessing_strategy = self.cache.actual_cached_state.preprocessor data.features = preprocessing_strategy.apply(data.features) return data, preprocessing_strategy def fit(self, input_data: InputData, verbose=False) -> OutputData: """ Run training process in the node :param input_data: data used for model training :param verbose: flag used for status printing to console, default False """ transformed = self._transform(input_data) preprocessed_data, preproc_strategy = self._preprocess(transformed) if not self.cache.actual_cached_state: if verbose: print('Cache is not actual') cached_model, model_predict = self.model.fit(data=preprocessed_data) self.cache.append(CachedState(preprocessor=copy(preproc_strategy), model=cached_model)) else: if verbose: print('Model were obtained from cache') model_predict = self.model.predict(fitted_model=self.cache.actual_cached_state.model, data=preprocessed_data) return self.output_from_prediction(input_data, model_predict) def predict(self, input_data: InputData, output_mode: str = 'default', verbose=False) -> OutputData: """ Run prediction process in the node :param input_data: data used for prediction :param output_mode: desired output for models (e.g. labels, probs, full_probs) :param verbose: flag used for status printing to console, default False """ transformed = self._transform(input_data) preprocessed_data, _ = self._preprocess(transformed) if not self.cache: raise ValueError('Model must be fitted before predict') model_predict = self.model.predict(fitted_model=self.cache.actual_cached_state.model, data=preprocessed_data, output_mode=output_mode) return self.output_from_prediction(input_data, model_predict) def fine_tune(self, input_data: InputData, max_lead_time: timedelta = timedelta(minutes=5), iterations: int = 30): """ Run the process of hyperparameter optimization for the node :param input_data: data used for tuning :param iterations: max number of iterations :param max_lead_time: max time available for tuning process """ transformed = self._transform(input_data) preprocessed_data, preproc_strategy = self._preprocess(transformed) fitted_model, _ = self.model.fine_tune(preprocessed_data, max_lead_time=max_lead_time, iterations=iterations) self.cache.append(CachedState(preprocessor=copy(preproc_strategy), model=fitted_model)) def __str__(self): model = f'{self.model}' return model @property def ordered_subnodes_hierarchy(self) -> List['Node']: nodes = [self] if self.nodes_from: for parent in self.nodes_from: nodes += parent.ordered_subnodes_hierarchy return nodes @property def custom_params(self) -> dict: return self.model.params @custom_params.setter def custom_params(self, params): if params: self.model.params = params
def __init__(self, nodes_from: Optional[List['Node']], model_type: ModelTypesIdsEnum): model = Model(model_type=model_type) nodes_from = [] if nodes_from is None else nodes_from super().__init__(nodes_from=nodes_from, model=model)
def __init__(self, model_type: ModelTypesIdsEnum): model = Model(model_type=model_type) super().__init__(nodes_from=None, model=model)