def test_fine_tune_primary_nodes(data_fixture, request): # TODO still stochatic result_list = [] for _ in range(3): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Chain composition chain = get_regr_chain() # Before tuning prediction chain.fit(train_data, use_cache=False) before_tuning_predicted = chain.predict(test_data) # Chain tuning chain.fine_tune_primary_nodes(train_data, max_lead_time=timedelta(minutes=1), iterations=10) # After tuning prediction chain.fit(train_data) after_tuning_predicted = chain.predict(test_data) # Metrics bfr_tun_mse = mse(y_true=test_data.target, y_pred=before_tuning_predicted.predict) aft_tun_mse = mse(y_true=test_data.target, y_pred=after_tuning_predicted.predict) print(f'Before tune test {bfr_tun_mse}') print(f'After tune test {aft_tun_mse}', '\n') result_list.append(aft_tun_mse <= bfr_tun_mse) assert any(result_list)
def test_fine_tune_all_nodes(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Chain composition chain = get_class_chain() # Before tuning prediction chain.fit(train_data, use_cache=False) before_tuning_predicted = chain.predict(test_data) # root node tuning chain.fine_tune_all_nodes(train_data, max_lead_time=timedelta(minutes=1), iterations=30) after_tun_root_node_predicted = chain.predict(test_data) bfr_tun_roc_auc = round( mse(y_true=test_data.target, y_pred=before_tuning_predicted.predict), 2) aft_tun_roc_auc = round( mse(y_true=test_data.target, y_pred=after_tun_root_node_predicted.predict), 2) print(f'Before tune test {bfr_tun_roc_auc}') print(f'After tune test {aft_tun_roc_auc}', '\n') assert aft_tun_roc_auc <= bfr_tun_roc_auc
def test_chain_hierarchy_fit_correct(data_setup): data = data_setup train, _ = train_test_data_setup(data) first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit) second = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) third = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[first]) final = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit, nodes_from=[second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) train_predicted = chain.fit(input_data=train, use_cache=False) assert chain.root_node.descriptive_id == ( '((/n_ModelTypesIdsEnum.logit_defaultparams;)/' 'n_ModelTypesIdsEnum.logit_defaultparams;;(/' 'n_ModelTypesIdsEnum.logit_defaultparams;)/' 'n_ModelTypesIdsEnum.logit_defaultparams;)/' 'n_ModelTypesIdsEnum.logit_defaultparams') assert chain.length == 4 assert chain.depth == 3 assert train_predicted.predict.shape == train.target.shape
def tune(self, fit: Callable, predict: Callable, tune_data: InputData, params_range: dict, default_params: dict, iterations: int) -> dict: tune_train_data, tune_test_data = train_test_data_setup(tune_data, 0.5) trained_model_default = fit(tune_test_data, default_params) prediction_default = predict(trained_model_default, tune_test_data) best_quality_metric = _regression_prediction_quality( prediction=prediction_default, real=tune_test_data.target) best_params = default_params for _ in range(iterations): random_params = get_random_params(params_range) try: trained_model_candidate = fit(tune_train_data, random_params) prediction_candidate = predict(trained_model_candidate, tune_test_data) quality_metric = _regression_prediction_quality( prediction=prediction_candidate, real=tune_test_data.target) if quality_metric < best_quality_metric: best_params = random_params except ValueError: pass return best_params
def test_pca_model_removes_redunant_features_correct(): n_informative = 5 data = classification_dataset_with_redunant_features( n_samples=1000, n_features=100, n_informative=n_informative) train_data, test_data = train_test_data_setup(data=data) pca = Model(model_type='pca_data_model') _, train_predicted = pca.fit(data=train_data) assert train_predicted.shape[1] < data.features.shape[1]
def test_log_clustering_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) kmeans = Model(model_type=ModelTypesIdsEnum.kmeans) _, train_predicted = kmeans.fit(data=train_data) assert all(np.unique(train_predicted) == [0, 1])
def test_regression_chain_fit_correct(): data = get_synthetic_regression_data() chain = compose_chain(data=data) train_data, test_data = train_test_data_setup(data) chain.fit(input_data=train_data) _, rmse_on_test = get_rmse_value(chain, train_data, test_data) rmse_threshold = np.std(data.target) * 0.05 assert rmse_on_test < rmse_threshold
def test_log_regression_fit_correct(classification_dataset): data = classification_dataset data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) log_reg = Model(model_type=ModelTypesIdsEnum.logit) _, train_predicted = log_reg.fit(data=train_data) roc_on_train = roc_auc(y_true=train_data.target, y_score=train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_qda_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) qda = Model(model_type=ModelTypesIdsEnum.qda) _, train_predicted = qda.fit(data=train_data) roc_on_train = roc_auc(y_true=train_data.target, y_score=train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_log_regression_fit_correct(classification_dataset): data = classification_dataset data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) log_reg = Model(model_type='logit') _, train_predicted = log_reg.fit(data=train_data) roc_on_train = get_roc_auc(train_data, train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup): data = data_setup train, test = train_test_data_setup(data) first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit) second = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.lda) third = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.knn) final = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.xgboost, nodes_from=[first, second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) first = deepcopy(first) second = deepcopy(second) third = deepcopy(third) final_shuffled = NodeGenerator.secondary_node( model_type=ModelTypesIdsEnum.xgboost, nodes_from=[third, first, second]) chain_shuffled = Chain() # change order of nodes in list for node in [final_shuffled, third, first, second]: chain_shuffled.add_node(node) train_predicted = chain.fit(input_data=train) train_predicted_shuffled = chain_shuffled.fit(input_data=train) # train results should be invariant assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id assert all( np.equal(train_predicted.predict, train_predicted_shuffled.predict)) test_predicted = chain.predict(input_data=test) test_predicted_shuffled = chain_shuffled.predict(input_data=test) # predict results should be invariant assert all( np.equal(test_predicted.predict, test_predicted_shuffled.predict)) # change parents order for the nodes fitted chain nodes_for_change = chain.nodes[3].nodes_from chain.nodes[3].nodes_from = [ nodes_for_change[2], nodes_for_change[0], nodes_for_change[1] ] chain.nodes[3].cache.clear() chain.fit(train) test_predicted_re_shuffled = chain.predict(input_data=test) # predict results should be invariant assert all( np.equal(test_predicted.predict, test_predicted_re_shuffled.predict))
def compose_chain( self, data: InputData, initial_chain: Optional[NASChain], composer_requirements: Optional[GPNNComposerRequirements], metrics: Optional[Callable], optimiser_parameters: GPChainOptimiserParameters = None, is_visualise: bool = False) -> NASChain: train_data, test_data = train_test_data_setup(data, 0.8) self.input_shape = [size for size in composer_requirements.image_size] self.input_shape.append(composer_requirements.channels_num) self.input_shape = tuple(self.input_shape) if not optimiser_parameters: self.optimiser_parameters = GPChainOptimiserParameters( chain_generation_function=random_cnn_chain, crossover_types=[CrossoverTypesEnum.subtree], crossover_types_dict=crossover_by_type, mutation_types=[MutationTypesEnum.simple], mutation_types_dict=mutation_by_type, selection_types=[SelectionTypesEnum.tournament]) else: self.optimiser_parameters = optimiser_parameters metric_function_for_nodes = partial( self.metric_for_nodes, metrics, train_data, test_data, self.input_shape, composer_requirements.num_of_classes, composer_requirements.batch_size, composer_requirements.train_epochs_num) optimiser = GPChainOptimiser( initial_chain=initial_chain, requirements=composer_requirements, primary_node_func=NNNodeGenerator.primary_node, secondary_node_func=NNNodeGenerator.secondary_node, chain_class=NASChain, parameters=self.optimiser_parameters) best_chain, self.history = optimiser.optimise( metric_function_for_nodes) historical_fitness = [chain.fitness for chain in self.history] if is_visualise: ComposerVisualiser.visualise_history(self.history, historical_fitness) write_composer_history_to_csv(historical_fitness=historical_fitness, historical_chains=self.history, pop_size=composer_requirements.pop_size) print('GP composition finished') return best_chain
def test_composite_lstm_chain_fit_correct(): data = get_synthetic_ts_data() chain = get_decomposed_chain() train_data, test_data = train_test_data_setup(data) chain.fit(input_data=train_data) _, rmse_on_test = get_rmse_value(chain, train_data, test_data) rmse_threshold = np.std(data.target) assert rmse_on_test < rmse_threshold
def test_lda_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) lda = Model(model_type='lda') _, train_predicted = lda.fit(data=train_data) roc_on_train = get_roc_auc(train_data, train_predicted) roc_threshold = 0.95 assert roc_on_train >= roc_threshold
def compose_chain(self, data: InputData, initial_chain: Optional[Chain], composer_requirements: Optional[GPComposerRequirements], metrics: Optional[Callable], optimiser_parameters: GPChainOptimiserParameters = None, is_visualise: bool = False) -> Chain: train_data, test_data = train_test_data_setup(data, 0.8) self.shared_cache.clear() if not optimiser_parameters: self.optimiser_parameters = GPChainOptimiserParameters( chain_generation_function=random_ml_chain, crossover_types=[ CrossoverTypesEnum.subtree, CrossoverTypesEnum.onepoint ], crossover_types_dict=crossover_by_type, mutation_types=[ MutationTypesEnum.simple, MutationTypesEnum.local_growth, MutationTypesEnum.reduce ], mutation_types_dict=mutation_by_type, selection_types=[SelectionTypesEnum.tournament]) else: self.optimiser_parameters = optimiser_parameters metric_function_for_nodes = partial(self.metric_for_nodes, metrics, train_data, test_data, True) optimiser = GPChainOptimiser( initial_chain=initial_chain, requirements=composer_requirements, primary_node_func=NodeGenerator.primary_node, secondary_node_func=NodeGenerator.secondary_node, chain_class=Chain, parameters=self.optimiser_parameters) best_chain, self.history = optimiser.optimise( metric_function_for_nodes) historical_fitness = [chain.fitness for chain in self.history] if is_visualise: ComposerVisualiser.visualise_history(self.history, historical_fitness) write_composer_history_to_csv(historical_fitness=historical_fitness, historical_chains=self.history, pop_size=composer_requirements.pop_size) print('GP composition finished') return best_chain
def test_eval_strategy_logreg(data_setup): data_set = data_setup train, test = train_test_data_setup(data=data_set) test_skl_model = LogisticRegression(C=10., random_state=1, solver='liblinear', max_iter=10000, verbose=0) test_skl_model.fit(train.features, train.target) expected_result = test_skl_model.predict(test.features) test_model_node = PrimaryNode(model_type='logit') test_model_node.fit(input_data=train) actual_result = test_model_node.predict(input_data=test) assert len(actual_result.predict) == len(expected_result)
def test_multiclassification_chain_fit_correct(): data = get_iris_data() chain = compose_chain() train_data, test_data = train_test_data_setup(data, shuffle_flag=True) chain.fit(input_data=train_data) results = chain.predict(input_data=test_data) roc_auc_on_test = roc_auc(y_true=test_data.target, y_score=results.predict, multi_class='ovo', average='macro') assert roc_auc_on_test > 0.95
def test_model_fit_and_predict_correctly(): """Checks whether the model fits and predict correctly on the synthetic dataset""" data = get_synthetic_input_data(N_SAMPLES, N_FEATURES, random_state=1) chain = compose_chain(data=data) train_data, test_data = train_test_data_setup(data) chain.fit(input_data=train_data) roc_auc_value_train, roc_auc_value_test = get_roc_auc_value( chain, train_data, test_data) train_auc_thr = get_auc_threshold(roc_auc_value_train) test_auc_thr = get_auc_threshold(roc_auc_value_test) assert train_auc_thr >= CORRECT_MODEL_AUC_THR assert test_auc_thr >= CORRECT_MODEL_AUC_THR
def test_regression_composite_fit_correct(): data = get_synthetic_ts_data() chain = get_decomposed_chain(model_trend='linear', model_residual='linear') train_data, test_data = train_test_data_setup(data) chain.fit(input_data=train_data) _, rmse_on_test = get_rmse_value(chain, train_data, test_data) print(rmse_on_test) rmse_threshold = np.std(data.target) * 1.5 assert rmse_on_test < rmse_threshold
def test_regression_chain_fit_correct(): data = get_synthetic_ts_data() chain = Chain() node_rfr = PrimaryNode('rfr') chain.add_node(node_rfr) train_data, test_data = train_test_data_setup(data) chain.fit(input_data=train_data) _, rmse_on_test = get_rmse_value(chain, train_data, test_data) rmse_threshold = np.std(data.target) * 1.5 assert rmse_on_test < rmse_threshold
def test_regression_chain_with_datamodel_fit_correct(): data = get_synthetic_regression_data() train_data, test_data = train_test_data_setup(data) node_data = PrimaryNode('direct_data_model') node_first = PrimaryNode('ridge') node_second = SecondaryNode('lasso') node_second.nodes_from = [node_first, node_data] chain = Chain(node_second) chain.fit(train_data) results = chain.predict(test_data) assert results.predict.shape == test_data.target.shape
def test_chain_with_clusters_fit_correct(): mean_roc_on_test = 0 for _ in range(15): # mean ROC AUC analysed because of stochastic clustering data = get_synthetic_input_data(n_samples=10000) chain = compose_chain(data=data) train_data, test_data = train_test_data_setup(data) chain.fit(input_data=train_data) _, roc_on_test = get_roc_auc_value(chain, train_data, test_data) mean_roc_on_test = np.mean([mean_roc_on_test, roc_on_test]) roc_threshold = 0.6 assert mean_roc_on_test > roc_threshold
def test_classification_manual_tuning_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) knn = Model(model_type='knn') model, _ = knn.fit(data=train_data) test_predicted = knn.predict(fitted_model=model, data=test_data) knn_for_tune = Model(model_type='knn') knn_for_tune.params = {'n_neighbors': 1} model, _ = knn_for_tune.fit(data=train_data) test_predicted_tuned = knn_for_tune.predict(fitted_model=model, data=test_data) assert not np.array_equal(test_predicted, test_predicted_tuned)
def synthetic_benchmark_composing_example(): fitted_chain = separately_fit_chain(samples=5000, features_amount=10, classes=2) data = synthetic_benchmark_dataset(samples_amount=5000, features_amount=10, fitted_chain=fitted_chain) print(f'Synthetic features: {data.features[:10]}') print(f'Synthetic target: {data.target[:10]}') train, test = train_test_data_setup(data) simple_chain = two_level_chain() simple_chain.fit(input_data=train, use_cache=False) print(f'ROC score on train: {roc_value(simple_chain, train)}') print(f'ROC score on test {roc_value(simple_chain, test)}')
def test_nodes_sequence_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train, _ = train_test_data_setup(data) first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='lda', nodes_from=[first]) third = SecondaryNode(model_type='qda', nodes_from=[first]) final = SecondaryNode(model_type='knn', nodes_from=[second, third]) train_predicted = final.fit(input_data=train) assert final.descriptive_id == ('((/n_logit_default_params;)/' 'n_lda_default_params;;(/' 'n_logit_default_params;)/' 'n_qda_default_params;)/' 'n_knn_default_params') assert train_predicted.predict.shape[0] == train.target.shape[0] assert final.cache.actual_cached_state is not None
def test_chain_with_datamodel_fit_correct(data_setup): data = data_setup train_data, test_data = train_test_data_setup(data) chain = Chain() node_data = PrimaryNode('direct_data_model') node_first = PrimaryNode('bernb') node_second = SecondaryNode('rf') node_second.nodes_from = [node_first, node_data] chain.add_node(node_data) chain.add_node(node_first) chain.add_node(node_second) chain.fit(train_data) results = np.asarray(probs_to_labels(chain.predict(test_data).predict)) assert results.shape == test_data.target.shape
def test_arima_tune_correct(): data = get_synthetic_ts_data() train_data, test_data = train_test_data_setup(data=data) arima_for_tune = Model(model_type='arima') model, _ = arima_for_tune.fine_tune(data=train_data, iterations=5, max_lead_time=timedelta(minutes=0.1)) test_predicted_tuned = arima_for_tune.predict(fitted_model=model, data=test_data) rmse_on_test_tuned = mse(y_true=test_data.target, y_pred=test_predicted_tuned, squared=False) rmse_threshold = np.std(test_data.target) assert rmse_on_test_tuned < rmse_threshold
def test_model_predictions_on_train_test_random(): """Checks that model can't predict correctly on random train and test datasets and the roc_auc_scores is close to 0.5. Both train and test data have no relations between features and target.""" data = get_synthetic_input_data(N_SAMPLES, N_FEATURES, random_state=1) data = get_random_target_data(data) train_data, test_data = train_test_data_setup(data) chain = compose_chain(data=train_data) chain.fit(input_data=train_data) roc_auc_value_train, roc_auc_value_test = get_roc_auc_value( chain, train_data, test_data) train_auc_thr = get_auc_threshold(roc_auc_value_train) test_auc_thr = get_auc_threshold(roc_auc_value_test) print(roc_auc_value_train) print(roc_auc_value_test) assert test_auc_thr <= CORRECT_MODEL_AUC_THR assert train_auc_thr <= CORRECT_MODEL_AUC_THR
def compose_chain(self, data: InputData, initial_chain: Optional[Chain], composer_requirements: Optional[GPComposerRequirements], metrics: Optional[Callable], optimiser_parameters: GPChainOptimiserParameters = None, is_visualise: bool = False, is_tune: bool = False) -> Chain: train_data, test_data = train_test_data_setup(data, 0.8) self.shared_cache.clear() metric_function_for_nodes = partial(self.metric_for_nodes, metrics, train_data, test_data, True) optimiser = GPChainOptimiser(initial_chain=initial_chain, requirements=composer_requirements, primary_node_func=PrimaryNode, secondary_node_func=SecondaryNode, chain_class=Chain, parameters=optimiser_parameters) best_chain, self.history = optimiser.optimise( metric_function_for_nodes) historical_fitness = [chain.fitness for chain in self.history] if is_visualise: ComposerVisualiser.visualise_history(self.history, historical_fitness) write_composer_history_to_csv(historical_fitness=historical_fitness, historical_chains=self.history, pop_size=composer_requirements.pop_size) print('GP composition finished') if is_tune: self.tune_chain(best_chain, data, composer_requirements.max_lead_time) return best_chain
def test_knn_classification_tune_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) data.features = Scaling().fit(data.features).apply(data.features) train_data, test_data = train_test_data_setup(data=data) knn = Model(model_type='knn') model, _ = knn.fit(data=train_data) test_predicted = knn.predict(fitted_model=model, data=test_data) roc_on_test = roc_auc(y_true=test_data.target, y_score=test_predicted) knn_for_tune = Model(model_type='knn') model, _ = knn_for_tune.fine_tune(data=train_data, iterations=10, max_lead_time=timedelta(minutes=1)) test_predicted_tuned = knn.predict(fitted_model=model, data=test_data) roc_on_test_tuned = roc_auc(y_true=test_data.target, y_score=test_predicted_tuned) roc_threshold = 0.6 assert roc_on_test_tuned > roc_on_test > roc_threshold