def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier()) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(f'ROC AUC for TPOT: {roc_auc_value}') node_scaling = PrimaryNode('scaling') node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling]) node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling]) chain = Chain(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(f'ROC AUC for FEDOT: {roc_auc_value}') return roc_auc_value
def test_forecast_with_exog(): train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts( ) # Source data for lagged node node_lagged = PrimaryNode('lagged', node_data={ 'fit': train_source_ts, 'predict': predict_source_ts }) # Set window size for lagged transformation node_lagged.custom_params = {'window_size': window_size} # Exogenous variable for exog node node_exog = PrimaryNode('exog', node_data={ 'fit': train_exog_ts, 'predict': predict_exog_ts }) node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_exog]) chain = Chain(node_final) chain.fit() forecast = chain.predict() prediction = np.ravel(np.array(forecast.predict)) assert tuple(prediction) == tuple(ts_test)
def test_chain_hierarchy_fit_correct(data_setup): data = data_setup train, _ = train_test_data_setup(data) first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) third = SecondaryNode(model_type='logit', nodes_from=[first]) final = SecondaryNode(model_type='logit', nodes_from=[second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) train_predicted = chain.fit(input_data=train, use_cache=False) assert chain.root_node.descriptive_id == ('((/n_logit_default_params;)/' 'n_logit_default_params;;(/' 'n_logit_default_params;)/' 'n_logit_default_params;)/' 'n_logit_default_params') assert chain.length == 4 assert chain.depth == 3 assert train_predicted.predict.shape[0] == train.target.shape[0] assert final.cache.actual_cached_state is not None
def create_chain_with_several_nested_atomized_model() -> Chain: chain = Chain() node_atomized_model = PrimaryNode( model_type=create_atomized_model_with_several_atomized_models()) node_atomized_model_secondary = SecondaryNode( model_type=create_atomized_model()) node_atomized_model_secondary.nodes_from = [node_atomized_model] node_knn = SecondaryNode('knn') node_knn.custom_params = {'n_neighbors': 9} node_knn.nodes_from = [node_atomized_model] node_knn_second = SecondaryNode('knn') node_knn_second.custom_params = {'n_neighbors': 5} node_knn_second.nodes_from = [ node_atomized_model, node_atomized_model_secondary, node_knn ] node_atomized_model_secondary_second = \ SecondaryNode(model_type=create_atomized_model_with_several_atomized_models()) node_atomized_model_secondary_second.nodes_from = [node_knn_second] chain.add_node(node_atomized_model_secondary_second) return chain
def chain_tuning(nodes_to_tune: str, chain: Chain, train_data: InputData, test_data: InputData, local_iter: int, tuner_iter_num: int = 50) -> (float, list): several_iter_scores_test = [] if nodes_to_tune == 'primary': print('primary_node_tuning') chain_tune_strategy = chain.fine_tune_primary_nodes elif nodes_to_tune == 'root': print('root_node_tuning') chain_tune_strategy = chain.fine_tune_all_nodes else: raise ValueError(f'Invalid type of nodes. Nodes must be primary or root') for iteration in range(local_iter): print(f'current local iteration {iteration}') # Chain tuning chain_tune_strategy(train_data, iterations=tuner_iter_num) # After tuning prediction chain.fit(train_data) after_tuning_predicted = chain.predict(test_data) # Metrics aft_tun_roc_auc = roc_auc(y_true=test_data.target, y_score=after_tuning_predicted.predict) several_iter_scores_test.append(aft_tun_roc_auc) return float(np.mean(several_iter_scores_test)), several_iter_scores_test
def create_chain(): first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') final = SecondaryNode(operation_type='knn', nodes_from=[first, second]) chain = Chain(final) chain.fitness = 1 return chain
def get_rmse_value(chain: Chain, train_data: InputData, test_data: InputData) -> (float, float): train_pred = chain.predict(input_data=train_data) test_pred = chain.predict(input_data=test_data) rmse_value_test = ts_mse(obs=test_data.target, pred=test_pred.predict) rmse_value_train = ts_mse(obs=train_data.target, pred=train_pred.predict) return rmse_value_train, rmse_value_test, train_pred, test_pred
def chain_with_multiple_roots(): first = PrimaryNode(operation_type='logit') root_first = SecondaryNode(operation_type='logit', nodes_from=[first]) root_second = SecondaryNode(operation_type='logit', nodes_from=[first]) chain = Chain() for node in [first, root_first, root_second]: chain.add_node(node) return chain
def chain_with_cycle(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second, first]) second.nodes_from.append(third) chain = Chain() for node in [first, second, third]: chain.add_node(node) return chain
def get_roc_auc_value(chain: Chain, train_data: InputData, test_data: InputData) -> (float, float): train_pred = chain.predict(input_data=train_data) test_pred = chain.predict(input_data=test_data) roc_auc_value_test = roc_auc(y_true=test_data.target, y_score=test_pred.predict) roc_auc_value_train = roc_auc(y_true=train_data.target, y_score=train_pred.predict) return roc_auc_value_train, roc_auc_value_test
def two_level_chain(): first = PrimaryNode(model_type='logit') second = PrimaryNode(model_type='knn') third = SecondaryNode(model_type='xgboost', nodes_from=[first, second]) chain = Chain() for node in [first, second, third]: chain.add_node(node) return chain
def test_import_json_to_fitted_chain_correctly(): json_path_load = create_correct_path('test_fitted_chain_convert_to_json') chain = Chain() chain.load(json_path_load) json_actual = chain.save('test_import_json_to_fitted_chain_correctly') with open(json_path_load, 'r') as json_file: json_expected = json.load(json_file) assert json_actual == json.dumps(json_expected)
def chain_with_isolated_components(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[]) fourth = SecondaryNode(operation_type='logit', nodes_from=[third]) chain = Chain() for node in [first, second, third, fourth]: chain.add_node(node) return chain
def valid_chain(): first = PrimaryNode(operation_type='logit') second = SecondaryNode(operation_type='logit', nodes_from=[first]) third = SecondaryNode(operation_type='logit', nodes_from=[second]) last = SecondaryNode(operation_type='logit', nodes_from=[third]) chain = Chain() for node in [first, second, third, last]: chain.add_node(node) return chain
def chain_with_isolated_nodes(): first = PrimaryNode(model_type='logit') second = SecondaryNode(model_type='logit', nodes_from=[first]) third = SecondaryNode(model_type='logit', nodes_from=[second]) isolated = SecondaryNode(model_type='logit', nodes_from=[]) chain = Chain() for node in [first, second, third, isolated]: chain.add_node(node) return chain
def get_composite_multiscale_chain(): chain = Chain() node_trend = PrimaryNode('trend_data_model') node_lstm_trend = SecondaryNode('ridge', nodes_from=[node_trend]) node_residual = PrimaryNode('residual_data_model') node_ridge_residual = SecondaryNode('ridge', nodes_from=[node_residual]) node_final = SecondaryNode( 'linear', nodes_from=[node_ridge_residual, node_lstm_trend]) chain.add_node(node_final) return chain
def chain_third(): # QDA # | \ # RF RF chain = Chain() new_node = SecondaryNode('qda') for model_type in ('rf', 'rf'): new_node.nodes_from.append(PrimaryNode(model_type)) chain.add_node(new_node) [chain.add_node(node_from) for node_from in new_node.nodes_from] return chain
def test_import_json_to_chain_correctly(): json_path_load = create_correct_path('test_chain_convert_to_json') chain = Chain() chain.load_chain(json_path_load) json_actual = chain.save_chain('test_import_json_to_chain_correctly_1') chain_expected = create_chain() json_expected = chain_expected.save_chain('test_import_json_to_chain_correctly_2') assert json.dumps(json_actual) == json.dumps(json_expected)
def test_ts_forecasting_lagged_data_operation(): train_input, predict_input, y_test = get_time_series() node_lagged = PrimaryNode('lagged') node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged]) chain = Chain(node_ridge) chain.fit_from_scratch(train_input) predicted_output = chain.predict(predict_input) predicted = np.ravel(predicted_output.predict) assert len(predicted) == len(np.ravel(y_test))
def metric_for_nodes(self, metric_function, train_data: InputData, test_data: InputData, is_chain_shared: bool, chain: Chain) -> float: try: validate(chain) if is_chain_shared: chain = SharedChain(base_chain=chain, shared_cache=self.shared_cache) chain.fit(input_data=train_data) return metric_function(chain, test_data) except Exception as ex: self.log.info(f'Error in chain assessment during composition: {ex}. Continue.') return max_int_value
def test_arima_chain_fit_correct(): train_data, test_data = get_synthetic_ts_data_linear(forecast_length=12) chain = Chain(PrimaryNode('arima')) chain.fit(input_data=train_data) rmse_on_train, rmse_on_test, _, _ = get_rmse_value(chain, train_data, test_data) rmse_threshold = _max_rmse_threshold_by_std(test_data.target) assert rmse_on_train < rmse_threshold
def get_rmse_value(chain: Chain, train_data: InputData, test_data: InputData) -> (float, float): train_pred = chain.predict(input_data=train_data) test_pred = chain.predict(input_data=test_data) rmse_value_test = mse(y_true=test_data.target, y_pred=test_pred.predict, squared=False) rmse_value_train = mse(y_true=train_data.target, y_pred=train_pred.predict, squared=False) return rmse_value_train, rmse_value_test
def test_chain_repr(): first = PrimaryNode(operation_type='logit') second = PrimaryNode(operation_type='lda') third = PrimaryNode(operation_type='knn') final = SecondaryNode(operation_type='xgboost', nodes_from=[first, second, third]) chain = Chain() chain.add_node(final) expected_chain_description = "{'depth': 2, 'length': 4, 'nodes': [xgboost, logit, lda, knn]}" assert repr(chain) == expected_chain_description
def create_json_models_files(): """ Creating JSON's files for test before tests. """ chain = create_chain() chain.save('test_chain_convert_to_json') chain_fitted = create_fitted_chain() chain_fitted.save('test_fitted_chain_convert_to_json') chain_empty = Chain() chain_empty.save('test_empty_chain_convert_to_json')
def test_log_clustering_fit_correct(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Scaling chain. Fit predict it scaling_chain = Chain(PrimaryNode('normalization')) scaling_chain.fit(train_data) scaled_data = scaling_chain.predict(train_data) kmeans = Model(operation_type='kmeans') _, train_predicted = kmeans.fit(data=scaled_data) assert all(np.unique(train_predicted.predict) == [0, 1])
def execute_chain_for_text_problem(train_data, test_data): node_text_clean = PrimaryNode('text_clean') node_tfidf = SecondaryNode('tfidf', nodes_from=[node_text_clean]) model_node = SecondaryNode('multinb', nodes_from=[node_tfidf]) chain = Chain(model_node) chain.fit(train_data) predicted = chain.predict(test_data) roc_auc_metric = roc_auc(y_true=test_data.target, y_score=predicted.predict) return roc_auc_metric
def test_regression_chain_period_exog_forecast_multistep_correct(): train_data, test_data = get_synthetic_ts_data_period(forecast_length=2, max_window_size=3) chain = Chain(PrimaryNode('linear')) chain.fit(input_data=train_data) rmse_on_train, rmse_on_test, _, _ = get_rmse_value(chain, train_data, test_data) rmse_threshold = 1.5 assert rmse_on_train < rmse_threshold assert rmse_on_test < rmse_threshold
def test_regression_chain_forecast_multistep_correct(): train_data, test_data = get_synthetic_ts_data_period(forecast_length=20, max_window_size=30) chain = Chain(PrimaryNode('ridge')) chain.fit(input_data=train_data) _, rmse_on_test, _, _ = get_rmse_value(chain, train_data, test_data) rmse_threshold = _max_rmse_threshold_by_std(test_data.target, is_strict=False) assert rmse_on_test < rmse_threshold
def test_chain_with_wrong_data(): chain = Chain(PrimaryNode('linear')) data_seq = np.arange(0, 10) task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=10)) data = InputData(idx=data_seq, features=data_seq, target=data_seq, data_type=DataTypesEnum.ts, task=task) with pytest.raises(ValueError): chain.fit(data)
def test_ts_forecasting_smoothing_data_operation(): train_input, predict_input, y_test = get_time_series() for smoothing_operation in ['smoothing', 'gaussian_filter']: node_smoothing = PrimaryNode(smoothing_operation) node_lagged = SecondaryNode('lagged', nodes_from=[node_smoothing]) node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged]) chain = Chain(node_ridge) chain.fit_from_scratch(train_input) predicted_output = chain.predict(predict_input) predicted = np.ravel(predicted_output.predict) assert len(predicted) == len(np.ravel(y_test))