def get_rmse_value(chain: Chain, train_data: InputData, test_data: InputData) -> (float, float): train_pred = chain.predict(input_data=train_data) test_pred = chain.predict(input_data=test_data) rmse_value_test = mse(y_true=test_data.target, y_pred=test_pred.predict, squared=False) rmse_value_train = mse(y_true=train_data.target, y_pred=train_pred.predict, squared=False) return rmse_value_train, rmse_value_test
def get_roc_auc_value(chain: Chain, train_data: InputData, test_data: InputData) -> (float, float): train_pred = chain.predict(input_data=train_data) test_pred = chain.predict(input_data=test_data) roc_auc_value_test = roc_auc(y_true=test_data.target, y_score=test_pred.predict) roc_auc_value_train = roc_auc(y_true=train_data.target, y_score=train_pred.predict) return roc_auc_value_train, roc_auc_value_test
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup): data = data_setup train, test = train_test_data_setup(data) first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit) second = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.lda) third = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.knn) final = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.xgboost, nodes_from=[first, second, third]) chain = Chain() for node in [first, second, third, final]: chain.add_node(node) first = deepcopy(first) second = deepcopy(second) third = deepcopy(third) final_shuffled = NodeGenerator.secondary_node( model_type=ModelTypesIdsEnum.xgboost, nodes_from=[third, first, second]) chain_shuffled = Chain() # change order of nodes in list for node in [final_shuffled, third, first, second]: chain_shuffled.add_node(node) train_predicted = chain.fit(input_data=train) train_predicted_shuffled = chain_shuffled.fit(input_data=train) # train results should be invariant assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id assert all( np.equal(train_predicted.predict, train_predicted_shuffled.predict)) test_predicted = chain.predict(input_data=test) test_predicted_shuffled = chain_shuffled.predict(input_data=test) # predict results should be invariant assert all( np.equal(test_predicted.predict, test_predicted_shuffled.predict)) # change parents order for the nodes fitted chain nodes_for_change = chain.nodes[3].nodes_from chain.nodes[3].nodes_from = [ nodes_for_change[2], nodes_for_change[0], nodes_for_change[1] ] chain.nodes[3].cache.clear() chain.fit(train) test_predicted_re_shuffled = chain.predict(input_data=test) # predict results should be invariant assert all( np.equal(test_predicted.predict, test_predicted_re_shuffled.predict))
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, is_visualise=False): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) chain = get_composite_lstm_chain() chain_simple = Chain() node_single = PrimaryNode('ridge') chain_simple.add_node(node_single) chain_lstm = Chain() node_lstm = PrimaryNode('lstm') chain_lstm.add_node(node_lstm) chain.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, f'full-composite_{forecast_length}', is_visualise) chain_lstm.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_lstm_only = calculate_validation_metric( chain_lstm.predict(dataset_to_validate), dataset_to_validate, f'full-lstm-only_{forecast_length}', is_visualise) chain_simple.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_simple = calculate_validation_metric( chain_simple.predict(dataset_to_validate), dataset_to_validate, f'full-simple_{forecast_length}', is_visualise) print(f'RMSE composite: {rmse_on_valid}') print(f'RMSE simple: {rmse_on_valid_simple}') print(f'RMSE LSTM only: {rmse_on_valid_lstm_only}') return rmse_on_valid_simple
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_tpot = PrimaryNode('tpot') node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds} node_lda = PrimaryNode('lda') node_rf = SecondaryNode('rf') node_rf.nodes_from = [node_tpot, node_lda] chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def chain_tuning(nodes_to_tune: str, chain: Chain, train_data: InputData, test_data: InputData, local_iter: int, tuner_iter_num: int = 50) -> (float, list): several_iter_scores_test = [] if nodes_to_tune == 'primary': print('primary_node_tuning') chain_tune_strategy = chain.fine_tune_primary_nodes elif nodes_to_tune == 'root': print('root_node_tuning') chain_tune_strategy = chain.fine_tune_all_nodes else: raise ValueError( f'Invalid type of nodes. Nodes must be primary or root') for iteration in range(local_iter): print(f'current local iteration {iteration}') # Chain tuning chain_tune_strategy(train_data, iterations=tuner_iter_num) # After tuning prediction chain.fit(train_data) after_tuning_predicted = chain.predict(test_data) # Metrics aft_tun_roc_auc = roc_auc(y_true=test_data.target, y_score=after_tuning_predicted.predict) several_iter_scores_test.append(aft_tun_roc_auc) return float(np.mean(several_iter_scores_test)), several_iter_scores_test
def apply_model_to_data(model: Chain, data_path: str): df, file_path = create_multi_clf_examples_from_excel(data_path, return_df=True) dataset_to_apply = InputData.from_csv(file_path, with_target=False) evo_predicted = model.predict(dataset_to_apply) df['forecast'] = probs_to_labels(evo_predicted.predict) return df
def calculate_validation_metric(chain: Chain, dataset_to_validate: InputData) -> float: # the execution of the obtained composite models predicted = chain.predict(dataset_to_validate) # the quality assessment for the simulation results roc_auc_value = roc_auc(y_true=dataset_to_validate.target, y_score=predicted.predict) return roc_auc_value
def get_value(cls, chain: Chain, reference_data: InputData) -> float: metric = cls.default_value try: results = chain.predict(reference_data) metric = cls.metric(reference_data, results) except Exception as ex: print(f'Metric evaluation error: {ex}') return metric
def validate_model_quality(model: Chain, data_path: str): dataset_to_validate = InputData.from_csv(data_path) predicted_labels = model.predict(dataset_to_validate).predict roc_auc_valid = round( roc_auc(y_true=test_data.target, y_score=predicted_labels, multi_class='ovo', average='macro'), 3) return roc_auc_valid
def get_value(chain: Chain, reference_data: InputData) -> float: try: # validate(chain) results = chain.predict(reference_data) y_pred = [round(predict[0]) for predict in results.predict] score = round( accuracy_score(y_true=reference_data.target, y_pred=y_pred), 3) except Exception as ex: print(ex) score = 0.5 return score
def get_value(chain: Chain, reference_data: InputData) -> float: try: # validate(chain) results = chain.predict(reference_data) score = round( roc_auc_score(y_score=results.predict, y_true=reference_data.target), 3) except Exception as ex: print(ex) score = 0.5 return score
def calculate_validation_metric(chain: Chain, dataset_to_validate: InputData) -> float: # the execution of the obtained composite models predicted = chain.predict(dataset_to_validate) # plot results compare_plot(predicted, dataset_to_validate) # the quality assessment for the simulation results roc_auc_value = mse(y_true=dataset_to_validate.target, y_pred=predicted.predict, squared=False) return roc_auc_value
def test_regression_chain_with_datamodel_fit_correct(): data = get_synthetic_regression_data() train_data, test_data = train_test_data_setup(data) node_data = PrimaryNode('direct_data_model') node_first = PrimaryNode('ridge') node_second = SecondaryNode('lasso') node_second.nodes_from = [node_first, node_data] chain = Chain(node_second) chain.fit(train_data) results = chain.predict(test_data) assert results.predict.shape == test_data.target.shape
def test_chain_with_datamodel_fit_correct(data_setup): data = data_setup train_data, test_data = train_test_data_setup(data) chain = Chain() node_data = PrimaryNode('direct_data_model') node_first = PrimaryNode('bernb') node_second = SecondaryNode('rf') node_second.nodes_from = [node_first, node_data] chain.add_node(node_data) chain.add_node(node_first) chain.add_node(node_second) chain.fit(train_data) results = np.asarray(probs_to_labels(chain.predict(test_data).predict)) assert results.shape == test_data.target.shape
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier()) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(roc_auc_value) chain = Chain() node_first = PrimaryNode('direct_data_model') node_second = PrimaryNode('bernb') node_third = SecondaryNode('rf') node_third.nodes_from.append(node_first) node_third.nodes_from.append(node_second) chain.add_node(node_third) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def test_chain_with_custom_params_for_model(data_setup): data = data_setup custom_params = dict(n_neighbors=1, weights='uniform', p=1) first = PrimaryNode(model_type='logit') second = PrimaryNode(model_type='lda') final = SecondaryNode(model_type='knn', nodes_from=[first, second]) chain = Chain() chain.add_node(final) chain_default_params = deepcopy(chain) chain.root_node.custom_params = custom_params chain_default_params.fit(data) chain.fit(data) custom_params_prediction = chain.predict(data).predict default_params_prediction = chain_default_params.predict(data).predict assert not np.array_equal(custom_params_prediction, default_params_prediction)
def run_oil_forecasting_problem(train_file_path, train_file_path_crm, forecast_length, max_window_size, is_visualise=False, well_id='Unknown'): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=False)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), train_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') full_path_train_crm = os.path.join(str(project_root()), train_file_path_crm) dataset_to_train_crm = InputData.from_csv(full_path_train_crm, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') dataset_to_validate_crm = copy(dataset_to_train_crm) prediction_full = None prediction_full_crm = None prediction_full_crm_opt = None forecast_window_shift_num = 4 depth = 100 for forecasting_step in range(forecast_window_shift_num): start = 0 + depth * forecasting_step end = depth * 2 + depth * (forecasting_step + 1) dataset_to_train_local = dataset_to_train.subset(start, end) dataset_to_train_local_crm = dataset_to_train_crm.subset(start, end) start = 0 + depth * forecasting_step end = depth * 2 + depth * (forecasting_step + 1) dataset_to_validate_local = dataset_to_validate.subset( start + depth, end + depth) dataset_to_validate_local_crm = dataset_to_validate_crm.subset( start + depth, end + depth) chain_simple = Chain(PrimaryNode('lstm')) chain_simple_crm = Chain(PrimaryNode('lstm')) chain_crm_opt = get_comp_chain() chain_simple.fit_from_scratch(input_data=dataset_to_train_local, verbose=False) chain_simple_crm.fit_from_scratch( input_data=dataset_to_train_local_crm, verbose=False) chain_crm_opt.fit_from_scratch(input_data=dataset_to_train_local_crm, verbose=False) prediction = chain_simple.predict(dataset_to_validate_local) prediction_crm = chain_simple_crm.predict( dataset_to_validate_local_crm) prediction_crm_opt = chain_crm_opt.predict( dataset_to_validate_local_crm) prediction_full = merge_datasets(prediction_full, prediction, forecasting_step) prediction_full_crm = merge_datasets(prediction_full_crm, prediction_crm, forecasting_step) prediction_full_crm_opt = merge_datasets(prediction_full_crm_opt, prediction_crm_opt, forecasting_step) rmse_on_valid_simple = calculate_validation_metric( prediction_full, prediction_full_crm, prediction_full_crm_opt, dataset_to_validate, well_id, is_visualise) print(well_id) print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}') print(f'RMSE ML: {round(rmse_on_valid_simple[1])}') print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}') print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}') print(f'DTW CRM: {round(rmse_on_valid_simple[4])}') print(f'DTW ML: {round(rmse_on_valid_simple[5])}') print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}') print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}') return rmse_on_valid_simple
from core.models.model import * from benchmark.benchmark_utils import get_scoring_case_data_paths train_file_path, test_file_path = get_scoring_case_data_paths() train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target chain = Chain() node0 = NodeGenerator.primary_node(ModelTypesIdsEnum.tpot) node1 = NodeGenerator.primary_node(ModelTypesIdsEnum.lda) node2 = NodeGenerator.secondary_node(ModelTypesIdsEnum.rf) node2.nodes_from.append(node0) node2.nodes_from.append(node1) chain.add_node(node0) chain.add_node(node1) chain.add_node(node2) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value)
def roc_value(chain: Chain, dataset_to_validate) -> float: predicted = chain.predict(dataset_to_validate) roc_auc_value = roc_auc(y_true=dataset_to_validate.target, y_score=predicted.predict) return roc_auc_value
def get_value(chain: Chain, reference_data: InputData) -> float: results = chain.predict(reference_data) return mean_squared_error(y_true=reference_data.target, y_pred=results.predict)
def synthetic_benchmark_dataset(samples_amount: int, features_amount: int, classes_amount: int = 2, features_options: Dict = DEFAULT_OPTIONS, fitted_chain: Chain = None) -> InputData: """ Generates a binary classification benchmark dataset that was obtained using the (TODO: add. reference) proposed fitting schema. :param samples_amount: Total amount of samples in the resulted dataset. :param features_amount: Total amount of features per sample. :param classes_amount: The amount of classes in the dataset. :param features_options: features options in key-value suitable for classification_dataset. :param fitted_chain: Chain with separately fitted models. If None then 3-level balanced tree were fitted and taken as a default. :return: Benchmark dataset that is ready to be used by Chain. """ if fitted_chain is None: fitted_chain = _default_chain(samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount) if classes_amount != 2: raise NotImplementedError( 'Only binary classification tasks are supported') features, target = classification_dataset( samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount, features_options=features_options) target = np.expand_dims(target, axis=1) task = Task(TaskTypesEnum.classification) samples_idxs = np.arange(0, samples_amount) train = InputData(idx=samples_idxs, features=features, target=target, task=task, data_type=DataTypesEnum.table) synth_target = fitted_chain.predict(input_data=train).predict synth_labels = _to_labels(synth_target) data_synth_train = InputData(idx=np.arange(0, samples_amount), features=features, target=synth_labels, task=task, data_type=DataTypesEnum.table) # TODO: fix preproc issues fitted_chain.fit_from_scratch(input_data=data_synth_train) features, target = classification_dataset( samples_amount=samples_amount, features_amount=features_amount, classes_amount=classes_amount, features_options=features_options) target = np.expand_dims(target, axis=1) test = InputData(idx=samples_idxs, features=features, target=target, data_type=DataTypesEnum.table, task=task) synth_target = fitted_chain.predict(input_data=test).predict synth_labels = _to_labels(synth_target) data_synth_final = InputData(idx=samples_idxs, features=features, data_type=DataTypesEnum.table, target=synth_labels, task=task) return data_synth_final