def run_autokeras(train_file_path: str, test_file_path: str, task: MachineLearningTasksEnum, case_name: str = 'default'): config_data = get_models_hyperparameters()['autokeras'] max_trial = config_data['MAX_TRIAL'] epoch = config_data['EPOCH'] train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) # TODO Save model to file if task is MachineLearningTasksEnum.classification: estimator = ak.StructuredDataClassifier else: estimator = ak.StructuredDataRegressor model = estimator(max_trials=max_trial) model.fit(train_data.features, train_data.target, epochs=epoch) predicted = model.predict(test_data.features) if task is MachineLearningTasksEnum.classification: result_metric = { 'autokeras_roc_auc': round(roc_auc_score(test_data.target, predicted), 3) } else: result_metric = {'MSE': round(mse(test_data.target, predicted), 3)} return result_metric
def run_h2o(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task = params.task config_data = get_models_hyperparameters()['H2O'] max_models = config_data['MAX_MODELS'] max_runtime_secs = config_data['MAX_RUNTIME_SECS'] result_filename = f'{case_label}_m{max_models}_rs{max_runtime_secs}_{task.name}' exported_model_path = os.path.join(CURRENT_PATH, result_filename) # TODO Regression if result_filename not in os.listdir(CURRENT_PATH): train_data = InputData.from_csv(train_file_path) best_model = fit_h2o(train_data, round(max_runtime_secs / 60)) temp_exported_model_path = h2o.save_model(model=best_model, path=CURRENT_PATH) os.renames(temp_exported_model_path, exported_model_path) ip, port = get_h2o_connect_config() h2o.init(ip=ip, port=port, name='h2o_server') imported_model = h2o.load_model(exported_model_path) test_frame = InputData.from_csv(test_file_path) true_target = test_frame.target predicted = predict_h2o(imported_model, test_frame) h2o.shutdown(prompt=False) return true_target, predicted
def run_autokeras(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file task = params.task config_data = get_models_hyperparameters()['autokeras'] max_trial = config_data['MAX_TRIAL'] epoch = config_data['EPOCH'] train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) # TODO Save model to file if task is TaskTypesEnum.classification: estimator = ak.StructuredDataClassifier else: estimator = ak.StructuredDataRegressor model = estimator(max_trials=max_trial) model.fit(train_data.features, train_data.target, epochs=epoch) predicted = model.predict(test_data.features) return test_data.target, predicted
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_tpot = PrimaryNode('tpot') node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds} node_lda = PrimaryNode('lda') node_rf = SecondaryNode('rf') node_rf.nodes_from = [node_tpot, node_lda] chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def run_h2o(train_file_path: str, test_file_path: str, task: MachineLearningTasksEnum, case_name='h2o_default'): config_data = get_models_hyperparameters()['H2O'] max_models = config_data['MAX_MODELS'] max_runtime_secs = config_data['MAX_RUNTIME_SECS'] result_filename = f'{case_name}_m{max_models}_rs{max_runtime_secs}_{task.name}' exported_model_path = os.path.join(CURRENT_PATH, result_filename) # TODO Regression if result_filename not in os.listdir(CURRENT_PATH): train_data = InputData.from_csv(train_file_path) best_model = fit_h2o(train_data) temp_exported_model_path = h2o.save_model(model=best_model, path=CURRENT_PATH) os.renames(temp_exported_model_path, exported_model_path) ip, port = get_h2o_connect_config() h2o.init(ip=ip, port=port, name='h2o_server') imported_model = h2o.load_model(exported_model_path) test_frame = InputData.from_csv(test_file_path) true_target = test_frame.target predictions = predict_h2o(imported_model, test_frame) if task is MachineLearningTasksEnum.classification: train_roc_auc_value = round(imported_model.auc(train=True), 3) valid_roc_auc_value = round(imported_model.auc(valid=True), 3) test_roc_auc_value = round(roc_auc_score(true_target, predictions), 3) metrics = { 'H2O_ROC_AUC_train': train_roc_auc_value, 'H2O_ROC_AUC_valid': valid_roc_auc_value, 'H2O_ROC_AUC_test': test_roc_auc_value } print(f"H2O_ROC_AUC_train: {metrics['H2O_ROC_AUC_train']}") print(f"H2O_ROC_AUC_valid: {metrics['H2O_ROC_AUC_valid']}") print(f"H2O_ROC_AUC_test: {metrics['H2O_ROC_AUC_test']}") else: mse_train = imported_model.mse() rmse_train = imported_model.rmse() metrics = {'H2O_MSE_train': mse_train, 'H2O_RMSE_train': rmse_train} print(f"H2O_MSE_train: {metrics['H2O_MSE_train']}") print(f"H2O_RMSE_train: {metrics['H2O_RMSE_train']}") h2o.shutdown(prompt=False) return metrics
def run_credit_scoring_problem( train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), is_visualise=False): task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types, _ = ModelTypesRepository().suitable_model( task_type=task.task_type) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC_penalty) # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # Create GP-based composer composer = GPComposer() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) chain_evo_composed.fine_tune_primary_nodes(input_data=dataset_to_compose, iterations=50) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True) if is_visualise: ComposerVisualiser.visualise(chain_evo_composed) # the quality assessment for the obtained composite models roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed
def run_xgb_classifier(train_file: str, test_file: str): train_data = InputData.from_csv(train_file) test_data = InputData.from_csv(test_file) model = XGBClassifier() model.fit(train_data.features, train_data.target) predicted = model.predict_proba(test_data.features)[:, 1] roc_auc_value = round(roc_auc_score(test_data.target, predicted), 3) return roc_auc_value
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, is_visualise=False): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) chain = get_composite_lstm_chain() chain_simple = Chain() node_single = PrimaryNode('ridge') chain_simple.add_node(node_single) chain_lstm = Chain() node_lstm = PrimaryNode('lstm') chain_lstm.add_node(node_lstm) chain.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, f'full-composite_{forecast_length}', is_visualise) chain_lstm.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_lstm_only = calculate_validation_metric( chain_lstm.predict(dataset_to_validate), dataset_to_validate, f'full-lstm-only_{forecast_length}', is_visualise) chain_simple.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_simple = calculate_validation_metric( chain_simple.predict(dataset_to_validate), dataset_to_validate, f'full-simple_{forecast_length}', is_visualise) print(f'RMSE composite: {rmse_on_valid}') print(f'RMSE simple: {rmse_on_valid_simple}') print(f'RMSE LSTM only: {rmse_on_valid_lstm_only}') return rmse_on_valid_simple
def get_model(train_file_path: str, cur_lead_time: datetime.timedelta = timedelta(seconds=60)): task = Task(task_type=TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) # the search of the models provided by the framework # that can be used as nodes in a chain for the selected task models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model( task_type=task.task_type) metric_function = MetricsRepository(). \ metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_lead_time=cur_lead_time) # Create the genetic programming-based composer, that allow to find # the optimal structure of the composite model composer = GPComposer() # run the search of best suitable model chain_evo_composed = composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose) return chain_evo_composed
def apply_model_to_data(model: Chain, data_path: str): df, file_path = create_multi_clf_examples_from_excel(data_path, return_df=True) dataset_to_apply = InputData.from_csv(file_path, with_target=False) evo_predicted = model.predict(dataset_to_apply) df['forecast'] = probs_to_labels(evo_predicted.predict) return df
def test_string_features_from_csv(): test_file_path = str(os.path.dirname(__file__)) file = 'data/scoring_train_cat.csv' expected_features = InputData.from_csv(os.path.join(test_file_path, file)).features assert expected_features.dtype == float assert np.isfinite(expected_features).all()
def run_tpot(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task = params.task models_hyperparameters = get_models_hyperparameters()['TPOT'] generations = models_hyperparameters['GENERATIONS'] population_size = models_hyperparameters['POPULATION_SIZE'] result_model_filename = f'{case_label}_g{generations}' \ f'_p{population_size}_{task.name}.pkl' current_file_path = str(os.path.dirname(__file__)) result_file_path = os.path.join(current_file_path, result_model_filename) train_data = InputData.from_csv(train_file_path, task=Task(task)) if result_model_filename not in os.listdir(current_file_path): # TODO change hyperparameters to actual from variable model = fit_tpot(train_data, models_hyperparameters['MAX_RUNTIME_MINS']) model.export( output_file_name=f'{result_model_filename[:-4]}_pipeline.py') # sklearn pipeline object fitted_model_config = model.fitted_pipeline_ joblib.dump(fitted_model_config, result_file_path, compress=1) imported_model = joblib.load(result_file_path) predict_data = InputData.from_csv(test_file_path, task=Task(task)) true_target = predict_data.target if task == TaskTypesEnum.regression: predicted = predict_tpot_reg(imported_model, predict_data) elif task == TaskTypesEnum.classification: predicted = predict_tpot_class(imported_model, predict_data) else: print('Incorrect type of ml task') raise NotImplementedError() print(f'BEST_model: {imported_model}') return true_target, predicted
def validate_model_quality(model: Chain, data_path: str): dataset_to_validate = InputData.from_csv(data_path) predicted_labels = model.predict(dataset_to_validate).predict roc_auc_valid = round( roc_auc(y_true=test_data.target, y_score=predicted_labels, multi_class='ovo', average='macro'), 3) return roc_auc_valid
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier()) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(roc_auc_value) chain = Chain() node_first = PrimaryNode('direct_data_model') node_second = PrimaryNode('bernb') node_third = SecondaryNode('rf') node_third.nodes_from.append(node_first) node_third.nodes_from.append(node_second) chain.add_node(node_third) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def run_xgboost(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file task = params.task train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) if task is TaskTypesEnum.classification: model = xgb.XGBClassifier(max_depth=2, learning_rate=1.0, objective='binary:logistic') model.fit(train_data.features, train_data.target) predicted = model.predict_proba(test_data.features)[:, 1] elif task is TaskTypesEnum.regression: xgbr = xgb.XGBRegressor(max_depth=3, learning_rate=0.3, n_estimators=300, objective='reg:squarederror') xgbr.fit(train_data.features, train_data.target) predicted = xgbr.predict(test_data.features) else: raise NotImplementedError() return test_data.target, predicted
def test_data_from_csv(): test_file_path = str(os.path.dirname(__file__)) file = 'data/test_dataset.csv' task_type = MachineLearningTasksEnum.classification df = pd.read_csv(os.path.join(test_file_path, file)) data_array = np.array(df).T features = data_array[1:-1].T target = data_array[-1] idx = data_array[0] expected_features = InputData(features=features, target=target, idx=idx, task_type=task_type).features.all() actual_features = InputData.from_csv(os.path.join(test_file_path, file)).features.all() assert expected_features == actual_features
def test_data_from_csv(): test_file_path = str(os.path.dirname(__file__)) file = 'data/simple_classification.csv' task = Task(TaskTypesEnum.classification) df = pd.read_csv(os.path.join(test_file_path, file)) data_array = np.array(df).T features = data_array[1:-1].T target = data_array[-1] idx = data_array[0] expected_features = InputData( features=features, target=target, idx=idx, task=task, data_type=DataTypesEnum.table).features.all() actual_features = InputData.from_csv(os.path.join(test_file_path, file)).features.all() assert expected_features == actual_features
def classification_dataset(): test_file_path = str(os.path.dirname(__file__)) file = os.path.join('data', 'advanced_classification.csv') return InputData.from_csv(os.path.join(test_file_path, file), task=Task(TaskTypesEnum.classification))
def scoring_dataset(): train_file_path, test_file_path = get_scoring_case_data_paths() train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) return train_data, test_data
def regression_dataset(): test_file_path = str(os.path.dirname(__file__)) file = 'data/advanced_regression.csv' data = InputData.from_csv(os.path.join(test_file_path, file)) data.task = Task(TaskTypesEnum.regression) return data
def classification_dataset(): test_file_path = str(os.path.dirname(__file__)) file = 'data/advanced_classification.csv' return InputData.from_csv(os.path.join(test_file_path, file))
def run_oil_forecasting_problem(train_file_path, train_file_path_crm, forecast_length, max_window_size, is_visualise=False, well_id='Unknown'): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size, return_all_steps=False, make_future_prediction=False)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), train_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') full_path_train_crm = os.path.join(str(project_root()), train_file_path_crm) dataset_to_train_crm = InputData.from_csv(full_path_train_crm, task=task_to_solve, data_type=DataTypesEnum.ts, delimiter=',') dataset_to_validate_crm = copy(dataset_to_train_crm) prediction_full = None prediction_full_crm = None prediction_full_crm_opt = None forecast_window_shift_num = 4 depth = 100 for forecasting_step in range(forecast_window_shift_num): start = 0 + depth * forecasting_step end = depth * 2 + depth * (forecasting_step + 1) dataset_to_train_local = dataset_to_train.subset(start, end) dataset_to_train_local_crm = dataset_to_train_crm.subset(start, end) start = 0 + depth * forecasting_step end = depth * 2 + depth * (forecasting_step + 1) dataset_to_validate_local = dataset_to_validate.subset( start + depth, end + depth) dataset_to_validate_local_crm = dataset_to_validate_crm.subset( start + depth, end + depth) chain_simple = Chain(PrimaryNode('lstm')) chain_simple_crm = Chain(PrimaryNode('lstm')) chain_crm_opt = get_comp_chain() chain_simple.fit_from_scratch(input_data=dataset_to_train_local, verbose=False) chain_simple_crm.fit_from_scratch( input_data=dataset_to_train_local_crm, verbose=False) chain_crm_opt.fit_from_scratch(input_data=dataset_to_train_local_crm, verbose=False) prediction = chain_simple.predict(dataset_to_validate_local) prediction_crm = chain_simple_crm.predict( dataset_to_validate_local_crm) prediction_crm_opt = chain_crm_opt.predict( dataset_to_validate_local_crm) prediction_full = merge_datasets(prediction_full, prediction, forecasting_step) prediction_full_crm = merge_datasets(prediction_full_crm, prediction_crm, forecasting_step) prediction_full_crm_opt = merge_datasets(prediction_full_crm_opt, prediction_crm_opt, forecasting_step) rmse_on_valid_simple = calculate_validation_metric( prediction_full, prediction_full_crm, prediction_full_crm_opt, dataset_to_validate, well_id, is_visualise) print(well_id) print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}') print(f'RMSE ML: {round(rmse_on_valid_simple[1])}') print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}') print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}') print(f'DTW CRM: {round(rmse_on_valid_simple[4])}') print(f'DTW ML: {round(rmse_on_valid_simple[5])}') print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}') print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}') return rmse_on_valid_simple
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, with_visualisation=True): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) metric_function = MetricsRepository().metric_by_id( RegressionMetricsEnum.RMSE) ref_chain = get_composite_lstm_chain() available_model_types_primary = ['trend_data_model', 'residual_data_model'] available_model_types_secondary = [ 'rfr', 'linear', 'ridge', 'lasso', 'additive_data_model' ] composer = FixedStructureComposer() composer_requirements = GPComposerRequirements( primary=available_model_types_primary, secondary=available_model_types_secondary, max_arity=2, max_depth=4, pop_size=10, num_of_generations=10, crossover_prob=0, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=20)) chain = composer.compose_chain(data=dataset_to_train, initial_chain=ref_chain, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) if with_visualisation: ComposerVisualiser.visualise(chain) chain.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, f'full-composite_{forecast_length}', is_visualise=with_visualisation) print(f'RMSE composite: {rmse_on_valid}') return rmse_on_valid
roc_auc_valid = round( roc_auc(y_true=test_data.target, y_score=predicted_labels, multi_class='ovo', average='macro'), 3) return roc_auc_valid if __name__ == '__main__': file_path_first = r'./data/example1.xlsx' file_path_second = r'./data/example2.xlsx' file_path_third = r'./data/example3.xlsx' train_file_path, test_file_path = create_multi_clf_examples_from_excel( file_path_first) test_data = InputData.from_csv(test_file_path) fitted_model = get_model(train_file_path) ComposerVisualiser.visualise(fitted_model) roc_auc = validate_model_quality(fitted_model, test_file_path) print(f'ROC AUC metric is {roc_auc}') final_prediction_first = apply_model_to_data(fitted_model, file_path_second) print(final_prediction_first['forecast']) final_prediction_second = apply_model_to_data(fitted_model, file_path_third) print(final_prediction_second['forecast'])
def file_data_setup(): test_file_path = str(os.path.dirname(__file__)) file = 'data/simple_classification.csv' input_data = InputData.from_csv(os.path.join(test_file_path, file)) input_data.idx = _to_numerical(categorical_ids=input_data.idx) return input_data
def run_credit_scoring_problem( train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), gp_optimiser_params: Optional[GPChainOptimiserParameters] = None, pop_size=None, generations=None): dataset_to_compose = InputData.from_csv(train_file_path) dataset_to_validate = InputData.from_csv(test_file_path) available_model_types, _ = ModelTypesRepository(). \ suitable_model(task_type=TaskTypesEnum.classification) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) if gp_optimiser_params: optimiser_parameters = gp_optimiser_params else: selection_types = [SelectionTypesEnum.tournament] crossover_types = [CrossoverTypesEnum.subtree] mutation_types = [ MutationTypesEnum.simple, MutationTypesEnum.growth, MutationTypesEnum.reduce ] regularization_type = RegularizationTypesEnum.decremental optimiser_parameters = GPChainOptimiserParameters( selection_types=selection_types, crossover_types=crossover_types, mutation_types=mutation_types, regularization_type=regularization_type) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=4, max_depth=3, pop_size=pop_size, num_of_generations=generations, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # Create GP-based composer composer = GPComposer() chain_evo_composed = composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, optimiser_parameters=optimiser_parameters, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True) roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed, chain_evo_composed, composer
def run_fedot(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task_type = params.task if task_type == TaskTypesEnum.classification: metric = ClassificationMetricsEnum.ROCAUC elif task_type == TaskTypesEnum.regression: metric = RegressionMetricsEnum.RMSE else: raise NotImplementedError() task = Task(task_type) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) models_hyperparameters = get_models_hyperparameters()['FEDOT'] cur_lead_time = models_hyperparameters['MAX_RUNTIME_MINS'] saved_model_name = f'fedot_{case_label}_{task_type}_{cur_lead_time}_{metric}' loaded_model = load_fedot_model(saved_model_name) if not loaded_model: generations = models_hyperparameters['GENERATIONS'] population_size = models_hyperparameters['POPULATION_SIZE'] # the search of the models provided by the framework that can be used as nodes in a chain' models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model(task.task_type) metric_function = MetricsRepository().metric_by_id(metric) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=population_size, num_of_generations=generations, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=cur_lead_time)) # Create GP-based composer composer = GPComposer() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) chain_evo_composed.fine_tune_primary_nodes( input_data=dataset_to_compose, iterations=50) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=False) save_fedot_model(chain_evo_composed, saved_model_name) else: chain_evo_composed = loaded_model evo_predicted = chain_evo_composed.predict(dataset_to_validate) return dataset_to_validate.target, evo_predicted.predict