def run_autokeras(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file task = params.task config_data = get_models_hyperparameters()['autokeras'] max_trial = config_data['MAX_TRIAL'] epoch = config_data['EPOCH'] train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) # TODO Save model to file if task is TaskTypesEnum.classification: estimator = ak.StructuredDataClassifier else: estimator = ak.StructuredDataRegressor model = estimator(max_trials=max_trial) model.fit(train_data.features, train_data.target, epochs=epoch) predicted = model.predict(test_data.features) return test_data.target, predicted
def from_json( file_path, task_type: TaskTypesEnum = MachineLearningTasksEnum.classification, train_size=0.75): df_train = pd.read_json(file_path) Xtrain = get_scaled_imgs(df_train) Ytrain = np.array(df_train['is_iceberg']) df_train.inc_angle = df_train.inc_angle.replace('na', 0) idx_tr = np.where(df_train.inc_angle > 0) Ytrain = Ytrain[idx_tr[0]] Xtrain = Xtrain[idx_tr[0], ...] Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xtrain, Ytrain, random_state=1, train_size=0.75) Xtr_more = get_more_images(Xtrain) Ytr_more = np.concatenate((Ytrain, Ytrain, Ytrain)) train_input_data = InputData(idx=np.arange(0, len(Xtr_more)), features=Xtr_more, target=np.array(Ytr_more), task_type=task_type) test_input_data = InputData(idx=np.arange(0, len(Xtest)), features=Xtest, target=np.array(Ytest), task_type=task_type) return train_input_data, test_input_data
def run_autokeras(train_file_path: str, test_file_path: str, task: MachineLearningTasksEnum, case_name: str = 'default'): config_data = get_models_hyperparameters()['autokeras'] max_trial = config_data['MAX_TRIAL'] epoch = config_data['EPOCH'] train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) # TODO Save model to file if task is MachineLearningTasksEnum.classification: estimator = ak.StructuredDataClassifier else: estimator = ak.StructuredDataRegressor model = estimator(max_trials=max_trial) model.fit(train_data.features, train_data.target, epochs=epoch) predicted = model.predict(test_data.features) if task is MachineLearningTasksEnum.classification: result_metric = { 'autokeras_roc_auc': round(roc_auc_score(test_data.target, predicted), 3) } else: result_metric = {'MSE': round(mse(test_data.target, predicted), 3)} return result_metric
def run_chain_from_automl(train_file_path: str, test_file_path: str, max_run_time: timedelta = timedelta(minutes=10)): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) testing_target = test_data.target chain = Chain() node_tpot = PrimaryNode('tpot') node_tpot.model.params = {'max_run_time_sec': max_run_time.seconds} node_lda = PrimaryNode('lda') node_rf = SecondaryNode('rf') node_rf.nodes_from = [node_tpot, node_lda] chain.add_node(node_rf) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def run_h2o(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task = params.task config_data = get_models_hyperparameters()['H2O'] max_models = config_data['MAX_MODELS'] max_runtime_secs = config_data['MAX_RUNTIME_SECS'] result_filename = f'{case_label}_m{max_models}_rs{max_runtime_secs}_{task.name}' exported_model_path = os.path.join(CURRENT_PATH, result_filename) # TODO Regression if result_filename not in os.listdir(CURRENT_PATH): train_data = InputData.from_csv(train_file_path) best_model = fit_h2o(train_data, round(max_runtime_secs / 60)) temp_exported_model_path = h2o.save_model(model=best_model, path=CURRENT_PATH) os.renames(temp_exported_model_path, exported_model_path) ip, port = get_h2o_connect_config() h2o.init(ip=ip, port=port, name='h2o_server') imported_model = h2o.load_model(exported_model_path) test_frame = InputData.from_csv(test_file_path) true_target = test_frame.target predicted = predict_h2o(imported_model, test_frame) h2o.shutdown(prompt=False) return true_target, predicted
def run_h2o(train_file_path: str, test_file_path: str, task: MachineLearningTasksEnum, case_name='h2o_default'): config_data = get_models_hyperparameters()['H2O'] max_models = config_data['MAX_MODELS'] max_runtime_secs = config_data['MAX_RUNTIME_SECS'] result_filename = f'{case_name}_m{max_models}_rs{max_runtime_secs}_{task.name}' exported_model_path = os.path.join(CURRENT_PATH, result_filename) # TODO Regression if result_filename not in os.listdir(CURRENT_PATH): train_data = InputData.from_csv(train_file_path) best_model = fit_h2o(train_data) temp_exported_model_path = h2o.save_model(model=best_model, path=CURRENT_PATH) os.renames(temp_exported_model_path, exported_model_path) ip, port = get_h2o_connect_config() h2o.init(ip=ip, port=port, name='h2o_server') imported_model = h2o.load_model(exported_model_path) test_frame = InputData.from_csv(test_file_path) true_target = test_frame.target predictions = predict_h2o(imported_model, test_frame) if task is MachineLearningTasksEnum.classification: train_roc_auc_value = round(imported_model.auc(train=True), 3) valid_roc_auc_value = round(imported_model.auc(valid=True), 3) test_roc_auc_value = round(roc_auc_score(true_target, predictions), 3) metrics = { 'H2O_ROC_AUC_train': train_roc_auc_value, 'H2O_ROC_AUC_valid': valid_roc_auc_value, 'H2O_ROC_AUC_test': test_roc_auc_value } print(f"H2O_ROC_AUC_train: {metrics['H2O_ROC_AUC_train']}") print(f"H2O_ROC_AUC_valid: {metrics['H2O_ROC_AUC_valid']}") print(f"H2O_ROC_AUC_test: {metrics['H2O_ROC_AUC_test']}") else: mse_train = imported_model.mse() rmse_train = imported_model.rmse() metrics = {'H2O_MSE_train': mse_train, 'H2O_RMSE_train': rmse_train} print(f"H2O_MSE_train: {metrics['H2O_MSE_train']}") print(f"H2O_RMSE_train: {metrics['H2O_RMSE_train']}") h2o.shutdown(prompt=False) return metrics
def run_credit_scoring_problem( train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=5), is_visualise=False): task = Task(TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) dataset_to_validate = InputData.from_csv(test_file_path, task=task) # the search of the models provided by the framework that can be used as nodes in a chain for the selected task available_model_types, _ = ModelTypesRepository().suitable_model( task_type=task.task_type) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC_penalty) # the choice and initialisation of the GP search composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=3, max_depth=3, pop_size=20, num_of_generations=20, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # Create GP-based composer composer = GPComposer() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) chain_evo_composed.fine_tune_primary_nodes(input_data=dataset_to_compose, iterations=50) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True) if is_visualise: ComposerVisualiser.visualise(chain_evo_composed) # the quality assessment for the obtained composite models roc_on_valid_evo_composed = calculate_validation_metric( chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') return roc_on_valid_evo_composed
def run_xgb_classifier(train_file: str, test_file: str): train_data = InputData.from_csv(train_file) test_data = InputData.from_csv(test_file) model = XGBClassifier() model.fit(train_data.features, train_data.target) predicted = model.predict_proba(test_data.features)[:, 1] roc_auc_value = round(roc_auc_score(test_data.target, predicted), 3) return roc_auc_value
def run_metocean_forecasting_problem(train_file_path, test_file_path, forecast_length=1, max_window_size=64, is_visualise=False): # specify the task to solve task_to_solve = Task( TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=forecast_length, max_window_size=max_window_size)) full_path_train = os.path.join(str(project_root()), train_file_path) dataset_to_train = InputData.from_csv(full_path_train, task=task_to_solve, data_type=DataTypesEnum.ts) # a dataset for a final validation of the composed model full_path_test = os.path.join(str(project_root()), test_file_path) dataset_to_validate = InputData.from_csv(full_path_test, task=task_to_solve, data_type=DataTypesEnum.ts) chain = get_composite_lstm_chain() chain_simple = Chain() node_single = PrimaryNode('ridge') chain_simple.add_node(node_single) chain_lstm = Chain() node_lstm = PrimaryNode('lstm') chain_lstm.add_node(node_lstm) chain.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid = calculate_validation_metric( chain.predict(dataset_to_validate), dataset_to_validate, f'full-composite_{forecast_length}', is_visualise) chain_lstm.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_lstm_only = calculate_validation_metric( chain_lstm.predict(dataset_to_validate), dataset_to_validate, f'full-lstm-only_{forecast_length}', is_visualise) chain_simple.fit(input_data=dataset_to_train, verbose=False) rmse_on_valid_simple = calculate_validation_metric( chain_simple.predict(dataset_to_validate), dataset_to_validate, f'full-simple_{forecast_length}', is_visualise) print(f'RMSE composite: {rmse_on_valid}') print(f'RMSE simple: {rmse_on_valid_simple}') print(f'RMSE LSTM only: {rmse_on_valid_lstm_only}') return rmse_on_valid_simple
def test_data_from_csv(): test_file_path = str(os.path.dirname(__file__)) file = 'data/test_dataset.csv' task_type = MachineLearningTasksEnum.classification df = pd.read_csv(os.path.join(test_file_path, file)) data_array = np.array(df).T features = data_array[1:-1].T target = data_array[-1] idx = data_array[0] expected_features = InputData(features=features, target=target, idx=idx, task_type=task_type).features.all() actual_features = InputData.from_csv(os.path.join(test_file_path, file)).features.all() assert expected_features == actual_features
def get_model(train_file_path: str, cur_lead_time: datetime.timedelta = timedelta(seconds=60)): task = Task(task_type=TaskTypesEnum.classification) dataset_to_compose = InputData.from_csv(train_file_path, task=task) # the search of the models provided by the framework # that can be used as nodes in a chain for the selected task models_repo = ModelTypesRepository() available_model_types, _ = models_repo.suitable_model( task_type=task.task_type) metric_function = MetricsRepository(). \ metric_by_id(ClassificationMetricsEnum.ROCAUC_penalty) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_lead_time=cur_lead_time) # Create the genetic programming-based composer, that allow to find # the optimal structure of the composite model composer = GPComposer() # run the search of best suitable model chain_evo_composed = composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose) return chain_evo_composed
def get_synthetic_regression_data(n_samples=10000, n_features=10, random_state=None) -> InputData: synthetic_data = make_regression(n_samples=n_samples, n_features=n_features, random_state=random_state) input_data = InputData(idx=np.arange(0, len(synthetic_data[1])), features=synthetic_data[0], target=synthetic_data[1], task_type=MachineLearningTasksEnum.regression) return input_data
def apply_model_to_data(model: Chain, data_path: str): df, file_path = create_multi_clf_examples_from_excel(data_path, return_df=True) dataset_to_apply = InputData.from_csv(file_path, with_target=False) evo_predicted = model.predict(dataset_to_apply) df['forecast'] = probs_to_labels(evo_predicted.predict) return df
def data_setup(): predictors, response = load_breast_cancer(return_X_y=True) np.random.seed(1) np.random.shuffle(predictors) np.random.shuffle(response) response = response[:100] predictors = predictors[:100] train_data_x, test_data_x = split_train_test(predictors) train_data_y, test_data_y = split_train_test(response) train_data = InputData(features=train_data_x, target=train_data_y, idx=np.arange(0, len(train_data_y)), task_type=MachineLearningTasksEnum.classification) test_data = InputData(features=test_data_x, target=test_data_y, idx=np.arange(0, len(test_data_y)), task_type=MachineLearningTasksEnum.classification) return train_data, test_data
def _input_from_parents(self, input_data: InputData, parent_operation: str, max_tune_time: Optional[timedelta] = None, verbose=False) -> InputData: if len(self.nodes_from) == 0: raise ValueError() if verbose: print(f'Fit all parent nodes in secondary node with model: {self.model}') parent_nodes = self._nodes_from_with_fixed_order() are_prev_nodes_affect_target = \ ['affects_target' in parent_node.model_tags for parent_node in parent_nodes] if any(are_prev_nodes_affect_target): # is the previous model is the model that changes target parent_results, target = _combine_parents_that_affects_target(parent_nodes, input_data, parent_operation) else: parent_results, target = _combine_parents_simple(parent_nodes, input_data, parent_operation, max_tune_time) secondary_input = InputData.from_predictions(outputs=parent_results, target=target) return secondary_input
def test_dummy_composer_flat_chain_build_correct(): composer = DummyComposer(DummyChainTypeEnum.flat) empty_data = InputData(idx=np.zeros(1), features=np.zeros(1), target=np.zeros(1), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) primary = ['logit'] secondary = ['logit', 'xgboost'] composer_requirements = ComposerRequirements(primary=primary, secondary=secondary) new_chain = composer.compose_chain( data=empty_data, initial_chain=None, composer_requirements=composer_requirements, metrics=None) assert len(new_chain.nodes) == 3 assert isinstance(new_chain.nodes[0], PrimaryNode) assert isinstance(new_chain.nodes[1], SecondaryNode) assert isinstance(new_chain.nodes[2], SecondaryNode) assert new_chain.nodes[1].nodes_from[0] is new_chain.nodes[0] assert new_chain.nodes[2].nodes_from[0] is new_chain.nodes[1] assert new_chain.nodes[0].nodes_from is None
def test_string_features_from_csv(): test_file_path = str(os.path.dirname(__file__)) file = 'data/scoring_train_cat.csv' expected_features = InputData.from_csv(os.path.join(test_file_path, file)).features assert expected_features.dtype == float assert np.isfinite(expected_features).all()
def get_iris_data() -> InputData: synthetic_data = load_iris() input_data = InputData(idx=np.arange(0, len(synthetic_data.target)), features=synthetic_data.data, target=synthetic_data.target, task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) return input_data
def test_data_from_csv(): test_file_path = str(os.path.dirname(__file__)) file = 'data/simple_classification.csv' task = Task(TaskTypesEnum.classification) df = pd.read_csv(os.path.join(test_file_path, file)) data_array = np.array(df).T features = data_array[1:-1].T target = data_array[-1] idx = data_array[0] expected_features = InputData( features=features, target=target, idx=idx, task=task, data_type=DataTypesEnum.table).features.all() actual_features = InputData.from_csv(os.path.join(test_file_path, file)).features.all() assert expected_features == actual_features
def test_data_from_predictions(output_dataset): data_1 = output_dataset data_2 = output_dataset data_3 = output_dataset target = output_dataset.predict new_input_data = InputData.from_predictions( outputs=[data_1, data_2, data_3], target=target) assert new_input_data.features.all() == np.array( [data_1.predict, data_2.predict, data_3.predict]).all()
def run_tpot(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file case_label = params.case_label task = params.task models_hyperparameters = get_models_hyperparameters()['TPOT'] generations = models_hyperparameters['GENERATIONS'] population_size = models_hyperparameters['POPULATION_SIZE'] result_model_filename = f'{case_label}_g{generations}' \ f'_p{population_size}_{task.name}.pkl' current_file_path = str(os.path.dirname(__file__)) result_file_path = os.path.join(current_file_path, result_model_filename) train_data = InputData.from_csv(train_file_path, task=Task(task)) if result_model_filename not in os.listdir(current_file_path): # TODO change hyperparameters to actual from variable model = fit_tpot(train_data, models_hyperparameters['MAX_RUNTIME_MINS']) model.export( output_file_name=f'{result_model_filename[:-4]}_pipeline.py') # sklearn pipeline object fitted_model_config = model.fitted_pipeline_ joblib.dump(fitted_model_config, result_file_path, compress=1) imported_model = joblib.load(result_file_path) predict_data = InputData.from_csv(test_file_path, task=Task(task)) true_target = predict_data.target if task == TaskTypesEnum.regression: predicted = predict_tpot_reg(imported_model, predict_data) elif task == TaskTypesEnum.classification: predicted = predict_tpot_class(imported_model, predict_data) else: print('Incorrect type of ml task') raise NotImplementedError() print(f'BEST_model: {imported_model}') return true_target, predicted
def validate_model_quality(model: Chain, data_path: str): dataset_to_validate = InputData.from_csv(data_path) predicted_labels = model.predict(dataset_to_validate).predict roc_auc_valid = round( roc_auc(y_true=test_data.target, y_score=predicted_labels, multi_class='ovo', average='macro'), 3) return roc_auc_valid
def data_setup(): predictors, response = load_iris(return_X_y=True) np.random.shuffle(predictors) np.random.shuffle(response) predictors = predictors[:100] response = response[:100] data = InputData(features=predictors, target=response, idx=np.arange(0, 100), task_type=MachineLearningTasksEnum.classification) return data
def data_setup() -> InputData: predictors, response = load_iris(return_X_y=True) np.random.seed(1) np.random.shuffle(predictors) np.random.shuffle(response) predictors = predictors[:100] response = response[:100] data = InputData(features=predictors, target=response, idx=np.arange(0, 100), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) return data
def get_synthetic_input_data(n_samples=10000, n_features=10, random_state=None) -> InputData: synthetic_data = make_classification(n_samples=n_samples, n_features=n_features, random_state=random_state) input_data = InputData(idx=np.arange(0, len(synthetic_data[1])), features=synthetic_data[0], target=synthetic_data[1], task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) return input_data
def get_synthetic_ts_data(n_steps=10000) -> InputData: simulated_data = ArmaProcess().generate_sample(nsample=n_steps) x1 = np.arange(0, n_steps) x2 = np.arange(0, n_steps) + 1 simulated_data = simulated_data + x1 * 0.0005 - x2 * 0.0001 input_data = InputData(idx=np.arange(0, n_steps), features=np.asarray([x1, x2]).T, target=simulated_data, task_type=MachineLearningTasksEnum.auto_regression) return input_data
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str): train_data = InputData.from_csv(train_file_path) test_data = InputData.from_csv(test_file_path) training_features = train_data.features testing_features = test_data.features training_target = train_data.target testing_target = test_data.target # Average CV score on the training set was: 0.93755 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier()) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict_proba(testing_features)[:, 1] roc_auc_value = roc_auc(y_true=testing_target, y_score=results) print(roc_auc_value) chain = Chain() node_first = PrimaryNode('direct_data_model') node_second = PrimaryNode('bernb') node_third = SecondaryNode('rf') node_third.nodes_from.append(node_first) node_third.nodes_from.append(node_second) chain.add_node(node_third) chain.fit(train_data) results = chain.predict(test_data) roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict) print(roc_auc_value) return roc_auc_value
def _preprocess(self, data: InputData): preprocessing_func = preprocessing_func_for_data(data, self) if not self.cache.actual_cached_state: # if fitted preprocessor not found in cache preprocessing_strategy = \ preprocessing_func().fit(data.features) else: # if fitted preprocessor already exists preprocessing_strategy = self.cache.actual_cached_state.preprocessor data.features = preprocessing_strategy.apply(data.features) return data, preprocessing_strategy
def classification_dataset_with_redunant_features(n_samples=1000, n_features=100, n_informative=5 ) -> InputData: synthetic_data = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative) input_data = InputData(idx=np.arange(0, len(synthetic_data[1])), features=synthetic_data[0], target=synthetic_data[1], task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) return input_data
def classification_dataset(): samples = 1000 x = 10.0 * np.random.rand(samples, ) - 5.0 x = np.expand_dims(x, axis=1) y = 1.0 / (1.0 + np.exp(np.power(x, -1.0))) threshold = 0.5 classes = np.array([0.0 if val <= threshold else 1.0 for val in y]) classes = np.expand_dims(classes, axis=1) data = InputData(features=x, target=classes, idx=np.arange(0, len(x)), task_type=MachineLearningTasksEnum.classification) return data