def test_random_composer(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data dataset_to_validate = data available_model_types, _ = ModelTypesRepository().suitable_model( task_type=TaskTypesEnum.classification) metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) random_composer = RandomSearchComposer(iter_num=1) req = ComposerRequirements(primary=available_model_types, secondary=available_model_types) chain_random_composed = random_composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=req, metrics=metric_function) chain_random_composed.fit_from_scratch(input_data=dataset_to_compose) predicted_random_composed = chain_random_composed.predict( dataset_to_validate) roc_on_valid_random_composed = roc_auc( y_true=dataset_to_validate.target, y_score=predicted_random_composed.predict) assert roc_on_valid_random_composed > 0.6
def test_dummy_composer_flat_chain_build_correct(): composer = DummyComposer(DummyChainTypeEnum.flat) empty_data = InputData(idx=np.zeros(1), features=np.zeros(1), target=np.zeros(1), task=Task(TaskTypesEnum.classification), data_type=DataTypesEnum.table) primary = ['logit'] secondary = ['logit', 'xgboost'] composer_requirements = ComposerRequirements(primary=primary, secondary=secondary) new_chain = composer.compose_chain( data=empty_data, initial_chain=None, composer_requirements=composer_requirements, metrics=None) assert len(new_chain.nodes) == 3 assert isinstance(new_chain.nodes[0], PrimaryNode) assert isinstance(new_chain.nodes[1], SecondaryNode) assert isinstance(new_chain.nodes[2], SecondaryNode) assert new_chain.nodes[1].nodes_from[0] is new_chain.nodes[0] assert new_chain.nodes[2].nodes_from[0] is new_chain.nodes[1] assert new_chain.nodes[0].nodes_from is None
def compose_chain(data: InputData) -> Chain: dummy_composer = DummyComposer(DummyChainTypeEnum.hierarchical) composer_requirements = ComposerRequirements(primary=[ModelTypesIdsEnum.lasso, ModelTypesIdsEnum.ridge], secondary=[ModelTypesIdsEnum.linear]) metric_function = MetricsRepository().metric_by_id(RegressionMetricsEnum.RMSE) chain = dummy_composer.compose_chain(data=data, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) return chain
def compose_chain(data: InputData) -> Chain: dummy_composer = DummyComposer(DummyChainTypeEnum.hierarchical) composer_requirements = ComposerRequirements(primary=[ModelTypesIdsEnum.kmeans, ModelTypesIdsEnum.kmeans], secondary=[ModelTypesIdsEnum.logit]) metric_function = MetricsRepository().metric_by_id(ClassificationMetricsEnum.ROCAUC) chain = dummy_composer.compose_chain(data=data, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=False) return chain
def test_dummy_composer_hierarchical_chain_build_correct(): composer = DummyComposer(DummyChainTypeEnum.hierarchical) empty_data = InputData(idx=np.zeros(1), features=np.zeros(1), target=np.zeros(1), task_type=MachineLearningTasksEnum.classification) primary = [ModelTypesIdsEnum.logit, ModelTypesIdsEnum.xgboost] secondary = [ModelTypesIdsEnum.logit] composer_requirements = ComposerRequirements(primary=primary, secondary=secondary) new_chain = composer.compose_chain( data=empty_data, initial_chain=None, composer_requirements=composer_requirements, metrics=None) assert len(new_chain.nodes) == 3 assert isinstance(new_chain.nodes[0], PrimaryNode) assert isinstance(new_chain.nodes[1], PrimaryNode) assert isinstance(new_chain.nodes[2], SecondaryNode) assert new_chain.nodes[2].nodes_from[0] is new_chain.nodes[0] assert new_chain.nodes[2].nodes_from[1] is new_chain.nodes[1] assert new_chain.nodes[1].nodes_from is None
def test_random_composer(data_fixture, request): random.seed(1) np.random.seed(1) data = request.getfixturevalue(data_fixture) dataset_to_compose = data dataset_to_validate = data models_repo = ModelTypesRepository() available_model_types, _ = models_repo.search_models( desired_metainfo=ModelMetaInfoTemplate( input_type=NumericalDataTypesEnum.table, output_type=CategoricalDataTypesEnum.vector, task_type=MachineLearningTasksEnum.classification, can_be_initial=True, can_be_secondary=True)) metric_function = MetricsRepository().metric_by_id( ClassificationMetricsEnum.ROCAUC) random_composer = RandomSearchComposer(iter_num=1) req = ComposerRequirements(primary=available_model_types, secondary=available_model_types) chain_random_composed = random_composer.compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=req, metrics=metric_function) chain_random_composed.fit_from_scratch(input_data=dataset_to_compose) predicted_random_composed = chain_random_composed.predict( dataset_to_validate) roc_on_valid_random_composed = roc_auc( y_true=dataset_to_validate.target, y_score=predicted_random_composed.predict) assert roc_on_valid_random_composed > 0.6
# the search of the models provided by the framework that can be used as nodes in a chain for the selected task models_repo = ModelTypesRepository() available_model_types, _ = models_repo.search_models( desired_metainfo=ModelMetaInfoTemplate( input_type=NumericalDataTypesEnum.table, output_type=CategoricalDataTypesEnum.vector, task_type=problem_class, can_be_initial=True, can_be_secondary=True)) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id(RegressionMetricsEnum.RMSE) # the choice and initialisation single_composer_requirements = ComposerRequirements( primary=[ModelTypesIdsEnum.ar], secondary=[]) chain_single = DummyComposer(DummyChainTypeEnum.flat).compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=single_composer_requirements, metrics=metric_function) train_prediction = chain_single.fit(input_data=dataset_to_compose, verbose=True) print("Composition finished") compare_plot(train_prediction, dataset_to_compose) # the quality assessment for the obtained composite models rmse_on_valid_single = calculate_validation_metric(chain_single, dataset_to_validate)
# the choice and initialisation composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=2, max_depth=2, pop_size=10, num_of_generations=10, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=datetime.timedelta(minutes=3)) single_composer_requirements = ComposerRequirements( primary=[ModelTypesIdsEnum.lasso, ModelTypesIdsEnum.ridge], secondary=[ModelTypesIdsEnum.linear]) chain_static = DummyComposer(DummyChainTypeEnum.hierarchical).compose_chain( data=dataset_to_compose, initial_chain=None, composer_requirements=single_composer_requirements, metrics=metric_function) chain_static.fit(input_data=dataset_to_compose, verbose=False) # Create GP-based composer composer = GPComposer() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain( data=dataset_to_compose, initial_chain=None,
def run_credit_scoring_problem(train_file_path, test_file_path, max_lead_time: datetime.timedelta = datetime.timedelta(minutes=20), gp_optimiser_params: Optional[GPChainOptimiserParameters] = None): dataset_to_compose = InputData.from_csv(train_file_path) dataset_to_validate = InputData.from_csv(test_file_path) # the search of the models provided by the framework that can be used as nodes in a chain for the selected task models_repo = ModelTypesRepository() available_model_types, _ = models_repo.search_models( desired_metainfo=ModelMetaInfoTemplate(input_type=NumericalDataTypesEnum.table, output_type=CategoricalDataTypesEnum.vector, task_type=[MachineLearningTasksEnum.classification, MachineLearningTasksEnum.clustering], can_be_initial=True, can_be_secondary=True)) # the choice of the metric for the chain quality assessment during composition metric_function = MetricsRepository().metric_by_id(ClassificationMetricsEnum.ROCAUC) if gp_optimiser_params: optimiser_parameters = gp_optimiser_params else: optimiser_parameters = GPChainOptimiserParameters(selection_types=[SelectionTypesEnum.tournament], crossover_types=[CrossoverTypesEnum.subtree], mutation_types=[MutationTypesEnum.growth], regularization_type=RegularizationTypesEnum.decremental, chain_generation_function=random_ml_chain, crossover_types_dict=crossover_by_type, mutation_types_dict=mutation_by_type) composer_requirements = GPComposerRequirements( primary=available_model_types, secondary=available_model_types, max_arity=4, max_depth=3, pop_size=5, num_of_generations=5, crossover_prob=0.8, mutation_prob=0.8, max_lead_time=max_lead_time) # Create GP-based composer composer = GPComposer() # the optimal chain generation by composition - the most time-consuming task chain_evo_composed = composer.compose_chain(data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, optimiser_parameters=optimiser_parameters, is_visualise=False) chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True) # the choice and initialisation of the dummy_composer dummy_composer = DummyComposer(DummyChainTypeEnum.hierarchical) chain_static = dummy_composer.compose_chain(data=dataset_to_compose, initial_chain=None, composer_requirements=composer_requirements, metrics=metric_function, is_visualise=True) chain_static.fit(input_data=dataset_to_compose, verbose=True) # the single-model variant of optimal chain single_composer_requirements = ComposerRequirements(primary=[ModelTypesIdsEnum.xgboost], secondary=[]) chain_single = DummyComposer(DummyChainTypeEnum.flat).compose_chain(data=dataset_to_compose, initial_chain=None, composer_requirements=single_composer_requirements, metrics=metric_function) chain_single.fit(input_data=dataset_to_compose, verbose=True) print("Composition finished") ComposerVisualiser.visualise(chain_static) ComposerVisualiser.visualise(chain_evo_composed) # the quality assessment for the obtained composite models roc_on_valid_static = calculate_validation_metric(chain_static, dataset_to_validate) roc_on_valid_single = calculate_validation_metric(chain_single, dataset_to_validate) roc_on_valid_evo_composed = calculate_validation_metric(chain_evo_composed, dataset_to_validate) print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}') print(f'Static ROC AUC is {round(roc_on_valid_static, 3)}') print(f'Single-model ROC AUC is {round(roc_on_valid_single, 3)}') return (roc_on_valid_evo_composed, chain_evo_composed), (chain_static, roc_on_valid_static), ( chain_single, roc_on_valid_single)