Пример #1
0
    def fit(self, train_data: InputData):
        """
        This method is used for operation training with the data provided
        :param InputData train_data: data used for operation training
        :return: trained Sklearn operation
        """

        warnings.filterwarnings("ignore", category=RuntimeWarning)
        if self.params_for_fit:
            operation_implementation = self.operation_impl(
                **self.params_for_fit)
        else:
            operation_implementation = self.operation_impl()

        # If model doesn't support mulio-utput and current task is ts_forecasting
        current_task = train_data.task.task_type
        models_repo = OperationTypesRepository()
        non_multi_models, _ = models_repo.suitable_operation(
            task_type=current_task, tags=['non_multi'])
        is_model_not_support_multi = self.operation_type in non_multi_models
        if is_model_not_support_multi and current_task == TaskTypesEnum.ts_forecasting:
            # Manually wrap the regressor into multi-output model
            operation_implementation = convert_to_multivariate_model(
                operation_implementation, train_data)
        else:
            operation_implementation.fit(train_data.features,
                                         train_data.target)
        return operation_implementation
Пример #2
0
def has_correct_data_connections(pipeline: 'Pipeline'):
    """ Check if the pipeline contains incorrect connections between operation for different data types """
    operation_repo = OperationTypesRepository(operation_type='data_operation')
    models_repo = OperationTypesRepository(operation_type='model')

    for node in pipeline.nodes:
        parent_nodes = node.nodes_from

        if parent_nodes is not None and len(parent_nodes) > 0:
            for parent_node in parent_nodes:
                current_nodes_supported_data_types = \
                    get_supported_data_types(node, operation_repo, models_repo)
                parent_node_supported_data_types = \
                    get_supported_data_types(parent_node, operation_repo, models_repo)

                if current_nodes_supported_data_types is None:
                    # case for atomic model
                    return True

                node_dtypes = set(
                    current_nodes_supported_data_types.input_types)
                parent_dtypes = set(parent_node_supported_data_types.output_types) \
                    if parent_node_supported_data_types else node_dtypes
                if len(set.intersection(node_dtypes, parent_dtypes)) == 0:
                    raise ValueError(
                        f'{ERROR_PREFIX} Pipeline has incorrect data connections'
                    )

    return True
Пример #3
0
def test_composer_with_cv_optimization_correct():
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose, dataset_to_validate = get_data(task)

    models_repo = OperationTypesRepository()
    available_model_types, _ = models_repo.suitable_operation(
        task_type=task.task_type, tags=['simple'])

    metric_function = [
        ClassificationMetricsEnum.ROCAUC_penalty,
        ClassificationMetricsEnum.accuracy, ClassificationMetricsEnum.logloss
    ]

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        timeout=timedelta(minutes=1),
        num_of_generations=2,
        cv_folds=3)

    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    pipeline_evo_composed = composer.compose_pipeline(data=dataset_to_compose,
                                                      is_visualise=False)[0]

    assert isinstance(pipeline_evo_composed, Pipeline)

    pipeline_evo_composed.fit(input_data=dataset_to_compose)
    predicted = pipeline_evo_composed.predict(dataset_to_validate)
    roc_on_valid_evo_composed = roc_auc(y_score=predicted.predict,
                                        y_true=dataset_to_validate.target)

    assert roc_on_valid_evo_composed > 0
Пример #4
0
    def fit(self, train_data: InputData):
        """
        This method is used for operation training with the data provided
        :param InputData train_data: data used for operation training
        :return: trained cuML operation
        """
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        if self.params_for_fit:
            operation_implementation = self.operation_impl(
                **self.params_for_fit)
        else:
            operation_implementation = self.operation_impl()

        # If model doesn't support multi-output and current task is ts_forecasting
        current_task = train_data.task.task_type
        models_repo = OperationTypesRepository()
        non_multi_models, _ = models_repo.suitable_operation(
            task_type=current_task, tags=['non_multi'])
        is_model_not_support_multi = self.operation_type in non_multi_models
        features = cudf.DataFrame(train_data.features.astype('float32'))
        target = cudf.Series(train_data.target.flatten().astype('float32'))

        if is_model_not_support_multi and current_task == TaskTypesEnum.ts_forecasting:
            raise NotImplementedError('Not supported for GPU yet')
            # TODO Manually wrap the regressor into multi-output model
        else:
            operation_implementation.fit(features, target)
        return operation_implementation
Пример #5
0
def run_pipeline_from_automl(train_file_path: str,
                             test_file_path: str,
                             max_run_time: timedelta = timedelta(minutes=10)):
    """ Function run pipeline with Auto ML models in nodes

    :param train_file_path: path to the csv file with data for train
    :param test_file_path: path to the csv file with data for validation
    :param max_run_time: maximum running time for customization of the "tpot" model

    :return roc_auc_value: ROC AUC metric for pipeline
    """
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    node_scaling = PrimaryNode('scaling')
    node_tpot = PrimaryNode('tpot')

    node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = SecondaryNode('lda', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda])
    OperationTypesRepository.assign_repo('model', 'automl_repository.json')
    pipeline = Pipeline(node_rf)

    pipeline.fit(train_data)
    results = pipeline.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
Пример #6
0
def get_model(train_file_path: str,
              cur_lead_time: datetime.timedelta = timedelta(seconds=60)):
    task = Task(task_type=TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(train_file_path, task=task)

    # the search of the models provided by the framework
    # that can be used as nodes in a chain for the selected task
    models_repo = OperationTypesRepository()
    available_model_types, _ = models_repo.suitable_operation(
        task_type=task.task_type, tags=['simple'])

    metric_function = ClassificationMetricsEnum.ROCAUC_penalty

    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        max_lead_time=cur_lead_time)

    # Create the genetic programming-based composer, that allow to find
    # the optimal structure of the composite model
    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    # run the search of best suitable model
    chain_evo_composed = composer.compose_chain(data=dataset_to_compose,
                                                is_visualise=False)
    chain_evo_composed.fit(input_data=dataset_to_compose)

    return chain_evo_composed
Пример #7
0
def filter_operations_by_preset(task: Task, preset: str):
    """ Function filter operations by preset, remove "heavy" operations and save
    appropriate ones
    """
    excluded_models_dict = {'light': ['mlp', 'svc', 'arima', 'exog_ts_data_source', 'text_clean'],
                            'light_tun': ['mlp', 'svc', 'arima', 'exog_ts_data_source', 'text_clean']}

    # Get data operations and models
    available_operations = get_operations_for_task(task, mode='all')
    available_data_operation = get_operations_for_task(task, mode='data_operation')

    # Exclude "heavy" operations if necessary
    if preset in excluded_models_dict.keys():
        excluded_operations = excluded_models_dict[preset]
        available_operations = [_ for _ in available_operations if _ not in excluded_operations]

    # Save only "light" operations
    if preset in ['ultra_light', 'ultra_light_tun']:
        light_models = ['dt', 'dtreg', 'logit', 'linear', 'lasso', 'ridge', 'knn', 'ar']
        included_operations = light_models + available_data_operation
        available_operations = [_ for _ in available_operations if _ in included_operations]

    if preset == 'gpu':
        # OperationTypesRepository.assign_repo('model', 'gpu_models_repository.json')
        repository = OperationTypesRepository().assign_repo('model', 'gpu_models_repository.json')
        available_operations = repository.suitable_operation(task_type=task.task_type)
    return available_operations
Пример #8
0
def test_cv_min_kfolds_raise():
    task = Task(task_type=TaskTypesEnum.classification)
    models_repo = OperationTypesRepository()
    available_model_types, _ = models_repo.suitable_operation(
        task_type=task.task_type, tags=['simple'])

    with pytest.raises(ValueError):
        GPComposerRequirements(primary=available_model_types,
                               secondary=available_model_types,
                               cv_folds=1)
Пример #9
0
def test_pipeline_from_automl_example():
    project_root_path = str(fedot_project_root())
    with OperationTypesRepository().assign_repo('model', 'model_repository_with_automl.json') as _:
        file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv')
        file_path_test = file_path_train

        auc = run_pipeline_from_automl(file_path_train, file_path_test, max_run_time=timedelta(seconds=1))
    OperationTypesRepository.assign_repo('model', 'model_repository.json')

    assert auc > 0.5
Пример #10
0
def boosting_mutation(pipeline: Pipeline, requirements, params,
                      **kwargs) -> Any:
    """
    This type of mutation adds the additional 'boosting' cascade to the existing pipeline.
    """

    task_type = params.advisor.task.task_type
    decompose_operations, _ = OperationTypesRepository(
        'data_operation').suitable_operation(task_type=task_type,
                                             tags=['decompose'])
    decompose_operation = decompose_operations[0]

    existing_pipeline = pipeline

    if len(pipeline.nodes) == 1:
        # to deal with single-node pipeline
        data_source = pipeline.nodes[0]
    else:
        data_source = PrimaryNode('scaling')

    decompose_parents = [existing_pipeline.root_node, data_source]

    node_decompose = SecondaryNode(decompose_operation,
                                   nodes_from=decompose_parents)
    node_boost = SecondaryNode('linear', nodes_from=[node_decompose])
    node_final = SecondaryNode(
        choice(requirements.secondary),
        nodes_from=[node_boost, existing_pipeline.root_node])
    pipeline.nodes.extend([node_decompose, node_final, node_boost])
    return pipeline
Пример #11
0
def test_classification_models_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)
    roc_threshold = 0.95
    logger = default_log('default_test_logger')

    with OperationTypesRepository() as repo:
        model_names, _ = repo.suitable_operation(
            task_type=TaskTypesEnum.classification,
            data_type=data.data_type,
            tags=['ml'])

    for model_name in model_names:
        logger.info(f"Test classification model: {model_name}.")
        model = Model(operation_type=model_name)
        _, train_predicted = model.fit(data=train_data)
        test_pred = model.predict(fitted_operation=_,
                                  data=test_data,
                                  is_fit_pipeline_stage=False)
        roc_on_test = get_roc_auc(valid_data=test_data,
                                  predicted_data=test_pred)
        if model_name not in ['bernb', 'multinb']:
            assert roc_on_test >= roc_threshold
        else:
            assert roc_on_test >= 0.5
Пример #12
0
def test_search_in_repository_by_tag_and_metainfo_correct():
    with OperationTypesRepository() as repo:
        model_names, _ = repo.suitable_operation(task_type=TaskTypesEnum.regression,
                                                 tags=['ml'])

        assert 'linear' in model_names
        assert len(model_names) == 12
Пример #13
0
def test_gp_composer_build_pipeline_correct(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    task = Task(TaskTypesEnum.classification)
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=task.task_type)

    metric_function = ClassificationMetricsEnum.ROCAUC

    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)

    builder = GPComposerBuilder(task).with_requirements(req).with_metrics(
        metric_function)
    gp_composer = builder.build()
    pipeline_gp_composed = gp_composer.compose_pipeline(
        data=dataset_to_compose)

    pipeline_gp_composed.fit_from_scratch(input_data=dataset_to_compose)
    predicted_gp_composed = pipeline_gp_composed.predict(dataset_to_validate)

    roc_on_valid_gp_composed = roc_auc(y_true=dataset_to_validate.target,
                                       y_score=predicted_gp_composed.predict)

    assert roc_on_valid_gp_composed > 0.6
Пример #14
0
def test_random_composer(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data

    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=TaskTypesEnum.classification)

    metric_function = MetricsRepository().metric_by_id(
        ClassificationMetricsEnum.ROCAUC)

    random_composer = RandomSearchComposer(iter_num=1)
    req = ComposerRequirements(primary=available_model_types,
                               secondary=available_model_types)
    pipeline_random_composed = random_composer.compose_pipeline(
        data=dataset_to_compose,
        initial_pipeline=None,
        composer_requirements=req,
        metrics=metric_function)
    pipeline_random_composed.fit_from_scratch(input_data=dataset_to_compose)

    predicted_random_composed = pipeline_random_composed.predict(
        dataset_to_validate)

    roc_on_valid_random_composed = roc_auc(
        y_true=dataset_to_validate.target,
        y_score=predicted_random_composed.predict)

    assert roc_on_valid_random_composed > 0.6
Пример #15
0
def test_cv_ts_and_cluster_raise():
    task = Task(task_type=TaskTypesEnum.clustering)
    dataset_to_compose, dataset_to_validate = get_data(task)
    metric_function = ClusteringMetricsEnum.silhouette

    operations_repo = OperationTypesRepository()
    available_model_types, _ = operations_repo.suitable_operation(
        task_type=task.task_type)
    composer_requirements = GPComposerRequirements(
        primary=available_model_types,
        secondary=available_model_types,
        cv_folds=4)
    builder = GPComposerBuilder(task).with_requirements(
        composer_requirements).with_metrics(metric_function)
    composer = builder.build()

    with pytest.raises(NotImplementedError):
        composer.compose_pipeline(data=dataset_to_compose, is_visualise=False)
Пример #16
0
def test_evaluate_individuals():
    project_root_path = str(fedot_project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/simple_classification.csv')
    full_path_train = os.path.join(str(fedot_project_root()), file_path_train)

    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(full_path_train, task=task)
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=task.task_type)

    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    composer_requirements = GPComposerRequirements(
        primary=available_model_types, secondary=available_model_types)

    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function)

    composer = builder.build()

    pipelines_to_evaluate = [
        pipeline_first(),
        pipeline_second(),
        pipeline_third(),
        pipeline_fourth()
    ]

    train_data, test_data = train_test_data_setup(
        dataset_to_compose,
        sample_split_ratio_for_tasks[dataset_to_compose.task.task_type])
    metric_function_for_nodes = partial(composer.composer_metric,
                                        composer.metrics, train_data,
                                        test_data)
    adapter = PipelineAdapter()
    population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate]
    timeout = datetime.timedelta(minutes=0.001)
    params = GraphGenerationParams(adapter=PipelineAdapter(),
                                   advisor=PipelineChangeAdvisor())
    with OptimisationTimer(timeout=timeout) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             graph_generation_params=params,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 1
    assert population[0].fitness is not None

    population = [Individual(adapter.adapt(c)) for c in pipelines_to_evaluate]
    timeout = datetime.timedelta(minutes=5)
    with OptimisationTimer(timeout=timeout) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             graph_generation_params=params,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 4
    assert all([ind.fitness is not None for ind in population])
Пример #17
0
def test_chain_from_automl_example():
    project_root_path = str(project_root())
    experimental_repo_file = os.path.join('model_repository_with_automl.json')
    with OperationTypesRepository(experimental_repo_file) as _:
        file_path_train = os.path.join(project_root_path, 'test/data/simple_classification.csv')
        file_path_test = file_path_train

        auc = run_chain_from_automl(file_path_train, file_path_test, max_run_time=timedelta(seconds=1))

    assert auc > 0.5
Пример #18
0
def test_search_in_repository_by_tag_correct():
    with OperationTypesRepository() as repo:
        model_names, _ = repo.operations_with_tag(tags=['simple', 'linear'], is_full_match=True)
        assert {'linear', 'logit', 'lasso', 'ridge'}.issubset(model_names)
        assert len(model_names) == 4

        model_names, _ = repo.operations_with_tag(tags=['simple', 'linear'])
        assert {'linear', 'logit', 'knn', 'lda', 'lasso', 'ridge'}.issubset(model_names)
        assert len(model_names) == 9

        model_names, _ = repo.operations_with_tag(tags=['non_real_tag'])
        assert len(model_names) == 0
Пример #19
0
def operations_for_task(task_name: str):
    """ Function filter operations by task and returns dictionary with names of
    models and data operations

    :param task_name: name of available task type

    :return dict_with_operations: dictionary with operations
        - models: appropriate models for task
        - data operations: appropriate data operations for task
    """

    task = _get_task_by_name(task_name)

    # Get models and data operations
    models_repo = OperationTypesRepository()
    data_operations_repo = OperationTypesRepository(
        operation_type='data_operation')

    appropriate_models, _ = models_repo.suitable_operation(task_type=task)
    appropriate_data_operations, _ = data_operations_repo.suitable_operation(
        task_type=task)

    dict_with_operations = {
        'model': appropriate_models,
        'data operation': appropriate_data_operations
    }

    return dict_with_operations
Пример #20
0
def has_no_conflicts_with_data_flow(pipeline: 'Pipeline'):
    """ Check if the pipeline contains incorrect connections between nodes """
    operation_repo = OperationTypesRepository(operation_type='data_operation')
    forbidden_parents_combination, _ = operation_repo.suitable_operation()
    forbidden_parents_combination = set(forbidden_parents_combination)

    for node in pipeline.nodes:
        parent_nodes = node.nodes_from

        if parent_nodes is not None and len(parent_nodes) > 1:
            # There are several parents
            operation_names = []
            for parent in parent_nodes:
                operation_names.append(parent.operation.operation_type)

            # If operations are identical
            if len(set(operation_names)) == 1:
                # And if it is forbidden to combine them
                if operation_names[0] in forbidden_parents_combination:
                    raise ValueError(
                        f'{ERROR_PREFIX} Pipeline has incorrect subgraph with identical data operations'
                    )
    return True
Пример #21
0
    def _node_generation(self,
                         node_type: Type[Node],
                         number_of_operations=None) -> List[Node]:
        task = self._train_data.task.task_type
        # Get models
        app_models, _ = OperationTypesRepository().suitable_operation(
            task_type=task)
        # Get data operations for such task
        app_data_operations, _ = OperationTypesRepository(
            'data_operation').suitable_operation(task_type=task)

        # Unit two lists
        app_operations = app_models
        if number_of_operations:
            random_operations = random.sample(app_operations,
                                              number_of_operations)
        else:
            random_operations = app_operations

        nodes = []
        for operation in random_operations:
            nodes.append(node_type(operation_type=operation))

        return nodes
Пример #22
0
def test_classification_data_operations():
    train_input, predict_input, y_test = get_small_classification_dataset()

    model_names, _ = OperationTypesRepository().suitable_operation(
        task_type=TaskTypesEnum.classification)

    for data_operation in model_names:
        node_data_operation = PrimaryNode(data_operation)
        node_final = SecondaryNode('logit', nodes_from=[node_data_operation])
        pipeline = Pipeline(node_final)

        # Fit and predict for pipeline
        pipeline.fit_from_scratch(train_input)
        predicted_output = pipeline.predict(predict_input)
        predicted = predicted_output.predict

        assert len(predicted) == len(y_test)
Пример #23
0
def test_ts_forecasting_smoothing_data_operation():
    train_input, predict_input, y_test = get_time_series()

    model_names, _ = OperationTypesRepository().operations_with_tag(
        tags=['smoothing'])

    for smoothing_operation in model_names:
        node_smoothing = PrimaryNode(smoothing_operation)
        node_lagged = SecondaryNode('lagged', nodes_from=[node_smoothing])
        node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged])
        pipeline = Pipeline(node_ridge)

        pipeline.fit_from_scratch(train_input)
        predicted_output = pipeline.predict(predict_input)
        predicted = np.ravel(predicted_output.predict)

        assert len(predicted) == len(np.ravel(y_test))
Пример #24
0
def test_evaluate_individuals():
    project_root_path = str(project_root())
    file_path_train = os.path.join(project_root_path,
                                   'test/data/simple_classification.csv')
    full_path_train = os.path.join(str(project_root()), file_path_train)

    task = Task(TaskTypesEnum.classification)
    dataset_to_compose = InputData.from_csv(full_path_train, task=task)
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=task.task_type)

    metric_function = ClassificationMetricsEnum.ROCAUC_penalty
    composer_requirements = GPComposerRequirements(
        primary=available_model_types, secondary=available_model_types)

    builder = GPComposerBuilder(task=task).with_requirements(composer_requirements). \
        with_metrics(metric_function)

    composer = builder.build()

    train_data, test_data = train_test_data_setup(
        dataset_to_compose,
        sample_split_ration_for_tasks[dataset_to_compose.task.task_type])
    metric_function_for_nodes = partial(composer.composer_metric,
                                        composer.metrics, train_data,
                                        test_data)
    population = [chain_first(), chain_second(), chain_third(), chain_fourth()]
    max_lead_time = datetime.timedelta(minutes=0.001)
    with CompositionTimer(max_lead_time=max_lead_time) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 1
    assert population[0].fitness is not None

    population = [chain_first(), chain_second(), chain_third(), chain_fourth()]
    max_lead_time = datetime.timedelta(minutes=5)
    with CompositionTimer(max_lead_time=max_lead_time) as t:
        evaluate_individuals(individuals_set=population,
                             objective_function=metric_function_for_nodes,
                             is_multi_objective=False,
                             timer=t)
    assert len(population) == 4
    assert all([ind.fitness is not None for ind in population])
Пример #25
0
class DataOperation(Operation):
    """
    Class with fit/predict methods defining the evaluation strategy for the task

    :param operation_type: name of the data operation
    :param log: Log object to record messages
    """
    def __init__(self, operation_type: str, log: Log = None):
        super().__init__(operation_type, log)
        self.operations_repo = OperationTypesRepository('data_operation')

    @property
    def metadata(self) -> OperationMetaInfo:
        operation_info = self.operations_repo.operation_info_by_id(
            self.operation_type)
        if not operation_info:
            raise ValueError(f'Data operation {self.operation_type} not found')
        return operation_info
Пример #26
0
def test_inf_and_nan_absence_after_pipeline_fitting_from_scratch():
    train_input = get_nan_inf_data()

    model_names, _ = OperationTypesRepository().suitable_operation(
        task_type=TaskTypesEnum.regression)

    for data_operation in model_names:
        node_data_operation = PrimaryNode(data_operation)
        node_final = SecondaryNode('linear', nodes_from=[node_data_operation])
        pipeline = Pipeline(node_final)

        # Fit and predict for pipeline
        pipeline.fit_from_scratch(train_input)
        predicted_output = pipeline.predict(train_input)
        predicted = predicted_output.predict

        assert np.sum(np.isinf(predicted)) == 0
        assert np.sum(np.isnan(predicted)) == 0
Пример #27
0
    def _define_operation_type(self) -> str:
        """
        The method determines what type of operations is set for this node

        :return : operations type 'model' or 'data_operation'
        """

        # Get available models from model_repository.json file
        operations_repo = OperationTypesRepository()
        models = operations_repo.operations

        # If there is a such model in the list
        if any(self.operation_name == model.id for model in models):
            operation_type = 'model'
        # Otherwise - it is preprocessing operations
        else:
            operation_type = 'data_operation'
        return operation_type
Пример #28
0
def print_data_operations_info(task_name):
    """ Function display data operations and information about it for considered task

    :param task_name: name of available task type
    """

    task = _get_task_by_name(task_name)

    repository = OperationTypesRepository(repository_name='data_operation_repository.json')
    # Filter operations
    repository_operations_list = _filter_operations_by_type(repository, task)
    for operation in repository_operations_list:
        hyperparameters = get_operation_parameter_range(str(operation.id))
        implementation_info = operation.current_strategy(task)(operation.id).implementation_info
        print(f"Data operation name - '{operation.id}'")
        print(f"Available hyperparameters to optimize with tuner - {hyperparameters}")
        print(f"Strategy implementation - {operation.current_strategy(task)}")
        print(f"Operation implementation - {implementation_info}\n")
Пример #29
0
def test_multi_objective_composer(data_fixture, request):
    random.seed(1)
    np.random.seed(1)
    data = request.getfixturevalue(data_fixture)
    dataset_to_compose = data
    dataset_to_validate = data
    available_model_types, _ = OperationTypesRepository().suitable_operation(
        task_type=TaskTypesEnum.classification)
    quality_metric = ClassificationMetricsEnum.ROCAUC
    complexity_metric = ComplexityMetricsEnum.node_num
    metrics = [quality_metric, complexity_metric]
    req = GPComposerRequirements(primary=available_model_types,
                                 secondary=available_model_types,
                                 max_arity=2,
                                 max_depth=2,
                                 pop_size=2,
                                 num_of_generations=1,
                                 crossover_prob=0.4,
                                 mutation_prob=0.5)
    scheme_type = GeneticSchemeTypesEnum.steady_state
    optimiser_parameters = GPGraphOptimiserParameters(
        genetic_scheme_type=scheme_type,
        selection_types=[SelectionTypesEnum.nsga2])
    builder = GPComposerBuilder(task=Task(
        TaskTypesEnum.classification)).with_requirements(req).with_metrics(
            metrics).with_optimiser_parameters(optimiser_parameters)
    composer = builder.build()
    pipelines_evo_composed = composer.compose_pipeline(data=dataset_to_compose)
    pipelines_roc_auc = []
    for pipeline_evo_composed in pipelines_evo_composed:
        pipeline_evo_composed.fit_from_scratch(input_data=dataset_to_compose)
        predicted_gp_composed = pipeline_evo_composed.predict(
            dataset_to_validate)

        roc_on_valid_gp_composed = roc_auc(
            y_true=dataset_to_validate.target,
            y_score=predicted_gp_composed.predict)

        pipelines_roc_auc.append(roc_on_valid_gp_composed)

    assert type(composer.metrics) is list and len(composer.metrics) > 1
    assert type(pipelines_evo_composed) is list
    assert composer.optimiser.parameters.multi_objective
    assert all([roc_auc > 0.6 for roc_auc in pipelines_roc_auc])
Пример #30
0
class Model(Operation):
    """
    Class with fit/predict methods defining the evaluation strategy for the task

    :param operation_type: name of the model
    :param log: Log object to record messages
    """
    def __init__(self, operation_type: str, log: Log = None):
        super().__init__(operation_type=operation_type, log=log)
        self.operations_repo = OperationTypesRepository(
            repository_name='model_repository.json')

    @property
    def metadata(self) -> OperationMetaInfo:
        model_info = self.operations_repo.operation_info_by_id(
            self.operation_type)
        if not model_info:
            raise ValueError(f'Model {self.operation_type} not found')
        return model_info