示例#1
0
def test_forecast_with_exog():
    train_source_ts, predict_source_ts, train_exog_ts, predict_exog_ts, ts_test = synthetic_with_exogenous_ts(
    )

    # Source data for lagged node
    node_lagged = PrimaryNode('lagged')
    # Set window size for lagged transformation
    node_lagged.custom_params = {'window_size': window_size}
    # Exogenous variable for exog node
    node_exog = PrimaryNode('exog_ts_data_source')

    node_final = SecondaryNode('linear', nodes_from=[node_lagged, node_exog])
    pipeline = Pipeline(node_final)

    pipeline.fit(input_data=MultiModalData({
        'exog_ts_data_source': train_exog_ts,
        'lagged': train_source_ts
    }))

    forecast = pipeline.predict(
        input_data=MultiModalData({
            'exog_ts_data_source': predict_exog_ts,
            'lagged': predict_source_ts
        }))
    prediction = np.ravel(np.array(forecast.predict))

    assert tuple(prediction) == tuple(ts_test)
示例#2
0
    def fit(self,
            features: Union[str, np.ndarray, pd.DataFrame, InputData, dict],
            target: Union[str, np.ndarray, pd.Series] = 'target',
            predefined_model: Union[str, Pipeline] = None):
        """
        Fit the graph with a predefined structure or compose and fit the new graph

        :param features: the array with features of train data
        :param target: the array with target values of train data
        :param predefined_model: the name of the atomic model or Pipeline instance
        :return: Pipeline object
        """

        self.target_name = target
        self.train_data = _define_data(ml_task=self.problem,
                                       features=features,
                                       target=target,
                                       is_predict=False)

        is_composing_required = True
        if self.current_pipeline is not None:
            is_composing_required = False

        if predefined_model is not None:
            is_composing_required = False
            if isinstance(predefined_model, Pipeline):
                self.current_pipeline = predefined_model
            elif isinstance(predefined_model, str):
                self.current_pipeline = Pipeline(PrimaryNode(predefined_model))
            else:
                raise ValueError(
                    f'{type(predefined_model)} is not supported as Fedot model'
                )

        return self._obtain_model(is_composing_required)
示例#3
0
def create_pipeline_with_several_nested_atomized_model() -> Pipeline:
    pipeline = Pipeline()
    atomized_op = create_atomized_model_with_several_atomized_models()
    node_atomized_model = PrimaryNode(operation_type=atomized_op)

    node_atomized_model_secondary = SecondaryNode(
        operation_type=create_atomized_model())
    node_atomized_model_secondary.nodes_from = [node_atomized_model]

    node_knn = SecondaryNode('knn')
    node_knn.custom_params = {'n_neighbors': 9}
    node_knn.nodes_from = [node_atomized_model]

    node_knn_second = SecondaryNode('knn')
    node_knn_second.custom_params = {'n_neighbors': 5}
    node_knn_second.nodes_from = [
        node_atomized_model, node_atomized_model_secondary, node_knn
    ]

    node_atomized_model_secondary_second = \
        SecondaryNode(operation_type=create_atomized_model_with_several_atomized_models())

    node_atomized_model_secondary_second.nodes_from = [node_knn_second]

    pipeline.add_node(node_atomized_model_secondary_second)

    return pipeline
示例#4
0
def test_pipeline_sequential_fit_correct(data_setup):
    data = data_setup
    train, _ = train_test_data_setup(data)

    first = PrimaryNode(operation_type='logit')
    second = SecondaryNode(operation_type='logit', nodes_from=[first])
    third = SecondaryNode(operation_type='logit', nodes_from=[second])
    final = SecondaryNode(operation_type='logit', nodes_from=[third])

    pipeline = Pipeline()
    for node in [first, second, third, final]:
        pipeline.add_node(node)

    train_predicted = pipeline.fit(input_data=train, use_fitted=False)

    assert pipeline.root_node.descriptive_id == (
        '(((/n_logit_default_params;)/'
        'n_logit_default_params;)/'
        'n_logit_default_params;)/'
        'n_logit_default_params')

    assert pipeline.length == 4
    assert pipeline.depth == 4
    assert train_predicted.predict.shape[0] == train.target.shape[0]
    assert final.fitted_operation is not None
示例#5
0
def run_tpot_vs_fedot_example(train_file_path: str, test_file_path: str):
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    training_features = train_data.features
    testing_features = test_data.features
    training_target = train_data.target
    testing_target = test_data.target

    # Average CV score on the training set was: 0.93755
    exported_pipeline = make_pipeline(
        StackingEstimator(estimator=BernoulliNB()), RandomForestClassifier())
    # Fix random state for all the steps in exported pipeline
    set_param_recursive(exported_pipeline.steps, 'random_state', 1)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict_proba(testing_features)[:, 1]

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results)

    print(f'ROC AUC for TPOT: {roc_auc_value}')

    node_scaling = PrimaryNode('scaling')
    node_bernb = SecondaryNode('bernb', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_bernb, node_scaling])
    pipeline = Pipeline(node_rf)

    pipeline.fit(train_data)
    results = pipeline.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(f'ROC AUC for FEDOT: {roc_auc_value}')

    return roc_auc_value
示例#6
0
def get_roc_auc_value(pipeline: Pipeline, train_data: InputData, test_data: InputData) -> (float, float):
    train_pred = pipeline.predict(input_data=train_data)
    test_pred = pipeline.predict(input_data=test_data)
    roc_auc_value_test = roc_auc(y_true=test_data.target, y_score=test_pred.predict)
    roc_auc_value_train = roc_auc(y_true=train_data.target, y_score=train_pred.predict)

    return roc_auc_value_train, roc_auc_value_test
示例#7
0
def run_pipeline_from_automl(train_file_path: str,
                             test_file_path: str,
                             max_run_time: timedelta = timedelta(minutes=10)):
    """ Function run pipeline with Auto ML models in nodes

    :param train_file_path: path to the csv file with data for train
    :param test_file_path: path to the csv file with data for validation
    :param max_run_time: maximum running time for customization of the "tpot" model

    :return roc_auc_value: ROC AUC metric for pipeline
    """
    train_data = InputData.from_csv(train_file_path)
    test_data = InputData.from_csv(test_file_path)

    testing_target = test_data.target

    node_scaling = PrimaryNode('scaling')
    node_tpot = PrimaryNode('tpot')

    node_tpot.operation.params = {'max_run_time_sec': max_run_time.seconds}

    node_lda = SecondaryNode('lda', nodes_from=[node_scaling])
    node_rf = SecondaryNode('rf', nodes_from=[node_tpot, node_lda])
    OperationTypesRepository.assign_repo('model', 'automl_repository.json')
    pipeline = Pipeline(node_rf)

    pipeline.fit(train_data)
    results = pipeline.predict(test_data)

    roc_auc_value = roc_auc(y_true=testing_target, y_score=results.predict)
    print(roc_auc_value)

    return roc_auc_value
示例#8
0
def run_import_export_example(pipeline_path):
    # Prepare data to train the model
    train_data, test_data = get_scoring_data()

    # Get pipeline and fit it
    pipeline = get_three_depth_manual_class_pipeline()
    pipeline.fit_from_scratch(train_data)

    predicted_output = pipeline.predict(test_data)
    prediction_before_export = np.array(predicted_output.predict)
    print(f'Before export {prediction_before_export[:4]}')

    NodesAnalysis(
        pipeline,
        train_data,
        test_data,
        approaches=[NodeDeletionAnalyze,
                    NodeReplaceOperationAnalyze]).analyze()

    # Export it
    pipeline.save(path=pipeline_path)

    # Import pipeline
    json_path_load = create_correct_path(pipeline_path)
    new_pipeline = Pipeline()
    new_pipeline.load(json_path_load)

    predicted_output_after_export = new_pipeline.predict(test_data)
    prediction_after_export = np.array(predicted_output_after_export.predict)

    print(f'After import {prediction_after_export[:4]}')
示例#9
0
    def _get_metric_value(self, pipeline: Pipeline,
                          metric: MetricByTask) -> float:
        pipeline.fit(self._train_data, use_fitted=False)
        predicted = pipeline.predict(self._test_data)
        metric_value = metric.get_value(true=self._test_data,
                                        predicted=predicted)

        return metric_value
示例#10
0
 def restore(self, opt_graph: OptGraph):
     # TODO improve transformation
     pipeline_nodes = []
     for node in opt_graph.nodes:
         self._transform_to_pipeline_node(node)
         pipeline_nodes.append(node)
     pipeline = Pipeline(pipeline_nodes)
     pipeline.uid = opt_graph.uid
     return pipeline
示例#11
0
def pipeline_with_multiple_roots():
    first = PrimaryNode(operation_type='logit')
    root_first = SecondaryNode(operation_type='logit', nodes_from=[first])
    root_second = SecondaryNode(operation_type='logit', nodes_from=[first])
    pipeline = Pipeline()

    for node in [first, root_first, root_second]:
        pipeline.add_node(node)

    return pipeline
示例#12
0
def pipeline_with_cycle():
    first = PrimaryNode(operation_type='logit')
    second = SecondaryNode(operation_type='logit', nodes_from=[first])
    third = SecondaryNode(operation_type='logit', nodes_from=[second, first])
    second.nodes_from.append(third)
    pipeline = Pipeline()
    for node in [first, second, third]:
        pipeline.add_node(node)

    return pipeline
示例#13
0
def pipeline_with_isolated_components():
    first = PrimaryNode(operation_type='logit')
    second = SecondaryNode(operation_type='logit', nodes_from=[first])
    third = SecondaryNode(operation_type='logit', nodes_from=[])
    fourth = SecondaryNode(operation_type='logit', nodes_from=[third])

    pipeline = Pipeline()
    for node in [first, second, third, fourth]:
        pipeline.add_node(node)

    return pipeline
示例#14
0
def pipeline_third():
    #    QDA
    #  |     \
    # RF     RF
    pipeline = Pipeline()
    new_node = SecondaryNode('qda')
    for model_type in ('rf', 'rf'):
        new_node.nodes_from.append(PrimaryNode(model_type))
    pipeline.add_node(new_node)
    [pipeline.add_node(node_from) for node_from in new_node.nodes_from]
    return pipeline
示例#15
0
def test_data_preparation_for_multi_target_correct(multi_target_data_setup):
    train, test = multi_target_data_setup
    simple_pipeline = Pipeline(PrimaryNode('linear'))
    simple_pipeline.fit(input_data=train)

    source_shape = test.target.shape
    # Get converted data
    results, new_test = QualityMetric()._simple_prediction(
        simple_pipeline, test)
    number_elements = len(new_test.target)
    assert source_shape[0] * source_shape[1] == number_elements
示例#16
0
def test_import_json_to_pipeline_correctly():
    json_path_load = create_correct_path('test_pipeline_convert_to_json')

    pipeline = Pipeline()
    pipeline.load(json_path_load)
    json_actual = pipeline.save('test_import_json_to_pipeline_correctly_1')

    pipeline_expected = create_pipeline()
    json_expected = pipeline_expected.save('test_import_json_to_pipeline_correctly_2')

    assert json.dumps(json_actual) == json.dumps(json_expected)
示例#17
0
def test_import_json_to_fitted_pipeline_correctly():
    json_path_load = create_correct_path('test_fitted_pipeline_convert_to_json')

    pipeline = Pipeline()
    pipeline.load(json_path_load)
    json_actual = pipeline.save('test_import_json_to_fitted_pipeline_correctly')

    with open(json_path_load, 'r') as json_file:
        json_expected = json.load(json_file)

    assert json_actual == json.dumps(json_expected, indent=4)
示例#18
0
def test_pipeline_with_wrong_data():
    pipeline = Pipeline(PrimaryNode('linear'))
    data_seq = np.arange(0, 10)
    task = Task(TaskTypesEnum.ts_forecasting,
                TsForecastingParams(forecast_length=10))

    data = InputData(idx=data_seq, features=data_seq, target=data_seq,
                     data_type=DataTypesEnum.ts, task=task)

    with pytest.raises(ValueError):
        pipeline.fit(data)
示例#19
0
def valid_pipeline():
    first = PrimaryNode(operation_type='logit')
    second = SecondaryNode(operation_type='logit', nodes_from=[first])
    third = SecondaryNode(operation_type='logit', nodes_from=[second])
    last = SecondaryNode(operation_type='logit', nodes_from=[third])

    pipeline = Pipeline()
    for node in [first, second, third, last]:
        pipeline.add_node(node)

    return pipeline
示例#20
0
def get_rmse_value(pipeline: Pipeline, train_data: InputData,
                   test_data: InputData) -> (float, float):
    train_pred = pipeline.predict(input_data=train_data)
    test_pred = pipeline.predict(input_data=test_data)
    rmse_value_test = mse(y_true=test_data.target,
                          y_pred=test_pred.predict,
                          squared=False)
    rmse_value_train = mse(y_true=train_data.target,
                           y_pred=train_pred.predict,
                           squared=False)

    return rmse_value_train, rmse_value_test
示例#21
0
def create_json_models_files():
    """
    Creating JSON's files for test before tests.
    """
    pipeline = create_pipeline()
    pipeline.save('test_pipeline_convert_to_json')

    pipeline_fitted = create_fitted_pipeline()
    pipeline_fitted.save('test_fitted_pipeline_convert_to_json')

    pipeline_empty = Pipeline()
    pipeline_empty.save('test_empty_pipeline_convert_to_json')
示例#22
0
def test_pipeline_repr():
    first = PrimaryNode(operation_type='logit')
    second = PrimaryNode(operation_type='lda')
    third = PrimaryNode(operation_type='knn')
    final = SecondaryNode(operation_type='xgboost',
                          nodes_from=[first, second, third])
    pipeline = Pipeline()
    pipeline.add_node(final)

    expected_pipeline_description = "{'depth': 2, 'length': 4, 'nodes': [xgboost, logit, lda, knn]}"

    assert repr(pipeline) == expected_pipeline_description
示例#23
0
def test_ts_forecasting_lagged_data_operation():
    train_input, predict_input, y_test = get_time_series()

    node_lagged = PrimaryNode('lagged')
    node_ridge = SecondaryNode('ridge', nodes_from=[node_lagged])
    pipeline = Pipeline(node_ridge)

    pipeline.fit_from_scratch(train_input)
    predicted_output = pipeline.predict(predict_input)
    predicted = np.ravel(predicted_output.predict)

    assert len(predicted) == len(np.ravel(y_test))
示例#24
0
def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Scaling pipeline. Fit predict it
    scaling_pipeline = Pipeline(PrimaryNode('normalization'))
    scaling_pipeline.fit(train_data)
    scaled_data = scaling_pipeline.predict(train_data)

    kmeans = Model(operation_type='kmeans')
    _, train_predicted = kmeans.fit(data=scaled_data)

    assert all(np.unique(train_predicted.predict) == [0, 1])
示例#25
0
def execute_pipeline_for_text_problem(train_data, test_data):
    node_text_clean = PrimaryNode('text_clean')
    node_tfidf = SecondaryNode('tfidf', nodes_from=[node_text_clean])
    model_node = SecondaryNode('multinb', nodes_from=[node_tfidf])
    pipeline = Pipeline(model_node)
    pipeline.fit(train_data)

    predicted = pipeline.predict(test_data)

    roc_auc_metric = roc_auc(y_true=test_data.target,
                             y_score=predicted.predict)

    return roc_auc_metric
示例#26
0
def test_save_load_fitted_atomized_pipeline_correctly():
    pipeline = create_pipeline_with_several_nested_atomized_model()

    train_data, test_data = create_data_for_train()
    pipeline.fit(train_data)

    json_actual = pipeline.save(
        'test_save_load_fitted_atomized_pipeline_correctly')

    json_path_load = create_correct_path(
        'test_save_load_fitted_atomized_pipeline_correctly')

    pipeline_loaded = Pipeline()
    pipeline_loaded.load(json_path_load)
    json_expected = pipeline_loaded.save(
        'test_save_load_fitted_atomized_pipeline_correctly_loaded')

    assert pipeline.length == pipeline_loaded.length
    assert json_actual == json_expected

    before_save_predicted = pipeline.predict(test_data)

    pipeline_loaded.fit(train_data)
    after_save_predicted = pipeline_loaded.predict(test_data)

    bfr_tun_mse = mean_squared_error(y_true=test_data.target,
                                     y_pred=before_save_predicted.predict)
    aft_tun_mse = mean_squared_error(y_true=test_data.target,
                                     y_pred=after_save_predicted.predict)

    assert aft_tun_mse <= bfr_tun_mse
示例#27
0
def create_pipeline() -> Pipeline:
    pipeline = Pipeline()
    node_logit = PrimaryNode('logit')

    node_lda = PrimaryNode('lda')
    node_lda.custom_params = {'n_components': 1}

    node_xgboost = SecondaryNode('xgboost')
    node_xgboost.custom_params = {'n_components': 1}
    node_xgboost.nodes_from = [node_logit, node_lda]

    pipeline.add_node(node_xgboost)

    return pipeline
示例#28
0
def test_multi_modal_pipeline():
    task = Task(TaskTypesEnum.classification)
    images_size = (128, 128)

    files_path = os.path.join('test', 'data', 'multi_modal')
    path = os.path.join(str(fedot_project_root()), files_path)

    train_num, _, train_img, _, train_text, _ = \
        prepare_multi_modal_data(path, task, images_size, with_split=False)

    # image
    image_node = PrimaryNode('cnn')
    image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1),
                                'architecture': 'simplified',
                                'num_classes': 2,
                                'epochs': 1,
                                'batch_size': 128}

    # image
    ds_image = PrimaryNode('data_source_img')
    image_node = SecondaryNode('cnn', nodes_from=[ds_image])
    image_node.custom_params = {'image_shape': (images_size[0], images_size[1], 1),
                                'architecture': 'simplified',
                                'num_classes': 2,
                                'epochs': 15,
                                'batch_size': 128}

    # table
    ds_table = PrimaryNode('data_source_table')
    scaling_node = SecondaryNode('scaling', nodes_from=[ds_table])
    numeric_node = SecondaryNode('rf', nodes_from=[scaling_node])

    # text
    ds_text = PrimaryNode('data_source_text')
    node_text_clean = SecondaryNode('text_clean', nodes_from=[ds_text])
    text_node = SecondaryNode('tfidf', nodes_from=[node_text_clean])

    pipeline = Pipeline(SecondaryNode('logit', nodes_from=[numeric_node, image_node, text_node]))

    fit_data = MultiModalData({
        'data_source_img': train_img,
        'data_source_table': train_num,
        'data_source_text': train_text
    })

    pipeline.fit(fit_data)
    prediction = pipeline.predict(fit_data)

    assert prediction is not None
示例#29
0
def sample_pipeline():
    return Pipeline(
        SecondaryNode(operation_type='logit',
                      nodes_from=[
                          PrimaryNode(operation_type='xgboost'),
                          PrimaryNode(operation_type='scaling')
                      ]))
示例#30
0
def return_working_pipeline():
    node_lagged_1 = PrimaryNode('lagged/1')
    node_exog = PrimaryNode('exog_ts_data_source')

    node_final = SecondaryNode('ridge', nodes_from=[node_lagged_1, node_exog])
    pipeline = Pipeline(node_final)
    return pipeline