示例#1
0
def test_fine_tune_primary_nodes(data_fixture, request):
    # TODO still stochatic
    result_list = []
    for _ in range(3):
        data = request.getfixturevalue(data_fixture)
        train_data, test_data = train_test_data_setup(data=data)

        # Chain composition
        chain = get_regr_chain()

        # Before tuning prediction
        chain.fit(train_data, use_cache=False)
        before_tuning_predicted = chain.predict(test_data)

        # Chain tuning
        chain.fine_tune_primary_nodes(train_data, max_lead_time=timedelta(minutes=1), iterations=10)

        # After tuning prediction
        chain.fit(train_data)
        after_tuning_predicted = chain.predict(test_data)

        # Metrics
        bfr_tun_mse = mse(y_true=test_data.target, y_pred=before_tuning_predicted.predict)
        aft_tun_mse = mse(y_true=test_data.target, y_pred=after_tuning_predicted.predict)

        print(f'Before tune test {bfr_tun_mse}')
        print(f'After tune test {aft_tun_mse}', '\n')
        result_list.append(aft_tun_mse <= bfr_tun_mse)

    assert any(result_list)
示例#2
0
def test_fine_tune_all_nodes(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train_data, test_data = train_test_data_setup(data=data)

    # Chain composition
    chain = get_class_chain()

    # Before tuning prediction
    chain.fit(train_data, use_cache=False)
    before_tuning_predicted = chain.predict(test_data)

    # root node tuning
    chain.fine_tune_all_nodes(train_data,
                              max_lead_time=timedelta(minutes=1),
                              iterations=30)
    after_tun_root_node_predicted = chain.predict(test_data)

    bfr_tun_roc_auc = round(
        mse(y_true=test_data.target, y_pred=before_tuning_predicted.predict),
        2)
    aft_tun_roc_auc = round(
        mse(y_true=test_data.target,
            y_pred=after_tun_root_node_predicted.predict), 2)

    print(f'Before tune test {bfr_tun_roc_auc}')
    print(f'After tune test {aft_tun_roc_auc}', '\n')

    assert aft_tun_roc_auc <= bfr_tun_roc_auc
示例#3
0
def test_chain_hierarchy_fit_correct(data_setup):
    data = data_setup
    train, _ = train_test_data_setup(data)
    first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit)
    second = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit,
                                          nodes_from=[first])
    third = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit,
                                         nodes_from=[first])
    final = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.logit,
                                         nodes_from=[second, third])

    chain = Chain()
    for node in [first, second, third, final]:
        chain.add_node(node)

    train_predicted = chain.fit(input_data=train, use_cache=False)

    assert chain.root_node.descriptive_id == (
        '((/n_ModelTypesIdsEnum.logit_defaultparams;)/'
        'n_ModelTypesIdsEnum.logit_defaultparams;;(/'
        'n_ModelTypesIdsEnum.logit_defaultparams;)/'
        'n_ModelTypesIdsEnum.logit_defaultparams;)/'
        'n_ModelTypesIdsEnum.logit_defaultparams')

    assert chain.length == 4
    assert chain.depth == 3
    assert train_predicted.predict.shape == train.target.shape
示例#4
0
文件: tuners.py 项目: gnodvi/FEDOT
    def tune(self, fit: Callable, predict: Callable, tune_data: InputData,
             params_range: dict, default_params: dict,
             iterations: int) -> dict:

        tune_train_data, tune_test_data = train_test_data_setup(tune_data, 0.5)

        trained_model_default = fit(tune_test_data, default_params)
        prediction_default = predict(trained_model_default, tune_test_data)
        best_quality_metric = _regression_prediction_quality(
            prediction=prediction_default, real=tune_test_data.target)
        best_params = default_params

        for _ in range(iterations):
            random_params = get_random_params(params_range)
            try:
                trained_model_candidate = fit(tune_train_data, random_params)
                prediction_candidate = predict(trained_model_candidate,
                                               tune_test_data)
                quality_metric = _regression_prediction_quality(
                    prediction=prediction_candidate,
                    real=tune_test_data.target)
                if quality_metric < best_quality_metric:
                    best_params = random_params
            except ValueError:
                pass
        return best_params
示例#5
0
def test_pca_model_removes_redunant_features_correct():
    n_informative = 5
    data = classification_dataset_with_redunant_features(
        n_samples=1000, n_features=100, n_informative=n_informative)
    train_data, test_data = train_test_data_setup(data=data)

    pca = Model(model_type='pca_data_model')
    _, train_predicted = pca.fit(data=train_data)

    assert train_predicted.shape[1] < data.features.shape[1]
def test_log_clustering_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    kmeans = Model(model_type=ModelTypesIdsEnum.kmeans)

    _, train_predicted = kmeans.fit(data=train_data)

    assert all(np.unique(train_predicted) == [0, 1])
def test_regression_chain_fit_correct():
    data = get_synthetic_regression_data()

    chain = compose_chain(data=data)
    train_data, test_data = train_test_data_setup(data)

    chain.fit(input_data=train_data)
    _, rmse_on_test = get_rmse_value(chain, train_data, test_data)

    rmse_threshold = np.std(data.target) * 0.05
    assert rmse_on_test < rmse_threshold
def test_log_regression_fit_correct(classification_dataset):
    data = classification_dataset
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    log_reg = Model(model_type=ModelTypesIdsEnum.logit)

    _, train_predicted = log_reg.fit(data=train_data)
    roc_on_train = roc_auc(y_true=train_data.target, y_score=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
def test_qda_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    qda = Model(model_type=ModelTypesIdsEnum.qda)

    _, train_predicted = qda.fit(data=train_data)
    roc_on_train = roc_auc(y_true=train_data.target, y_score=train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
示例#10
0
def test_log_regression_fit_correct(classification_dataset):
    data = classification_dataset
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    log_reg = Model(model_type='logit')

    _, train_predicted = log_reg.fit(data=train_data)

    roc_on_train = get_roc_auc(train_data, train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
示例#11
0
def test_secondary_nodes_is_invariant_to_inputs_order(data_setup):
    data = data_setup
    train, test = train_test_data_setup(data)
    first = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.logit)
    second = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.lda)
    third = NodeGenerator.primary_node(model_type=ModelTypesIdsEnum.knn)
    final = NodeGenerator.secondary_node(model_type=ModelTypesIdsEnum.xgboost,
                                         nodes_from=[first, second, third])

    chain = Chain()
    for node in [first, second, third, final]:
        chain.add_node(node)

    first = deepcopy(first)
    second = deepcopy(second)
    third = deepcopy(third)
    final_shuffled = NodeGenerator.secondary_node(
        model_type=ModelTypesIdsEnum.xgboost,
        nodes_from=[third, first, second])

    chain_shuffled = Chain()
    # change order of nodes in list
    for node in [final_shuffled, third, first, second]:
        chain_shuffled.add_node(node)

    train_predicted = chain.fit(input_data=train)

    train_predicted_shuffled = chain_shuffled.fit(input_data=train)

    # train results should be invariant
    assert chain.root_node.descriptive_id == chain_shuffled.root_node.descriptive_id
    assert all(
        np.equal(train_predicted.predict, train_predicted_shuffled.predict))

    test_predicted = chain.predict(input_data=test)
    test_predicted_shuffled = chain_shuffled.predict(input_data=test)

    # predict results should be invariant
    assert all(
        np.equal(test_predicted.predict, test_predicted_shuffled.predict))

    # change parents order for the nodes fitted chain
    nodes_for_change = chain.nodes[3].nodes_from
    chain.nodes[3].nodes_from = [
        nodes_for_change[2], nodes_for_change[0], nodes_for_change[1]
    ]
    chain.nodes[3].cache.clear()
    chain.fit(train)
    test_predicted_re_shuffled = chain.predict(input_data=test)

    # predict results should be invariant
    assert all(
        np.equal(test_predicted.predict, test_predicted_re_shuffled.predict))
示例#12
0
    def compose_chain(
            self,
            data: InputData,
            initial_chain: Optional[NASChain],
            composer_requirements: Optional[GPNNComposerRequirements],
            metrics: Optional[Callable],
            optimiser_parameters: GPChainOptimiserParameters = None,
            is_visualise: bool = False) -> NASChain:
        train_data, test_data = train_test_data_setup(data, 0.8)

        self.input_shape = [size for size in composer_requirements.image_size]
        self.input_shape.append(composer_requirements.channels_num)
        self.input_shape = tuple(self.input_shape)

        if not optimiser_parameters:
            self.optimiser_parameters = GPChainOptimiserParameters(
                chain_generation_function=random_cnn_chain,
                crossover_types=[CrossoverTypesEnum.subtree],
                crossover_types_dict=crossover_by_type,
                mutation_types=[MutationTypesEnum.simple],
                mutation_types_dict=mutation_by_type,
                selection_types=[SelectionTypesEnum.tournament])
        else:
            self.optimiser_parameters = optimiser_parameters
        metric_function_for_nodes = partial(
            self.metric_for_nodes, metrics, train_data, test_data,
            self.input_shape, composer_requirements.num_of_classes,
            composer_requirements.batch_size,
            composer_requirements.train_epochs_num)

        optimiser = GPChainOptimiser(
            initial_chain=initial_chain,
            requirements=composer_requirements,
            primary_node_func=NNNodeGenerator.primary_node,
            secondary_node_func=NNNodeGenerator.secondary_node,
            chain_class=NASChain,
            parameters=self.optimiser_parameters)

        best_chain, self.history = optimiser.optimise(
            metric_function_for_nodes)

        historical_fitness = [chain.fitness for chain in self.history]

        if is_visualise:
            ComposerVisualiser.visualise_history(self.history,
                                                 historical_fitness)

        write_composer_history_to_csv(historical_fitness=historical_fitness,
                                      historical_chains=self.history,
                                      pop_size=composer_requirements.pop_size)

        print('GP composition finished')
        return best_chain
示例#13
0
def test_composite_lstm_chain_fit_correct():
    data = get_synthetic_ts_data()

    chain = get_decomposed_chain()

    train_data, test_data = train_test_data_setup(data)

    chain.fit(input_data=train_data)
    _, rmse_on_test = get_rmse_value(chain, train_data, test_data)

    rmse_threshold = np.std(data.target)
    assert rmse_on_test < rmse_threshold
示例#14
0
def test_lda_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    lda = Model(model_type='lda')

    _, train_predicted = lda.fit(data=train_data)

    roc_on_train = get_roc_auc(train_data, train_predicted)
    roc_threshold = 0.95
    assert roc_on_train >= roc_threshold
示例#15
0
    def compose_chain(self,
                      data: InputData,
                      initial_chain: Optional[Chain],
                      composer_requirements: Optional[GPComposerRequirements],
                      metrics: Optional[Callable],
                      optimiser_parameters: GPChainOptimiserParameters = None,
                      is_visualise: bool = False) -> Chain:

        train_data, test_data = train_test_data_setup(data, 0.8)
        self.shared_cache.clear()

        if not optimiser_parameters:
            self.optimiser_parameters = GPChainOptimiserParameters(
                chain_generation_function=random_ml_chain,
                crossover_types=[
                    CrossoverTypesEnum.subtree, CrossoverTypesEnum.onepoint
                ],
                crossover_types_dict=crossover_by_type,
                mutation_types=[
                    MutationTypesEnum.simple, MutationTypesEnum.local_growth,
                    MutationTypesEnum.reduce
                ],
                mutation_types_dict=mutation_by_type,
                selection_types=[SelectionTypesEnum.tournament])
        else:
            self.optimiser_parameters = optimiser_parameters

        metric_function_for_nodes = partial(self.metric_for_nodes, metrics,
                                            train_data, test_data, True)

        optimiser = GPChainOptimiser(
            initial_chain=initial_chain,
            requirements=composer_requirements,
            primary_node_func=NodeGenerator.primary_node,
            secondary_node_func=NodeGenerator.secondary_node,
            chain_class=Chain,
            parameters=self.optimiser_parameters)

        best_chain, self.history = optimiser.optimise(
            metric_function_for_nodes)
        historical_fitness = [chain.fitness for chain in self.history]

        if is_visualise:
            ComposerVisualiser.visualise_history(self.history,
                                                 historical_fitness)

        write_composer_history_to_csv(historical_fitness=historical_fitness,
                                      historical_chains=self.history,
                                      pop_size=composer_requirements.pop_size)

        print('GP composition finished')
        return best_chain
示例#16
0
文件: test_node.py 项目: gnodvi/FEDOT
def test_eval_strategy_logreg(data_setup):
    data_set = data_setup
    train, test = train_test_data_setup(data=data_set)
    test_skl_model = LogisticRegression(C=10., random_state=1,
                                        solver='liblinear',
                                        max_iter=10000, verbose=0)
    test_skl_model.fit(train.features, train.target)
    expected_result = test_skl_model.predict(test.features)

    test_model_node = PrimaryNode(model_type='logit')
    test_model_node.fit(input_data=train)
    actual_result = test_model_node.predict(input_data=test)

    assert len(actual_result.predict) == len(expected_result)
示例#17
0
def test_multiclassification_chain_fit_correct():
    data = get_iris_data()
    chain = compose_chain()
    train_data, test_data = train_test_data_setup(data, shuffle_flag=True)

    chain.fit(input_data=train_data)
    results = chain.predict(input_data=test_data)

    roc_auc_on_test = roc_auc(y_true=test_data.target,
                              y_score=results.predict,
                              multi_class='ovo',
                              average='macro')

    assert roc_auc_on_test > 0.95
def test_model_fit_and_predict_correctly():
    """Checks whether the model fits and predict correctly on the synthetic dataset"""
    data = get_synthetic_input_data(N_SAMPLES, N_FEATURES, random_state=1)

    chain = compose_chain(data=data)
    train_data, test_data = train_test_data_setup(data)

    chain.fit(input_data=train_data)
    roc_auc_value_train, roc_auc_value_test = get_roc_auc_value(
        chain, train_data, test_data)
    train_auc_thr = get_auc_threshold(roc_auc_value_train)
    test_auc_thr = get_auc_threshold(roc_auc_value_test)

    assert train_auc_thr >= CORRECT_MODEL_AUC_THR
    assert test_auc_thr >= CORRECT_MODEL_AUC_THR
示例#19
0
def test_regression_composite_fit_correct():
    data = get_synthetic_ts_data()

    chain = get_decomposed_chain(model_trend='linear', model_residual='linear')

    train_data, test_data = train_test_data_setup(data)

    chain.fit(input_data=train_data)
    _, rmse_on_test = get_rmse_value(chain, train_data, test_data)

    print(rmse_on_test)

    rmse_threshold = np.std(data.target) * 1.5

    assert rmse_on_test < rmse_threshold
示例#20
0
def test_regression_chain_fit_correct():
    data = get_synthetic_ts_data()

    chain = Chain()
    node_rfr = PrimaryNode('rfr')
    chain.add_node(node_rfr)

    train_data, test_data = train_test_data_setup(data)

    chain.fit(input_data=train_data)
    _, rmse_on_test = get_rmse_value(chain, train_data, test_data)

    rmse_threshold = np.std(data.target) * 1.5

    assert rmse_on_test < rmse_threshold
示例#21
0
def test_regression_chain_with_datamodel_fit_correct():
    data = get_synthetic_regression_data()
    train_data, test_data = train_test_data_setup(data)

    node_data = PrimaryNode('direct_data_model')
    node_first = PrimaryNode('ridge')
    node_second = SecondaryNode('lasso')
    node_second.nodes_from = [node_first, node_data]

    chain = Chain(node_second)

    chain.fit(train_data)
    results = chain.predict(test_data)

    assert results.predict.shape == test_data.target.shape
def test_chain_with_clusters_fit_correct():
    mean_roc_on_test = 0
    for _ in range(15):
        # mean ROC AUC analysed because of stochastic clustering

        data = get_synthetic_input_data(n_samples=10000)

        chain = compose_chain(data=data)
        train_data, test_data = train_test_data_setup(data)

        chain.fit(input_data=train_data)
        _, roc_on_test = get_roc_auc_value(chain, train_data, test_data)
        mean_roc_on_test = np.mean([mean_roc_on_test, roc_on_test])

    roc_threshold = 0.6
    assert mean_roc_on_test > roc_threshold
示例#23
0
def test_classification_manual_tuning_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    knn = Model(model_type='knn')
    model, _ = knn.fit(data=train_data)
    test_predicted = knn.predict(fitted_model=model, data=test_data)

    knn_for_tune = Model(model_type='knn')
    knn_for_tune.params = {'n_neighbors': 1}
    model, _ = knn_for_tune.fit(data=train_data)

    test_predicted_tuned = knn_for_tune.predict(fitted_model=model,
                                                data=test_data)

    assert not np.array_equal(test_predicted, test_predicted_tuned)
示例#24
0
def synthetic_benchmark_composing_example():
    fitted_chain = separately_fit_chain(samples=5000,
                                        features_amount=10,
                                        classes=2)
    data = synthetic_benchmark_dataset(samples_amount=5000,
                                       features_amount=10,
                                       fitted_chain=fitted_chain)

    print(f'Synthetic features: {data.features[:10]}')
    print(f'Synthetic target: {data.target[:10]}')

    train, test = train_test_data_setup(data)
    simple_chain = two_level_chain()
    simple_chain.fit(input_data=train, use_cache=False)

    print(f'ROC score on train: {roc_value(simple_chain, train)}')
    print(f'ROC score on test {roc_value(simple_chain, test)}')
示例#25
0
def test_nodes_sequence_fit_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    train, _ = train_test_data_setup(data)
    first = PrimaryNode(model_type='logit')
    second = SecondaryNode(model_type='lda', nodes_from=[first])
    third = SecondaryNode(model_type='qda', nodes_from=[first])
    final = SecondaryNode(model_type='knn', nodes_from=[second, third])

    train_predicted = final.fit(input_data=train)

    assert final.descriptive_id == ('((/n_logit_default_params;)/'
                                    'n_lda_default_params;;(/'
                                    'n_logit_default_params;)/'
                                    'n_qda_default_params;)/'
                                    'n_knn_default_params')

    assert train_predicted.predict.shape[0] == train.target.shape[0]
    assert final.cache.actual_cached_state is not None
示例#26
0
def test_chain_with_datamodel_fit_correct(data_setup):
    data = data_setup
    train_data, test_data = train_test_data_setup(data)

    chain = Chain()
    node_data = PrimaryNode('direct_data_model')
    node_first = PrimaryNode('bernb')
    node_second = SecondaryNode('rf')
    node_second.nodes_from = [node_first, node_data]

    chain.add_node(node_data)
    chain.add_node(node_first)
    chain.add_node(node_second)

    chain.fit(train_data)
    results = np.asarray(probs_to_labels(chain.predict(test_data).predict))

    assert results.shape == test_data.target.shape
示例#27
0
def test_arima_tune_correct():
    data = get_synthetic_ts_data()
    train_data, test_data = train_test_data_setup(data=data)

    arima_for_tune = Model(model_type='arima')
    model, _ = arima_for_tune.fine_tune(data=train_data,
                                        iterations=5,
                                        max_lead_time=timedelta(minutes=0.1))

    test_predicted_tuned = arima_for_tune.predict(fitted_model=model,
                                                  data=test_data)

    rmse_on_test_tuned = mse(y_true=test_data.target,
                             y_pred=test_predicted_tuned,
                             squared=False)

    rmse_threshold = np.std(test_data.target)

    assert rmse_on_test_tuned < rmse_threshold
def test_model_predictions_on_train_test_random():
    """Checks that model can't predict correctly on random train and test datasets and
    the roc_auc_scores is close to 0.5.
    Both train and test data have no relations between features and target."""
    data = get_synthetic_input_data(N_SAMPLES, N_FEATURES, random_state=1)
    data = get_random_target_data(data)

    train_data, test_data = train_test_data_setup(data)

    chain = compose_chain(data=train_data)
    chain.fit(input_data=train_data)
    roc_auc_value_train, roc_auc_value_test = get_roc_auc_value(
        chain, train_data, test_data)
    train_auc_thr = get_auc_threshold(roc_auc_value_train)
    test_auc_thr = get_auc_threshold(roc_auc_value_test)
    print(roc_auc_value_train)
    print(roc_auc_value_test)

    assert test_auc_thr <= CORRECT_MODEL_AUC_THR
    assert train_auc_thr <= CORRECT_MODEL_AUC_THR
示例#29
0
    def compose_chain(self,
                      data: InputData,
                      initial_chain: Optional[Chain],
                      composer_requirements: Optional[GPComposerRequirements],
                      metrics: Optional[Callable],
                      optimiser_parameters: GPChainOptimiserParameters = None,
                      is_visualise: bool = False,
                      is_tune: bool = False) -> Chain:

        train_data, test_data = train_test_data_setup(data, 0.8)
        self.shared_cache.clear()

        metric_function_for_nodes = partial(self.metric_for_nodes, metrics,
                                            train_data, test_data, True)

        optimiser = GPChainOptimiser(initial_chain=initial_chain,
                                     requirements=composer_requirements,
                                     primary_node_func=PrimaryNode,
                                     secondary_node_func=SecondaryNode,
                                     chain_class=Chain,
                                     parameters=optimiser_parameters)

        best_chain, self.history = optimiser.optimise(
            metric_function_for_nodes)
        historical_fitness = [chain.fitness for chain in self.history]

        if is_visualise:
            ComposerVisualiser.visualise_history(self.history,
                                                 historical_fitness)

        write_composer_history_to_csv(historical_fitness=historical_fitness,
                                      historical_chains=self.history,
                                      pop_size=composer_requirements.pop_size)

        print('GP composition finished')

        if is_tune:
            self.tune_chain(best_chain, data,
                            composer_requirements.max_lead_time)
        return best_chain
示例#30
0
def test_knn_classification_tune_correct(data_fixture, request):
    data = request.getfixturevalue(data_fixture)
    data.features = Scaling().fit(data.features).apply(data.features)
    train_data, test_data = train_test_data_setup(data=data)

    knn = Model(model_type='knn')
    model, _ = knn.fit(data=train_data)
    test_predicted = knn.predict(fitted_model=model, data=test_data)

    roc_on_test = roc_auc(y_true=test_data.target, y_score=test_predicted)

    knn_for_tune = Model(model_type='knn')
    model, _ = knn_for_tune.fine_tune(data=train_data,
                                      iterations=10,
                                      max_lead_time=timedelta(minutes=1))

    test_predicted_tuned = knn.predict(fitted_model=model, data=test_data)

    roc_on_test_tuned = roc_auc(y_true=test_data.target,
                                y_score=test_predicted_tuned)
    roc_threshold = 0.6
    assert roc_on_test_tuned > roc_on_test > roc_threshold