Пример #1
0
 def test_validate_hdl2(self):
     train_df, test_df = datasets.load("titanic", return_train_test=True)
     trained_pipeline = AutoFlowClassifier(
         initial_runs=1,
         run_limit=3,
         n_jobs=1,
         included_highR_nan_imputers=["operate.pop"],
         debug=True,
         n_jobs_in_algorithm=5,
         resource_manager=self.mock_resource_manager)
     column_descriptions = {
         "id": "PassengerId",
         "target": "Survived",
         "text": "Name"
     }
     try:
         trained_pipeline.fit(X_train=train_df,
                              X_test=test_df,
                              column_descriptions=column_descriptions,
                              splitter=KFold(n_splits=3,
                                             shuffle=True,
                                             random_state=42),
                              fit_ensemble_params=True,
                              is_not_realy_run=True)
         assert Exception("didn't detect wrong HDL.")
     except Exception as e:
         self.assertEqual(
             str(e),
             "In step 'highR_nan->nan', user defined packege : 'operate.pop' does not exist!"
         )
Пример #2
0
 def test_splitter(self):
     X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         random_state=42)
     groups = np.zeros([X_train.shape[0]])
     groups[:len(groups) // 2] = 1
     pipe = AutoFlowClassifier(DAG_workflow={
         "num->target": ["linearsvc", "svc", "logistic_regression"]
     },
                               initial_runs=1,
                               run_limit=1,
                               debug=True,
                               resource_manager=self.mock_resource_manager)
     pipe.fit(X_train, y_train, groups=groups)
     score = pipe.score(X_test, y_test)
     print(score)
     y_true_indexes = pipe.estimator.y_true_indexes_list[0]
     splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
     for observed_y_true_index, (_, valid_index) in zip(
             y_true_indexes, splitter.split(X_train, y_train, groups)):
         assert np.all(observed_y_true_index == valid_index)
     self.assertGreater(score, 0.5)
Пример #3
0
    def test_classifier(self):
        X, y = load_iris(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        hdl_bank = get_default_hdl_bank()
        hdl_bank["classification"]["mlp"] = {
            "hidden_layer_sizes": {
                "_type": "int_quniform",
                "_value": [10, 1000, 10]
            },
            "activation": {
                "_type": "choice",
                "_value": ["relu", "tanh", "logistic"]
            },
        }

        trained_pipeline = AutoFlowClassifier(
            initial_runs=1,
            run_limit=2,
            n_jobs=1,
            included_classifiers=["mlp"],
            hdl_bank=hdl_bank,
            model_registry={"mlp": MLPClassifier},
            debug=True,
            random_state=55,
            resource_manager=self.mock_resource_manager)
        trained_pipeline.fit(
            X_train=X,
            y_train=y,
            X_test=X_test,
            y_test=y_test,
            splitter=KFold(n_splits=3, shuffle=True, random_state=42),
        )
        joblib.dump(trained_pipeline, "autoflow_classification.bz2")
        # ---
        predict_pipeline = joblib.load("autoflow_classification.bz2")
        score = predict_pipeline.score(X_test, y_test)
        print(score)
Пример #4
0
import pandas as pd
from sklearn.model_selection import KFold

from autoflow import AutoFlowClassifier

train_df = pd.read_csv("./data/train_classification.csv")
test_df = pd.read_csv("./data/test_classification.csv")
trained_pipeline = AutoFlowClassifier(initial_runs=5,
                                      run_limit=10,
                                      n_jobs=3,
                                      included_classifiers=["lightgbm"],
                                      db_type="postgresql",
                                      db_params={
                                          "user": "******",
                                          "host": "0.0.0.0",
                                          "port": 5432
                                      },
                                      store_path="/autoflow",
                                      file_system="hdfs",
                                      should_store_intermediate_result=True,
                                      file_system_params={
                                          "url": "http://0.0.0.0:50070",
                                          "user": "******"
                                      })
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}
trained_pipeline.fit(
    X_train=train_df,
Пример #5
0
from pathlib import Path

import joblib
import pandas as pd
from sklearn.model_selection import KFold

import autoflow
from autoflow import AutoFlowClassifier

examples_path = Path(autoflow.__file__).parent.parent / "examples"
train_df = pd.read_csv(examples_path / "data/train_classification.csv")
test_df = pd.read_csv(examples_path / "data/test_classification.csv")
trained_pipeline = AutoFlowClassifier(
    initial_runs=1,
    run_limit=1,
    n_jobs=1,
    included_classifiers=["lightgbm"],
    debug=True,
    should_store_intermediate_result=True,
    hdl_metadata={"key1": "hdl_metadata_test1"})
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}
# if not os.path.exists("autoflow_classification.bz2"):
trained_pipeline.fit(task_metadata={"key1": "task_metadata_test1"},
                     dataset_metadata={"key1": "dataset_metadata_test1"},
                     X_train=train_df,
                     X_test=test_df,
                     column_descriptions=column_descriptions,
                     splitter=KFold(n_splits=3, shuffle=True, random_state=42),
# @Author  : qichun tang
# @Contact    : [email protected]

import joblib
import pandas as pd
from sklearn.model_selection import ShuffleSplit

from autoflow import AutoFlowClassifier

train_df = pd.read_csv("./data/train_classification.csv")
test_df = pd.read_csv("./data/test_classification.csv")
trained_pipeline = AutoFlowClassifier(
    initial_runs=12,
    run_limit=12,
    n_jobs=3,
    included_classifiers=[
        "scale.standardize|svc", "scale.standardize|knn",
        "scale.standardize|logistic_regression", "gaussian_nb", "extra_trees",
        "lightgbm"
    ],
)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}
trained_pipeline.fit(
    X_train=train_df,
    X_test=test_df,
    column_descriptions=column_descriptions,
    fit_ensemble_params=False,
    splitter=ShuffleSplit(n_splits=1, test_size=0.25, random_state=42),
Пример #7
0
import pandas as pd
from sklearn.model_selection import KFold

import autoflow
from autoflow import AutoFlowClassifier
from autoflow.datasets import load

train_df = load("qsar")
trained_pipeline = AutoFlowClassifier(
    initial_runs=1,
    run_limit=5,
    n_jobs=1,
    included_classifiers=["lightgbm"],
    debug=True,
    num2purified_workflow={
        "num->compressed": {
            "_name": "compress.f1score",
            "threshold": 0.9,
            "n_jobs": 12,
            # "cache_intermediate":False
        },
        "compressed->purified": ["scale.standardize", "operate.keep_going"],
    }
    # should_store_intermediate_result=True,  # 测试对中间结果存储的正确性
)
column_descriptions = {"target": "target"}
# if not os.path.exists("autoflow_classification.bz2"):
trained_pipeline.fit(X_train=train_df,
                     column_descriptions=column_descriptions,
                     splitter=KFold(n_splits=3, shuffle=True, random_state=42),
                     fit_ensemble_params=False)
Пример #8
0
# @Author  : qichun tang
# @Contact    : [email protected]
import pickle
from pathlib import Path

from sklearn.model_selection import KFold

from autoflow import AutoFlowClassifier
from autoflow import datasets

train_df, test_df = datasets.load("titanic", return_train_test=True)
trained_pipeline = AutoFlowClassifier(
    initial_runs=1,
    run_limit=1,
    n_jobs=1,
    included_classifiers=["catboost"],
    debug=True,
    n_jobs_in_algorithm=5
    # should_store_intermediate_result=True,  # 测试对中间结果存储的正确性
)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "text": "Name"
}
# if not os.path.exists("autoflow_classification.bz2"):
trained_pipeline.fit(
    X_train=train_df,
    X_test=test_df,
    column_descriptions=column_descriptions,
    splitter=KFold(n_splits=3, shuffle=True, random_state=42),
Пример #9
0
    "hidden_layer_sizes": {
        "_type": "int_quniform",
        "_value": [10, 1000, 10]
    },
    "activation": {
        "_type": "choice",
        "_value": ["relu", "tanh", "logistic"]
    },
}
train_df = pd.read_csv("./data/train_classification.csv")
test_df = pd.read_csv("./data/test_classification.csv")
trained_pipeline = AutoFlowClassifier(initial_runs=5,
                                      run_limit=10,
                                      n_jobs=1,
                                      included_classifiers=["mlp"],
                                      should_store_intermediate_result=True,
                                      hdl_bank=hdl_bank,
                                      model_registry={"mlp": MLPClassifier},
                                      debug=True,
                                      random_state=55)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}
trained_pipeline.fit(
    X_train=train_df,
    X_test=test_df,
    column_descriptions=column_descriptions,
    splitter=KFold(n_splits=3, shuffle=True, random_state=42),
)
Пример #10
0
import os
from pathlib import Path

import pandas as pd

import autoflow
from autoflow import AutoFlowClassifier, DataManager

examples_path = Path(autoflow.__file__).parent.parent / "examples"
train_df = pd.read_csv(examples_path / "data/train_classification.csv")
test_df = pd.read_csv(examples_path / "data/test_classification.csv")
pipe = AutoFlowClassifier(initial_runs=5, run_limit=10, n_jobs=1, included_classifiers=["lightgbm"])
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}
pipe.data_manager = DataManager(X_train=train_df, X_test=test_df, column_descriptions=column_descriptions)
pipe.hdl_constructors[0].run(pipe.data_manager, pipe.random_state, pipe.highR_cat_threshold)
graph = pipe.hdl_constructors[0].draw_workflow_space()
open("workflow_space.gv").write(graph.source)
cmd = f'''dot -Tpng -Gsize=9,15\! -Gdpi=300 -oworkflow_space.png workflow_space.gv'''
os.system(cmd)
Пример #11
0
import joblib
import pandas as pd
from sklearn.model_selection import KFold

from autoflow import AutoFlowClassifier

# load data from csv file
train_df = pd.read_csv("../data/train_classification.csv")
test_df = pd.read_csv("../data/test_classification.csv")
# initial_runs  -- initial runs are totally random search, to provide experience for SMAC algorithm.
# run_limit     -- is the maximum number of runs.
# n_jobs        -- defines how many search processes are started.
# included_classifiers -- restrict the search space . lightgbm is the only classifier that needs to be selected
# per_run_time_limit -- restrict the run time. if a trial during 60 seconds, it is expired, should be killed.
trained_pipeline = AutoFlowClassifier(initial_runs=5, run_limit=10, n_jobs=1, included_classifiers=["lightgbm"],
                                       per_run_time_limit=60)
# describing meaning of columns. `id`, `target` and `ignore` all has specific meaning
# `id` is a column name means unique descriptor of each rows,
# `target` column in the dataset is what your model will learn to predict
# `ignore` is some columns which contains irrelevant information
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "ignore": "Name"
}
if not os.path.exists("autoflow_classification.bz2"):
    # pass `train_df`, `test_df` and `column_descriptions` to classifier,
    # if param `fit_ensemble_params` set as "auto", Stack Ensemble will be used
    # ``splitter`` is train-valid-dataset splitter, in here it is set as 3-Fold Cross Validation
    trained_pipeline.fit(
        X_train=train_df, X_test=test_df, column_descriptions=column_descriptions,