def test_validate_hdl2(self): train_df, test_df = datasets.load("titanic", return_train_test=True) trained_pipeline = AutoFlowClassifier( initial_runs=1, run_limit=3, n_jobs=1, included_highR_nan_imputers=["operate.pop"], debug=True, n_jobs_in_algorithm=5, resource_manager=self.mock_resource_manager) column_descriptions = { "id": "PassengerId", "target": "Survived", "text": "Name" } try: trained_pipeline.fit(X_train=train_df, X_test=test_df, column_descriptions=column_descriptions, splitter=KFold(n_splits=3, shuffle=True, random_state=42), fit_ensemble_params=True, is_not_realy_run=True) assert Exception("didn't detect wrong HDL.") except Exception as e: self.assertEqual( str(e), "In step 'highR_nan->nan', user defined packege : 'operate.pop' does not exist!" )
def test_splitter(self): X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) groups = np.zeros([X_train.shape[0]]) groups[:len(groups) // 2] = 1 pipe = AutoFlowClassifier(DAG_workflow={ "num->target": ["linearsvc", "svc", "logistic_regression"] }, initial_runs=1, run_limit=1, debug=True, resource_manager=self.mock_resource_manager) pipe.fit(X_train, y_train, groups=groups) score = pipe.score(X_test, y_test) print(score) y_true_indexes = pipe.estimator.y_true_indexes_list[0] splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) for observed_y_true_index, (_, valid_index) in zip( y_true_indexes, splitter.split(X_train, y_train, groups)): assert np.all(observed_y_true_index == valid_index) self.assertGreater(score, 0.5)
def test_classifier(self): X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) hdl_bank = get_default_hdl_bank() hdl_bank["classification"]["mlp"] = { "hidden_layer_sizes": { "_type": "int_quniform", "_value": [10, 1000, 10] }, "activation": { "_type": "choice", "_value": ["relu", "tanh", "logistic"] }, } trained_pipeline = AutoFlowClassifier( initial_runs=1, run_limit=2, n_jobs=1, included_classifiers=["mlp"], hdl_bank=hdl_bank, model_registry={"mlp": MLPClassifier}, debug=True, random_state=55, resource_manager=self.mock_resource_manager) trained_pipeline.fit( X_train=X, y_train=y, X_test=X_test, y_test=y_test, splitter=KFold(n_splits=3, shuffle=True, random_state=42), ) joblib.dump(trained_pipeline, "autoflow_classification.bz2") # --- predict_pipeline = joblib.load("autoflow_classification.bz2") score = predict_pipeline.score(X_test, y_test) print(score)
import pandas as pd from sklearn.model_selection import KFold from autoflow import AutoFlowClassifier train_df = pd.read_csv("./data/train_classification.csv") test_df = pd.read_csv("./data/test_classification.csv") trained_pipeline = AutoFlowClassifier(initial_runs=5, run_limit=10, n_jobs=3, included_classifiers=["lightgbm"], db_type="postgresql", db_params={ "user": "******", "host": "0.0.0.0", "port": 5432 }, store_path="/autoflow", file_system="hdfs", should_store_intermediate_result=True, file_system_params={ "url": "http://0.0.0.0:50070", "user": "******" }) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } trained_pipeline.fit( X_train=train_df,
from pathlib import Path import joblib import pandas as pd from sklearn.model_selection import KFold import autoflow from autoflow import AutoFlowClassifier examples_path = Path(autoflow.__file__).parent.parent / "examples" train_df = pd.read_csv(examples_path / "data/train_classification.csv") test_df = pd.read_csv(examples_path / "data/test_classification.csv") trained_pipeline = AutoFlowClassifier( initial_runs=1, run_limit=1, n_jobs=1, included_classifiers=["lightgbm"], debug=True, should_store_intermediate_result=True, hdl_metadata={"key1": "hdl_metadata_test1"}) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } # if not os.path.exists("autoflow_classification.bz2"): trained_pipeline.fit(task_metadata={"key1": "task_metadata_test1"}, dataset_metadata={"key1": "dataset_metadata_test1"}, X_train=train_df, X_test=test_df, column_descriptions=column_descriptions, splitter=KFold(n_splits=3, shuffle=True, random_state=42),
# @Author : qichun tang # @Contact : [email protected] import joblib import pandas as pd from sklearn.model_selection import ShuffleSplit from autoflow import AutoFlowClassifier train_df = pd.read_csv("./data/train_classification.csv") test_df = pd.read_csv("./data/test_classification.csv") trained_pipeline = AutoFlowClassifier( initial_runs=12, run_limit=12, n_jobs=3, included_classifiers=[ "scale.standardize|svc", "scale.standardize|knn", "scale.standardize|logistic_regression", "gaussian_nb", "extra_trees", "lightgbm" ], ) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } trained_pipeline.fit( X_train=train_df, X_test=test_df, column_descriptions=column_descriptions, fit_ensemble_params=False, splitter=ShuffleSplit(n_splits=1, test_size=0.25, random_state=42),
import pandas as pd from sklearn.model_selection import KFold import autoflow from autoflow import AutoFlowClassifier from autoflow.datasets import load train_df = load("qsar") trained_pipeline = AutoFlowClassifier( initial_runs=1, run_limit=5, n_jobs=1, included_classifiers=["lightgbm"], debug=True, num2purified_workflow={ "num->compressed": { "_name": "compress.f1score", "threshold": 0.9, "n_jobs": 12, # "cache_intermediate":False }, "compressed->purified": ["scale.standardize", "operate.keep_going"], } # should_store_intermediate_result=True, # 测试对中间结果存储的正确性 ) column_descriptions = {"target": "target"} # if not os.path.exists("autoflow_classification.bz2"): trained_pipeline.fit(X_train=train_df, column_descriptions=column_descriptions, splitter=KFold(n_splits=3, shuffle=True, random_state=42), fit_ensemble_params=False)
# @Author : qichun tang # @Contact : [email protected] import pickle from pathlib import Path from sklearn.model_selection import KFold from autoflow import AutoFlowClassifier from autoflow import datasets train_df, test_df = datasets.load("titanic", return_train_test=True) trained_pipeline = AutoFlowClassifier( initial_runs=1, run_limit=1, n_jobs=1, included_classifiers=["catboost"], debug=True, n_jobs_in_algorithm=5 # should_store_intermediate_result=True, # 测试对中间结果存储的正确性 ) column_descriptions = { "id": "PassengerId", "target": "Survived", "text": "Name" } # if not os.path.exists("autoflow_classification.bz2"): trained_pipeline.fit( X_train=train_df, X_test=test_df, column_descriptions=column_descriptions, splitter=KFold(n_splits=3, shuffle=True, random_state=42),
"hidden_layer_sizes": { "_type": "int_quniform", "_value": [10, 1000, 10] }, "activation": { "_type": "choice", "_value": ["relu", "tanh", "logistic"] }, } train_df = pd.read_csv("./data/train_classification.csv") test_df = pd.read_csv("./data/test_classification.csv") trained_pipeline = AutoFlowClassifier(initial_runs=5, run_limit=10, n_jobs=1, included_classifiers=["mlp"], should_store_intermediate_result=True, hdl_bank=hdl_bank, model_registry={"mlp": MLPClassifier}, debug=True, random_state=55) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } trained_pipeline.fit( X_train=train_df, X_test=test_df, column_descriptions=column_descriptions, splitter=KFold(n_splits=3, shuffle=True, random_state=42), )
import os from pathlib import Path import pandas as pd import autoflow from autoflow import AutoFlowClassifier, DataManager examples_path = Path(autoflow.__file__).parent.parent / "examples" train_df = pd.read_csv(examples_path / "data/train_classification.csv") test_df = pd.read_csv(examples_path / "data/test_classification.csv") pipe = AutoFlowClassifier(initial_runs=5, run_limit=10, n_jobs=1, included_classifiers=["lightgbm"]) column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } pipe.data_manager = DataManager(X_train=train_df, X_test=test_df, column_descriptions=column_descriptions) pipe.hdl_constructors[0].run(pipe.data_manager, pipe.random_state, pipe.highR_cat_threshold) graph = pipe.hdl_constructors[0].draw_workflow_space() open("workflow_space.gv").write(graph.source) cmd = f'''dot -Tpng -Gsize=9,15\! -Gdpi=300 -oworkflow_space.png workflow_space.gv''' os.system(cmd)
import joblib import pandas as pd from sklearn.model_selection import KFold from autoflow import AutoFlowClassifier # load data from csv file train_df = pd.read_csv("../data/train_classification.csv") test_df = pd.read_csv("../data/test_classification.csv") # initial_runs -- initial runs are totally random search, to provide experience for SMAC algorithm. # run_limit -- is the maximum number of runs. # n_jobs -- defines how many search processes are started. # included_classifiers -- restrict the search space . lightgbm is the only classifier that needs to be selected # per_run_time_limit -- restrict the run time. if a trial during 60 seconds, it is expired, should be killed. trained_pipeline = AutoFlowClassifier(initial_runs=5, run_limit=10, n_jobs=1, included_classifiers=["lightgbm"], per_run_time_limit=60) # describing meaning of columns. `id`, `target` and `ignore` all has specific meaning # `id` is a column name means unique descriptor of each rows, # `target` column in the dataset is what your model will learn to predict # `ignore` is some columns which contains irrelevant information column_descriptions = { "id": "PassengerId", "target": "Survived", "ignore": "Name" } if not os.path.exists("autoflow_classification.bz2"): # pass `train_df`, `test_df` and `column_descriptions` to classifier, # if param `fit_ensemble_params` set as "auto", Stack Ensemble will be used # ``splitter`` is train-valid-dataset splitter, in here it is set as 3-Fold Cross Validation trained_pipeline.fit( X_train=train_df, X_test=test_df, column_descriptions=column_descriptions,