示例#1
0
def test_multioutput():
    from sklearn.datasets import make_regression
    from sklearn.model_selection import train_test_split
    from sklearn.multioutput import MultiOutputRegressor, RegressorChain

    # create regression data
    X, y = make_regression(n_targets=3)

    # split into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.30,
                                                        random_state=42)

    # train the model
    model = MultiOutputRegressor(AutoML(task="regression", time_budget=1))
    model.fit(X_train, y_train)

    # predict
    print(model.predict(X_test))

    # train the model
    model = RegressorChain(AutoML(task="regression", time_budget=1))
    model.fit(X_train, y_train)

    # predict
    print(model.predict(X_test))
示例#2
0
 def test_regression_xgboost(self):
     X_train = scipy.sparse.random(300, 900, density=0.0001)
     y_train = np.random.uniform(size=300)
     X_val = scipy.sparse.random(100, 900, density=0.0001)
     y_val = np.random.uniform(size=100)
     automl_experiment = AutoML()
     automl_experiment.add_learner(learner_name="my_xgb1",
                                   learner_class=MyXGB1)
     automl_experiment.add_learner(learner_name="my_xgb2",
                                   learner_class=MyXGB2)
     automl_settings = {
         "time_budget": 2,
         "estimator_list": ["my_xgb1", "my_xgb2"],
         "task": "regression",
         "log_file_name": "test/regression_xgboost.log",
         "n_jobs": 1,
         "model_history": True,
         "keep_search_state": True,
         "early_stop": True,
     }
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           X_val=X_val,
                           y_val=y_val,
                           **automl_settings)
     assert automl_experiment._state.X_val.shape == X_val.shape
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("my_xgb2"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(automl_experiment.best_config)
     print(automl_experiment.best_loss)
     print(automl_experiment.best_config_train_time)
示例#3
0
    def test_sparse_matrix_regression(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'mae',
            "task": 'regression',
            "log_file_name": "test/sparse_regression.log",
            "model_history": True
        }
        X_train = scipy.sparse.random(300, 900, density=0.0001)
        y_train = np.random.uniform(size=300)
        X_val = scipy.sparse.random(100, 900, density=0.0001)
        y_val = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              X_val=X_val,
                              y_val=y_val,
                              **automl_settings)
        assert automl_experiment.X_val.shape == X_val.shape
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(automl_experiment.best_config)
        print(automl_experiment.best_loss)
        print(automl_experiment.best_config_train_time)
示例#4
0
    def test_training_log(self):

        with TemporaryDirectory() as d:
            filename = os.path.join(d, 'test_training_log.log')

            # Run a simple job.
            automl_experiment = AutoML()
            automl_settings = {
                "time_budget": 2,
                "metric": 'mse',
                "task": 'regression',
                "log_file_name": filename,
                "log_training_metric": True,
                "mem_thres": 1024*1024,
                "n_jobs": 1,
                "model_history": True
            }
            X_train, y_train = load_boston(return_X_y=True)            
            automl_experiment.fit(X_train=X_train, y_train=y_train,
                                  **automl_settings)

            # Check if the training log file is populated.
            self.assertTrue(os.path.exists(filename))
            with training_log_reader(filename) as reader:
                count = 0
                for record in reader.records():
                    print(record)
                    count += 1
                self.assertGreater(count, 0)
示例#5
0
    def test_regression(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'mse',
            "task": 'regression',
            "log_file_name": "test/boston.log",
            "log_training_metric": True,
            "model_history": True
        }
        X_train, y_train = load_boston(return_X_y=True)
        n = len(y_train)
        automl_experiment.fit(X_train=X_train[:n >> 1],
                              y_train=y_train[:n >> 1],
                              X_val=X_train[n >> 1:],
                              y_val=y_train[n >> 1:],
                              **automl_settings)
        assert automl_experiment.y_val.shape[0] == n - (n >> 1)
        assert automl_experiment.eval_method == 'holdout'
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(get_output_from_log(automl_settings["log_file_name"], 1))
示例#6
0
def _test_custom_data():
    from flaml import AutoML
    import requests
    import pandas as pd

    try:
        train_dataset = pd.read_csv("data/input/train.tsv",
                                    delimiter="\t",
                                    quoting=3)
        dev_dataset = pd.read_csv("data/input/dev.tsv",
                                  delimiter="\t",
                                  quoting=3)
        test_dataset = pd.read_csv("data/input/test.tsv",
                                   delimiter="\t",
                                   quoting=3)
    except requests.exceptions.HTTPError:
        return

    custom_sent_keys = ["#1 String", "#2 String"]
    label_key = "Quality"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    X_test = test_dataset[custom_sent_keys]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
        "time_budget": 5,
        "task": "seq-classification",
        "metric": "accuracy",
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "data/output/",
        "ckpt_per_epoch": 1,
    }

    automl.fit(X_train=X_train,
               y_train=y_train,
               X_val=X_val,
               y_val=y_val,
               **automl_settings)
    automl.predict(X_test)
    automl.predict(["test test"])
    automl.predict([
        ["test test", "test test"],
        ["test test", "test test"],
        ["test test", "test test"],
    ])
示例#7
0
def _test_ray_classification():
    from sklearn.datasets import make_classification

    X, y = make_classification(1000, 10)
    automl = AutoML()
    automl.fit(X,
               y,
               time_budget=10,
               task="classification",
               n_concurrent_trials=2)
示例#8
0
    def test_classification(self, as_frame=False):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 4,
            "metric": 'accuracy',
            "task": 'classification',
            "log_file_name": "test/iris.log",
            "log_training_metric": True,
            "model_history": True
        }
        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train)[:5])
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        del automl_settings["metric"]
        del automl_settings["model_history"]
        del automl_settings["log_training_metric"]
        automl_experiment = AutoML()
        duration = automl_experiment.retrain_from_log(
            log_file_name=automl_settings["log_file_name"],
            X_train=X_train,
            y_train=y_train,
            train_full=True,
            record_id=0)
        print(duration)
        print(automl_experiment.model)
        print(automl_experiment.predict_proba(X_train)[:5])
示例#9
0
 def test_roc_auc_ovo(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 1,
         "metric": "roc_auc_ovo",
         "task": "classification",
         "log_file_name": "test/roc_auc_ovo.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = load_iris(return_X_y=True)
     automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
示例#10
0
    def test_custom_metric(self):
        df, y = load_iris(return_X_y=True, as_frame=True)
        df["label"] = y
        automl_experiment = AutoML()
        automl_settings = {
            "dataframe": df,
            "label": "label",
            "time_budget": 5,
            "eval_method": "cv",
            "metric": custom_metric,
            "task": "classification",
            "log_file_name": "test/iris_custom.log",
            "log_training_metric": True,
            "log_type": "all",
            "n_jobs": 1,
            "model_history": True,
            "sample_weight": np.ones(len(y)),
            "pred_time_limit": 1e-5,
            "ensemble": True,
        }
        automl_experiment.fit(**automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.best_model_for_estimator("rf"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        automl_experiment = AutoML()
        estimator = automl_experiment.get_estimator_from_log(
            automl_settings["log_file_name"], record_id=0, task="multi"
        )
        print(estimator)
        (
            time_history,
            best_valid_loss_history,
            valid_loss_history,
            config_history,
            metric_history,
        ) = get_output_from_log(
            filename=automl_settings["log_file_name"], time_budget=6
        )
        print(metric_history)
        try:
            import ray

            df = ray.put(df)
            automl_settings["dataframe"] = df
            automl_settings["use_ray"] = True
            automl_experiment.fit(**automl_settings)
        except ImportError:
            pass
示例#11
0
def test_cv():
    from flaml import AutoML
    import pandas as pd
    import requests

    train_data = {
        "sentence1": [
            'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
            "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
            "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
            "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
        ],
        "sentence2": [
            'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
            "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
            "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
            "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
        ],
        "label": [1, 0, 1, 0],
        "idx": [0, 1, 2, 3],
    }
    train_dataset = pd.DataFrame(train_data)

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
        "time_budget": 5,
        "task": "seq-classification",
        "metric": "accuracy",
        "n_splits": 3,
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
        "fp16": False,
    }

    try:
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
    except requests.exceptions.HTTPError:
        return
示例#12
0
    def test_custom_metric(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 10,
            'eval_method': 'holdout',
            "metric": custom_metric,
            "task": 'classification',
            "log_file_name": "test/iris_custom.log",
            "log_training_metric": True,
            'log_type': 'all',
            "model_history": True
        }
        X_train, y_train = load_iris(return_X_y=True)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        automl_experiment = AutoML()
        estimator = automl_experiment.get_estimator_from_log(
            automl_settings["log_file_name"], record_id=0, objective='multi')
        print(estimator)
        time_history, best_valid_loss_history, valid_loss_history, \
            config_history, train_loss_history = get_output_from_log(
                filename=automl_settings['log_file_name'], time_budget=6)
        print(train_loss_history)
示例#13
0
 def test_sparse_matrix_lr(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 3,
         "metric": "f1",
         "task": "classification",
         "log_file_name": "test/sparse_classification.log",
         "estimator_list": ["lrl1", "lrl2"],
         "log_type": "all",
         "n_jobs": 1,
     }
     X_train = scipy.sparse.random(3000, 3000, density=0.1)
     y_train = np.random.randint(2, size=3000)
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           train_time_limit=1,
                           **automl_settings)
     automl_settings["time_budget"] = 5
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           **automl_settings)
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("lrl2"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
示例#14
0
    def test_random_skip_oom(self):
        automl_experiment = AutoML()
        automl_experiment.add_learner(learner_name="large_lgbm",
                                      learner_class=MyLargeLGBM)
        automl_settings = {
            "time_budget": 2,
            "task": "classification",
            "log_file_name": "test/sparse_classification_oom.log",
            "estimator_list": ["large_lgbm"],
            "log_type": "all",
            "n_jobs": 1,
            "hpo_method": "random",
            "n_concurrent_trials": 2,
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)

        try:
            automl_experiment.fit(X_train=X_train,
                                  y_train=y_train,
                                  **automl_settings)
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
            print(automl_experiment.best_model_for_estimator("large_lgbm"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
            print("skipping concurrency test as ray is not installed")
            return
示例#15
0
    def test_micro_macro_f1(self):
        automl_experiment_micro = AutoML()
        automl_experiment_macro = AutoML()
        automl_settings = {
            "time_budget": 2,
            "task": "classification",
            "log_file_name": "test/micro_macro_f1.log",
            "log_training_metric": True,
            "n_jobs": 1,
            "model_history": True,
        }
        X_train, y_train = load_iris(return_X_y=True)
        automl_experiment_micro.fit(
            X_train=X_train, y_train=y_train, metric="micro_f1", **automl_settings
        )
        automl_experiment_macro.fit(
            X_train=X_train, y_train=y_train, metric="macro_f1", **automl_settings
        )
        estimator = automl_experiment_macro.model
        y_pred = estimator.predict(X_train)
        y_pred_proba = estimator.predict_proba(X_train)
        from flaml.ml import norm_confusion_matrix, multi_class_curves

        print(norm_confusion_matrix(y_train, y_pred))
        from sklearn.metrics import roc_curve, precision_recall_curve

        print(multi_class_curves(y_train, y_pred_proba, roc_curve))
        print(multi_class_curves(y_train, y_pred_proba, precision_recall_curve))
示例#16
0
 def test_binary(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 1,
         "task": "binary",
         "log_file_name": "test/breast_cancer.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = load_breast_cancer(return_X_y=True)
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           **automl_settings)
     _ = automl_experiment.predict(X_train)
示例#17
0
 def test_roc_auc_ovr(self):
     automl_experiment = AutoML()
     X_train, y_train = load_iris(return_X_y=True)
     automl_settings = {
         "time_budget": 1,
         "metric": "roc_auc_ovr",
         "task": "classification",
         "log_file_name": "test/roc_auc_ovr.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "sample_weight": np.ones(len(y_train)),
         "eval_method": "holdout",
         "model_history": True,
     }
     automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
示例#18
0
def test_numpy_large():
    import numpy as np
    import pandas as pd
    from flaml import AutoML

    X_train = pd.date_range("2017-01-01", periods=70000, freq="T")
    y_train = pd.DataFrame(np.random.randint(6500, 7500, 70000))
    automl = AutoML()
    automl.fit(
        X_train=X_train[:-10].values,  # a single column of timestamp
        y_train=y_train[:-10].values,  # value for each timestamp
        period=10,  # time horizon to forecast, e.g., 12 months
        task="ts_forecast",
        time_budget=10,  # time budget in seconds
    )
示例#19
0
    def test_fit_w_starting_point(self, as_frame=True):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 3,
            "metric": "accuracy",
            "task": "classification",
            "log_file_name": "test/iris.log",
            "log_training_metric": True,
            "n_jobs": 1,
            "model_history": True,
        }
        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
        if as_frame:
            # test drop column
            X_train.columns = range(X_train.shape[1])
            X_train[X_train.shape[1]] = np.zeros(len(y_train))
        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
        automl_val_accuracy = 1.0 - automl_experiment.best_loss
        print("Best ML leaner:", automl_experiment.best_estimator)
        print("Best hyperparmeter config:", automl_experiment.best_config)
        print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
        print(
            "Training duration of best run: {0:.4g} s".format(
                automl_experiment.best_config_train_time
            )
        )

        starting_points = automl_experiment.best_config_per_estimator
        print("starting_points", starting_points)
        print("loss of the starting_points", automl_experiment.best_loss_per_estimator)
        automl_settings_resume = {
            "time_budget": 2,
            "metric": "accuracy",
            "task": "classification",
            "log_file_name": "test/iris_resume.log",
            "log_training_metric": True,
            "n_jobs": 1,
            "model_history": True,
            "log_type": "all",
            "starting_points": starting_points,
        }
        new_automl_experiment = AutoML()
        new_automl_experiment.fit(
            X_train=X_train, y_train=y_train, **automl_settings_resume
        )

        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
        print("Best ML leaner:", new_automl_experiment.best_estimator)
        print("Best hyperparmeter config:", new_automl_experiment.best_config)
        print(
            "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
        )
        print(
            "Training duration of best run: {0:.4g} s".format(
                new_automl_experiment.best_config_train_time
            )
        )
示例#20
0
    def test_logging_level(self):

        from flaml import logger, logger_formatter

        with tempfile.TemporaryDirectory() as d:

            training_log = os.path.join(d, "training.log")

            # Configure logging for the FLAML logger
            # and add a handler that outputs to a buffer.
            logger.setLevel(logging.INFO)
            buf = io.StringIO()
            ch = logging.StreamHandler(buf)
            ch.setFormatter(logger_formatter)
            logger.addHandler(ch)

            # Run a simple job.
            automl = AutoML()
            automl_settings = {
                "time_budget": 1,
                "metric": 'mse',
                "task": 'regression',
                "log_file_name": training_log,
                "log_training_metric": True,
                "n_jobs": 1,
                "model_history": True,
            }
            X_train, y_train = load_boston(return_X_y=True)
            n = len(y_train) >> 1
            automl.fit(X_train=X_train[:n],
                       y_train=y_train[:n],
                       X_val=X_train[n:],
                       y_val=y_train[n:],
                       **automl_settings)

            # Check if the log buffer is populated.
            self.assertTrue(len(buf.getvalue()) > 0)

        import pickle
        with open('automl.pkl', 'wb') as f:
            pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
        print(automl.__version__)
示例#21
0
 def test_regression(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "task": "regression",
         "log_file_name": "test/california.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = fetch_california_housing(return_X_y=True)
     n = int(len(y_train) * 9 // 10)
     automl_experiment.fit(X_train=X_train[:n],
                           y_train=y_train[:n],
                           X_val=X_train[n:],
                           y_val=y_train[n:],
                           **automl_settings)
     assert automl_experiment._state.eval_method == "holdout"
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("xgboost"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(get_output_from_log(automl_settings["log_file_name"], 1))
     automl_experiment.retrain_from_log(
         task="regression",
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         time_budget=1,
     )
     automl_experiment.retrain_from_log(
         task="regression",
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         time_budget=0,
     )
示例#22
0
    def test_sparse_matrix_regression_cv(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            'eval_method': 'cv',
            "task": 'regression',
            "log_file_name": "test/sparse_regression.log",
            "model_history": True
        }
        X_train = scipy.sparse.random(100, 100)
        y_train = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
示例#23
0
    def test_parallel_xgboost(self, hpo_method=None):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 10,
            "metric": "ap",
            "task": "classification",
            "log_file_name": "test/sparse_classification.log",
            "estimator_list": ["xgboost"],
            "log_type": "all",
            "n_jobs": 1,
            "n_concurrent_trials": 2,
            "hpo_method": hpo_method,
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)
        try:
            import ray

            X_train_ref = ray.put(X_train)
            automl_experiment.fit(X_train=X_train_ref,
                                  y_train=y_train,
                                  **automl_settings)
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
            print(automl_experiment.best_model_for_estimator("xgboost"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
            return
示例#24
0
    def test_ray_classification(self):
        X, y = load_breast_cancer(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25)

        automl = AutoML()
        try:
            automl.fit(
                X_train,
                y_train,
                X_val=X_test,
                y_val=y_test,
                time_budget=10,
                task="classification",
                use_ray=True,
            )
            automl.fit(
                X_train,
                y_train,
                X_val=X_test,
                y_val=y_test,
                time_budget=10,
                task="classification",
                n_concurrent_trials=2,
            )
        except ImportError:
            return
示例#25
0
def test_mlflow():
    import subprocess
    import sys

    subprocess.check_call([sys.executable, "-m", "pip", "install", "mlflow"])
    import mlflow
    from flaml.data import load_openml_task

    try:
        X_train, X_test, y_train, y_test = load_openml_task(
            task_id=7592, data_dir="test/"
        )
    except (OpenMLServerException, ChunkedEncodingError) as e:
        print(e)
        return
    """ import AutoML class from flaml package """
    from flaml import AutoML

    automl = AutoML()
    settings = {
        "time_budget": 5,  # total running time in seconds
        "metric": "accuracy",  # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
        "estimator_list": ["lgbm", "rf", "xgboost"],  # list of ML learners
        "task": "classification",  # task type
        "sample": False,  # whether to subsample training data
        "log_file_name": "adult.log",  # flaml log file
    }
    mlflow.set_experiment("flaml")
    with mlflow.start_run() as run:
        automl.fit(X_train=X_train, y_train=y_train, **settings)
        mlflow.sklearn.log_model(automl, "automl")
    loaded_model = mlflow.pyfunc.load_model(f"{run.info.artifact_uri}/automl")
    print(loaded_model.predict(X_test))
    automl._mem_thres = 0
    print(automl.trainable(automl.points_to_evaluate[0]))

    settings["use_ray"] = True
    try:
        with mlflow.start_run() as run:
            automl.fit(X_train=X_train, y_train=y_train, **settings)
            mlflow.sklearn.log_model(automl, "automl")
        automl = mlflow.sklearn.load_model(f"{run.info.artifact_uri}/automl")
        print(automl.predict_proba(X_test))
    except ImportError:
        pass
示例#26
0
 def test_sparse_matrix_regression(self):
     X_train = scipy.sparse.random(300, 900, density=0.0001)
     y_train = np.random.uniform(size=300)
     X_val = scipy.sparse.random(100, 900, density=0.0001)
     y_val = np.random.uniform(size=100)
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "metric": "mae",
         "task": "regression",
         "log_file_name": "test/sparse_regression.log",
         "n_jobs": 1,
         "model_history": True,
         "keep_search_state": True,
         "verbose": 0,
         "early_stop": True,
     }
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           X_val=X_val,
                           y_val=y_val,
                           **automl_settings)
     assert automl_experiment._state.X_val.shape == X_val.shape
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("rf"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(automl_experiment.best_config)
     print(automl_experiment.best_loss)
     print(automl_experiment.best_config_train_time)
示例#27
0
 def test_datetime_columns(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "metric": 'mse',
         "task": 'regression',
         "log_file_name": "test/datetime_columns.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True
     }
     fake_df = pd.DataFrame({
         'A': [
             datetime(1900, 2, 3),
             datetime(1900, 3, 4),
             datetime(1900, 3, 4),
             datetime(1900, 3, 4),
             datetime(1900, 7, 2),
             datetime(1900, 8, 9)
         ],
         'B': [
             datetime(1900, 1, 1),
             datetime(1900, 1, 1),
             datetime(1900, 1, 1),
             datetime(1900, 1, 1),
             datetime(1900, 1, 1),
             datetime(1900, 1, 1)
         ],
         'year_A': [
             datetime(1900, 1, 2),
             datetime(1900, 8, 1),
             datetime(1900, 1, 4),
             datetime(1900, 6, 1),
             datetime(1900, 1, 5),
             datetime(1900, 4, 1)
         ]
     })
     y = np.array([0, 1, 0, 1, 0, 0])
     automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings)
     _ = automl_experiment.predict(fake_df)
示例#28
0
文件: test_gpu.py 项目: sonichi/FLAML
def test_xgboost():
    from flaml import AutoML
    from sklearn.datasets import make_moons
    import scipy.sparse
    import numpy as np
    from xgboost.core import XGBoostError

    try:
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)
        automl = AutoML()
        automl.fit(
            X_train,
            y_train,
            estimator_list=["xgb_limitdepth", "xgboost"],
            time_budget=5,
            gpu_per_trial=1,
        )

        train, label = make_moons(
            n_samples=300000, shuffle=True, noise=0.3, random_state=None
        )
        automl = AutoML()
        automl.fit(
            train,
            label,
            estimator_list=["xgb_limitdepth", "xgboost"],
            time_budget=5,
            gpu_per_trial=1,
        )
        automl.fit(
            train,
            label,
            estimator_list=["xgb_limitdepth", "xgboost"],
            time_budget=5,
        )
    except XGBoostError:
        # No visible GPU is found for XGBoost.
        return
示例#29
0
def test_package_minimum():
    # Initialize an AutoML instance
    automl = AutoML()
    # Specify automl goal and constraint
    automl_settings = {
        "time_budget": 10,  # in seconds
        "metric": "accuracy",
        "task": "classification",
        "log_file_name": "iris.log",
    }
    X_train, y_train = load_iris(return_X_y=True)
    # Train with labeled input data
    automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
    # Check that `best_config` is created, the log was created and best model is accessible
    assert hasattr(automl, "best_config")
    assert Path("iris.log").exists()
    assert automl.model is not None
    print(automl.model)
    # Predict and check that the prediction shape is as expected
    preds = automl.predict_proba(X_train)
    assert preds.shape == (150, 3)
    print(preds)
示例#30
0
    def test_sparse_matrix_xgboost(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'ap',
            "task": 'classification',
            "log_file_name": "test/sparse_classification.log",
            "estimator_list": ["xgboost"],
            "log_type": "all",
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)