Exemplo n.º 1
0
def test_numpy():
    X_train = np.arange("2014-01", "2021-01", dtype="datetime64[M]")
    y_train = np.random.random(size=len(X_train))
    automl = AutoML()
    automl.fit(
        X_train=X_train[:72],  # a single column of timestamp
        y_train=y_train[:72],  # value for each timestamp
        period=12,  # time horizon to forecast, e.g., 12 months
        task="ts_forecast",
        time_budget=3,  # time budget in seconds
        log_file_name="test/ts_forecast.log",
        n_splits=3,  # number of splits
    )
    print(automl.predict(X_train[72:]))

    automl = AutoML()
    automl.fit(
        X_train=X_train[:72],  # a single column of timestamp
        y_train=y_train[:72],  # value for each timestamp
        period=12,  # time horizon to forecast, e.g., 12 months
        task="ts_forecast",
        time_budget=1,  # time budget in seconds
        estimator_list=["arima", "sarimax"],
        log_file_name="test/ts_forecast.log",
    )
    print(automl.predict(X_train[72:]))
    # an alternative way to specify predict steps for arima/sarimax
    print(automl.predict(12))
Exemplo n.º 2
0
    def test_parallel_xgboost(self, hpo_method=None):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 10,
            "metric": "ap",
            "task": "classification",
            "log_file_name": "test/sparse_classification.log",
            "estimator_list": ["xgboost"],
            "log_type": "all",
            "n_jobs": 1,
            "n_concurrent_trials": 2,
            "hpo_method": hpo_method,
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)
        try:
            import ray

            X_train_ref = ray.put(X_train)
            automl_experiment.fit(X_train=X_train_ref,
                                  y_train=y_train,
                                  **automl_settings)
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
            print(automl_experiment.best_model_for_estimator("xgboost"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
            return
Exemplo n.º 3
0
 def test_regression_xgboost(self):
     X_train = scipy.sparse.random(300, 900, density=0.0001)
     y_train = np.random.uniform(size=300)
     X_val = scipy.sparse.random(100, 900, density=0.0001)
     y_val = np.random.uniform(size=100)
     automl_experiment = AutoML()
     automl_experiment.add_learner(learner_name="my_xgb1",
                                   learner_class=MyXGB1)
     automl_experiment.add_learner(learner_name="my_xgb2",
                                   learner_class=MyXGB2)
     automl_settings = {
         "time_budget": 2,
         "estimator_list": ["my_xgb1", "my_xgb2"],
         "task": "regression",
         "log_file_name": "test/regression_xgboost.log",
         "n_jobs": 1,
         "model_history": True,
         "keep_search_state": True,
         "early_stop": True,
     }
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           X_val=X_val,
                           y_val=y_val,
                           **automl_settings)
     assert automl_experiment._state.X_val.shape == X_val.shape
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("my_xgb2"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(automl_experiment.best_config)
     print(automl_experiment.best_loss)
     print(automl_experiment.best_config_train_time)
Exemplo n.º 4
0
 def test_sparse_matrix_regression(self):
     X_train = scipy.sparse.random(300, 900, density=0.0001)
     y_train = np.random.uniform(size=300)
     X_val = scipy.sparse.random(100, 900, density=0.0001)
     y_val = np.random.uniform(size=100)
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "metric": "mae",
         "task": "regression",
         "log_file_name": "test/sparse_regression.log",
         "n_jobs": 1,
         "model_history": True,
         "keep_search_state": True,
         "verbose": 0,
         "early_stop": True,
     }
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           X_val=X_val,
                           y_val=y_val,
                           **automl_settings)
     assert automl_experiment._state.X_val.shape == X_val.shape
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("rf"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(automl_experiment.best_config)
     print(automl_experiment.best_loss)
     print(automl_experiment.best_config_train_time)
Exemplo n.º 5
0
    def test_sparse_matrix_regression(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'mae',
            "task": 'regression',
            "log_file_name": "test/sparse_regression.log",
            "model_history": True
        }
        X_train = scipy.sparse.random(300, 900, density=0.0001)
        y_train = np.random.uniform(size=300)
        X_val = scipy.sparse.random(100, 900, density=0.0001)
        y_val = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              X_val=X_val,
                              y_val=y_val,
                              **automl_settings)
        assert automl_experiment.X_val.shape == X_val.shape
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(automl_experiment.best_config)
        print(automl_experiment.best_loss)
        print(automl_experiment.best_config_train_time)
Exemplo n.º 6
0
 def test_sparse_matrix_lr(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 3,
         "metric": "f1",
         "task": "classification",
         "log_file_name": "test/sparse_classification.log",
         "estimator_list": ["lrl1", "lrl2"],
         "log_type": "all",
         "n_jobs": 1,
     }
     X_train = scipy.sparse.random(3000, 3000, density=0.1)
     y_train = np.random.randint(2, size=3000)
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           train_time_limit=1,
                           **automl_settings)
     automl_settings["time_budget"] = 5
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           **automl_settings)
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("lrl2"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
Exemplo n.º 7
0
    def test_random_skip_oom(self):
        automl_experiment = AutoML()
        automl_experiment.add_learner(learner_name="large_lgbm",
                                      learner_class=MyLargeLGBM)
        automl_settings = {
            "time_budget": 2,
            "task": "classification",
            "log_file_name": "test/sparse_classification_oom.log",
            "estimator_list": ["large_lgbm"],
            "log_type": "all",
            "n_jobs": 1,
            "hpo_method": "random",
            "n_concurrent_trials": 2,
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)

        try:
            automl_experiment.fit(X_train=X_train,
                                  y_train=y_train,
                                  **automl_settings)
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
            print(automl_experiment.best_model_for_estimator("large_lgbm"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
            print("skipping concurrency test as ray is not installed")
            return
Exemplo n.º 8
0
    def test_regression(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'mse',
            "task": 'regression',
            "log_file_name": "test/boston.log",
            "log_training_metric": True,
            "model_history": True
        }
        X_train, y_train = load_boston(return_X_y=True)
        n = len(y_train)
        automl_experiment.fit(X_train=X_train[:n >> 1],
                              y_train=y_train[:n >> 1],
                              X_val=X_train[n >> 1:],
                              y_val=y_train[n >> 1:],
                              **automl_settings)
        assert automl_experiment.y_val.shape[0] == n - (n >> 1)
        assert automl_experiment.eval_method == 'holdout'
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(get_output_from_log(automl_settings["log_file_name"], 1))
Exemplo n.º 9
0
def run(dataset, config):
    log.info(f"\n**** FLAML [v{__version__}] ****\n")

    X_train, y_train = dataset.train.X, dataset.train.y.squeeze()
    X_test, y_test = dataset.test.X, dataset.test.y.squeeze()

    is_classification = config.type == 'classification'
    time_budget = config.max_runtime_seconds
    n_jobs = config.framework_params.get('_n_jobs', config.cores)
    log.info("Running FLAML with {} number of cores".format(config.cores))
    aml = AutoML()

    # Mapping of benchmark metrics to flaml metrics
    metrics_mapping = dict(
        acc='accuracy',
        auc='roc_auc',
        f1='f1',
        logloss='log_loss',
        mae='mae',
        mse='mse',
        rmse='rmse',
        r2='r2',
    )
    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else 'auto'
    if perf_metric is None:
        log.warning("Performance metric %s not supported.", config.metric)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    log_dir = output_subdir("logs", config)
    flaml_log_file_name = os.path.join(log_dir, "flaml.log")
    with Timer() as training:
        aml.fit(X_train,
                y_train,
                metric=perf_metric,
                task=config.type,
                n_jobs=n_jobs,
                log_file_name=flaml_log_file_name,
                time_budget=time_budget,
                **training_params)

    with Timer() as predict:
        predictions = aml.predict(X_test)
    probabilities = aml.predict_proba(X_test) if is_classification else None
    labels = aml.classes_ if is_classification else None
    return result(
        output_file=config.output_predictions_file,
        probabilities=probabilities,
        predictions=predictions,
        truth=y_test,
        models_count=len(aml.config_history),
        training_duration=training.duration,
        predict_duration=predict.duration,
        probabilities_labels=labels,
    )
Exemplo n.º 10
0
def _test_custom_data():
    from flaml import AutoML
    import requests
    import pandas as pd

    try:
        train_dataset = pd.read_csv("data/input/train.tsv",
                                    delimiter="\t",
                                    quoting=3)
        dev_dataset = pd.read_csv("data/input/dev.tsv",
                                  delimiter="\t",
                                  quoting=3)
        test_dataset = pd.read_csv("data/input/test.tsv",
                                   delimiter="\t",
                                   quoting=3)
    except requests.exceptions.HTTPError:
        return

    custom_sent_keys = ["#1 String", "#2 String"]
    label_key = "Quality"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    X_test = test_dataset[custom_sent_keys]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
        "time_budget": 5,
        "task": "seq-classification",
        "metric": "accuracy",
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "data/output/",
        "ckpt_per_epoch": 1,
    }

    automl.fit(X_train=X_train,
               y_train=y_train,
               X_val=X_val,
               y_val=y_val,
               **automl_settings)
    automl.predict(X_test)
    automl.predict(["test test"])
    automl.predict([
        ["test test", "test test"],
        ["test test", "test test"],
        ["test test", "test test"],
    ])
Exemplo n.º 11
0
 def test_binary(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 1,
         "task": "binary",
         "log_file_name": "test/breast_cancer.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = load_breast_cancer(return_X_y=True)
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           **automl_settings)
     _ = automl_experiment.predict(X_train)
Exemplo n.º 12
0
 def test_regression(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "task": "regression",
         "log_file_name": "test/california.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = fetch_california_housing(return_X_y=True)
     n = int(len(y_train) * 9 // 10)
     automl_experiment.fit(X_train=X_train[:n],
                           y_train=y_train[:n],
                           X_val=X_train[n:],
                           y_val=y_train[n:],
                           **automl_settings)
     assert automl_experiment._state.eval_method == "holdout"
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("xgboost"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(get_output_from_log(automl_settings["log_file_name"], 1))
     automl_experiment.retrain_from_log(
         task="regression",
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         time_budget=1,
     )
     automl_experiment.retrain_from_log(
         task="regression",
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         time_budget=0,
     )
Exemplo n.º 13
0
    def test_sparse_matrix_regression_cv(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            'eval_method': 'cv',
            "task": 'regression',
            "log_file_name": "test/sparse_regression.log",
            "model_history": True
        }
        X_train = scipy.sparse.random(100, 100)
        y_train = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
Exemplo n.º 14
0
 def test_datetime_columns(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "metric": 'mse',
         "task": 'regression',
         "log_file_name": "test/datetime_columns.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True
     }
     fake_df = pd.DataFrame({
         'A': [
             datetime(1900, 2, 3),
             datetime(1900, 3, 4),
             datetime(1900, 3, 4),
             datetime(1900, 3, 4),
             datetime(1900, 7, 2),
             datetime(1900, 8, 9)
         ],
         'B': [
             datetime(1900, 1, 1),
             datetime(1900, 1, 1),
             datetime(1900, 1, 1),
             datetime(1900, 1, 1),
             datetime(1900, 1, 1),
             datetime(1900, 1, 1)
         ],
         'year_A': [
             datetime(1900, 1, 2),
             datetime(1900, 8, 1),
             datetime(1900, 1, 4),
             datetime(1900, 6, 1),
             datetime(1900, 1, 5),
             datetime(1900, 4, 1)
         ]
     })
     y = np.array([0, 1, 0, 1, 0, 0])
     automl_experiment.fit(X_train=fake_df, y_train=y, **automl_settings)
     _ = automl_experiment.predict(fake_df)
Exemplo n.º 15
0
    def test_sparse_matrix_xgboost(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'ap',
            "task": 'classification',
            "log_file_name": "test/sparse_classification.log",
            "estimator_list": ["xgboost"],
            "log_type": "all",
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
Exemplo n.º 16
0
 def test_classification(self, as_frame=False):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 4,
         "metric": "accuracy",
         "task": "classification",
         "log_file_name": "test/iris.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
     if as_frame:
         # test drop column
         X_train.columns = range(X_train.shape[1])
         X_train[X_train.shape[1]] = np.zeros(len(y_train))
     automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
     print(automl_experiment.classes_)
     print(automl_experiment.predict(X_train)[:5])
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("catboost"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     del automl_settings["metric"]
     del automl_settings["model_history"]
     del automl_settings["log_training_metric"]
     automl_experiment = AutoML(task="classification")
     duration = automl_experiment.retrain_from_log(
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         record_id=0,
     )
     print(duration)
     print(automl_experiment.model)
     print(automl_experiment.predict_proba(X_train)[:5])
Exemplo n.º 17
0
 def test_sparse_matrix_regression_cv(self):
     X_train = scipy.sparse.random(8, 100)
     y_train = np.random.uniform(size=8)
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         'eval_method': 'cv',
         "task": 'regression',
         "log_file_name": "test/sparse_regression.log",
         "n_jobs": 1,
         "model_history": True,
         "metric": "mse",
         "sample_weight": np.ones(len(y_train)),
     }
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           **automl_settings)
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.model_history)
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
Exemplo n.º 18
0
 def test_parallel(self, hpo_method=None):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 10,
         "task": "regression",
         "log_file_name": "test/california.log",
         "log_type": "all",
         "n_jobs": 1,
         "n_concurrent_trials": 10,
         "hpo_method": hpo_method,
     }
     X_train, y_train = fetch_california_housing(return_X_y=True)
     try:
         automl_experiment.fit(X_train=X_train,
                               y_train=y_train,
                               **automl_settings)
         print(automl_experiment.predict(X_train))
         print(automl_experiment.model)
         print(automl_experiment.config_history)
         print(automl_experiment.best_model_for_estimator("xgboost"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
     except ImportError:
         return
Exemplo n.º 19
0
def test_regression():
    try:
        import ray
    except ImportError:
        return
    from flaml import AutoML
    import pandas as pd

    train_data = {
        "sentence1": [
            "A plane is taking off.",
            "A man is playing a large flute.",
            "A man is spreading shreded cheese on a pizza.",
            "Three men are playing chess.",
        ],
        "sentence2": [
            "An air plane is taking off.",
            "A man is playing a flute.",
            "A man is spreading shredded cheese on an uncooked pizza.",
            "Two men are playing chess.",
        ],
        "label":
        [5.0, 3.799999952316284, 3.799999952316284, 2.5999999046325684],
        "idx": [0, 1, 2, 3],
    }
    train_dataset = pd.DataFrame(train_data)

    dev_data = {
        "sentence1": [
            "A man is playing the cello.",
            "Some men are fighting.",
            "A man is smoking.",
            "The man is playing the piano.",
        ],
        "sentence2": [
            "A man seated is playing the cello.",
            "Two men are fighting.",
            "A man is skating.",
            "The man is playing the guitar.",
        ],
        "label": [4.25, 4.25, 0.5, 1.600000023841858],
        "idx": [4, 5, 6, 7],
    }
    dev_dataset = pd.DataFrame(dev_data)

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 2,
        "time_budget": 5,
        "task": "seq-regression",
        "metric": "pearsonr",
        "starting_points": {
            "transformer": {
                "num_train_epochs": 1
            }
        },
        "use_ray": True,
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
        "fp16": False,
    }

    ray.shutdown()
    ray.init()
    automl.fit(X_train=X_train,
               y_train=y_train,
               X_val=X_val,
               y_val=y_val,
               **automl_settings)
    automl.predict(X_val)
Exemplo n.º 20
0
def test_hf_data():
    from flaml import AutoML
    import pandas as pd

    train_data = {
        "sentence1": [
            'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
            "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
            "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
            "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
        ],
        "sentence2": [
            'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
            "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
            "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
            "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
        ],
        "label": [1, 0, 1, 0],
        "idx": [0, 1, 2, 3],
    }
    train_dataset = pd.DataFrame(train_data)

    dev_data = {
        "sentence1": [
            "The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .",
            "Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .",
            "The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .",
            "The DVD-CCA then appealed to the state Supreme Court .",
        ],
        "sentence2": [
            "PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .",
            "With the scandal hanging over Stewart 's company , revenue the first quarter of the year dropped 15 percent from the same period a year earlier .",
            "The tech-laced Nasdaq Composite .IXIC rallied 30.46 points , or 2.04 percent , to 1,520.15 .",
            "The DVD CCA appealed that decision to the U.S. Supreme Court .",
        ],
        "label": [1, 1, 0, 1],
        "idx": [4, 5, 6, 7],
    }
    dev_dataset = pd.DataFrame(dev_data)

    test_data = {
        "sentence1": [
            "That compared with $ 35.18 million , or 24 cents per share , in the year-ago period .",
            "Shares of Genentech , a much larger company with several products on the market , rose more than 2 percent .",
            "Legislation making it harder for consumers to erase their debts in bankruptcy court won overwhelming House approval in March .",
            "The Nasdaq composite index increased 10.73 , or 0.7 percent , to 1,514.77 .",
        ],
        "sentence2": [
            "Earnings were affected by a non-recurring $ 8 million tax benefit in the year-ago period .",
            "Shares of Xoma fell 16 percent in early trade , while shares of Genentech , a much larger company with several products on the market , were up 2 percent .",
            "Legislation making it harder for consumers to erase their debts in bankruptcy court won speedy , House approval in March and was endorsed by the White House .",
            "The Nasdaq Composite index , full of technology stocks , was lately up around 18 points .",
        ],
        "label": [0, 0, 0, 0],
        "idx": [8, 10, 11, 12],
    }
    test_dataset = pd.DataFrame(test_data)

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    X_test = test_dataset[custom_sent_keys]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
        "time_budget": 10,
        "task": "seq-classification",
        "metric": "accuracy",
        "log_file_name": "seqclass.log",
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 5,
        "fp16": False,
    }

    try:
        automl.fit(X_train=X_train,
                   y_train=y_train,
                   X_val=X_val,
                   y_val=y_val,
                   **automl_settings)
    except requests.exceptions.HTTPError:
        return

    automl = AutoML()
    automl.retrain_from_log(X_train=X_train,
                            y_train=y_train,
                            train_full=True,
                            record_id=0,
                            **automl_settings)
    with open("automl.pkl", "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    with open("automl.pkl", "rb") as f:
        automl = pickle.load(f)
    shutil.rmtree("test/data/output/")
    automl.predict(X_test)
    automl.predict(["test test", "test test"])
    automl.predict([
        ["test test", "test test"],
        ["test test", "test test"],
        ["test test", "test test"],
    ])

    automl.predict_proba(X_test)
    print(automl.classes_)
Exemplo n.º 21
0
    def test_logging_level(self):

        from flaml import logger, logger_formatter

        with tempfile.TemporaryDirectory() as d:

            training_log = os.path.join(d, "training.log")

            # Configure logging for the FLAML logger
            # and add a handler that outputs to a buffer.
            logger.setLevel(logging.INFO)
            buf = io.StringIO()
            ch = logging.StreamHandler(buf)
            ch.setFormatter(logger_formatter)
            logger.addHandler(ch)

            # Run a simple job.
            automl = AutoML()
            automl_settings = {
                "time_budget": 1,
                "metric": "rmse",
                "task": "regression",
                "log_file_name": training_log,
                "log_training_metric": True,
                "n_jobs": 1,
                "model_history": True,
                "keep_search_state": True,
                "learner_selector": "roundrobin",
            }
            X_train, y_train = fetch_california_housing(return_X_y=True)
            n = len(y_train) >> 1
            print(automl.model, automl.classes_, automl.predict(X_train))
            automl.fit(X_train=X_train[:n],
                       y_train=y_train[:n],
                       X_val=X_train[n:],
                       y_val=y_train[n:],
                       **automl_settings)
            logger.info(automl.search_space)
            logger.info(automl.low_cost_partial_config)
            logger.info(automl.points_to_evaluate)
            logger.info(automl.cat_hp_cost)
            import optuna as ot

            study = ot.create_study()
            from flaml.tune.space import define_by_run_func, add_cost_to_space

            sample = define_by_run_func(study.ask(), automl.search_space)
            logger.info(sample)
            logger.info(unflatten_hierarchical(sample, automl.search_space))
            add_cost_to_space(automl.search_space,
                              automl.low_cost_partial_config,
                              automl.cat_hp_cost)
            logger.info(automl.search_space["ml"].categories)
            if automl.best_config:
                config = automl.best_config.copy()
                config["learner"] = automl.best_estimator
                automl.trainable({"ml": config})
            from flaml import tune, BlendSearch
            from flaml.automl import size
            from functools import partial

            low_cost_partial_config = automl.low_cost_partial_config
            search_alg = BlendSearch(
                metric="val_loss",
                mode="min",
                space=automl.search_space,
                low_cost_partial_config=low_cost_partial_config,
                points_to_evaluate=automl.points_to_evaluate,
                cat_hp_cost=automl.cat_hp_cost,
                resource_attr=automl.resource_attr,
                min_resource=automl.min_resource,
                max_resource=automl.max_resource,
                config_constraints=[(partial(size, automl._state), "<=",
                                     automl._mem_thres)],
                metric_constraints=automl.metric_constraints,
            )
            analysis = tune.run(
                automl.trainable,
                search_alg=search_alg,  # verbose=2,
                time_budget_s=1,
                num_samples=-1,
            )
            print(
                min(trial.last_result["val_loss"]
                    for trial in analysis.trials))
            config = analysis.trials[-1].last_result["config"]["ml"]
            automl._state._train_with_config(config["learner"], config)
            for _ in range(3):
                print(
                    search_alg._ls.complete_config(
                        low_cost_partial_config,
                        search_alg._ls_bound_min,
                        search_alg._ls_bound_max,
                    ))
            # Check if the log buffer is populated.
            self.assertTrue(len(buf.getvalue()) > 0)

        import pickle

        with open("automl.pkl", "wb") as f:
            pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
        print(automl.__version__)
        pred1 = automl.predict(X_train)
        with open("automl.pkl", "rb") as f:
            automl = pickle.load(f)
        pred2 = automl.predict(X_train)
        delta = pred1 - pred2
        assert max(delta) == 0 and min(delta) == 0
        automl.save_best_config("test/housing.json")
Exemplo n.º 22
0
def test_automl(budget=5, dataset_format="dataframe", hpo_method=None):
    from flaml.data import load_openml_dataset

    try:
        X_train, X_test, y_train, y_test = load_openml_dataset(
            dataset_id=1169, data_dir="test/", dataset_format=dataset_format
        )
    except (OpenMLServerException, ChunkedEncodingError) as e:
        print(e)
        return
    """ import AutoML class from flaml package """
    from flaml import AutoML

    automl = AutoML()
    settings = {
        "time_budget": budget,  # total running time in seconds
        "metric": "accuracy",  # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
        "task": "classification",  # task type
        "log_file_name": "airlines_experiment.log",  # flaml log file
        "seed": 7654321,  # random seed
        "hpo_method": hpo_method,
    }
    """The main flaml automl API"""
    automl.fit(X_train=X_train, y_train=y_train, **settings)
    """ retrieve best config and best learner """
    print("Best ML leaner:", automl.best_estimator)
    print("Best hyperparmeter config:", automl.best_config)
    print("Best accuracy on validation data: {0:.4g}".format(1 - automl.best_loss))
    print(
        "Training duration of best run: {0:.4g} s".format(automl.best_config_train_time)
    )
    print(automl.model.estimator)
    print("time taken to find best model:", automl.time_to_find_best_model)
    """ pickle and save the automl object """
    import pickle

    with open("automl.pkl", "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    """ compute predictions of testing dataset """
    y_pred = automl.predict(X_test)
    print("Predicted labels", y_pred)
    print("True labels", y_test)
    y_pred_proba = automl.predict_proba(X_test)[:, 1]
    """ compute different metric values on testing dataset """
    from flaml.ml import sklearn_metric_loss_score

    print("accuracy", "=", 1 - sklearn_metric_loss_score("accuracy", y_pred, y_test))
    print(
        "roc_auc", "=", 1 - sklearn_metric_loss_score("roc_auc", y_pred_proba, y_test)
    )
    print("log_loss", "=", sklearn_metric_loss_score("log_loss", y_pred_proba, y_test))
    from flaml.data import get_output_from_log

    (
        time_history,
        best_valid_loss_history,
        valid_loss_history,
        config_history,
        metric_history,
    ) = get_output_from_log(filename=settings["log_file_name"], time_budget=6)
    for config in config_history:
        print(config)
    print(automl.resource_attr)
    print(automl.max_resource)
    print(automl.min_resource)
    automl.fit(X_train=X_train, y_train=y_train, ensemble=True, **settings)
Exemplo n.º 23
0
def _test_hf_data():
    from flaml import AutoML
    import requests
    from datasets import load_dataset

    try:
        train_dataset = load_dataset("glue", "mrpc", split="train[:1%]").to_pandas()
        dev_dataset = load_dataset("glue", "mrpc", split="validation[:1%]").to_pandas()
        test_dataset = load_dataset("glue", "mrpc", split="test[:1%]").to_pandas()
    except requests.exceptions.ConnectionError:
        return

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    X_test = test_dataset[custom_sent_keys]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 1,
        "max_iter": 2,
        "time_budget": 5000,
        "task": "seq-classification",
        "metric": "accuracy",
        "log_file_name": "seqclass.log",
        "use_ray": True,
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "facebook/muppet-roberta-base",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 5,
        "fp16": True,
    }

    automl.fit(
        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
    )

    automl = AutoML()
    automl.retrain_from_log(
        X_train=X_train,
        y_train=y_train,
        train_full=True,
        record_id=0,
        **automl_settings
    )
    with open("automl.pkl", "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    with open("automl.pkl", "rb") as f:
        automl = pickle.load(f)
    shutil.rmtree("test/data/output/")
    automl.predict(X_test)
    automl.predict(["test test", "test test"])
    automl.predict(
        [
            ["test test", "test test"],
            ["test test", "test test"],
            ["test test", "test test"],
        ]
    )

    automl.predict_proba(X_test)
    print(automl.classes_)
Exemplo n.º 24
0
def test_multivariate_forecast_cat(budget=5):
    time_horizon = 180
    train_df, test_df = load_multi_dataset_cat(time_horizon)
    X_test = test_df[[
        "timeStamp", "season", "above_monthly_avg"
    ]]  # test dataframe must contain values for the regressors / multivariate variables
    y_test = test_df["demand"]
    automl = AutoML()
    settings = {
        "time_budget": budget,  # total running time in seconds
        "metric": "mape",  # primary metric
        "task": "ts_forecast",  # task type
        "log_file_name":
        "test/energy_forecast_categorical.log",  # flaml log file
        "eval_method": "holdout",
        "log_type": "all",
        "label": "demand",
    }
    """The main flaml automl API"""
    try:
        import prophet

        automl.fit(dataframe=train_df, **settings, period=time_horizon)
    except ImportError:
        print("not using prophet due to ImportError")
        automl.fit(
            dataframe=train_df,
            **settings,
            estimator_list=["arima", "sarimax"],
            period=time_horizon,
        )
    """ retrieve best config and best learner"""
    print("Best ML leaner:", automl.best_estimator)
    print("Best hyperparmeter config:", automl.best_config)
    print(f"Best mape on validation data: {automl.best_loss}")
    print(f"Training duration of best run: {automl.best_config_train_time}s")
    print(automl.model.estimator)
    """ pickle and save the automl object """
    import pickle

    with open("automl.pkl", "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    """ compute predictions of testing dataset """
    y_pred = automl.predict(X_test)
    print("Predicted labels", y_pred)
    print("True labels", y_test)
    """ compute different metric values on testing dataset"""
    from flaml.ml import sklearn_metric_loss_score

    print("mape", "=", sklearn_metric_loss_score("mape", y_pred, y_test))
    print("rmse", "=", sklearn_metric_loss_score("rmse", y_pred, y_test))
    print("mse", "=", sklearn_metric_loss_score("mse", y_pred, y_test))
    print("mae", "=", sklearn_metric_loss_score("mae", y_pred, y_test))
    from flaml.data import get_output_from_log

    (
        time_history,
        best_valid_loss_history,
        valid_loss_history,
        config_history,
        metric_history,
    ) = get_output_from_log(filename=settings["log_file_name"],
                            time_budget=budget)
    for config in config_history:
        print(config)
    print(automl.resource_attr)
    print(automl.max_resource)
    print(automl.min_resource)
Exemplo n.º 25
0
                automl.best_config_train_time))
            time_history, best_valid_loss_history, valid_loss_history, config_history, train_loss_history = \
                get_output_from_log(filename=settings['log_file_name'], time_budget=TIME_BUDGET)
            plt.title(f'Learning Curve - {m}')
            plt.xlabel('Wall Clock Time (s)')
            plt.ylabel('Validation Accuracy')
            plt.scatter(time_history, 1 - np.array(valid_loss_history))
            plt.step(time_history,
                     1 - np.array(best_valid_loss_history),
                     where='post')
            file_path = f'{PLOTS_ROOT}/automl_learning_curve_{m}.png'
            plt.savefig(file_path)
            plt.close()
            mlflow.log_artifact(file_path)

            y_pred = automl.predict(X_test)
            y_prob = automl.predict_proba(X_test)[:, 1]

            model_scores = calculate_model_score(y_test, y_pred)
            mlflow.log_metrics({
                f'accuracy': model_scores['accuracy'],
                f'f1': model_scores['f1'],
                f'f1_micro': model_scores['f1_micro'],
                f'f1_macro': model_scores['f1_macro'],
                f'precision': model_scores['precision'],
                f'recall': model_scores['recall'],
                f'roc_auc': model_scores['roc_auc']
            })

            f1_timestep = calc_score_and_std_per_timestep(
                X_test_df, y_test, y_pred)
def test_mcc():
    from flaml import AutoML
    import requests
    import pandas as pd

    train_data = {
        "video-id": [
            "anetv_fruimvo90vA",
            "anetv_fruimvo90vA",
            "anetv_fruimvo90vA",
            "anetv_MldEr60j33M",
            "lsmdc0049_Hannah_and_her_sisters-69438",
        ],
        "fold-ind": ["10030", "10030", "10030", "5488", "17405"],
        "startphrase": [
            "A woman is seen running down a long track and jumping into a pit. The camera",
            "A woman is seen running down a long track and jumping into a pit. The camera",
            "A woman is seen running down a long track and jumping into a pit. The camera",
            "A man in a white shirt bends over and picks up a large weight. He",
            "Someone furiously shakes someone away. He",
        ],
        "sent1": [
            "A woman is seen running down a long track and jumping into a pit.",
            "A woman is seen running down a long track and jumping into a pit.",
            "A woman is seen running down a long track and jumping into a pit.",
            "A man in a white shirt bends over and picks up a large weight.",
            "Someone furiously shakes someone away.",
        ],
        "sent2": ["The camera", "The camera", "The camera", "He", "He"],
        "gold-source": ["gen", "gen", "gold", "gen", "gold"],
        "ending0": [
            "captures her as well as lifting weights down in place.",
            "follows her spinning her body around and ends by walking down a lane.",
            "watches her as she walks away and sticks her tongue out to another person.",
            "lifts the weights over his head.",
            "runs to a woman standing waiting.",
        ],
        "ending1": [
            "pans up to show another woman running down the track.",
            "pans around the two.",
            "captures her as well as lifting weights down in place.",
            "also lifts it onto his chest before hanging it back out again.",
            "tackles him into the passenger seat.",
        ],
        "ending2": [
            "follows her movements as the group members follow her instructions.",
            "captures her as well as lifting weights down in place.",
            "follows her spinning her body around and ends by walking down a lane.",
            "spins around and lifts a barbell onto the floor.",
            "pounds his fist against a cupboard.",
        ],
        "ending3": [
            "follows her spinning her body around and ends by walking down a lane.",
            "follows her movements as the group members follow her instructions.",
            "pans around the two.",
            "bends down and lifts the weight over his head.",
            "offers someone the cup on his elbow and strides out.",
        ],
        "label": [1, 3, 0, 0, 2],
    }
    dev_data = {
        "video-id": [
            "lsmdc3001_21_JUMP_STREET-422",
            "lsmdc0001_American_Beauty-45991",
            "lsmdc0001_American_Beauty-45991",
            "lsmdc0001_American_Beauty-45991",
        ],
        "fold-ind": ["11783", "10977", "10970", "10968"],
        "startphrase": [
            "Firing wildly he shoots holes through the tanker. He",
            "He puts his spatula down. The Mercedes",
            "He stands and looks around, his eyes finally landing on: "
            "The digicam and a stack of cassettes on a shelf. Someone",
            "He starts going through someone's bureau. He opens the drawer "
            "in which we know someone keeps his marijuana, but he",
        ],
        "sent1": [
            "Firing wildly he shoots holes through the tanker.",
            "He puts his spatula down.",
            "He stands and looks around, his eyes finally landing on: "
            "The digicam and a stack of cassettes on a shelf.",
            "He starts going through someone's bureau.",
        ],
        "sent2": [
            "He",
            "The Mercedes",
            "Someone",
            "He opens the drawer in which we know someone keeps his marijuana, but he",
        ],
        "gold-source": ["gold", "gold", "gold", "gold"],
        "ending0": [
            "overtakes the rig and falls off his bike.",
            "fly open and drinks.",
            "looks at someone's papers.",
            "stops one down and rubs a piece of the gift out.",
        ],
        "ending1": [
            "squeezes relentlessly on the peanut jelly as well.",
            "walks off followed driveway again.",
            "feels around it and falls in the seat once more.",
            "cuts the mangled parts.",
        ],
        "ending2": [
            "scrambles behind himself and comes in other directions.",
            "slots them into a separate green.",
            "sprints back from the wreck and drops onto his back.",
            "hides it under his hat to watch.",
        ],
        "ending3": [
            "sweeps a explodes and knocks someone off.",
            "pulls around to the drive - thru window.",
            "sits at the kitchen table, staring off into space.",
            "does n't discover its false bottom.",
        ],
        "label": [0, 3, 3, 3],
    }
    test_data = {
        "video-id": [
            "lsmdc0001_American_Beauty-45991",
            "lsmdc0001_American_Beauty-45991",
            "lsmdc0001_American_Beauty-45991",
            "lsmdc0001_American_Beauty-45991",
        ],
        "fold-ind": ["10980", "10976", "10978", "10969"],
        "startphrase": [
            "Someone leans out of the drive - thru window, "
            "grinning at her, holding bags filled with fast food. The Counter Girl",
            "Someone looks up suddenly when he hears. He",
            "Someone drives; someone sits beside her. They",
            "He opens the drawer in which we know someone "
            "keeps his marijuana, but he does n't discover"
            " its false bottom. He stands and looks around, his eyes",
        ],
        "sent1": [
            "Someone leans out of the drive - thru "
            "window, grinning at her, holding bags filled with fast food.",
            "Someone looks up suddenly when he hears.",
            "Someone drives; someone sits beside her.",
            "He opens the drawer in which we know"
            " someone keeps his marijuana, but he does n't discover its false bottom.",
        ],
        "sent2": [
            "The Counter Girl",
            "He",
            "They",
            "He stands and looks around, his eyes",
        ],
        "gold-source": ["gold", "gold", "gold", "gold"],
        "ending0": [
            "stands next to him, staring blankly.",
            "puts his spatula down.",
            "rise someone's feet up.",
            "moving to the side, the houses rapidly stained.",
        ],
        "ending1": [
            "with auditorium, filmed, singers the club.",
            "bumps into a revolver and drops surreptitiously into his weapon.",
            "lift her and they are alarmed.",
            "focused as the sight of someone making his way down a trail.",
        ],
        "ending2": [
            "attempts to block her ransacked.",
            "talks using the phone and walks away for a few seconds.",
            "are too involved with each other to "
            "notice someone watching them from the drive - thru window.",
            "finally landing on: the digicam and a stack of cassettes on a shelf.",
        ],
        "ending3": [
            "is eating solid and stinky.",
            "bundles the flaxen powder beneath the car.",
            "sit at a table with a beer from a table.",
            "deep and continuing, its bleed - length sideburns pressing on him.",
        ],
        "label": [0, 0, 2, 2],
    }

    train_dataset = pd.DataFrame(train_data)
    dev_dataset = pd.DataFrame(dev_data)
    test_dataset = pd.DataFrame(test_data)

    custom_sent_keys = [
        "sent1",
        "sent2",
        "ending0",
        "ending1",
        "ending2",
        "ending3",
        "gold-source",
        "video-id",
        "startphrase",
        "fold-ind",
    ]
    label_key = "label"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    X_test = test_dataset[custom_sent_keys]
    X_true = test_dataset[label_key]
    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 2,
        "time_budget": 5,
        "task": "multichoice-classification",
        "metric": "accuracy",
        "log_file_name": "seqclass.log",
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
        "fp16": False,
    }

    try:
        automl.fit(X_train=X_train,
                   y_train=y_train,
                   X_val=X_val,
                   y_val=y_val,
                   **automl_settings)
    except requests.exceptions.HTTPError:
        return

    y_pred = automl.predict(X_test)
    proba = automl.predict_proba(X_test)
    print(str(len(automl.classes_)) + " classes")
    print(y_pred)
    print(X_true)
    print(proba)
    true_count = 0
    for i, v in X_true.items():
        if y_pred[i] == v:
            true_count += 1
    accuracy = round(true_count / len(y_pred), 5)
    print("Accuracy: " + str(accuracy))
Exemplo n.º 27
0
def test_forecast_classification(budget=5):
    from hcrystalball.utils import get_sales_data
    from hcrystalball.wrappers import get_sklearn_wrapper

    time_horizon = 30
    df = get_sales_data(n_dates=180, n_assortments=1, n_states=1, n_stores=1)
    df = df[["Sales", "Open", "Promo", "Promo2"]]
    # feature engineering
    import numpy as np

    df["above_mean_sales"] = np.where(df["Sales"] > df["Sales"].mean(), 1, 0)
    df.reset_index(inplace=True)
    train_df = df[:-time_horizon]
    test_df = df[-time_horizon:]
    X_train, X_test = (
        train_df[["Date", "Open", "Promo", "Promo2"]],
        test_df[["Date", "Open", "Promo", "Promo2"]],
    )
    y_train, y_test = train_df["above_mean_sales"], test_df["above_mean_sales"]
    automl = AutoML()
    settings = {
        "time_budget": budget,  # total running time in seconds
        "metric": "accuracy",  # primary metric
        "task": "ts_forecast_classification",  # task type
        "log_file_name":
        "test/sales_classification_forecast.log",  # flaml log file
        "eval_method": "holdout",
    }
    """The main flaml automl API"""
    automl.fit(X_train=X_train,
               y_train=y_train,
               **settings,
               period=time_horizon)
    """ retrieve best config and best learner"""
    print("Best ML leaner:", automl.best_estimator)
    print("Best hyperparmeter config:", automl.best_config)
    print(f"Best mape on validation data: {automl.best_loss}")
    print(f"Training duration of best run: {automl.best_config_train_time}s")
    print(automl.model.estimator)
    """ pickle and save the automl object """
    import pickle

    with open("automl.pkl", "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    """ compute predictions of testing dataset """
    y_pred = automl.predict(X_test)
    """ compute different metric values on testing dataset"""
    from flaml.ml import sklearn_metric_loss_score

    print(y_test)
    print(y_pred)
    print("accuracy", "=",
          1 - sklearn_metric_loss_score("accuracy", y_test, y_pred))
    from flaml.data import get_output_from_log

    (
        time_history,
        best_valid_loss_history,
        valid_loss_history,
        config_history,
        metric_history,
    ) = get_output_from_log(filename=settings["log_file_name"],
                            time_budget=budget)
    for config in config_history:
        print(config)
    print(automl.resource_attr)
    print(automl.max_resource)
    print(automl.min_resource)
Exemplo n.º 28
0
def test_summarization():
    from flaml import AutoML
    from pandas import DataFrame

    train_dataset = DataFrame([
        ("The cat is alive", "The cat is dead"),
        ("The cat is alive", "The cat is dead"),
        ("The cat is alive", "The cat is dead"),
        ("The cat is alive", "The cat is dead"),
    ])
    dev_dataset = DataFrame([
        ("The old woman is beautiful", "The old woman is ugly"),
        ("The old woman is beautiful", "The old woman is ugly"),
        ("The old woman is beautiful", "The old woman is ugly"),
        ("The old woman is beautiful", "The old woman is ugly"),
    ])
    test_dataset = DataFrame([
        ("The purse is cheap", "The purse is expensive"),
        ("The purse is cheap", "The purse is expensive"),
        ("The purse is cheap", "The purse is expensive"),
        ("The purse is cheap", "The purse is expensive"),
    ])

    for each_dataset in [train_dataset, dev_dataset, test_dataset]:
        each_dataset.columns = ["document", "summary"]

    custom_sent_keys = ["document"]
    label_key = "summary"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    X_test = test_dataset[custom_sent_keys]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
        "time_budget": 20,
        "task": "summarization",
        "metric": "rouge1",
        "log_file_name": "seqclass.log",
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "patrickvonplaten/t5-tiny-random",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
        "fp16": False,
    }

    try:
        automl.fit(X_train=X_train,
                   y_train=y_train,
                   X_val=X_val,
                   y_val=y_val,
                   **automl_settings)
    except requests.exceptions.HTTPError:
        return
    automl = AutoML()
    automl.retrain_from_log(X_train=X_train,
                            y_train=y_train,
                            train_full=True,
                            record_id=0,
                            **automl_settings)
    automl.predict(X_test)
Exemplo n.º 29
0
def test_forecast_automl(budget=5):
    # using dataframe
    import statsmodels.api as sm

    data = sm.datasets.co2.load_pandas().data["co2"].resample("MS").mean()
    data = (data.fillna(
        data.bfill()).to_frame().reset_index().rename(columns={
            "index": "ds",
            "co2": "y"
        }))
    num_samples = data.shape[0]
    time_horizon = 12
    split_idx = num_samples - time_horizon
    df = data[:split_idx]
    X_test = data[split_idx:]["ds"]
    y_test = data[split_idx:]["y"]
    automl = AutoML()
    settings = {
        "time_budget": budget,  # total running time in seconds
        "metric": "mape",  # primary metric
        "task": "ts_forecast",  # task type
        "log_file_name": "test/CO2_forecast.log",  # flaml log file
        "eval_method": "holdout",
        "label": "y",
    }
    """The main flaml automl API"""
    try:
        import prophet

        automl.fit(dataframe=df, **settings, period=time_horizon)
    except ImportError:
        print("not using prophet due to ImportError")
        automl.fit(
            dataframe=df,
            **settings,
            estimator_list=["arima", "sarimax"],
            period=time_horizon,
        )
    """ retrieve best config and best learner"""
    print("Best ML leaner:", automl.best_estimator)
    print("Best hyperparmeter config:", automl.best_config)
    print(f"Best mape on validation data: {automl.best_loss}")
    print(f"Training duration of best run: {automl.best_config_train_time}s")
    print(automl.model.estimator)
    """ pickle and save the automl object """
    import pickle

    with open("automl.pkl", "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    """ compute predictions of testing dataset """
    y_pred = automl.predict(X_test)
    print("Predicted labels", y_pred)
    print("True labels", y_test)
    """ compute different metric values on testing dataset"""
    from flaml.ml import sklearn_metric_loss_score

    print("mape", "=", sklearn_metric_loss_score("mape", y_pred, y_test))
    from flaml.data import get_output_from_log

    (
        time_history,
        best_valid_loss_history,
        valid_loss_history,
        config_history,
        metric_history,
    ) = get_output_from_log(filename=settings["log_file_name"],
                            time_budget=budget)
    for config in config_history:
        print(config)
    print(automl.resource_attr)
    print(automl.max_resource)
    print(automl.min_resource)

    X_train = df[["ds"]]
    y_train = df["y"]
    automl = AutoML()
    try:
        automl.fit(X_train=X_train,
                   y_train=y_train,
                   **settings,
                   period=time_horizon)
    except ImportError:
        print("not using prophet due to ImportError")
        automl.fit(
            X_train=X_train,
            y_train=y_train,
            **settings,
            estimator_list=["arima", "sarimax"],
            period=time_horizon,
        )