Пример #1
0
    def test_classification(self, as_frame=False):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 4,
            "metric": 'accuracy',
            "task": 'classification',
            "log_file_name": "test/iris.log",
            "log_training_metric": True,
            "model_history": True
        }
        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train)[:5])
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        del automl_settings["metric"]
        del automl_settings["model_history"]
        del automl_settings["log_training_metric"]
        automl_experiment = AutoML()
        duration = automl_experiment.retrain_from_log(
            log_file_name=automl_settings["log_file_name"],
            X_train=X_train,
            y_train=y_train,
            train_full=True,
            record_id=0)
        print(duration)
        print(automl_experiment.model)
        print(automl_experiment.predict_proba(X_train)[:5])
Пример #2
0
 def test_regression(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "task": "regression",
         "log_file_name": "test/california.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = fetch_california_housing(return_X_y=True)
     n = int(len(y_train) * 9 // 10)
     automl_experiment.fit(X_train=X_train[:n],
                           y_train=y_train[:n],
                           X_val=X_train[n:],
                           y_val=y_train[n:],
                           **automl_settings)
     assert automl_experiment._state.eval_method == "holdout"
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("xgboost"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(get_output_from_log(automl_settings["log_file_name"], 1))
     automl_experiment.retrain_from_log(
         task="regression",
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         time_budget=1,
     )
     automl_experiment.retrain_from_log(
         task="regression",
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         time_budget=0,
     )
Пример #3
0
 def test_classification(self, as_frame=False):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 4,
         "metric": "accuracy",
         "task": "classification",
         "log_file_name": "test/iris.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
     if as_frame:
         # test drop column
         X_train.columns = range(X_train.shape[1])
         X_train[X_train.shape[1]] = np.zeros(len(y_train))
     automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
     print(automl_experiment.classes_)
     print(automl_experiment.predict(X_train)[:5])
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("catboost"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     del automl_settings["metric"]
     del automl_settings["model_history"]
     del automl_settings["log_training_metric"]
     automl_experiment = AutoML(task="classification")
     duration = automl_experiment.retrain_from_log(
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         record_id=0,
     )
     print(duration)
     print(automl_experiment.model)
     print(automl_experiment.predict_proba(X_train)[:5])
Пример #4
0
def test_hf_data():
    from flaml import AutoML
    import pandas as pd

    train_data = {
        "sentence1": [
            'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
            "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
            "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
            "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
        ],
        "sentence2": [
            'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
            "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
            "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
            "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
        ],
        "label": [1, 0, 1, 0],
        "idx": [0, 1, 2, 3],
    }
    train_dataset = pd.DataFrame(train_data)

    dev_data = {
        "sentence1": [
            "The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .",
            "Revenue in the first quarter of the year dropped 15 percent from the same period a year earlier .",
            "The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .",
            "The DVD-CCA then appealed to the state Supreme Court .",
        ],
        "sentence2": [
            "PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .",
            "With the scandal hanging over Stewart 's company , revenue the first quarter of the year dropped 15 percent from the same period a year earlier .",
            "The tech-laced Nasdaq Composite .IXIC rallied 30.46 points , or 2.04 percent , to 1,520.15 .",
            "The DVD CCA appealed that decision to the U.S. Supreme Court .",
        ],
        "label": [1, 1, 0, 1],
        "idx": [4, 5, 6, 7],
    }
    dev_dataset = pd.DataFrame(dev_data)

    test_data = {
        "sentence1": [
            "That compared with $ 35.18 million , or 24 cents per share , in the year-ago period .",
            "Shares of Genentech , a much larger company with several products on the market , rose more than 2 percent .",
            "Legislation making it harder for consumers to erase their debts in bankruptcy court won overwhelming House approval in March .",
            "The Nasdaq composite index increased 10.73 , or 0.7 percent , to 1,514.77 .",
        ],
        "sentence2": [
            "Earnings were affected by a non-recurring $ 8 million tax benefit in the year-ago period .",
            "Shares of Xoma fell 16 percent in early trade , while shares of Genentech , a much larger company with several products on the market , were up 2 percent .",
            "Legislation making it harder for consumers to erase their debts in bankruptcy court won speedy , House approval in March and was endorsed by the White House .",
            "The Nasdaq Composite index , full of technology stocks , was lately up around 18 points .",
        ],
        "label": [0, 0, 0, 0],
        "idx": [8, 10, 11, 12],
    }
    test_dataset = pd.DataFrame(test_data)

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    X_test = test_dataset[custom_sent_keys]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
        "time_budget": 10,
        "task": "seq-classification",
        "metric": "accuracy",
        "log_file_name": "seqclass.log",
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 5,
        "fp16": False,
    }

    try:
        automl.fit(X_train=X_train,
                   y_train=y_train,
                   X_val=X_val,
                   y_val=y_val,
                   **automl_settings)
    except requests.exceptions.HTTPError:
        return

    automl = AutoML()
    automl.retrain_from_log(X_train=X_train,
                            y_train=y_train,
                            train_full=True,
                            record_id=0,
                            **automl_settings)
    with open("automl.pkl", "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    with open("automl.pkl", "rb") as f:
        automl = pickle.load(f)
    shutil.rmtree("test/data/output/")
    automl.predict(X_test)
    automl.predict(["test test", "test test"])
    automl.predict([
        ["test test", "test test"],
        ["test test", "test test"],
        ["test test", "test test"],
    ])

    automl.predict_proba(X_test)
    print(automl.classes_)
Пример #5
0
def test_summarization():
    from flaml import AutoML
    from pandas import DataFrame

    train_dataset = DataFrame([
        ("The cat is alive", "The cat is dead"),
        ("The cat is alive", "The cat is dead"),
        ("The cat is alive", "The cat is dead"),
        ("The cat is alive", "The cat is dead"),
    ])
    dev_dataset = DataFrame([
        ("The old woman is beautiful", "The old woman is ugly"),
        ("The old woman is beautiful", "The old woman is ugly"),
        ("The old woman is beautiful", "The old woman is ugly"),
        ("The old woman is beautiful", "The old woman is ugly"),
    ])
    test_dataset = DataFrame([
        ("The purse is cheap", "The purse is expensive"),
        ("The purse is cheap", "The purse is expensive"),
        ("The purse is cheap", "The purse is expensive"),
        ("The purse is cheap", "The purse is expensive"),
    ])

    for each_dataset in [train_dataset, dev_dataset, test_dataset]:
        each_dataset.columns = ["document", "summary"]

    custom_sent_keys = ["document"]
    label_key = "summary"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    X_test = test_dataset[custom_sent_keys]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
        "time_budget": 20,
        "task": "summarization",
        "metric": "rouge1",
        "log_file_name": "seqclass.log",
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "patrickvonplaten/t5-tiny-random",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
        "fp16": False,
    }

    try:
        automl.fit(X_train=X_train,
                   y_train=y_train,
                   X_val=X_val,
                   y_val=y_val,
                   **automl_settings)
    except requests.exceptions.HTTPError:
        return
    automl = AutoML()
    automl.retrain_from_log(X_train=X_train,
                            y_train=y_train,
                            train_full=True,
                            record_id=0,
                            **automl_settings)
    automl.predict(X_test)
Пример #6
0
def _test_hf_data():
    from flaml import AutoML
    import requests
    from datasets import load_dataset

    try:
        train_dataset = load_dataset("glue", "mrpc", split="train[:1%]").to_pandas()
        dev_dataset = load_dataset("glue", "mrpc", split="validation[:1%]").to_pandas()
        test_dataset = load_dataset("glue", "mrpc", split="test[:1%]").to_pandas()
    except requests.exceptions.ConnectionError:
        return

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    X_test = test_dataset[custom_sent_keys]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 1,
        "max_iter": 2,
        "time_budget": 5000,
        "task": "seq-classification",
        "metric": "accuracy",
        "log_file_name": "seqclass.log",
        "use_ray": True,
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "facebook/muppet-roberta-base",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 5,
        "fp16": True,
    }

    automl.fit(
        X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, **automl_settings
    )

    automl = AutoML()
    automl.retrain_from_log(
        X_train=X_train,
        y_train=y_train,
        train_full=True,
        record_id=0,
        **automl_settings
    )
    with open("automl.pkl", "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    with open("automl.pkl", "rb") as f:
        automl = pickle.load(f)
    shutil.rmtree("test/data/output/")
    automl.predict(X_test)
    automl.predict(["test test", "test test"])
    automl.predict(
        [
            ["test test", "test test"],
            ["test test", "test test"],
            ["test test", "test test"],
        ]
    )

    automl.predict_proba(X_test)
    print(automl.classes_)