示例#1
0
def train_digits():
    digits = load_digits()
    X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(
        digits.data),
                                                        digits.target,
                                                        stratify=digits.target,
                                                        test_size=0.25,
                                                        random_state=123)

    # train models with AutoML
    automl = AutoML(mode="Perform", results_path="AutoML_digits")
    automl.fit(X_train, y_train)

    # compute
    predictions = automl.predict_all(X_test)
    print(predictions.head())
    print("Test accuracy:",
          accuracy_score(y_test, predictions["label"].astype(int)))

    plot_digits(X_test, predictions)
import numpy as np
import pandas as pd
from supervised.automl import AutoML
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import log_loss
import warnings

# warnings.filterwarnings("error", category=RuntimeWarning) #pd.core.common.SettingWithCopyWarning)

df = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
    skipinitialspace=True,
)

X = df[df.columns[:-1]]
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

automl = AutoML(results_path="AutoML_7", mode="Compete")
automl.fit(X_train, y_train)

predictions = automl.predict_all(X_test)

print(predictions.head())
print(predictions.tail())
print(X_test.shape, predictions.shape)
print("LogLoss", log_loss(y_test, predictions["prediction_>50K"]))
示例#3
0
def run(dataset, config):
    log.info(f"\n**** mljar-supervised [v{supervised.__version__}] ****\n")
    save_metadata(config, version=supervised.__version__)

    # Mapping of benchmark metrics to MLJAR metrics
    metrics_mapping = dict(auc='auc', logloss='logloss', rmse='rmse')
    eval_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else "auto"

    # Mapping of benchmark task to MLJAR ML task
    problem_mapping = dict(
        binary="binary_classification",
        multiclass="multiclass_classification",
        regression="regression",
    )
    ml_task = problem_mapping.get(
        dataset.problem_type
    )  # if None the AutoML will guess about the ML task
    is_classification = config.type == "classification"
    results_path = output_subdir("results", config)
    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith("_")
    }

    column_names, _ = zip(*dataset.columns)
    column_types = dict(dataset.columns)
    label = dataset.target.name

    train = pd.DataFrame(dataset.train.data,
                         columns=column_names).astype(column_types, copy=False)
    X_train = train.drop(columns=label)
    y_train = train[label]

    test = pd.DataFrame(dataset.test.data,
                        columns=column_names).astype(column_types, copy=False)
    X_test = test.drop(columns=label)
    y_test = test[label]

    automl = AutoML(results_path=results_path,
                    total_time_limit=config.max_runtime_seconds,
                    random_state=config.seed,
                    ml_task=ml_task,
                    eval_metric=eval_metric,
                    **training_params)

    with utils.Timer() as training:
        automl.fit(X_train, y_train)

    with utils.Timer() as predict:
        preds = automl.predict_all(X_test)

    predictions, probabilities = None, None
    if is_classification:
        predictions = preds["label"].values
        cols = [f"prediction_{c}" for c in np.unique(y_train)]
        probabilities = preds[cols].values
    else:
        predictions = preds["prediction"].values

    # clean the results
    if not config.framework_params.get("_save_artifacts", False):
        shutil.rmtree(results_path, ignore_errors=True)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  models_count=len(automl._models),
                  training_duration=training.duration,
                  predict_duration=predict.duration)
import numpy as np
import pandas as pd
from supervised.automl import AutoML
from sklearn.metrics import accuracy_score
import os

nrows = 100
ncols = 3
X = np.random.rand(nrows, ncols)
X = pd.DataFrame(X, columns=[f"f{i}" for i in range(ncols)])
#y = np.random.randint(0, 2, nrows)
y = np.random.permutation(["a", "B"] * 50)

automl = AutoML(model_time_limit=10)  #, algorithms=["Decision Tree"])
automl.fit(X, y)
print("Train accuracy", accuracy_score(y, automl.predict_all(X)["label"]))

#X = np.random.rand(1000, 10)
#X = pd.DataFrame(X, columns=[f"f{i}" for i in range(10)])
#y = np.random.randint(0, 2, 1000)
#print("Test accuracy", accuracy_score(y, automl.predict(X)["label"]))
示例#5
0
y = df["Survived"]

for col in df.columns:
    print(col, df[col].dtype)

automl = AutoML(
    results_path="AutoML_39",
    # algorithms=["Xgboost"],
    # model_time_limit=20,
    # train_ensemble=True,
    mode="Explain"
)
# automl.set_advanced(start_random_models=3)
automl.fit(X, y)

pred = automl.predict_all(X)

print("Train accuracy", accuracy_score(y, pred["label"]))

test = pd.read_csv("tests/data/Titanic/test_with_Survived.csv")
test_cols = [
    "Parch",
    "Ticket",
    "Fare",
    "Pclass",
    "Name",
    "Sex",
    "Age",
    "SibSp",
    "Cabin",
    "Embarked",
示例#6
0
def run(dataset, config):
    log.info(f"\n**** mljar-supervised [v{supervised.__version__}] ****\n")

    # Mapping of benchmark metrics to MLJAR metrics
    metrics_mapping = dict(auc='auc', logloss='logloss', rmse='rmse')
    eval_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else "auto"

    # Mapping of benchmark task to MLJAR ML task
    problem_mapping = dict(
        binary="binary_classification",
        multiclass="multiclass_classification",
        regression="regression",
    )
    ml_task = problem_mapping.get(
        dataset.problem_type
    )  # if None the AutoML will guess about the ML task
    is_classification = config.type == "classification"
    results_path = output_subdir("results", config)
    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith("_")
    }

    X_train, y_train = dataset.train.X, dataset.train.y.squeeze()
    X_test, y_test = dataset.test.X, dataset.test.y.squeeze()

    automl = AutoML(results_path=results_path,
                    total_time_limit=config.max_runtime_seconds,
                    random_state=config.seed,
                    ml_task=ml_task,
                    eval_metric=eval_metric,
                    **training_params)

    with Timer() as training:
        automl.fit(X_train, y_train)

    with Timer() as predict:
        preds = automl.predict_all(X_test)

    predictions, probabilities, probabilities_labels = None, None, None
    if is_classification:
        # preds is a dataframe with columns ["prediction_LABEL", .., "label"]
        if y_train.dtype == bool and preds["label"].dtype == int:
            # boolean target produces integer predictions for mljar-supervised <= 0.10.6
            # https://github.com/mljar/mljar-supervised/issues/442
            preds = preds.rename(
                {
                    "prediction_0": "False",
                    "prediction_1": "True"
                }, axis=1)
            preds["label"] = preds["label"].astype(bool)
        else:
            preds.columns = [
                c.replace("prediction_", "", 1) for c in preds.columns
            ]

        predictions = preds["label"].values
        probabilities_labels = list(preds.columns)[:-1]
        probabilities = preds[probabilities_labels].values
    else:
        predictions = preds["prediction"].values

    # clean the results
    if not config.framework_params.get("_save_artifacts", False):
        shutil.rmtree(results_path, ignore_errors=True)

    return result(output_file=config.output_predictions_file,
                  predictions=predictions,
                  truth=y_test,
                  probabilities=probabilities,
                  probabilities_labels=probabilities_labels,
                  models_count=len(automl._models),
                  training_duration=training.duration,
                  predict_duration=predict.duration)
import warnings

from sklearn import datasets
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

from supervised import AutoML
from supervised.exceptions import AutoMLException

# warnings.filterwarnings('error')
warnings.filterwarnings(
    "error",
    category=pd.core.common.SettingWithCopyWarning)  # message="*ndarray*")

df = pd.read_csv("tests/data/iris_missing_values_missing_target.csv")
X = df[["feature_1", "feature_2", "feature_3", "feature_4"]]
y = df["class"]

automl = AutoML()

automl.fit(X, y)

predictions = automl.predict_all(X)

print(predictions.head())
print(predictions.tail())

print(X.shape)
print(predictions.shape)