Пример #1
0
def set_classifier(clf_name, fit_seed, n_jobs=-1):
    classifier_setting = {
        "RandomForestClassifier":
        RandomForestClassifier(n_estimators=100,
                               n_jobs=n_jobs,
                               random_state=fit_seed),
        "HistGradientBoostingClassifier":
        HistGradientBoostingClassifier(random_state=fit_seed),
        "XGBClassifier":
        xgb.XGBClassifier(
            use_label_encoder=False,
            n_jobs=n_jobs,
            tree_method="hist",
            random_state=fit_seed,
        ),
        "LGBMClassifier":
        lgb.LGBMClassifier(n_jobs=n_jobs, random_state=fit_seed),
        "CatBoostClassifier":
        CatBoostClassifier(
            thread_count=n_jobs,
            random_state=fit_seed,
            logging_level="Silent",
            allow_writing_files=False,
        ),
        "WildWood":
        ForestClassifier(n_estimators=10, n_jobs=n_jobs,
                         random_state=fit_seed),
    }
    return classifier_setting[clf_name]
Пример #2
0
def fit_forest(X_train, y_train, n_estimators=10, dirichlet=0.5, step=1.0):
    clf_kwargs = {
        "n_estimators": n_estimators,
        "min_samples_split": 2,
        "random_state": random_state,
        "n_jobs": 1,
        "step": step,
        "dirichlet": dirichlet,
    }

    clf = ForestClassifier(**clf_kwargs)
    clf.fit(X_train, y_train)
    return clf
Пример #3
0
 def fit(
     self,
     params,
     X_train,
     y_train,
     Xy_val,
     sample_weight,
     n_estimators=None,
     seed=None,
 ):
     if seed is not None:
         params.update({"random_state": seed})
     if n_estimators is not None:
         params.update({"n_estimators": n_estimators})
     clf = ForestClassifier(**params, n_jobs=-1)
     clf.fit(
         X_train,
         y_train,
         sample_weight=sample_weight,
         categorical_features=self.categorical_features,
     )
     return clf, None
Пример #4
0
def fit_forest(
    X_train,
    y_train,
    aggregation=True,
    n_estimators=10,
    dirichlet=0.5,
    step=1.0,
    min_samples_split=2,
    n_jobs=1,
):

    clf_kwargs = {
        "n_estimators": n_estimators,
        "aggregation": aggregation,
        "min_samples_split": min_samples_split,
        "random_state": random_state,
        "n_jobs": n_jobs,
        "step": step,
        "dirichlet": dirichlet,
        "max_features": 2,
    }
    clf = ForestClassifier(**clf_kwargs)
    clf.fit(X_train, y_train)
    return clf
Пример #5
0
def launch_run(*, run_config, experiment_id):
    """

    Parameters
    ----------
    run_config : dict
        The configuration of the run

    experiment_id : str
        Id of the experiment that groups runs in mlflow

    Returns
    -------
    output : dict
        Metrics computed during this run
    """

    wildwood_kwargs = {
        key.replace("wildwood_", ""): val
        for key, val in run_config.items() if key.startswith("wildwood")
    }

    dataset_name = run_config["dataset"]
    dataset_random_state = run_config["dataset_random_state"]
    loader = loader_from_name[dataset_name]

    # Just get the task from the dataset
    dataset = loader()
    learning_task = dataset.task

    # But we use the raw data in wildwood
    X, y = loader(raw=True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=dataset_random_state, shuffle=True, stratify=y)

    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    y_test = label_encoder.transform(y_test)

    kwargs_one_tree = wildwood_kwargs.copy()
    kwargs_one_tree["n_estimators"] = 1

    # Fit a single tree on the full dataset to force pre-compilation (doing so on a
    # subset often fails).
    # TODO: debug such cases
    clf = ForestClassifier(**kwargs_one_tree)
    clf.fit(X_train, y_train)

    # Instantiate again just to be sure
    clf = ForestClassifier(**wildwood_kwargs)

    with mlflow.start_run(experiment_id=experiment_id):
        # Fit and timing
        tic = time()
        # clf.fit(X_train, y_train, **fit_kwargs_generator(clf_name, dataset_name))
        # TODO: include computations with an without categorical features ?
        clf.fit(X_train, y_train)

        toc = time()
        fit_time = toc - tic
        logging.info(f"Fitted for experiment {filename} in {fit_time}s")

        # Predict and timing
        tic = time()
        y_scores_train = clf.predict_proba(X_train)
        toc = time()
        predict_train_time = toc - tic

        tic = time()
        y_scores_test = clf.predict_proba(X_test)
        toc = time()
        predict_test_time = toc - tic

        # col_predict_time.append(predict_time)
        logging.info(
            f"Predicted for experiment {filename} on train in {predict_train_time}s and test in {predict_test_time}s"
        )

        y_pred_train = clf.predict(X_train)
        y_pred_test = clf.predict(X_test)

        metrics = compute_metrics(
            learning_task=learning_task,
            y_train=y_train,
            y_test=y_test,
            y_scores_train=y_scores_train,
            y_scores_test=y_scores_test,
            y_pred_train=y_pred_train,
            y_pred_test=y_pred_test,
        )

        mlflow_metrics = dict(
            **metrics,
            fit_time=fit_time,
            predict_train_time=predict_train_time,
            predict_test_time=predict_test_time,
        )

        mlflow_params = dict(
            **wildwood_kwargs,
            dataset=dataset_name,
            dataset_random_state=dataset_random_state,
        )

        mlflow.log_params(mlflow_params)
        mlflow.log_metrics(mlflow_metrics)
Пример #6
0
    # "max_bins": 8308,
    "n_jobs": -1,
    "dirichlet": 1e-8,
    "step": 1.0,
    "aggregation": False,
    "verbose": True
}

# classifiers = [
#     ("tree", DecisionTreeClassifier),
#     ("sk_tree", SkDecisionTreeClassifier)
# ]

classifiers = [
    # ("forest", ForestBinaryClassifier(n_estimators=1, **clf_kwargs)),
    ("forest", ForestClassifier(**clf_kwargs)),
    # ("sk_forest", RandomForestClassifier(**clf_kwargs))
    # ("tree", DecisionTreeClassifier(**clf_kwargs)),
    # ("sk_tree", SkDecisionTreeClassifier(**clf_kwargs)),
]

n_classifiers = len(classifiers)
n_datasets = len(datasets)
h = 0.2
i = 1

# iterate over datasets

# for ds_cnt, ds in enumerate(datasets):
#     # preprocess datasets, split into training and test part
#     ds_name, (X, y) = ds
Пример #7
0
# X_train = rcv1_train.data
# y_train = rcv1_train.target
# X_test = rcv1_test.data
# y_test = rcv1_test.target

clf_kwargs = {
    "n_estimators": 5,
    "min_samples_split": 2,
    "random_state": random_state,
    "n_jobs": -1,
    "dirichlet": 1e-5,
    "step": 2.0,
    "aggregation": True
}

clf = ForestClassifier(**clf_kwargs)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

tic = time()
clf.fit(X_train, y_train)
toc = time()
print("time to fit: ", toc - tic)

tic = time()
y_scores = clf.predict_proba(X_test)
toc = time()
print("time to predict_proba: ", toc - tic)
Пример #8
0
data_random_state = 42

dataset = load_bank()
dataset.one_hot_encode = False
dataset.standardize = False
X_train, X_test, y_train, y_test = dataset.extract(
    random_state=data_random_state)

n_estimators = 100

clf = ForestClassifier(
    n_estimators=n_estimators,
    random_state=42,
    aggregation=False,
    max_features=None,
    categorical_features=dataset.categorical_features_,
    n_jobs=1,
    class_weight="balanced",
    criterion="entropy",
)
clf.fit(X_train, y_train)
y_scores_train = clf.predict_proba(X_train)
y_scores_test = clf.predict_proba(X_test)
avg_prec_train = average_precision_score(y_train, y_scores_train[:, 1])
avg_prec_test = average_precision_score(y_test, y_scores_test[:, 1])
print("Categorical")
print("AP(train):", avg_prec_train, "AP(test):", avg_prec_test)

clf = ForestClassifier(
    n_estimators=n_estimators,
    random_state=42,
Пример #9
0
random_state = 42

classifiers = [
    lambda n: (
        "RFW",
        RandomForestClassifier(
            n_estimators=n,
            n_jobs=-1,
            random_state=random_state,
        ),
    ),
    lambda n: (
        "WildWood",
        ForestClassifier(
            n_estimators=n,
            multiclass="ovr",
            n_jobs=-1,
            random_state=random_state,
        ),
    ),
    lambda n: (
        "ET",
        ExtraTreesClassifier(
            n_estimators=n,
            n_jobs=-1,
            random_state=random_state,
        ),
    ),
]

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s %(message)s",