Пример #1
0
def load_data_cache(use_test=False, out_ohe=True):

    test_size = 0.33
    valid_size = 0.33 * (1 - test_size)
    if use_test:
        print("!!! USING TEST DATA !!!")
        (
            (X_train, y_train),
            (X_valid, y_valid),
            (X_test, y_test),
            categorical_indicator,
        ) = albert.load_data(
            random_state=RANDOM_STATE,
            test_size=test_size,
            valid_size=valid_size,
            categoricals_to_integers=True,
        )
        X_train = np.concatenate([X_train, X_valid])
        y_train = np.concatenate([y_train, y_valid])
        X_valid, y_valid = X_test, y_test
    else:
        (
            (X_train, y_train),
            (X_valid, y_valid),
            _,
            categorical_indicator,
        ) = albert.load_data(
            random_state=RANDOM_STATE,
            test_size=test_size,
            valid_size=valid_size,
            categoricals_to_integers=True,
        )

    # Replace missing values with mean value
    # https://scikit-learn.org/stable/modules/impute.html
    print("Replacing missing values")
    imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
    X_train = imp_mean.fit_transform(X_train)
    X_valid = imp_mean.transform(X_valid)

    # Min Max => Std scaler preprocessing for non categorical variables
    for i, (categorical, _) in enumerate(categorical_indicator):
        if not categorical:
            scaler = minmaxstdscaler()
            X_train[:, i:i + 1] = scaler.fit_transform(X_train[:, i:i + 1])
            X_valid[:, i:i + 1] = scaler.transform(X_valid[:, i:i + 1])

    # One Hot Encoding of Outputs
    if out_ohe:
        prepro_output = preprocessing.OneHotEncoder()
        y_train = y_train.reshape(-1, 1)
        y_valid = y_valid.reshape(-1, 1)
        y_train = prepro_output.fit_transform(y_train).toarray()
        y_valid = prepro_output.transform(y_valid).toarray()

    print(f"X_train shape: {np.shape(X_train)}")
    print(f"y_train shape: {np.shape(y_train)}")
    print(f"X_valid shape: {np.shape(X_valid)}")
    print(f"y_valid shape: {np.shape(y_valid)}")
    return (X_train, y_train), (X_valid, y_valid), categorical_indicator
Пример #2
0
def load_data_cache(use_test=False):
    # Random state
    random_state = np.random.RandomState(seed=42)

    if use_test:
        print("!!! USING TEST DATA !!!")
        (X_train, y_train), (X_valid,
                             y_valid), (X_test, y_test) = airlines.load_data(
                                 random_state=random_state,
                                 test_size=0.33,
                                 valid_size=0.33 * (1 - 0.33))
        X_train = np.concatenate([X_train, X_valid])
        y_train = np.concatenate([y_train, y_valid])
        X_valid, y_valid = X_test, y_test
    else:
        (X_train, y_train), (X_valid, y_valid), _ = airlines.load_data(
            random_state=random_state,
            test_size=0.33,
            valid_size=0.33 * (1 - 0.33))

    prepro_output = preprocessing.OneHotEncoder()
    y_train = y_train.reshape(-1, 1)
    y_valid = y_valid.reshape(-1, 1)
    y_train = prepro_output.fit_transform(y_train).toarray()
    y_valid = prepro_output.transform(y_valid).toarray()

    prepro_input = minmaxstdscaler()
    X_train = prepro_input.fit_transform(X_train)
    X_valid = prepro_input.transform(X_valid)

    print(f"X_train shape: {np.shape(X_train)}")
    print(f"y_train shape: {np.shape(y_train)}")
    print(f"X_valid shape: {np.shape(X_valid)}")
    print(f"y_valid shape: {np.shape(y_valid)}")
    return (X_train, y_train), (X_valid, y_valid)
Пример #3
0
 def __init__(
     self,
     clf=KNeighborsClassifier(n_jobs=4),
     load_data_func=lambda: load_breast_cancer(return_X_y=True),
     preproc=minmaxstdscaler(),
     seed=42,
 ):
     super().__init__(
         clf=clf, load_data_func=load_data_func, preproc=preproc, seed=seed
     )
Пример #4
0
def run(config: dict, load_data: callable) -> float:
    """Run function which can be used for AutoML classification.

    Args:
        config (dict): [description]
        load_data (callable): [description]

    Returns:
        float: [description]
    """
    seed = 42
    config["random_state"] = seed

    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=seed)

    preproc = minmaxstdscaler()
    X_train = preproc.fit_transform(X_train)
    X_test = preproc.transform(X_test)

    mapping = CLASSIFIERS

    clf_class = mapping[config["classifier"]]

    # keep parameters possible for the current classifier
    sig = signature(clf_class)
    clf_allowed_params = list(sig.parameters.keys())
    clf_params = {
        k: v
        for k, v in config.items()
        if k in clf_allowed_params and not (v in ["nan", "NA"])
    }

    if "n_jobs" in clf_allowed_params:  # performance parameter
        clf_params["n_jobs"] = 8

    try:  # good practice to manage the fail value yourself...
        clf = clf_class(**clf_params)

        clf.fit(X_train, y_train)

        fit_is_complete = True
    except:
        fit_is_complete = False

    if fit_is_complete:
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
    else:
        acc = -1.0

    return acc
Пример #5
0
 def __init__(
         self,
         clf=RandomForestRegressor(n_jobs=4, random_state=42),
         load_data_func=lambda: load_boston(return_X_y=True),
         preproc=minmaxstdscaler(),
         seed=42,
 ):
     super().__init__(clf=clf,
                      load_data_func=load_data_func,
                      preproc=preproc,
                      seed=seed)
Пример #6
0
 def __init__(
         self,
         clf=None,
         load_data_func=lambda: load_breast_cancer(return_X_y=True),
         preproc=minmaxstdscaler(),
         seed=42,
 ):
     self.clf = clf
     self.seed = seed
     self.load_data_func = load_data_func
     self.preproc = preproc
Пример #7
0
def load_data_cache():
    # Random state
    random_state = np.random.RandomState(seed=42)

    (X_train, y_train), (X_valid, y_valid), _ = covertype.load_data(
        random_state=random_state
    )

    prepro_output = preprocessing.OneHotEncoder()
    y_train = y_train.reshape(-1, 1)
    y_valid = y_valid.reshape(-1, 1)
    y_train = prepro_output.fit_transform(y_train).toarray()
    y_valid = prepro_output.transform(y_valid).toarray()

    prepro_input = minmaxstdscaler()
    X_train = prepro_input.fit_transform(X_train)
    X_valid = prepro_input.transform(X_valid)

    print(f"X_train shape: {np.shape(X_train)}")
    print(f"y_train shape: {np.shape(y_train)}")
    print(f"X_valid shape: {np.shape(X_valid)}")
    print(f"y_valid shape: {np.shape(y_valid)}")
    return (X_train, y_train), (X_valid, y_valid)
Пример #8
0
def run_autosklearn1(config: dict, load_data: callable) -> float:
    """Run function which can be used for AutoML classification.

    It has to be used with the ``deephyper.sklearn.classifier.problem_autosklearn1``  problem definition which corresponds to:

    .. code-block::

        Configuration space object:
            Hyperparameters:
                C, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale
                alpha, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale
                classifier, Type: Categorical, Choices: {RandomForest, Logistic, AdaBoost, KNeighbors, MLP, SVC, XGBoost}, Default: RandomForest
                gamma, Type: UniformFloat, Range: [1e-05, 10.0], Default: 0.01, on log-scale
                kernel, Type: Categorical, Choices: {linear, poly, rbf, sigmoid}, Default: linear
                max_depth, Type: UniformInteger, Range: [2, 100], Default: 14, on log-scale
                n_estimators, Type: UniformInteger, Range: [1, 2000], Default: 45, on log-scale
                n_neighbors, Type: UniformInteger, Range: [1, 100], Default: 50
            Conditions:
                (C | classifier == 'Logistic' || C | classifier == 'SVC')
                (gamma | kernel == 'rbf' || gamma | kernel == 'poly' || gamma | kernel == 'sigmoid')
                (n_estimators | classifier == 'RandomForest' || n_estimators | classifier == 'AdaBoost')
                alpha | classifier == 'MLP'
                kernel | classifier == 'SVC'
                max_depth | classifier == 'RandomForest'
                n_neighbors | classifier == 'KNeighbors'

    Args:
        config (dict): an hyperparameter configuration ``dict`` corresponding to the ``deephyper.sklearn.classifier.problem_autosklearn1``.
        load_data (callable): a function returning data as Numpy arrays ``(X, y)``.

    Returns:
        float: returns the accuracy on the validation set.
    """
    seed = 42
    config["random_state"] = seed

    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=seed)

    preproc = minmaxstdscaler()
    X_train = preproc.fit_transform(X_train)
    X_test = preproc.transform(X_test)

    mapping = CLASSIFIERS

    clf_class = mapping[config["classifier"]]

    # keep parameters possible for the current classifier
    sig = signature(clf_class)
    clf_allowed_params = list(sig.parameters.keys())
    clf_params = {
        k: v
        for k, v in config.items()
        if k in clf_allowed_params and not (v in ["nan", "NA"])
    }

    if "n_jobs" in clf_allowed_params:  # performance parameter
        clf_params["n_jobs"] = 8

    try:  # good practice to manage the fail value yourself...
        clf = clf_class(**clf_params)

        clf.fit(X_train, y_train)

        fit_is_complete = True
    except:
        fit_is_complete = False

    if fit_is_complete:
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
    else:
        acc = -1.0

    return acc