示例#1
0
def run_mlbox(params: 'ExecutionParams'):
    train_file_path = params.train_file
    test_file_path = params.test_file
    target_name = params.target_name
    task = params.task

    config_data = get_models_hyperparameters()['MLBox']
    new_test_file_path, true_target = separate_target_column(
        test_file_path, target_name)
    paths = [train_file_path, new_test_file_path]

    data = Reader(sep=",").train_test_split(paths, target_name)
    data = Drift_thresholder().fit_transform(data)

    score = 'roc_auc' if task is TaskTypesEnum.classification else 'neg_mean_squared_error'

    opt = Optimiser(scoring=score, n_folds=5)
    params = opt.optimise(config_data['space'],
                          data,
                          max_evals=config_data['max_evals'])
    opt.evaluate(params, data)

    Predictor(verbose=False).fit_predict(params, data)

    cur_work_dir = os.path.abspath(os.curdir)

    predicted_df = pd.read_csv(
        os.path.join(cur_work_dir, f'save/{target_name}_predictions.csv'))
    predicted = predicted_df['1.0']

    os.remove(new_test_file_path)

    return true_target, predicted
def run_mlbox(train_file_path: str, test_file_path: str, target_name: str,
              task: MachineLearningTasksEnum):
    config_data = get_models_hyperparameters()['MLBox']
    new_test_file_path, test_target = separate_target_column(
        test_file_path, target_name)
    paths = [train_file_path, new_test_file_path]

    data = Reader(sep=",").train_test_split(paths, target_name)
    data = Drift_thresholder().fit_transform(data)

    score = 'roc_auc' if task is MachineLearningTasksEnum.classification else 'neg_mean_squared_error'

    opt = Optimiser(scoring=score, n_folds=5)
    params = opt.optimise(config_data['space'],
                          data,
                          max_evals=config_data['max_evals'])
    opt.evaluate(params, data)

    Predictor(verbose=False).fit_predict(params, data)

    cur_work_dir = os.path.abspath(os.curdir)

    predicted_df = pd.read_csv(
        os.path.join(cur_work_dir, f'save/{target_name}_predictions.csv'))
    predicted = predicted_df['1.0']
    metric = roc_auc_score(test_target, predicted)

    print(f'ROC_AUC: {metric}')

    os.remove(new_test_file_path)

    return metric
示例#3
0
def mlbox_counter():
    from mlbox.preprocessing import Reader, Drift_thresholder
    from mlbox.optimisation import Optimiser
    from mlbox.prediction import Predictor
    target_name = '601'

    rd = Reader(sep=",")
    df = rd.train_test_split(['train_egg.csv', 'test_egg.csv'], target_name)
    # print(df)
    dft = Drift_thresholder()
    df = dft.fit_transform(df)  # removing non-stable features (like ID,...)

    opt = Optimiser(scoring="accuracy", n_folds=10)
    space = {
        'est__strategy': {
            "search": "choice",
            "space": ["LightGBM"]
        },
        'est__n_estimators': {
            "search": "choice",
            "space": [150]
        },
        'est__colsample_bytree': {
            "search": "uniform",
            "space": [0.8, 0.95]
        },
        'est__subsample': {
            "search": "uniform",
            "space": [0.8, 0.95]
        },
        'est__max_depth': {
            "search": "choice",
            "space": [5, 6, 7, 8, 9]
        },
        'est__learning_rate': {
            "search": "choice",
            "space": [0.07]
        }
    }
    best = opt.optimise(space, df, 15)

    prd = Predictor()
    prd.fit_predict(best, df)
# Paths to the train set and the test set.
paths = ["train_regression.csv", "test_regression.csv"]
# Name of the feature to predict.
# This columns should only be present in the train set.
target_name = "SalePrice"

# Reading and cleaning all files
# Declare a reader for csv files
rd = Reader(sep=',')
# Return a dictionnary containing three entries
# dict["train"] contains training samples withtout target columns
# dict["test"] contains testing elements withtout target columns
# dict["target"] contains target columns for training samples.
dict = rd.train_test_split(paths, target_name)

dft = Drift_thresholder()
dict = dft.fit_transform(dict)

# Tuning
mape = make_scorer(lambda y_true, y_pred: 100 * np.sum(
    np.abs(y_true - y_pred) / y_true) / len(y_true),
                   greater_is_better=False,
                   needs_proba=False)
# Declare an optimiser. You can declare your own score
# as presented here or use one in
# {"mean_absolute_error", "mean_squared_error","median_absolute_error","r2"}
opt = Optimiser(scoring=mape, n_folds=3)
opt.evaluate(None, dict)

# Space of hyperparameters
# The keys must respect the following syntax : "enc__param".
示例#5
0
# Paths to the train set and the test set.
paths = ["train_classification.csv", "test_classification.csv"]
# Name of the feature to predict.
# This columns should only be present in the train set.
target_name = "Survived"

# Reading and cleaning all files
# Declare a reader for csv files
rd = Reader(sep=',')
# Return a dictionnary containing three entries
# dict["train"] contains training samples withtout target columns
# dict["test"] contains testing elements withtout target columns
# dict["target"] contains target columns for training samples.
data = rd.train_test_split(paths, target_name)

dft = Drift_thresholder()
data = dft.fit_transform(data)

# Tuning
# Declare an optimiser. Scoring possibilities for classification lie in :
# {"accuracy", "roc_auc", "f1", "neg_log_loss", "precision", "recall"}
opt = Optimiser(scoring='accuracy', n_folds=3)
opt.evaluate(None, data)

# Space of hyperparameters
# The keys must respect the following syntax : "enc__param".
#   "enc" = "ne" for na encoder
#   "enc" = "ce" for categorical encoder
#   "enc" = "fs" for feature selector [OPTIONAL]
#   "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
#   "enc" = "est" for the final estimator
示例#6
0
def model_auto_mlbox( filepath= [ "train.csv", "test.csv" ],
    colX=None, coly=None,
    do="predict",
    outfolder="aaserialize/",
    model_type="regressor/classifier",
    params={ "csv_seprator" : ",", "train_size" : 0.5, "score_metric" : "accuracy",
             "n_folds": 3, "n_step": 10},
    param_space =  {
        'est__strategy':{"search":"choice",                         "space":["LightGBM"]},
        'est__n_estimators':{"search":"choice",                     "space":[150]},
        'est__colsample_bytree':{"search":"uniform",                "space":[0.8,0.95]},
        'est__subsample':{"search":"uniform",                       "space":[0.8,0.95]},
        'est__max_depth':{"search":"choice",                        "space":[5,6,7,8,9]},
        'est__learning_rate':{"search":"choice",                    "space":[0.07]}
    },
    generation=1,
    population_size=5,
    verbosity=2,
):
    """
      Using mlbox
      https://www.analyticsvidhya.com/blog/2017/07/mlbox-library-automated-machine-learning/


    Parameters
    ----------
    df : TYPE
        DESCRIPTION.
    colX : TYPE
        DESCRIPTION.
    coly : TYPE
        DESCRIPTION.
    outfolder : TYPE, optional
        DESCRIPTION. The default is "aaserialize/".
    model_type : TYPE, optional
        DESCRIPTION. The default is "regressor/classifier".
    params : TYPE, optional
        DESCRIPTION. The default is {"train_size" : 0.5}.
    generation : TYPE, optional
        DESCRIPTION. The default is 1.
    population_size : TYPE, optional
        DESCRIPTION. The default is 5.
    verbosity : TYPE, optional
        DESCRIPTION. The default is 2.

    Returns
    -------
    None.

    """
    from mlbox.preprocessing import Reader,Drift_thresholder
    from mlbox.optimisation import Optimiser
    from mlbox.prediction import Predictor

    p = dict2(params)


    ## Pre-process
    """
    df (dict, default = None) –
    Dataset dictionary. Must contain keys and values:

    ”train”: pandas DataFrame for the train set.
    ”test” : pandas DataFrame for the test set.
    ”target” : encoded pandas Serie for the target on train set (with dtype=’float’ for a regression or dtype=’int’ for a classification). Indexes should match the train set.

    """
    rd = Reader(sep = p.csv_separator)
    df = rd.train_test_split( filepath, coly)   # Reading and preprocessing (dates, ...)
    dft = Drift_thresholder()
    df = dft.fit_transform(df)      # Removing non-stable features (like ID,...)


    ### Optimal parameter
    # score_rmse = make_scorer(lambda y_true, y_pred: np.sqrt(np.sum((y_true - y_pred)**2)/len(y_true)), greater_is_better=False, needs_proba=False)
    #                    opt = Optimiser(scoring = rmse, n_folds = 3)

    opt = Optimiser(scoring = p.score_metric, n_folds = p.n_folds)
    param_optim = opt.optimise(param_space, df, p.n_step)


    if do == "prediction" :
      clf = Predictor(to_path= outfolder, verbose=True)

      #Fit and predict and save on disk
      clf.fit_predict(param_optim, df)

      # Load the predictions
      preds = pd.read_csv("save/"+coly+"_predictions.csv")
      print(preds.shape, preds.head(5))




      """