def run_mlbox(params: 'ExecutionParams'): train_file_path = params.train_file test_file_path = params.test_file target_name = params.target_name task = params.task config_data = get_models_hyperparameters()['MLBox'] new_test_file_path, true_target = separate_target_column( test_file_path, target_name) paths = [train_file_path, new_test_file_path] data = Reader(sep=",").train_test_split(paths, target_name) data = Drift_thresholder().fit_transform(data) score = 'roc_auc' if task is TaskTypesEnum.classification else 'neg_mean_squared_error' opt = Optimiser(scoring=score, n_folds=5) params = opt.optimise(config_data['space'], data, max_evals=config_data['max_evals']) opt.evaluate(params, data) Predictor(verbose=False).fit_predict(params, data) cur_work_dir = os.path.abspath(os.curdir) predicted_df = pd.read_csv( os.path.join(cur_work_dir, f'save/{target_name}_predictions.csv')) predicted = predicted_df['1.0'] os.remove(new_test_file_path) return true_target, predicted
def run_mlbox(train_file_path: str, test_file_path: str, target_name: str, task: MachineLearningTasksEnum): config_data = get_models_hyperparameters()['MLBox'] new_test_file_path, test_target = separate_target_column( test_file_path, target_name) paths = [train_file_path, new_test_file_path] data = Reader(sep=",").train_test_split(paths, target_name) data = Drift_thresholder().fit_transform(data) score = 'roc_auc' if task is MachineLearningTasksEnum.classification else 'neg_mean_squared_error' opt = Optimiser(scoring=score, n_folds=5) params = opt.optimise(config_data['space'], data, max_evals=config_data['max_evals']) opt.evaluate(params, data) Predictor(verbose=False).fit_predict(params, data) cur_work_dir = os.path.abspath(os.curdir) predicted_df = pd.read_csv( os.path.join(cur_work_dir, f'save/{target_name}_predictions.csv')) predicted = predicted_df['1.0'] metric = roc_auc_score(test_target, predicted) print(f'ROC_AUC: {metric}') os.remove(new_test_file_path) return metric
def mlbox_counter(): from mlbox.preprocessing import Reader, Drift_thresholder from mlbox.optimisation import Optimiser from mlbox.prediction import Predictor target_name = '601' rd = Reader(sep=",") df = rd.train_test_split(['train_egg.csv', 'test_egg.csv'], target_name) # print(df) dft = Drift_thresholder() df = dft.fit_transform(df) # removing non-stable features (like ID,...) opt = Optimiser(scoring="accuracy", n_folds=10) space = { 'est__strategy': { "search": "choice", "space": ["LightGBM"] }, 'est__n_estimators': { "search": "choice", "space": [150] }, 'est__colsample_bytree': { "search": "uniform", "space": [0.8, 0.95] }, 'est__subsample': { "search": "uniform", "space": [0.8, 0.95] }, 'est__max_depth': { "search": "choice", "space": [5, 6, 7, 8, 9] }, 'est__learning_rate': { "search": "choice", "space": [0.07] } } best = opt.optimise(space, df, 15) prd = Predictor() prd.fit_predict(best, df)
# Paths to the train set and the test set. paths = ["train_regression.csv", "test_regression.csv"] # Name of the feature to predict. # This columns should only be present in the train set. target_name = "SalePrice" # Reading and cleaning all files # Declare a reader for csv files rd = Reader(sep=',') # Return a dictionnary containing three entries # dict["train"] contains training samples withtout target columns # dict["test"] contains testing elements withtout target columns # dict["target"] contains target columns for training samples. dict = rd.train_test_split(paths, target_name) dft = Drift_thresholder() dict = dft.fit_transform(dict) # Tuning mape = make_scorer(lambda y_true, y_pred: 100 * np.sum( np.abs(y_true - y_pred) / y_true) / len(y_true), greater_is_better=False, needs_proba=False) # Declare an optimiser. You can declare your own score # as presented here or use one in # {"mean_absolute_error", "mean_squared_error","median_absolute_error","r2"} opt = Optimiser(scoring=mape, n_folds=3) opt.evaluate(None, dict) # Space of hyperparameters # The keys must respect the following syntax : "enc__param".
# Paths to the train set and the test set. paths = ["train_classification.csv", "test_classification.csv"] # Name of the feature to predict. # This columns should only be present in the train set. target_name = "Survived" # Reading and cleaning all files # Declare a reader for csv files rd = Reader(sep=',') # Return a dictionnary containing three entries # dict["train"] contains training samples withtout target columns # dict["test"] contains testing elements withtout target columns # dict["target"] contains target columns for training samples. data = rd.train_test_split(paths, target_name) dft = Drift_thresholder() data = dft.fit_transform(data) # Tuning # Declare an optimiser. Scoring possibilities for classification lie in : # {"accuracy", "roc_auc", "f1", "neg_log_loss", "precision", "recall"} opt = Optimiser(scoring='accuracy', n_folds=3) opt.evaluate(None, data) # Space of hyperparameters # The keys must respect the following syntax : "enc__param". # "enc" = "ne" for na encoder # "enc" = "ce" for categorical encoder # "enc" = "fs" for feature selector [OPTIONAL] # "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL] # "enc" = "est" for the final estimator
def model_auto_mlbox( filepath= [ "train.csv", "test.csv" ], colX=None, coly=None, do="predict", outfolder="aaserialize/", model_type="regressor/classifier", params={ "csv_seprator" : ",", "train_size" : 0.5, "score_metric" : "accuracy", "n_folds": 3, "n_step": 10}, param_space = { 'est__strategy':{"search":"choice", "space":["LightGBM"]}, 'est__n_estimators':{"search":"choice", "space":[150]}, 'est__colsample_bytree':{"search":"uniform", "space":[0.8,0.95]}, 'est__subsample':{"search":"uniform", "space":[0.8,0.95]}, 'est__max_depth':{"search":"choice", "space":[5,6,7,8,9]}, 'est__learning_rate':{"search":"choice", "space":[0.07]} }, generation=1, population_size=5, verbosity=2, ): """ Using mlbox https://www.analyticsvidhya.com/blog/2017/07/mlbox-library-automated-machine-learning/ Parameters ---------- df : TYPE DESCRIPTION. colX : TYPE DESCRIPTION. coly : TYPE DESCRIPTION. outfolder : TYPE, optional DESCRIPTION. The default is "aaserialize/". model_type : TYPE, optional DESCRIPTION. The default is "regressor/classifier". params : TYPE, optional DESCRIPTION. The default is {"train_size" : 0.5}. generation : TYPE, optional DESCRIPTION. The default is 1. population_size : TYPE, optional DESCRIPTION. The default is 5. verbosity : TYPE, optional DESCRIPTION. The default is 2. Returns ------- None. """ from mlbox.preprocessing import Reader,Drift_thresholder from mlbox.optimisation import Optimiser from mlbox.prediction import Predictor p = dict2(params) ## Pre-process """ df (dict, default = None) – Dataset dictionary. Must contain keys and values: ”train”: pandas DataFrame for the train set. ”test” : pandas DataFrame for the test set. ”target” : encoded pandas Serie for the target on train set (with dtype=’float’ for a regression or dtype=’int’ for a classification). Indexes should match the train set. """ rd = Reader(sep = p.csv_separator) df = rd.train_test_split( filepath, coly) # Reading and preprocessing (dates, ...) dft = Drift_thresholder() df = dft.fit_transform(df) # Removing non-stable features (like ID,...) ### Optimal parameter # score_rmse = make_scorer(lambda y_true, y_pred: np.sqrt(np.sum((y_true - y_pred)**2)/len(y_true)), greater_is_better=False, needs_proba=False) # opt = Optimiser(scoring = rmse, n_folds = 3) opt = Optimiser(scoring = p.score_metric, n_folds = p.n_folds) param_optim = opt.optimise(param_space, df, p.n_step) if do == "prediction" : clf = Predictor(to_path= outfolder, verbose=True) #Fit and predict and save on disk clf.fit_predict(param_optim, df) # Load the predictions preds = pd.read_csv("save/"+coly+"_predictions.csv") print(preds.shape, preds.head(5)) """