def cross_validate_model(self, X, y, model, num_folds, score): ''' Realiza la validación de un modelo, con los dataset X e y, haciendo CV de numfolds y devolviendo la metrica score. Parametros ---------- X: numpy.array Datos de las instancias del dataset con el que entrenar. Y: numpy.array Targets de las instancias de X. model: scikit_model modelo al que entrenar num_folds: int numero de folds de la validacion cruzada score: string métrica que devuelve, de los strings disponibles en valido en sklearn.metrics.SCORERS.keys(). Return ------- array de resultados array con los resultados de cada fold de la CV ''' if score not in SCORERS.keys(): raise AttributeError( "Atributo score debe ser válido. Ver válidos en sklearn.metrics.SCORERS.keys()" ) print('\t' + str(model)[:20], end=' - ') mod_scores = cross_val_score(model, X, y, cv=num_folds, scoring=score) print('FM') return np.array(mod_scores)
def _fit_by_cross_validation(self, X, y, number_of_splits: int = 5, label_name: str = None, cores_for_training: int = 1, optimization_metric: str = "balanced_accuracy"): model = self._get_ml_model() scoring = optimization_metric if optimization_metric not in SCORERS.keys(): scoring = "balanced_accuracy" warnings.warn( f"{self.__class__.__name__}: specified optimization metric ({optimization_metric}) is not defined as a sklearn scoring function, using {scoring} instead... ") if not self.show_warnings: warnings.simplefilter("ignore") os.environ["PYTHONWARNINGS"] = "ignore" self.model = RandomizedSearchCV(model, param_distributions=self._parameter_grid, cv=number_of_splits, n_jobs=cores_for_training, scoring=scoring, refit=True) self.model.fit(X, y) if not self.show_warnings: del os.environ["PYTHONWARNINGS"] warnings.simplefilter("always") self.model = self.model.best_estimator_ # do not leave RandomSearchCV object to be in models, use the best estimator instead return self.model
def score_options(): ''' Return a list of possible scorers for a regression model ''' from sklearn.metrics import SCORERS score_types = sorted(SCORERS.keys()) print('Possible scores to choose from: ') for s in score_types: print(s)
def evaluation_p(population: pd.DataFrame, list_caracteres: np.array, data: pd.DataFrame, target: pd.Series, model: Any, scorer: str, n_cv: int = 5, sort_scores: bool = True) -> pd.DataFrame: """ Évaluation des individus d'une population :param population: ensemble des individus à évaluer :param list_caracteres: liste des caractères qui peuvent être exprimés :param data: données à utiliser pour l'évaluation :param target: sortie à prédire :param model: modèle à ajuster :param scorer: score de performance à maximiser :param n_cv: nombre de plis pour validation croisée, doit être au moins de 2 :param sort_scores: appliquer un tri (descendant) ou non :return: population avec score de performance obtenu par validation croisée """ # TODO: voir pour passer directement la fonction scorer=make_score() # et les paramètres associés pour ne pas la définir en dur population_eval = population.copy() mean_scores = [] xval_strategy = KFold(n_cv, shuffle=True, random_state=123) for indiv in population_eval.values: lstcols = list_caracteres[indiv] if scorer in SCORERS.keys(): scoring_function = scorer elif scorer == 'bic': kwargs = {'k': len(lstcols)} scoring_function = make_scorer(get_bic, greater_is_better=False, **kwargs) else: print(f'Scorer {scorer} unknown') break scores = cross_val_score(model, data[lstcols], target, cv=xval_strategy, scoring=scoring_function, n_jobs=-1) mean_scores.append(scores.mean()) population_scores = pd.Series(mean_scores, index=population_eval.index, name='score') population_eval['score'] = population_scores if sort_scores: population_eval = population_eval.sort_values(by='score', ascending=False) else: population_eval = population_eval.reset_index(drop=True) return population_eval
def cv(self): data = input("Predictors Name: ") target = input("Target: ") scaler = input( "Input type of Problem: Regression = R, Classification = C") no_cv = input("k-folds, k = ") from sklearn.metrics import SCORERS list(SCORERS.keys()) reg_scorers = [ 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'explained_variance' ] class_scorers = ['precision', 'recall', 'f1', 'accuracy', 'roc_auc'] step2 = "Get scorers to cross validate on. Please separate scorers by a comma only. " print(step2) if scaler == "R": metrics = input(", ".join(reg_scorers)) elif scaler == "C": metrics = input(", ".join(class_scorers)) scorers = [i.strip() for i in metrics.split(",")] print(self.get_script) print("\n_____________Copy from Here_____________\n") step1 = "# Import module to cross-validate" print(step1) module_import = "from sklearn.model_selection import cross_validate" print(module_import + "\n") step3 = f"#Create base model to cross validate " print(step3) print(f"model = ...\n") step4 = f"#Define the scorers to validate on" print(step4) print(f"scorers = {scorers}") step4 = f"#Cross validate model on {data}" print(step4) print( f"scores = cross_validate(model, X = {data}, y = {target}, scoring = scorers)\n", cv=no_cv) step5 = f"#Check performance on {data}" print(step5) str = r"print(f'Performance : { scores }')" print(str)
def parse_cfg(self): transformations = self.cfg['dataset'].get('transform') if transformations: self.cfg['dataset']['transform'] = [ eval(t) for t in transformations ] target_transform = self.cfg['dataset'].get('target_transform') if target_transform: self.cfg['dataset']['target_transform'] = eval(target_transform) scorers = self.cfg['training'].get('scorers') if isinstance(scorers, str): scorers = [scorers] scorers_dict = {} for s in scorers: scorers_dict[s] = s if s in SCORERS.keys() else make_scorer( eval(s)) self.cfg['training']['scorers'] = scorers_dict models = self.cfg['training'].get('models') if isinstance(models, list): self.cfg['training']['models'] = [ Pipeline([(e, _get_model(e)) for e in model]) for model in models ] elif isinstance(models, str): self.cfg['training']['models'] = jb_load(models) self.cfg['training']['cfg_path'] = self.cfg_path if 'holdout' in self.cfg: self.cfg['holdout']['cfg_path'] = self.cfg_path scorers = self.cfg['holdout'].get('scorers') if scorers: valid_scorers = { 'balanced_accuracy': balanced_accuracy_score, 'accuracy': accuracy_score, 'roc_auc': roc_auc_score, 'recall': recall_score, 'specificity': specificity, } self.cfg['holdout']['scorers'] = [ valid_scorers[s] for s in scorers ]
def test_scorer_memmap_input(): # Non-regression test for #6147: some score functions would # return singleton memmap when computed on memmap data instead of scalar # float values. for name in SCORERS.keys(): yield check_scorer_memmap, name
X_test[sc_cols] = \ sc.transform(X_test[sc_cols]) features = list(X_train.columns.values) print(X_train.shape) print(X_test.shape) ############################################################## param_grid = { 'C': [0.001, 0.01, 0.1, 1, 10], #[.1,.9], #[1e-5,1e-4, 1e-3, 1e-2, 0.1],# 1, 10, 100], 'gamma': [ 0.001, 0.01, 0.1, 1 ], #['scale',1,10,100], #'['auto','scale', 1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100], 'kernel': ['sigmoid'] } print(sorted(SCORERS.keys())) clf = GridSearchCV( svm.SVC(probability=True), param_grid=param_grid, scoring='accuracy', #'roc_auc', #'f1_macro', # 'f1_weighted', #'precision_weighted',#'average_precision', #'f1_macro', cv=3, refit=True, verbose=10, # 10 to see results return_train_score=True, # n_jobs=multiprocessing.cpu_count() - 5 # get error if using too much memory , njobs < cpu's availale -2 #n_jobs=30 ) # higher verbose =more printed
import pandas as pd from sklearn.model_selection import cross_val_score from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import SCORERS data = pd.read_csv('IRIS.csv') print(data.describe()) print(data.head()) y = data['species'] X = data.drop(['species'], axis=1) print(y.head()) print(X.head()) dt_model = DecisionTreeClassifier() print(SCORERS.keys()) scores = cross_val_score(dt_model, X, y, cv=5, n_jobs=4, scoring='accuracy') print(scores) print(scores.mean()) print(scores.std())
import xgboost as xgb from xgboost import XGBRegressor as XGBR from xgboost import XGBClassifier as XGBC from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split, KFold, cross_val_score from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.linear_model import LinearRegression as LR from sklearn.metrics import mean_squared_error as mse, SCORERS data = load_boston() X = data.data Y = data.target X_trian, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) sk_xgb_model = XGBR(n_estimators=100, random_state=0).fit(X_trian, Y_train) pre1 = sk_xgb_model.predict(X_test) score1 = sk_xgb_model.score(X_test, Y_test) mse = mse(y_true=Y_test, y_pred=pre1) important = sk_xgb_model.feature_importances_ print('pre: ', pre1) print('score1: ', score1) print('mse: ', mse) print('important: ', important) print('mean: ', Y.mean()) print(SCORERS.keys()) #所有可用的评估指标
from sklearn.metrics import SCORERS if __name__ == "__main__": print("These below are going to be lots of fun") for scorer in SCORERS.keys(): print(scorer)
def evaluate(self, params, df): """Evaluates the data. Evaluates the data with a given scoring function and given hyper-parameters of the whole pipeline. If no parameters are set, default configuration for each step is evaluated : no feature selection is applied and no meta features are created. Parameters ---------- params : dict, default = None. Hyper-parameters dictionary for the whole pipeline. - The keys must respect the following syntax : "enc__param". - "enc" = "ne" for na encoder - "enc" = "ce" for categorical encoder - "enc" = "fs" for feature selector [OPTIONAL] - "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL] - "enc" = "est" for the final estimator - "param" : a correct associated parameter for each step. Ex: "max_depth" for "enc"="est", ... - The values are those of the parameters. Ex: 4 for key = "est__max_depth", ... df : dict, default = None Dataset dictionary. Must contain keys and values: - "train": pandas DataFrame for the train set. - "target" : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification). Indexes should match the train set. Returns ------- float. The score. The higher the better. Positive for a score and negative for a loss. Examples -------- >>> from mlbox.optimisation import * >>> from sklearn.datasets import load_boston >>> #load data >>> dataset = load_boston() >>> #evaluating the pipeline >>> opt = Optimiser() >>> params = { ... "ne__numerical_strategy" : 0, ... "ce__strategy" : "label_encoding", ... "fs__threshold" : 0.1, ... "stck__base_estimators" : [Regressor(strategy="RandomForest"), Regressor(strategy="ExtraTrees")], ... "est__strategy" : "Linear" ... } >>> df = {"train" : pd.DataFrame(dataset.data), "target" : pd.Series(dataset.target)} >>> opt.evaluate(params, df) """ ne = NA_encoder() ce = Categorical_encoder() ########################################## # Automatically checking the task ########################################## # TODO: a lot of code can be factorized for the different tasks ########################################## # Classification ########################################## if (df['target'].dtype == 'int'): # Cross validation counts = df['target'].value_counts() classes_to_drop = counts[counts < self.n_folds].index mask_to_drop = df['target'].apply(lambda x: x in classes_to_drop) indexes_to_drop = df['target'][mask_to_drop].index n_classes = len(counts) - len(classes_to_drop) if n_classes == 1: raise ValueError( "Your target has not enough classes. You can't run the optimiser" ) cv = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state) # Estimator est = Classifier() # Feature selection if specified fs = None if (params is not None): for p in params.keys(): if (p.startswith("fs__")): fs = Clf_feature_selector() else: pass # Stacking if specified STCK = {} if (params is not None): for p in params.keys(): if (p.startswith("stck")): # TODO: Check if p.split("__")[1] instead? STCK[p.split("__")[0]] = StackingClassifier( verbose=False) # noqa else: pass # Default scoring for classification if (self.scoring is None): self.scoring = 'neg_log_loss' # works also for multiclass pb else: if (type(self.scoring) == str): if (self.scoring not in list(SCORERS.keys())): warnings.warn("Unknown or invalid scoring metric. " "neg_log_loss is used instead.") self.scoring = 'neg_log_loss' else: # binary classification if n_classes <= 2: pass # multiclass classification else: warnings.warn( "This is a multiclass problem. Please make sure that your scoring metric is " "appropriate.") if self.scoring + "_weighted" in list( SCORERS.keys()): warnings.warn( "Weighted strategy for the scoring metric is used." ) self.scoring = self.scoring + "_weighted" # specific scenarios else: if self.scoring == "roc_auc": self.scoring = make_scorer( lambda y_true, y_pred: roc_auc_score( pd.get_dummies(y_true), y_pred ), # noqa greater_is_better=True, needs_proba=True) else: pass ########################################## # Regression ########################################## elif (df['target'].dtype == 'float'): # Cross validation indexes_to_drop = [] cv = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state) # Estimator est = Regressor() # Feature selection if specified fs = None if (params is not None): for p in params.keys(): if (p.startswith("fs__")): fs = Reg_feature_selector() else: pass # Stacking if specified STCK = {} if (params is not None): for p in params.keys(): if (p.startswith("stck")): # TODO: Check if p.split("__")[1] instead? STCK[p.split("__")[0]] = StackingRegressor( verbose=False) else: pass # Default scoring for regression if (self.scoring is None): self.scoring = "neg_mean_squared_error" else: if (type(self.scoring) == str): if (self.scoring not in list(SCORERS.keys())): warnings.warn( "Unknown or invalid scoring metric. " "neg_mean_squared_error is used instead.") self.scoring = 'neg_mean_squared_error' else: pass else: pass else: raise ValueError("Impossible to determine the task. " "Please check that your target is encoded.") ########################################## # Creating the Pipeline ########################################## pipe = [("ne", ne), ("ce", ce)] # Do we need to cache transformers? cache = False if (params is not None): if ("ce__strategy" in params): if (params["ce__strategy"] == "entity_embedding"): cache = True else: pass else: pass if (fs is not None): if ("fs__strategy" in params): if (params["fs__strategy"] != "variance"): cache = True else: pass else: pass if (len(STCK) != 0): cache = True else: pass # Pipeline creation if (fs is not None): pipe.append(("fs", fs)) else: pass for stck in np.sort(list(STCK)): pipe.append((stck, STCK[stck])) pipe.append(("est", est)) if cache: pp = Pipeline(pipe, memory=self.to_path) else: pp = Pipeline(pipe) ########################################## # Fitting the Pipeline ########################################## start_time = time.time() # No params : default configuration if (params is None): set_params = True print('No parameters set. Default configuration is tested') else: try: pp = pp.set_params(**params) set_params = True except: set_params = False if (set_params): if (self.verbose): print("") print("#####################################################" " testing hyper-parameters... " "#####################################################") print("") print(">>> NA ENCODER :" + str(ne.get_params())) print("") print(">>> CA ENCODER :" + str({'strategy': ce.strategy})) if (fs is not None): print("") print(">>> FEATURE SELECTOR :" + str(fs.get_params())) for i, stck in enumerate(np.sort(list(STCK))): stck_params = STCK[stck].get_params().copy() stck_params_display = { k: stck_params[k] for k in stck_params.keys() if k not in ["level_estimator", "verbose", "base_estimators"] } print("") print(">>> STACKING LAYER n°" + str(i + 1) + " :" + str(stck_params_display)) for j, model in enumerate(stck_params["base_estimators"]): print("") print(" > base_estimator n°" + str(j + 1) + " :" + str( dict( list(model.get_params().items()) + list(model.get_estimator().get_params(). items())))) print("") print(">>> ESTIMATOR :" + str( dict( list(est.get_params().items()) + list(est.get_estimator().get_params().items())))) print("") try: # Computing the mean cross validation score across the folds scores = cross_val_score(estimator=pp, X=df['train'].drop(indexes_to_drop), y=df['target'].drop(indexes_to_drop), scoring=self.scoring, cv=cv) score = np.mean(scores) except: scores = [-np.inf for _ in range(self.n_folds)] score = -np.inf else: raise ValueError("Pipeline cannot be set with these parameters." " Check the name of your stages.") if (score == -np.inf): warnings.warn( "An error occurred while computing the cross " "validation mean score. Please check that the parameter values are correct " "and that your scoring function is valid and appropriate to the task." ) ########################################## # Reporting scores ########################################## out = " (" for i, s in enumerate(scores[:-1]): out = out + "fold " + str(i + 1) + " = " + str(s) + ", " if (self.verbose): print("") print("MEAN SCORE : " + str(self.scoring) + " = " + str(score)) print("VARIANCE : " + str(np.std(scores)) + out + "fold " + str(i + 2) + " = " + str(scores[-1]) + ")") print("CPU time: %s seconds" % (time.time() - start_time)) print("") return score
import random from pprint import pprint from numpy import ravel from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import accuracy_score, SCORERS if __name__ == '__main__': data = load_boston() model = RandomForestRegressor(n_estimators=10, random_state=0) cross = cross_val_score(model, data.data, data.target, cv=10, scoring='neg_mean_squared_error') # 模型指标评估列表 k = sorted(SCORERS.keys()) pprint(k)
rmse = math.sqrt(mse) print(rmse) '''summary table for coefficients''' import statsmodels.api as sm from sklearn.linear_model import LinearRegression as lr X2_train = sm.add_constant(X_train) # add a column of 1 beside x col ols = sm.OLS( y_train.astype(float), X2_train.astype(float)) # ordinary least square = linear regression lr = ols.fit() print(lr.summary()) '''Cross validation''' from sklearn.model_selection import KFold, cross_val_score from sklearn.linear_model import LinearRegression as lr from sklearn.metrics import SCORERS SCORERS.keys() kf = KFold(n_splits=5, shuffle=True, random_state=1) lr = lr() '''Cross validation score (R2 for test data, and full data)''' r2score = cross_val_score(lr1, X_test, y_test, cv=kf, scoring='r2') print(r2score.mean()) r2score_b = cross_val_score(lr, X, y, cv=kf, scoring='r2') print(r2score_b.mean()) '''Cross validation score (RMSE for test data, and full data)''' RMSE = np.sqrt(-cross_val_score( lr1, X_test, y_test, cv=kf, scoring='neg_mean_squared_error')) print(RMSE.mean()) RMSE_b = np.sqrt( -cross_val_score(lr, X, y, cv=kf, scoring='neg_mean_squared_error')) print(RMSE_b.mean())
['grid_search', 'random_search', 'bayesian_search']), required=True) @click.option( '--cv', help="Number of cross validation steps", type=int, required=False, default=5, show_default=True, ) @click.option( '-m', '--metrics', help= "Metrics that should be tested during cross validation (comma separated)", type=click.Choice(list(SCORERS.keys())), required=False, multiple=True, ) @click.option( '--randomize', help= "Randomize sample labels to test the stability of and effectiveness of the machine learning algorithm", is_flag=True, required=False, ) def classify( data: str, out: str, model: str, optimizer: str,