def main(): try: import sklearn if sklearn.__version__ < "0.20": gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed") except ImportError: gs.fatal("Package python3-scikit-learn 0.20 or newer is not installed") try: import pandas as pd except ImportError: gs.fatal("Package python3-pandas 0.25 or newer is not installed") # parser options --------------------------------------------------------------------------------------------------- group = options["group"] training_map = options["training_map"] training_points = options["training_points"] field = options["field"] model_save = options["save_model"] model_name = options["model_name"] hyperparams = { "penalty": options["penalty"], "alpha": options["alpha"], "l1_ratio": options["l1_ratio"], "C": options["c"], "epsilon": options["epsilon"], "min_samples_leaf": options["min_samples_leaf"], "n_estimators": options["n_estimators"], "learning_rate": options["learning_rate"], "subsample": options["subsample"], "max_depth": options["max_depth"], "max_features": options["max_features"], "n_neighbors": options["n_neighbors"], "weights": options["weights"], "hidden_layer_sizes": options["hidden_units"], } cv = int(options["cv"]) group_raster = options["group_raster"] importances = flags["f"] preds_file = options["preds_file"] classif_file = options["classif_file"] fimp_file = options["fimp_file"] param_file = options["param_file"] norm_data = flags["s"] random_state = int(options["random_state"]) load_training = options["load_training"] save_training = options["save_training"] n_jobs = int(options["n_jobs"]) balance = flags["b"] category_maps = option_to_list(options["category_maps"]) # define estimator ------------------------------------------------------------------------------------------------- hyperparams, param_grid = process_param_grid(hyperparams) estimator, mode = predefined_estimators( model_name, random_state, n_jobs, hyperparams ) # remove dict keys that are incompatible for the selected estimator estimator_params = estimator.get_params() param_grid = { key: value for key, value in param_grid.items() if key in estimator_params } scoring, search_scorer = scoring_metrics(mode) # checks of input options ------------------------------------------------------------------------------------------ if ( mode == "classification" and balance is True and model_name not in check_class_weights() ): gs.warning(model_name + " does not support class weights") balance = False if mode == "regression" and balance is True: gs.warning("Balancing of class weights is only possible for classification") balance = False if classif_file: if cv <= 1: gs.fatal( "Output of cross-validation global accuracy requires cross-validation cv > 1" ) if not os.path.exists(os.path.dirname(classif_file)): gs.fatal("Directory for output file {} does not exist".format(classif_file)) # feature importance file selected but no cross-validation scheme used if importances: if sklearn.__version__ < "0.22": gs.fatal("Feature importances calculation requires scikit-learn version >= 0.22") if fimp_file: if importances is False: gs.fatal('Output of feature importance requires the "f" flag to be set') if not os.path.exists(os.path.dirname(fimp_file)): gs.fatal("Directory for output file {} does not exist".format(fimp_file)) # predictions file selected but no cross-validation scheme used if preds_file: if cv <= 1: gs.fatal( "Output of cross-validation predictions requires cross-validation cv > 1" ) if not os.path.exists(os.path.dirname(preds_file)): gs.fatal("Directory for output file {} does not exist".format(preds_file)) # define RasterStack ----------------------------------------------------------------------------------------------- stack = RasterStack(group=group) if category_maps is not None: stack.categorical = category_maps # extract training data -------------------------------------------------------------------------------------------- if load_training != "": X, y, cat, class_labels, group_id = load_training_data(load_training) if class_labels is not None: a = pd.DataFrame({"response": y, "labels": class_labels}) a = a.drop_duplicates().values class_labels = {k: v for (k, v) in a} else: gs.message("Extracting training data") if group_raster != "": stack.append(group_raster) if training_map != "": X, y, cat = stack.extract_pixels(training_map) y = y.flatten() with RasterRow(training_map) as src: class_labels = {v: k for (k, v, m) in src.cats} if "" in class_labels.values(): class_labels = None elif training_points != "": X, y, cat = stack.extract_points(training_points, field) y = y.flatten() if y.dtype in (np.object_, np.object): from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = le.fit_transform(y) class_labels = {k: v for (k, v) in enumerate(le.classes_)} else: class_labels = None # take group id from last column and remove from predictors if group_raster != "": group_id = X[:, -1] X = np.delete(X, -1, axis=1) stack.drop(group_raster) else: group_id = None # check for labelled pixels and training data if y.shape[0] == 0 or X.shape[0] == 0: gs.fatal( "No training pixels or pixels in imagery group " "...check computational region" ) from sklearn.utils import shuffle if group_id is None: X, y, cat = shuffle(X, y, cat, random_state=random_state) else: X, y, cat, group_id = shuffle( X, y, cat, group_id, random_state=random_state ) if save_training != "": save_training_data( save_training, X, y, cat, class_labels, group_id, stack.names ) # cross validation settings ---------------------------------------------------------------------------------------- # inner resampling method (cv=2) from sklearn.model_selection import GridSearchCV, StratifiedKFold, GroupKFold, KFold if any(param_grid) is True: if group_id is None and mode == "classification": inner = StratifiedKFold(n_splits=2, random_state=random_state) elif group_id is None and mode == "regression": inner = KFold(n_splits=2, random_state=random_state) else: inner = GroupKFold(n_splits=2) else: inner = None # outer resampling method (cv=cv) if cv > 1: if group_id is None and mode == "classification": outer = StratifiedKFold(n_splits=cv, random_state=random_state) elif group_id is None and mode == "regression": outer = KFold(n_splits=cv, random_state=random_state) else: outer = GroupKFold(n_splits=cv) # modify estimators that take sample_weights ----------------------------------------------------------------------- if balance is True: from sklearn.utils import compute_class_weight class_weights = compute_class_weight(class_weight="balanced", classes=(y), y=y) fit_params = {"sample_weight": class_weights} else: class_weights = None fit_params = {} # preprocessing ---------------------------------------------------------------------------------------------------- from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder # standardization if norm_data is True and category_maps is None: scaler = StandardScaler() trans = ColumnTransformer( remainder="passthrough", transformers=[("scaling", scaler, np.arange(0, stack.count))], ) # one-hot encoding elif norm_data is False and category_maps is not None: enc = OneHotEncoder(handle_unknown="ignore", sparse=False) trans = ColumnTransformer( remainder="passthrough", transformers=[("onehot", enc, stack.categorical)] ) # standardization and one-hot encoding elif norm_data is True and category_maps is not None: scaler = StandardScaler() enc = OneHotEncoder(handle_unknown="ignore", sparse=False) trans = ColumnTransformer( remainder="passthrough", transformers=[ ("onehot", enc, stack.categorical), ("scaling", scaler, np.setxor1d( range(stack.count), stack.categorical).astype('int')), ], ) # combine transformers if norm_data is True or category_maps is not None: estimator = Pipeline([("preprocessing", trans), ("estimator", estimator)]) param_grid = wrap_named_step(param_grid) fit_params = wrap_named_step(fit_params) if any(param_grid) is True: estimator = GridSearchCV( estimator=estimator, param_grid=param_grid, scoring=search_scorer, n_jobs=n_jobs, cv=inner, ) # estimator training ----------------------------------------------------------------------------------------------- gs.message(os.linesep) gs.message(("Fitting model using " + model_name)) if balance is True and group_id is not None: estimator.fit(X, y, groups=group_id, **fit_params) elif balance is True and group_id is None: estimator.fit(X, y, **fit_params) else: estimator.fit(X, y) # message best hyperparameter setup and optionally save using pandas if any(param_grid) is True: gs.message(os.linesep) gs.message("Best parameters:") optimal_pars = [ (k.replace("estimator__", "").replace("selection__", "") + " = " + str(v)) for (k, v) in estimator.best_params_.items() ] for i in optimal_pars: gs.message(i) if param_file != "": param_df = pd.DataFrame(estimator.cv_results_) param_df.to_csv(param_file) # cross-validation ------------------------------------------------------------------------------------------------- if cv > 1: from sklearn.metrics import classification_report from sklearn import metrics if ( mode == "classification" and cv > np.histogram(y, bins=np.unique(y))[0].min() ): gs.message(os.linesep) gs.fatal( "Number of cv folds is greater than number of " "samples in some classes" ) gs.message(os.linesep) gs.message("Cross validation global performance measures......:") if ( mode == "classification" and len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)) ): scoring["roc_auc"] = metrics.roc_auc_score from sklearn.model_selection import cross_val_predict preds = cross_val_predict( estimator, X, y, group_id, cv=outer, n_jobs=n_jobs, fit_params=fit_params ) test_idx = [test for train, test in outer.split(X, y)] n_fold = np.zeros((0,)) for fold in range(outer.get_n_splits()): n_fold = np.hstack((n_fold, np.repeat(fold, test_idx[fold].shape[0]))) preds = {"y_pred": preds, "y_true": y, "cat": cat, "fold": n_fold} preds = pd.DataFrame(data=preds, columns=["y_pred", "y_true", "cat", "fold"]) gs.message(os.linesep) gs.message("Global cross validation scores...") gs.message(os.linesep) gs.message("Metric \t Mean \t Error") for name, func in scoring.items(): score_mean = ( preds.groupby("fold") .apply(lambda x: func(x["y_true"], x["y_pred"])) .mean() ) score_std = ( preds.groupby("fold") .apply(lambda x: func(x["y_true"], x["y_pred"])) .std() ) gs.message( name + "\t" + str(score_mean.round(3)) + "\t" + str(score_std.round(3)) ) if mode == "classification": gs.message(os.linesep) gs.message("Cross validation class performance measures......:") report_str = classification_report( y_true=preds["y_true"], y_pred=preds["y_pred"], sample_weight=class_weights, output_dict=False, ) report = classification_report( y_true=preds["y_true"], y_pred=preds["y_pred"], sample_weight=class_weights, output_dict=True, ) report = pd.DataFrame(report) gs.message(report_str) if classif_file != "": report.to_csv(classif_file, mode="w", index=True) # write cross-validation predictions to csv file if preds_file != "": preds.to_csv(preds_file, mode="w", index=False) text_file = open(preds_file + "t", "w") text_file.write('"Real", "Real", "integer", "integer"') text_file.close() # feature importances ---------------------------------------------------------------------------------------------- if importances is True: from sklearn.inspection import permutation_importance fimp = permutation_importance( estimator, X, y, scoring=search_scorer, n_repeats=5, n_jobs=n_jobs, random_state=random_state, ) feature_names = deepcopy(stack.names) feature_names = [i.split("@")[0] for i in feature_names] fimp = pd.DataFrame( { "feature": feature_names, "importance": fimp["importances_mean"], "std": fimp["importances_std"], } ) gs.message(os.linesep) gs.message("Feature importances") gs.message("Feature" + "\t" + "Score") for index, row in fimp.iterrows(): gs.message( row["feature"] + "\t" + str(row["importance"]) + "\t" + str(row["std"]) ) if fimp_file != "": fimp.to_csv(fimp_file, index=False) # save the fitted model import joblib joblib.dump((estimator, y, class_labels), model_save)
from raster import RasterStack stack = RasterStack(rasters=[ "lsat5_1987_10", "lsat5_1987_20", "lsat5_1987_30", "lsat5_1987_40", "lsat5_1987_50", "lsat5_1987_70" ]) stack = RasterStack(rasters=maplist) stack.lsat5_1987_10 maplist2 = deepcopy(maplist) maplist2 = [i.split('@')[0] for i in maplist2] stack = RasterStack(rasters=maplist2) stack.lsat5_1987_10 X, y, crd = stack.extract_points(vect_name='landclass96_roi', fields=['value', 'cat']) df = stack.extract_points(vect_name='landclass96_roi', field='value', as_df=True) df = stack.extract_pixels(response='landclass96_roi', as_df=True) X, y, crd = stack.extract_pixels(response='landclass96_roi') stack.head() stack.tail() data = stack.read() data.shape df = stack.to_pandas() # df = stack.to_pandas(res=500)
def main(): try: import sklearn if sklearn.__version__ < '0.20': gs.fatal("Scikit learn 0.20 or newer is required") except ImportError: gs.fatal("Scikit learn 0.20 or newer is not installed") try: import pandas as pd except ImportError: gs.fatal("Pandas is not installed ") # ------------------------------------------------------------------------- # Parser options # ------------------------------------------------------------------------- # required gui section group = options['group'] training_map = options['training_map'] training_points = options['training_points'] field = options['field'] model_save = options['save_model'] # estimator gui section model_name = options['model_name'] grid_search = options['grid_search'] hyperparams = { 'C': options['c'], 'min_samples_split': options['min_samples_split'], 'min_samples_leaf': options['min_samples_leaf'], 'n_estimators': options['n_estimators'], 'learning_rate': options['learning_rate'], 'subsample': options['subsample'], 'max_depth': options['max_depth'], 'max_features': options['max_features'], 'max_degree': options['max_degree'], 'n_neighbors': options['n_neighbors'], 'weights': options['weights'] } # cross validation cv = int(options['cv']) group_raster = options['group_raster'] tune_only = flags['t'] importances = flags['f'] n_permutations = int(options['n_permutations']) errors_file = options['errors_file'] preds_file = options['preds_file'] fimp_file = options['fimp_file'] param_file = options['param_file'] # general options norm_data = flags['s'] category_maps = option_to_list(options['category_maps']) random_state = int(options['random_state']) load_training = options['load_training'] save_training = options['save_training'] n_jobs = int(options['n_jobs']) balance = flags['b'] # ------------------------------------------------------------------------- # Make dicts for hyperparameters, datatypes and parameters for tuning # ------------------------------------------------------------------------- hyperparams_type = dict.fromkeys(hyperparams, int) hyperparams_type['C'] = float hyperparams_type['learning_rate'] = float hyperparams_type['subsample'] = float hyperparams_type['weights'] = str param_grid = deepcopy(hyperparams_type) param_grid = dict.fromkeys(param_grid, None) for key, val in hyperparams.items(): # split any comma separated strings and add them to the param_grid if ',' in val: # add all vals to param_grid param_grid[key] = [ hyperparams_type[key](i) for i in val.split(',') ] # use first param for default hyperparams[key] = [ hyperparams_type[key](i) for i in val.split(',') ][0] # else convert the single strings to int or float else: hyperparams[key] = hyperparams_type[key](val) if hyperparams['max_depth'] == 0: hyperparams['max_depth'] = None if hyperparams['max_features'] == 0: hyperparams['max_features'] = 'auto' param_grid = {k: v for k, v in param_grid.items() if v is not None} # retrieve sklearn estimator object and parameters estimator, mode = model_classifiers(model_name, random_state, n_jobs, hyperparams, balance) # remove dict keys that are incompatible for the selected estimator estimator_params = estimator.get_params() param_grid = { key: value for key, value in param_grid.items() if key in estimator_params } scoring, search_scorer = scoring_metrics(mode) # ------------------------------------------------------------------------- # Error checking of input options # ------------------------------------------------------------------------- # feature importances selected by no cross-validation scheme used if importances is True and cv == 1: gs.fatal('Feature importances require cross-validation cv > 1') # check for field attribute if training_points are used if training_points != '' and field == '': gs.fatal('No attribute column specified for training points') # check that cv > 1 if hyperparameter tuning is selected if any(param_grid ) is True and cv == 1 and grid_search == 'cross-validation': gs.fatal( 'Hyperparameter search using cross validation requires cv > 1') # check the cross-validation occurs if feature importances is True if importances is True and tune_only is True: gs.fatal('Permutation feature importances require cross validation') if importances is True and cv == 1: gs.fatal('Permutation feature importances require cv > 1') # ------------------------------------------------------------------------- # Define RasterStack # ------------------------------------------------------------------------- # fetch individual raster names from group maplist = gs.read_command("i.group", group=group, flags="g").split(os.linesep)[:-1] # create RasterStack stack = RasterStack(rasters=maplist) if category_maps is not None: stack.categorical = category_maps # ------------------------------------------------------------------------- # Extract training data # ------------------------------------------------------------------------- # Sample training data and group id if load_training != '': X, y, group_id, sample_coords = load_training_data(load_training) else: gs.message('Extracting training data') # append spatial clumps or group raster to the predictors if group_raster != '': stack.append(group_raster) # extract training data if training_map != '': X, y, sample_coords = stack.extract_pixels(training_map) elif training_points != '': X, y, sample_coords = stack.extract_points(training_points, field) y = y.flatten() # reshape to 1 dimension # take group id from last column and remove from predictors if group_raster != '': group_id = X[:, -1] X = np.delete(X, -1, axis=1) stack.drop(group_raster) else: group_id = None # check for labelled pixels and training data if y.shape[0] == 0 or X.shape[0] == 0: gs.fatal('No training pixels or pixels in imagery group ' '...check computational region') # shuffle data from sklearn.utils import shuffle if group_id is None: X, y, sample_coords = shuffle(X, y, sample_coords, random_state=random_state) else: X, y, sample_coords, group_id = shuffle(X, y, sample_coords, group_id, random_state=random_state) # optionally save extracted data to .csv file if save_training != '': save_training_data(X, y, group_id, sample_coords, save_training) # --------------------------------------------------------------------- # Define the inner search resampling method # --------------------------------------------------------------------- from sklearn.model_selection import (GridSearchCV, StratifiedKFold, GroupKFold, KFold, ShuffleSplit, GroupShuffleSplit) # define inner resampling using cross-validation method if any(param_grid) is True and grid_search == 'cross-validation': if group_id is None and mode == 'classification': inner = StratifiedKFold(n_splits=cv, random_state=random_state) elif group_id is None and mode == 'regression': inner = KFold(n_splits=cv, random_state=random_state) else: inner = GroupKFold(n_splits=cv) # define inner resampling using the holdout method elif any(param_grid) is True and grid_search == 'holdout': if group_id is None: inner = ShuffleSplit(n_splits=1, test_size=0.33, random_state=random_state) else: inner = GroupShuffleSplit(n_splits=1, test_size=0.33, random_state=random_state) else: inner = None # --------------------------------------------------------------------- # Define the outer search resampling method # --------------------------------------------------------------------- if cv > 1: if group_id is None and mode == 'classification': outer = StratifiedKFold(n_splits=cv, random_state=random_state) elif group_id is None and mode == 'regression': outer = KFold(n_splits=cv, random_state=random_state) else: outer = GroupKFold(n_splits=cv) # --------------------------------------------------------------------- # Define sample weights for estimators that require weights in fit method # --------------------------------------------------------------------- # estimators that take sample_weights if balance is True and mode == 'classification' and model_name in ( 'GradientBoostingClassifier', 'GaussianNB'): from sklearn.utils import compute_class_weight class_weights = compute_class_weight(class_weight='balanced', classes=(y), y=y) else: class_weights = None # --------------------------------------------------------------------- # Define the preprocessing pipeline # --------------------------------------------------------------------- from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer # standardization if norm_data is True and category_maps is None: from sklearn.preprocessing import StandardScaler scaler = StandardScaler() trans = ColumnTransformer( remainder='passthrough', transformers=[('scaling', scaler, np.setxor1d(range(stack.count), stack.categorical).astype('int'))]) # onehot encoding if category_maps is not None: from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder(handle_unknown='ignore', sparse=False) trans.transformers.append(('onehot', enc, stack.categorical)) # combine transformers if norm_data is True or category_maps is not None: estimator = Pipeline([('preprocessing', trans), ('estimator', estimator)]) # --------------------------------------------------------------------- # Create the hyperparameter grid search method # --------------------------------------------------------------------- # check if dict contains and keys - perform GridSearchCV if any(param_grid) is True: # if Pipeline then change param_grid keys to named_step if isinstance(estimator, Pipeline): for key in param_grid.keys(): newkey = 'estimator__' + key param_grid[newkey] = param_grid.pop(key) # create grid search method estimator = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring=search_scorer, n_jobs=n_jobs, cv=inner) # --------------------------------------------------------------------- # Estimator training # --------------------------------------------------------------------- gs.message(os.linesep) gs.message(('Fitting model using ' + model_name)) # fitting ensuring that all options are passed if model_name in ('GradientBoostingClassifier', 'GausianNB') and balance is True: if isinstance(estimator, Pipeline): fit_params = {'estimator__sample_weight': class_weights} else: fit_params = {'sample_weight': class_weights} else: fit_params = {} if isinstance(inner, (GroupKFold, GroupShuffleSplit)): estimator.fit(X, y, groups=group_id, **fit_params) else: estimator.fit(X, y, **fit_params) # message best hyperparameter setup and optionally save using pandas if any(param_grid) is True: gs.message(os.linesep) gs.message('Best parameters:') gs.message(str(estimator.best_params_)) if param_file != '': param_df = pd.DataFrame(estimator.cv_results_) param_df.to_csv(param_file) # --------------------------------------------------------------------- # Cross-validation # --------------------------------------------------------------------- # from sklearn.model_selection import cross_validate # scores = cross_validate(estimator, X, y, group_id, scoring, outer, n_jobs, fit_params=fit_params) # gs.message(scores) # test_scoring = ['test_' + i for i in scoring] # gs.message(os.linesep) # gs.message(('Metric \t Mean \t Error')) # for sc in test_scoring: # gs.message(sc + '\t' + str(scores[sc].mean()) + '\t' + str(scores[sc].std())) if cv > 1 and tune_only is not True: if mode == 'classification' and cv > np.histogram( y, bins=np.unique(y))[0].min(): gs.message(os.linesep) gs.fatal('Number of cv folds is greater than number of ' + 'samples in some classes') gs.message(os.linesep) gs.message("Cross validation global performance measures......:") # add auc and mcc as scorer if classification is binary if mode == 'classification' and \ len(np.unique(y)) == 2 and all([0, 1] == np.unique(y)): scoring.append('roc_auc') scoring.append('matthews_corrcoef') # perform the cross-validatation scores, cscores, fimp, models, preds = cross_val_scores( estimator, X, y, group_id, class_weights, outer, scoring, importances, n_permutations, random_state, n_jobs) preds = np.hstack((preds, sample_coords)) for method, val in scores.items(): gs.message(method + ":\t%0.3f\t+/-SD\t%0.3f" % (val.mean(), val.std())) # individual class scores if mode == 'classification' and len(np.unique(y)) != 2: gs.message(os.linesep) gs.message('Cross validation class performance measures......:') gs.message('Class \t' + '\t'.join(map(str, np.unique(y)))) for method, val in cscores.items(): mat_cscores = np.matrix(val) gs.message(method + ':\t' + '\t'.join( map(str, np.round(mat_cscores.mean(axis=0), 2)[0]))) gs.message( method + ' std:\t' + '\t'.join(map(str, np.round(mat_cscores.std(axis=0), 2)[0]))) # write cross-validation results for csv file if errors_file != '': errors = pd.DataFrame(scores) errors.to_csv(errors_file, mode='w') # write cross-validation predictions to csv file if preds_file != '': preds = pd.DataFrame(preds) preds.columns = ['y_true', 'y_pred', 'fold', 'x', 'y'] preds.to_csv(preds_file, mode='w') text_file = open(preds_file + 't', "w") text_file.write('"Integer","Real","Real","integer","Real","Real"') text_file.close() # feature importances if importances is True: gs.message(os.linesep) gs.message("Feature importances") gs.message("id" + "\t" + "Raster" + "\t" + "Importance") # mean of cross-validation feature importances for i in range(len(fimp.mean(axis=0))): gs.message( str(i) + "\t" + maplist[i] + "\t" + str(round(fimp.mean(axis=0)[i], 4))) if fimp_file != '': np.savetxt(fname=fimp_file, X=fimp, delimiter=',', header=','.join(maplist), comments='') # Save the fitted model from sklearn.externals import joblib joblib.dump((X, y, sample_coords, group_id, estimator), model_save)