def main(opt, flg): # import functions which depend on sklearn only after parser run from ml_functions import (balance, explorer_clsfiers, run_classifier, optimize_training, explore_SVC, plot_grid) from features import importances, tocsv msgr = get_msgr() indexes = None vect = opt['vector'] vtraining = opt['vtraining'] if opt['vtraining'] else None scaler, decmp = None, None vlayer = opt['vlayer'] if opt['vlayer'] else vect + '_stats' tlayer = opt['tlayer'] if opt['tlayer'] else vect + '_training' rlayer = opt['rlayer'] if opt['rlayer'] else vect + '_results' labels = extract_classes(vtraining, 1) pprint(labels) if opt['scalar']: scapar = opt['scalar'].split(',') from sklearn.preprocessing import StandardScaler scaler = StandardScaler(with_mean='with_mean' in scapar, with_std='with_std' in scapar) if opt['decomposition']: dec, params = (opt['decomposition'].split('|') if '|' in opt['decomposition'] else (opt['decomposition'], '')) kwargs = ({k: v for k, v in (p.split('=') for p in params.split(','))} if params else {}) load_decompositions() decmp = DECMP[dec](**kwargs) # if training extract training if vtraining and flg['e']: msgr.message("Extract training from: <%s> to <%s>." % (vtraining, vect)) extract_training(vect, vtraining, tlayer) flg['n'] = True if flg['n']: msgr.message("Save arrays to npy files.") save2npy(vect, vlayer, tlayer, fcats=opt['npy_cats'], fcols=opt['npy_cols'], fdata=opt['npy_data'], findx=opt['npy_index'], fclss=opt['npy_tclasses'], ftdata=opt['npy_tdata']) # define the classifiers to use/test if opt['pyclassifiers'] and opt['pyvar']: # import classifiers to use mycls = imp.load_source("mycls", opt['pyclassifiers']) classifiers = getattr(mycls, opt['pyvar']) else: from ml_classifiers import CLASSIFIERS classifiers = CLASSIFIERS # Append the SVC classifier if opt['svc_c'] and opt['svc_gamma']: from sklearn.svm import SVC svc = {'name': 'SVC', 'classifier': SVC, 'kwargs': {'C': float(opt['svc_c']), 'gamma': float(opt['svc_gamma']), 'kernel': opt['svc_kernel']}} classifiers.append(svc) # extract classifiers from pyindx if opt['pyindx']: indexes = [i for i in get_indexes(opt['pyindx'])] classifiers = [classifiers[i] for i in indexes] num = int(opt['n_training']) if opt['n_training'] else None # load fron npy files Xt = np.load(opt['npy_tdata']) Yt = np.load(opt['npy_tclasses']) cols = np.load(opt['npy_cols']) # Define rules to substitute NaN, Inf, posInf, negInf values rules = {} for key in ('nan', 'inf', 'neginf', 'posinf'): if opt[key]: rules[key] = get_rules(opt[key]) pprint(rules) # Substitute (skip cat column) Xt, rules_vals = substitute(Xt, rules, cols[1:]) Xtoriginal = Xt # scale the data if scaler: msgr.message("Scaling the training data set.") scaler.fit(Xt, Yt) Xt = scaler.transform(Xt) # decompose data if decmp: msgr.message("Decomposing the training data set.") decmp.fit(Xt) Xt = decmp.transform(Xt) # Feature importances with forests of trees if flg['f']: np.save('training_transformed.npy', Xt) importances(Xt, Yt, cols[1:], csv=opt['imp_csv'], img=opt['imp_fig'], # default parameters to save the matplotlib figure **dict(dpi=300, transparent=False, bbox_inches='tight')) # optimize the training set if flg['o']: ind_optimize = (int(opt['pyindx_optimize']) if opt['pyindx_optimize'] else 0) cls = classifiers[ind_optimize] msgr.message("Find the optimum training set.") best, Xbt, Ybt = optimize_training(cls, Xt, Yt, labels, #{v: k for k, v in labels.items()}, scaler, decmp, num=num, maxiterations=1000) msg = " - save the optimum training data set to: %s." msgr.message(msg % opt['npy_btdata']) np.save(opt['npy_btdata'], Xbt) msg = " - save the optimum training classes set to: %s." msgr.message(msg % opt['npy_btclasses']) np.save(opt['npy_btclasses'], Ybt) # balance the data if flg['b']: msg = "Balancing the training data set, each class have <%d> samples." msgr.message(msg % num) Xbt, Ybt = balance(Xt, Yt, num) else: if not flg['o']: Xbt = (np.load(opt['npy_btdata']) if os.path.isfile(opt['npy_btdata']) else Xt) Ybt = (np.load(opt['npy_btclasses']) if os.path.isfile(opt['npy_btclasses']) else Yt) # scale the data if scaler: msgr.message("Scaling the training data set.") scaler.fit(Xbt, Ybt) Xt = scaler.transform(Xt) Xbt = scaler.transform(Xbt) if flg['d']: C_range = [float(c) for c in opt['svc_c_range'].split(',') if c] gamma_range = [float(g) for g in opt['svc_gamma_range'].split(',') if g] kernel_range = [str(s) for s in opt['svc_kernel_range'].split(',') if s] poly_range = [int(i) for i in opt['svc_poly_range'].split(',') if i] allkwargs = dict(C=C_range, gamma=gamma_range, kernel=kernel_range, degree=poly_range) kwargs = {} for k in allkwargs: if allkwargs[k]: kwargs[k] = allkwargs[k] msgr.message("Exploring the SVC domain.") grid = explore_SVC(Xbt, Ybt, n_folds=5, n_jobs=int(opt['svc_n_jobs']), **kwargs) import pickle krnlstr = '_'.join(s for s in opt['svc_kernel_range'].split(',') if s) pkl = open('grid%s.pkl' % krnlstr, 'w') pickle.dump(grid, pkl) pkl.close() # pkl = open('grid.pkl', 'r') # grid = pickle.load(pkl) # pkl.close() plot_grid(grid, save=opt['svc_img']) # test the accuracy of different classifiers if flg['t']: # test different classifiers msgr.message("Exploring different classifiers.") msgr.message("cls_id cls_name mean max min std") res = explorer_clsfiers(classifiers, Xt, Yt, labels=labels, indexes=indexes, n_folds=5, bv=flg['v'], extra=flg['x']) # TODO: sort(order=...) is working only in the terminal, why? #res.sort(order='mean') with open(opt['csv_test_cls'], 'w') as csv: csv.write(tocsv(res)) if flg['c']: # classify data = np.load(opt['npy_data']) indx = np.load(opt['npy_index']) # Substitute using column values data, dummy = substitute(data, rules, cols[1:]) Xt = data[indx] if scaler: msgr.message("Scaling the training data set.") scaler.fit(Xt, Yt) Xt = scaler.transform(Xt) msgr.message("Scaling the whole data set.") data = scaler.transform(data) if decmp: msgr.message("Decomposing the training data set.") decmp.fit(Xt) Xt = decmp.transform(Xt) msgr.message("Decompose the whole data set.") data = decmp.transform(data) cats = np.load(opt['npy_cats']) np.save('data_filled_scaled.npy', data) tcols = [] for cls in classifiers: report = (open(opt['report_class'], "w") if opt['report_class'] else sys.stdout) run_classifier(cls, Xt, Yt, Xt, Yt, labels, data, report=report) tcols.append((cls['name'], 'INTEGER')) import pickle with open('classification_results.pkl', 'w') as res: pickle.dump(classifiers, res) #classifiers = pickle.load(res) msgr.message("Export the results to layer: <%s>" % str(rlayer)) export_results(vect, classifiers, cats, rlayer, vtraining, tcols, overwrite(), pkl='res.pkl', append=flg['a']) # res.close() if flg['r']: rules = ('\n'.join(['%d %s' % (k, v) for k, v in get_colors(vtraining).items()]) if vtraining else None) msgr.message("Export the layer with results to raster") with Vector(vect, mode='r') as vct: tab = vct.dblinks.by_name(rlayer).table() rasters = [c for c in tab.columns] rasters.remove(tab.key) v2rst = Module('v.to.rast') rclrs = Module('r.colors') for rst in rasters: v2rst(input=vect, layer=rlayer, type='area', use='attr', attrcolumn=rst.encode(), output=(opt['rst_names'] % rst).encode(), memory=1000, overwrite=overwrite()) if rules: rclrs(map=rst.encode(), rules='-', stdin_=rules)
def main(opt, flg): # import functions which depend on sklearn only after parser run from ml_functions import ( balance, explorer_clsfiers, run_classifier, optimize_training, explore_SVC, plot_grid, ) from features import importances, tocsv msgr = get_msgr() indexes = None vect = opt["vector"] vtraining = opt["vtraining"] if opt["vtraining"] else None scaler, decmp = None, None vlayer = opt["vlayer"] if opt["vlayer"] else vect + "_stats" tlayer = opt["tlayer"] if opt["tlayer"] else vect + "_training" rlayer = opt["rlayer"] if opt["rlayer"] else vect + "_results" labels = extract_classes(vtraining, 1) pprint(labels) if opt["scalar"]: scapar = opt["scalar"].split(",") from sklearn.preprocessing import StandardScaler scaler = StandardScaler(with_mean="with_mean" in scapar, with_std="with_std" in scapar) if opt["decomposition"]: dec, params = (opt["decomposition"].split("|") if "|" in opt["decomposition"] else (opt["decomposition"], "")) kwargs = ({k: v for k, v in (p.split("=") for p in params.split(","))} if params else {}) load_decompositions() decmp = DECMP[dec](**kwargs) # if training extract training if vtraining and flg["e"]: msgr.message("Extract training from: <%s> to <%s>." % (vtraining, vect)) extract_training(vect, vtraining, tlayer) flg["n"] = True if flg["n"]: msgr.message("Save arrays to npy files.") save2npy( vect, vlayer, tlayer, fcats=opt["npy_cats"], fcols=opt["npy_cols"], fdata=opt["npy_data"], findx=opt["npy_index"], fclss=opt["npy_tclasses"], ftdata=opt["npy_tdata"], ) # define the classifiers to use/test if opt["pyclassifiers"] and opt["pyvar"]: # import classifiers to use mycls = SourceFileLoader("mycls", opt["pyclassifiers"]).load_module() classifiers = getattr(mycls, opt["pyvar"]) else: from ml_classifiers import CLASSIFIERS classifiers = CLASSIFIERS # Append the SVC classifier if opt["svc_c"] and opt["svc_gamma"]: from sklearn.svm import SVC svc = { "name": "SVC", "classifier": SVC, "kwargs": { "C": float(opt["svc_c"]), "gamma": float(opt["svc_gamma"]), "kernel": opt["svc_kernel"], }, } classifiers.append(svc) # extract classifiers from pyindx if opt["pyindx"]: indexes = [i for i in get_indexes(opt["pyindx"])] classifiers = [classifiers[i] for i in indexes] num = int(opt["n_training"]) if opt["n_training"] else None # load fron npy files Xt = np.load(opt["npy_tdata"]) Yt = np.load(opt["npy_tclasses"]) cols = np.load(opt["npy_cols"]) # Define rules to substitute NaN, Inf, posInf, negInf values rules = {} for key in ("nan", "inf", "neginf", "posinf"): if opt[key]: rules[key] = get_rules(opt[key]) pprint(rules) # Substitute (skip cat column) Xt, rules_vals = substitute(Xt, rules, cols[1:]) Xtoriginal = Xt # scale the data if scaler: msgr.message("Scaling the training data set.") scaler.fit(Xt, Yt) Xt = scaler.transform(Xt) # decompose data if decmp: msgr.message("Decomposing the training data set.") decmp.fit(Xt) Xt = decmp.transform(Xt) # Feature importances with forests of trees if flg["f"]: np.save("training_transformed.npy", Xt) importances( Xt, Yt, cols[1:], csv=opt["imp_csv"], img=opt["imp_fig"], # default parameters to save the matplotlib figure **dict(dpi=300, transparent=False, bbox_inches="tight"), ) # optimize the training set if flg["o"]: ind_optimize = int( opt["pyindx_optimize"]) if opt["pyindx_optimize"] else 0 cls = classifiers[ind_optimize] msgr.message("Find the optimum training set.") best, Xbt, Ybt = optimize_training( cls, Xt, Yt, labels, # {v: k for k, v in labels.items()}, scaler, decmp, num=num, maxiterations=1000, ) msg = " - save the optimum training data set to: %s." msgr.message(msg % opt["npy_btdata"]) np.save(opt["npy_btdata"], Xbt) msg = " - save the optimum training classes set to: %s." msgr.message(msg % opt["npy_btclasses"]) np.save(opt["npy_btclasses"], Ybt) # balance the data if flg["b"]: msg = "Balancing the training data set, each class have <%d> samples." msgr.message(msg % num) Xbt, Ybt = balance(Xt, Yt, num) else: if not flg["o"]: Xbt = (np.load(opt["npy_btdata"]) if os.path.isfile(opt["npy_btdata"]) else Xt) Ybt = (np.load(opt["npy_btclasses"]) if os.path.isfile(opt["npy_btclasses"]) else Yt) # scale the data if scaler: msgr.message("Scaling the training data set.") scaler.fit(Xbt, Ybt) Xt = scaler.transform(Xt) Xbt = scaler.transform(Xbt) if flg["d"]: C_range = [float(c) for c in opt["svc_c_range"].split(",") if c] gamma_range = [ float(g) for g in opt["svc_gamma_range"].split(",") if g ] kernel_range = [ str(s) for s in opt["svc_kernel_range"].split(",") if s ] poly_range = [int(i) for i in opt["svc_poly_range"].split(",") if i] allkwargs = dict(C=C_range, gamma=gamma_range, kernel=kernel_range, degree=poly_range) kwargs = {} for k in allkwargs: if allkwargs[k]: kwargs[k] = allkwargs[k] msgr.message("Exploring the SVC domain.") grid = explore_SVC(Xbt, Ybt, n_folds=5, n_jobs=int(opt["svc_n_jobs"]), **kwargs) import pickle krnlstr = "_".join(s for s in opt["svc_kernel_range"].split(",") if s) pkl = open("grid%s.pkl" % krnlstr, "w") pickle.dump(grid, pkl) pkl.close() # pkl = open('grid.pkl', 'r') # grid = pickle.load(pkl) # pkl.close() plot_grid(grid, save=opt["svc_img"]) # test the accuracy of different classifiers if flg["t"]: # test different classifiers msgr.message("Exploring different classifiers.") msgr.message("cls_id cls_name mean max min std") res = explorer_clsfiers( classifiers, Xt, Yt, labels=labels, indexes=indexes, n_folds=5, bv=flg["v"], extra=flg["x"], ) # TODO: sort(order=...) is working only in the terminal, why? # res.sort(order='mean') with open(opt["csv_test_cls"], "w") as csv: csv.write(tocsv(res)) if flg["c"]: # classify data = np.load(opt["npy_data"]) indx = np.load(opt["npy_index"]) # Substitute using column values data, dummy = substitute(data, rules, cols[1:]) Xt = data[indx] if scaler: msgr.message("Scaling the training data set.") scaler.fit(Xt, Yt) Xt = scaler.transform(Xt) msgr.message("Scaling the whole data set.") data = scaler.transform(data) if decmp: msgr.message("Decomposing the training data set.") decmp.fit(Xt) Xt = decmp.transform(Xt) msgr.message("Decompose the whole data set.") data = decmp.transform(data) cats = np.load(opt["npy_cats"]) np.save("data_filled_scaled.npy", data) tcols = [] for cls in classifiers: report = (open(opt["report_class"], "w") if opt["report_class"] else sys.stdout) run_classifier(cls, Xt, Yt, Xt, Yt, labels, data, report=report) tcols.append((cls["name"], "INTEGER")) import pickle with open("classification_results.pkl", "w") as res: pickle.dump(classifiers, res) # classifiers = pickle.load(res) msgr.message("Export the results to layer: <%s>" % str(rlayer)) export_results( vect, classifiers, cats, rlayer, vtraining, tcols, overwrite(), pkl="res.pkl", append=flg["a"], ) # res.close() if flg["r"]: rules = ("\n".join([ "%d %s" % (k, v) for k, v in get_colors(vtraining).items() ]) if vtraining else None) msgr.message("Export the layer with results to raster") with Vector(vect, mode="r") as vct: tab = vct.dblinks.by_name(rlayer).table() rasters = [c for c in tab.columns] rasters.remove(tab.key) v2rst = Module("v.to.rast") rclrs = Module("r.colors") for rst in rasters: v2rst( input=vect, layer=rlayer, type="area", use="attr", attrcolumn=rst.encode(), output=(opt["rst_names"] % rst).encode(), memory=1000, overwrite=overwrite(), ) if rules: rclrs(map=rst.encode(), rules="-", stdin_=rules)