def featureExtractionInstructions2Xy(wavAnnColl, lt, TpipeSettings, labelSet=None, **kwargs): """All instructions for feature extraction Params: ------- wavAnnColl : lt : label transformer featExtFun : feature extraction function labelSet : list with the subset of labels to consider **feExParamDict : other kwargs for feature extraction eg. dict(wavPreprocessingT=None, ensembleSettings=ensembleSettings) """ Tpipe = fex.makeTransformationsPipeline(TpipeSettings) feExFun = Tpipe.fun datO = fex.wavAnnCollection2Xy_ensemble_datXy_names(wavAnnColl, feExFun,**kwargs) X_train, y_train_labels = datO.filterInstances(labelSet) y_train = lt.nom2num(y_train_labels) return X_train, y_train
def runWSD2Experiment(train_coll, test_coll, lt, T_settings, labsHierarchy, cv, out_fN, testColl_scoreClassLabels, readSections, param=None, predictionsDir=None, keepSections='default', scoring=None): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list of (wavF, annF, template_annF) lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str scoring: string or sklearn.metrics.scorer """ Tpipe = fex.makeTransformationsPipeline(T_settings) feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs X, y_names = X0, y0_names #myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) labsD = lt.targetNumNomDict() with open( out_fN, 'a' ) as out_file: # print details about the dataset into status file out_file.write("# {} ({})\n".format(collFi_train, len(train_coll))) out_file.write("#label_transformer {} {}\t data {}\n".format( lt.targetNumNomDict(), lt.classes_, Counter(y_names))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testFrac, random_state=0) with open(out_fN, 'a') as out_file: # print details to status file out_file.write("#TRAIN, shape {}\n".format(np.shape(X_train))) out_file.write("#TEST, shape {}\n".format(np.shape(X_test))) #### CLF pipe = Pipeline(estimators) gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ## best clf scores with open(out_fN, 'a') as out_file: out_file.write("#CLF\t{}\tbest score {:.3f}\n".format( str(gs.best_params_).replace('\n', ''), gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, 'a') as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write("{:2.2f}, {:2.2f}, {:.2f}, ".format( param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write("{:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc), 100 * 2 * np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(lt.num2nom(y_true), lt.num2nom(y_pred), out_fN, labels=lt.classes_) #strini="\nTest Set", strend="\n") #### TEST collection ### train classifier with whole dataset clf = skb.clone( gs.best_estimator_ ) # clone to create a new classifier with the same parameters clf.fit(X, y) ### print scores callIx = lt.nom2num('c') for wavF, annF, template_annF in test_coll[:]: ### print WSD2 scores, see WSD2_experiment.ipynb MLvl.printWSD2_scores( wavF, true_annF=annF, template_annF=template_annF, WSD2_clf=clf, WSD2_feExFun=feExFun, lt=lt, scoreClassLabels=testColl_scoreClassLabels, outF=out_fN, strini=", ", strend="", m='auto', #strini="\nTESTCOLL", strend="\n", m='auto', readSectionsWSD2=readSections, # for WSD2 labelsHierarchy=['c']) if predictionsDir: bN = os.path.basename(annF) annFile_predict = os.path.join(predictionsDir, "{}_{}".format(int(param), bN)) pT.WSD2predictAnnotations(wavF, template_annF, feExFun, lt, clf, outF=annFile_predict, readSections=readSections, keepSections=keepSections) with open(out_fN, 'a') as out_file: out_file.write("\n") return clf
################### ASSIGNMENTS #################### ##### OUTPUT FILES try: os.makedirs(oDir) except OSError: pass out_fN = os.path.join(oDir, "scores.txt") if savePredictions: predictionsDir = os.path.join(oDir, "predictions") try: os.makedirs(predictionsDir) except OSError: pass Tpipe = fex.makeTransformationsPipeline(T_settings) ## clf settings clfStr = 'cv{}-'.format(cv) settingsStr = "{}-{}".format(Tpipe.string, clfStr) settingsStr += '-labsHierarchy_' + '_'.join(labsHierarchy) ## write in out file out_file = open(out_fN, 'a') out_file.write("#WSD1\n###--------- {} ---------###\n".format( time.strftime("%Y.%m.%d\t\t%H:%M:%S"))) out_file.write("#" + settingsStr + '\n') out_file.close() ## load collections train_coll = fex.readCols(collFi_train, colIndexes=(0, 1))
def runCallClfExperiment(wavColl, lt, T_settings, out_fN, testFrac, cv, pipe_estimators, gs_grid, filterClfClasses, scoring=None, param=None): """ . . . . . USE callClfExperiment instead to be deprecated!!! . . . . . Runs clf experiments in the old way using T_settings instead of Tpipe Parameters ---------- train_coll: list test_coll: list lt: ML.labelTransformer T_settings: list of tuples labelsHierarchy: list of strings cv: cv folds pipe_estimators: list for pipeline gs_grid: list filterClfClasses: list can use lt.classes_ out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str scoring: string or sklearn.metrics.scorer param: float value of the param in experiment, for printing """ Tpipe = fex.makeTransformationsPipeline(T_settings) feExFun = Tpipe.fun fs = Tpipe.Audio_features.fs #### prepare DATA: collections --> X y ## compute features datO = fex.wavLCollection2datXy(wavColl, fs=fs, featExtFun=feExFun) X, y_names = datO.filterInstances(filterClfClasses) y = lt.nom2num(y_names) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testFrac, random_state=0) #### CLF pipe = Pipeline(pipe_estimators) gs = GridSearchCV(estimator=pipe, param_grid=gs_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) clf_best = gs.best_estimator_ y_pred = clf_best.predict(X_test) ## clf scores over test set with open(out_fN, 'a') as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_train, y_train, scoring=scoring) out_file.write("{:}, {:2.2f}, {:.2f}, ".format( param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc))) P, R, f1, _ = mt.precision_recall_fscore_support( y_test, y_pred, average='macro') # average of the scores for the call classes acc = mt.accuracy_score(y_test, y_pred) out_file.write("{:.2f}, {:.2f}, {:.2f}, {:.2f}\n".format( acc * 100, P * 100, R * 100, f1 * 100)) return clf_best.fit(X, y)
cv = 6 ## clf settings ### inicialise Clf settings paramsDi={} pipe_estimators=[] from pylotwhale.MLwhales.clf_pool import svc_l as clf_settings pipe_estimators.append(('clf', clf_settings.fun)) paramsDi.update(clf_settings.grid_params_di) gs_grid = [paramsDi] # clfSettings.grid_params # #### Classes callSet = ['126i', '130', '127', '129', '128i', '131i', '093ii'] ##### FILES ## INPUT -> collection files filesDi = {} #collFi_train filesDi['train']= '/home/flo/x1-flo/whales/MLwhales/callClassification/data/collections/Vocal-repertoire-catalogue-Pilot-whales-Norway-callsL10.txt' ## OUTPUT -> DIR #oDir = import pylotwhale.MLwhales.featureExtraction as fex settings_str = fex.makeTransformationsPipeline(T_settings).string + clf_settings.clf_name filesDi['outDir'] = '/home/flo/x1-flo/whales/MLwhales/callClassification/data/experiments/vocaRepNPW-clf/{}'.format(settings_str)
def run_iter_clf_experiment(param_grid, paramKey, paramDict, clf_settings, feExParamsDict, #updateParamInDict, wavAnnColl_te, lt, updateTestSet=True, scores_file=None, accum_file=None, ): """ Run a clf experiments for different parameters (param_grid) Parameters: ---------- param_grid : array_like Experiment parameters. Often repeated according to n_experiments paramKey : str key of paramDict, used to specify the and aupdate the experiments parameter which can be a feature name (ceps), NFFT or the instructions for the ensemble generation. See updateParamInDict() and feExParamsDict. paramDict: dict dictionary where the experment param was defined clf_settings : dictionary clf settings feExParamsDict: feExParamDict : dictionary Instructions for the extraction of features and ensemble generation. Used user over the train set and sometimes also over the test set. wavAnnColl : collection of annotated wavs lt : label transformer featExtFun : feature extraction instructions callable or dicT labelSet : set of clf-labels ensembleSettings : instructions for the generation of a sound ensemble (dict) updateParamInDict : callable Instructions for updating the experimental parameter. Often specified by paramsDict. wavAnnColl_te : list, test collection lt : label transformer updateTestSet : bool True if feture extraction changes over the experiment False, no need to update feature representation if the test set scores_file : str output file for saving clf scores accum_file : str output file for saving predictions """ ### TEST DATA ## data settings paramDict[paramKey]=param_grid[0] ## feExFun = fex.makeTransformationsPipeline(feExParamsDict["TpipeSettings"]).fun XyDict_test = fex.wavAnnCollection2datXyDict(wavAnnColl_te, feExFun) XyO_test = fex.XyDict2XyO(XyDict_test) X_test, y_test_labels = XyO_test.filterInstances(lt.classes_) y_test = lt.nom2num(y_test_labels) scoresDict = None for param in param_grid: paramDict[paramKey]=param #paramsDict = updateParamInDict(feExParamsDict, paramKey, param) print(paramKey, paramDict[paramKey], paramDict)#, feExParamsDict["TpipeSettings"]) clfExp = clf_experimentO(clf_settings, **feExParamsDict) #print("param", param, '\n\n', paramsDict['featExtFun']) if updateTestSet: # True when changing feature extraction instructions feExFun = fex.makeTransformationsPipeline(feExParamsDict["TpipeSettings"]).fun XyDict_test = updateParamTestSet(wavAnnColl_te, lt, featExtFun=feExFun, output_type='dict') X_test, y_test = updateParamTestSet(wavAnnColl_te, lt, featExtFun=feExFun, output_type='Xy') if scores_file is not None: clfExp.print_scores(scores_file, X_test, y_test, param) if XyDict_test is not None: scoresDict = clfExp.accumPredictions(XyDict_test, param, predictionsDict=scoresDict) if accum_file is not None: clfExp.print_predictions(accum_file, scoresDict, lt) return True
def runExperiment( train_coll, test_coll, lt, T_settings, labsHierarchy, out_fN, cv, pipe_estimators, gs_grid, scoring=None, param=None, predictionsDir=None, ): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings cv: cv folds pipe_estimators: list for pipline gs_grid: list out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str scoring: string or sklearn.metrics.scorer """ Tpipe = fex.makeTransformationsPipeline(T_settings) feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs X, y_names = ( X0, y0_names, ) # myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) labsD = lt.targetNumNomDict() with open( out_fN, "a" ) as out_file: # print details about the dataset into status file out_file.write("# {} ({})\n".format(collFi_train, len(train_coll))) out_file.write("#label_transformer {} {}\t data {}\n".format( lt.targetNumNomDict(), lt.classes_, Counter(y_names))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testFrac, random_state=0) with open(out_fN, "a") as out_file: # print details to status file out_file.write("#TRAIN, shape {}\n".format(np.shape(X_train))) out_file.write("#TEST, shape {}\n".format(np.shape(X_test))) #### CLF pipe = Pipeline(pipe_estimators) gs = GridSearchCV(estimator=pipe, param_grid=gs_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ## best clf scores with open(out_fN, "a") as out_file: out_file.write("#CLF\t{}\tbest score {:.3f}\n".format( str(gs.best_params_).replace("\n", ""), gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, "a") as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write("{:2.2f}, {:2.2f}, {:.2f}, ".format( param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write("{:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc), 100 * 2 * np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN) #### TEST collection ### train classifier with whole dataset clf = skb.clone( gs.best_estimator_ ) # clone to create a new classifier with the same parameters clf.fit(X, y) ### print scores callIx = lt.nom2num("c") for wavF, annF in test_coll[:]: A, a_names = fex.getXy_fromWavFAnnF(wavF, annF, feExFun, labsHierarchy, filter_classes=lt.classes_) a_true = lt.nom2num(a_names) a_pred = clf.predict(A) P = mt.precision_score(a_true, a_pred, average=None)[callIx] R = mt.recall_score(a_true, a_pred, average=None)[callIx] f1 = mt.f1_score(a_true, a_pred, average=None)[callIx] with open(out_fN, "a") as out_file: out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format( 100 * f1, 100 * P, 100 * R)) if predictionsDir: bN = os.path.basename(annF) annFile_predict = os.path.join( predictionsDir, "{}_{}_{}".format(int(f1 * 100), int(param), bN)) pT.predictSoundSections(wavF, clf, lt, feExFun, annSections=labsHierarchy, outF=annFile_predict) with open(out_fN, "a") as out_file: out_file.write("\n") return clf
metric='f1_macro'#'accuracy' cv = 5 ## clf settings ### inicialise Clf settings paramsDi={} pipe_estimators=[] from pylotwhale.MLwhales.clf_pool import svc_rbf as clf_settings pipe_estimators.append(('clf', clf_settings.fun)) paramsDi.update(clf_settings.grid_params_di) gs_grid = [paramsDi] # clfSettings.grid_params # #### Classes #callSet = ['126i', '130', '127', '129', '128i', '131i', '093ii'] ##### FILES ## INPUT -> collection files filesDi = {} #collFi_train filesDi['train'] ='/home/florencia/whales/MLwhales/callClassification/data/collections/Vocal-repertoire-catalogue-Pilot-whales-Norway-callsL10.txt' # '/home/florencia/whales/data/orchive/flo/data/wavLabelColl-call-catalog-xsilence.txt' # ## OUTPUT -> DIR #oDir = from pylotwhale.MLwhales.featureExtraction import makeTransformationsPipeline settings_str = makeTransformationsPipeline(T_settings).string + clf_settings.clf_name + '-'+ metric filesDi['outDir'] = '/home/florencia/whales/MLwhales/callClassification/data/experiments/fullRep-cutWavs/f1_macro_n_mels_Nslices_NFFT1024/{}'.format(settings_str) # '/home/florencia/whales/data/orchive/flo/data/experiments/{}'.format(settings_str) #'/home/florencia/whales/MLwhales/callClassification/data/experiments/fullRep-cutWavs/{}'.format(settings_str)