def getScoresFromWav(wavF, annF, feExFun, clf, lt, labelsHierarchy): """ extracts features and its labels (ground truth) from wavF and annF files and compares the clf predictions with the ground truth Parameters ---------- wavF: str annF: str feExFun: callable clf : classifier le : label encoder (object) labelsHierarchy: list """ A, a_names = fex.getXy_fromWavFAnnF(wavF, annF, feExFun, labelsHierarchy, filter_classes=lt.classes_) a = lt.nom2num(a_names) return clfScoresO(clf, A, a)
def runExperiment( train_coll, test_coll, lt, T_settings, labsHierarchy, out_fN, cv, pipe_estimators, gs_grid, scoring=None, param=None, predictionsDir=None, ): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings cv: cv folds pipe_estimators: list for pipline gs_grid: list out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str scoring: string or sklearn.metrics.scorer """ Tpipe = fex.makeTransformationsPipeline(T_settings) feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs X, y_names = ( X0, y0_names, ) # myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) labsD = lt.targetNumNomDict() with open( out_fN, "a" ) as out_file: # print details about the dataset into status file out_file.write("# {} ({})\n".format(collFi_train, len(train_coll))) out_file.write("#label_transformer {} {}\t data {}\n".format( lt.targetNumNomDict(), lt.classes_, Counter(y_names))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testFrac, random_state=0) with open(out_fN, "a") as out_file: # print details to status file out_file.write("#TRAIN, shape {}\n".format(np.shape(X_train))) out_file.write("#TEST, shape {}\n".format(np.shape(X_test))) #### CLF pipe = Pipeline(pipe_estimators) gs = GridSearchCV(estimator=pipe, param_grid=gs_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ## best clf scores with open(out_fN, "a") as out_file: out_file.write("#CLF\t{}\tbest score {:.3f}\n".format( str(gs.best_params_).replace("\n", ""), gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, "a") as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write("{:2.2f}, {:2.2f}, {:.2f}, ".format( param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write("{:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc), 100 * 2 * np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN) #### TEST collection ### train classifier with whole dataset clf = skb.clone( gs.best_estimator_ ) # clone to create a new classifier with the same parameters clf.fit(X, y) ### print scores callIx = lt.nom2num("c") for wavF, annF in test_coll[:]: A, a_names = fex.getXy_fromWavFAnnF(wavF, annF, feExFun, labsHierarchy, filter_classes=lt.classes_) a_true = lt.nom2num(a_names) a_pred = clf.predict(A) P = mt.precision_score(a_true, a_pred, average=None)[callIx] R = mt.recall_score(a_true, a_pred, average=None)[callIx] f1 = mt.f1_score(a_true, a_pred, average=None)[callIx] with open(out_fN, "a") as out_file: out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format( 100 * f1, 100 * P, 100 * R)) if predictionsDir: bN = os.path.basename(annF) annFile_predict = os.path.join( predictionsDir, "{}_{}_{}".format(int(f1 * 100), int(param), bN)) pT.predictSoundSections(wavF, clf, lt, feExFun, annSections=labsHierarchy, outF=annFile_predict) with open(out_fN, "a") as out_file: out_file.write("\n") return clf