def runWSD2Experiment(train_coll, test_coll, lt, T_settings, labsHierarchy, cv, out_fN, testColl_scoreClassLabels, readSections, param=None, predictionsDir=None, keepSections='default', scoring=None): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list of (wavF, annF, template_annF) lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str scoring: string or sklearn.metrics.scorer """ Tpipe = fex.makeTransformationsPipeline(T_settings) feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs X, y_names = X0, y0_names #myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) labsD = lt.targetNumNomDict() with open( out_fN, 'a' ) as out_file: # print details about the dataset into status file out_file.write("# {} ({})\n".format(collFi_train, len(train_coll))) out_file.write("#label_transformer {} {}\t data {}\n".format( lt.targetNumNomDict(), lt.classes_, Counter(y_names))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testFrac, random_state=0) with open(out_fN, 'a') as out_file: # print details to status file out_file.write("#TRAIN, shape {}\n".format(np.shape(X_train))) out_file.write("#TEST, shape {}\n".format(np.shape(X_test))) #### CLF pipe = Pipeline(estimators) gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ## best clf scores with open(out_fN, 'a') as out_file: out_file.write("#CLF\t{}\tbest score {:.3f}\n".format( str(gs.best_params_).replace('\n', ''), gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, 'a') as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write("{:2.2f}, {:2.2f}, {:.2f}, ".format( param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write("{:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc), 100 * 2 * np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(lt.num2nom(y_true), lt.num2nom(y_pred), out_fN, labels=lt.classes_) #strini="\nTest Set", strend="\n") #### TEST collection ### train classifier with whole dataset clf = skb.clone( gs.best_estimator_ ) # clone to create a new classifier with the same parameters clf.fit(X, y) ### print scores callIx = lt.nom2num('c') for wavF, annF, template_annF in test_coll[:]: ### print WSD2 scores, see WSD2_experiment.ipynb MLvl.printWSD2_scores( wavF, true_annF=annF, template_annF=template_annF, WSD2_clf=clf, WSD2_feExFun=feExFun, lt=lt, scoreClassLabels=testColl_scoreClassLabels, outF=out_fN, strini=", ", strend="", m='auto', #strini="\nTESTCOLL", strend="\n", m='auto', readSectionsWSD2=readSections, # for WSD2 labelsHierarchy=['c']) if predictionsDir: bN = os.path.basename(annF) annFile_predict = os.path.join(predictionsDir, "{}_{}".format(int(param), bN)) pT.WSD2predictAnnotations(wavF, template_annF, feExFun, lt, clf, outF=annFile_predict, readSections=readSections, keepSections=keepSections) with open(out_fN, 'a') as out_file: out_file.write("\n") return clf
out_file.write("\n#TRAIN, shape {}\n".format(np.shape(X_train))) out_file.write("#TEST, shape {}\n".format(np.shape(X_test))) ## more info #out_file.write("#X {}, y {}\n".format( np.shape(X), np.shape(y))) out_file.write("#Target dict {}\t{}\n".format( labsD, trainDat.targetFrequencies())) ### grid search pipe_svc = Pipeline(estimators) param_grid = [{ 'reduce_dim__n_components': pca_range, 'clf': [SVC()], 'clf__C': pen_range, 'clf__gamma': gamma_range, 'clf__kernel': ['rbf'] }] gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring=metric, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) with open(out_file_scores, 'a') as out_file: out_file.write(MLvl.gridSearchresults(gs)) print(out_file_scores)
def runExperiment( train_coll, test_coll, lt, T_settings, labsHierarchy, out_fN, cv, pipe_estimators, gs_grid, scoring=None, param=None, predictionsDir=None, ): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings cv: cv folds pipe_estimators: list for pipline gs_grid: list out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str scoring: string or sklearn.metrics.scorer """ Tpipe = fex.makeTransformationsPipeline(T_settings) feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs X, y_names = ( X0, y0_names, ) # myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) labsD = lt.targetNumNomDict() with open( out_fN, "a" ) as out_file: # print details about the dataset into status file out_file.write("# {} ({})\n".format(collFi_train, len(train_coll))) out_file.write("#label_transformer {} {}\t data {}\n".format( lt.targetNumNomDict(), lt.classes_, Counter(y_names))) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testFrac, random_state=0) with open(out_fN, "a") as out_file: # print details to status file out_file.write("#TRAIN, shape {}\n".format(np.shape(X_train))) out_file.write("#TEST, shape {}\n".format(np.shape(X_test))) #### CLF pipe = Pipeline(pipe_estimators) gs = GridSearchCV(estimator=pipe, param_grid=gs_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ## best clf scores with open(out_fN, "a") as out_file: out_file.write("#CLF\t{}\tbest score {:.3f}\n".format( str(gs.best_params_).replace("\n", ""), gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, "a") as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write("{:2.2f}, {:2.2f}, {:.2f}, ".format( param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write("{:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc), 100 * 2 * np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN) #### TEST collection ### train classifier with whole dataset clf = skb.clone( gs.best_estimator_ ) # clone to create a new classifier with the same parameters clf.fit(X, y) ### print scores callIx = lt.nom2num("c") for wavF, annF in test_coll[:]: A, a_names = fex.getXy_fromWavFAnnF(wavF, annF, feExFun, labsHierarchy, filter_classes=lt.classes_) a_true = lt.nom2num(a_names) a_pred = clf.predict(A) P = mt.precision_score(a_true, a_pred, average=None)[callIx] R = mt.recall_score(a_true, a_pred, average=None)[callIx] f1 = mt.f1_score(a_true, a_pred, average=None)[callIx] with open(out_fN, "a") as out_file: out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format( 100 * f1, 100 * P, 100 * R)) if predictionsDir: bN = os.path.basename(annF) annFile_predict = os.path.join( predictionsDir, "{}_{}_{}".format(int(f1 * 100), int(param), bN)) pT.predictSoundSections(wavF, clf, lt, feExFun, annSections=labsHierarchy, outF=annFile_predict) with open(out_fN, "a") as out_file: out_file.write("\n") return clf
def run_experiment_WSD(train_coll, test_coll, test_frac, lt, Tpipe, labsHierarchy, out_fN, cv, clf_pipe, gs_grid, class_balance=None, metric=None, predictionsDir=None): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings cv: cv folds estimators: list for pipline gs_grid: list out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str class_balance: str name of the class to balance for metric: string or sklearn.metrics.scorer """ feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs if class_balance: X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance) X, y_names = X0, y0_names #myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) #labsD = lt.targetNumNomDict() ## scores header X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_frac, random_state=0) #### CLF scoring = MLvl.get_scorer(metric) pipe = Pipeline(clf_pipe) gs = GridSearchCV(estimator=pipe, param_grid=gs_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ### PRINT with open(out_fN, 'a') as out_file: # print details about the dataset into status file #out_file.write("# {} ({})\n".format( collFi_train, len(train_coll))) ## samples per class out_file.write(", ".join([str(list(y_names).count(item)) for item in lt.classes_])) ## sizes of the test/train sets out_file.write(", {}, {}".format(len(X_train), len(X_test))) ## best clf scores with open(out_fN, 'a') as out_file: out_file.write('')#", {}".format(str(gs.best_params_).replace('\n', ', '), # gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, 'a') as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write(", {:2.2f}, {:.2f}".format(100*np.mean(cv_sc), 100*2*np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write(", {:2.2f}, {:.2f}, ".format(100*np.mean(cv_acc), 100*2*np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN) ### Tpipe -- feature extraction params with open(out_fN, 'a') as out_file: settings_str = Tpipe_settings_and_header(Tpipe)[1] out_file.write(", " + settings_str+'\n')
def run_experiment_WSD( train_coll, test_coll, lt, Tpipe, labsHierarchy, out_fN, cv, clf_pipe, gs_grid, class_balance=None, metric=None, predictionsDir=None, ): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings cv: cv folds estimators: list for pipline gs_grid: list out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str class_balance: str name of the class to balance for metric: string or sklearn.metrics.scorer """ feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs if class_balance: X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance) X, y_names = ( X0, y0_names, ) # myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) labsD = lt.targetNumNomDict() ## scores header X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testFrac, random_state=0) #### CLF scoring = MLvl.get_scorer(metric) pipe = Pipeline(clf_pipe) gs = GridSearchCV(estimator=pipe, param_grid=gs_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ### PRINT with open( out_fN, "a" ) as out_file: # print details about the dataset into status file # out_file.write("# {} ({})\n".format( collFi_train, len(train_coll))) ## samples per class out_file.write(",".join( [str(list(y_names).count(item)) for item in lt.classes_])) ## sizes of the test/train sets out_file.write(", {}, {}".format(len(X_train), len(X_test))) ## best clf scores with open(out_fN, "a") as out_file: out_file.write( "") # ", {}".format(str(gs.best_params_).replace('\n', ', '), # gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, "a") as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write(", {:2.2f}, {:.2f}".format(100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write(", {:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc), 100 * 2 * np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN) ### Tpipe -- feature extraction params with open(out_fN, "a") as out_file: settings_str = expT.Tpipe_settings_and_header(Tpipe)[1] out_file.write("," + settings_str + "\n") ### settings # settings_str = Tpipe_settings_and_header(Tpipe)[1] # out_file.write(", {}\n".format(settings_str)) """ #### TEST collection ### train classifier with whole dataset clf = skb.clone(gs.best_estimator_) # clone to create a new classifier with the same parameters clf.fit(X,y) ### print scores callIx = lt.nom2num('c') for wavF, annF in test_coll[:]: A, a_names = fex.getXy_fromWavFAnnF(wavF, annF, feExFun, labsHierarchy, filter_classes=lt.classes_) a_true = lt.nom2num(a_names) a_pred = clf.predict(A) P = mt.precision_score(a_true, a_pred, average=None)[callIx] R = mt.recall_score(a_true, a_pred, average=None)[callIx] f1 = mt.f1_score(a_true, a_pred, average=None)[callIx] with open(out_fN, 'a') as out_file: out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format(100*f1, 100*P, 100*R)) if predictionsDir: bN = os.path.basename(annF) annFile_predict = os.path.join(predictionsDir, "{}_{}".format(int(f1*100), bN)) pT.predictSoundSections(wavF, clf, lt, feExFun, annSections=labsHierarchy, outF=annFile_predict) """ return clf_best