def wavAnn2annSecs_dataXy_names(wavF, annF, featExtFun=None): """ Instantiates the annotated sections of a wavfile extracting a feature vector for each annotated section in the wav file meant to be used with feature extraction 'split' (n_n) used for call type classification Q (n_n) Parameters ---------- wavF: str path to wavefile annF: str path to wavfile featExtFun: callable feature extraction function Returns ------- datO: ML.dataXy_names classification features """ ### extract features for each annotated section segmentsLi, fs = auf.getAnnWavSec(wavF, annF) datO = myML.dataXy_names() ## for each annotation in the wavfile compute the features for annIndex in range(len(segmentsLi)): label = segmentsLi[annIndex]["label"] waveform = segmentsLi[annIndex]["waveform"] M = featExtFun(waveform) datO.addInstances(np.expand_dims(M.flatten(), axis=0), [np.array(label)]) return datO
def extractFeaturesWDataAugmentation(sampSpace, feExFun, n_instances=10, **ensSettings): """Prepares data with the labels in wavAnnCollection, balancing the classes generating artificial samples Parameter --------- sampSpace: dict labels and waveforms (samples space) feExfun: callable n_instances: int ensemble_settings: dict kwards for the generation of artificial samples see exT.generateData_ensembleSettings(n_artificial_samples=1)""" datO = myML.dataXy_names() # data object for call in sampSpace.keys(): ### extract features from original samples dat = waveformsLi2DatXy_names(sampSpace[call], call, feExFun, nInstances=n_instances) datO.addInstances(dat.X, dat.y_names) n_art_instances = n_instances - dat.m_instances ### generate artificial samples datArt = waveformsLi2aritificial_DatXy_names( sampSpace[call], call, feExFun, n_instances=n_art_instances, **ensSettings) datO.addInstances(datArt.X, datArt.y_names) return datO
def wavAnnCollection2datXy(WavAnnCollection, feExFun=None, labelsHierarchy="default"): """ Extracts features and labels from wav-ann collection Parameters ---------- WavAnnCollection: list of tuples [(<path to wavF>, <path to annF>), ...] feExFun: callable feature extraction function labelsHierarchy: list labels in hierarchical order for setting the label of the instances (WALKING) Return ------ > datO : a file with the paths to the features and their labels """ if labelsHierarchy == "default": labelsHierarchy = ["c"] datO = myML.dataXy_names() # initialise data object for wavF, annF in WavAnnCollection: X, y0_names = getXy_fromWavFAnnF(wavF, annF, feExFun, labelsHierarchy) datO.addInstances(X, y0_names) return datO
def waveformsLi2aritificial_DatXy_names(waveformsLi, label, feExFun, n_instances, **ensemble_settings): """takes a list of waveforms, all with the same label, generates artificial samples, extracts features and returns data object Parameters --------- n_instances: int total number of artificial samples (instances) to generate ensemble_settings: dict kwargs for the generation of artificial samples see exT.generateData_ensembleSettings(n_artificial_samples=1) """ n_samps = len(waveformsLi) # indices to take different waveforms until created desired number of samples indices = np.arange(n_instances) % n_samps datO = myML.dataXy_names() # initialise data object for i in indices: waveform = waveformsLi[i] artificial_waveform = eff.generateWaveformEnsemble( waveform, **ensemble_settings)[0] art_samp = feExFun(artificial_waveform) datO.addInstances(np.expand_dims(art_samp.flatten(), axis=0), [np.array(label)]) return datO
def wavLCollection2datXy(wavLabelCollection, fs=None, featExtFun=None): """ returns the data object of a collection of labelled wavs ..... call type (classification) .... Parameters ---------- wavLabelCollection : list of tuples tu[0] : path to wav file tu[1] : wav label featExtFun : callable Return ------ > datO: myML.dataXy_names data """ datO = myML.dataXy_names() # initialise data object for wavF, l in wavLabelCollection: waveForm, fs = wav2waveform(wavF, fs=fs) # , normalize=False) M = featExtFun(waveForm) datO.addInstances(np.expand_dims(M.flatten(), axis=0), [l]) # print(np.shape(M0), datO.shape, np.shape(datO.y), os.path.basename(wavF)) return datO
def wavAnnCollection2annSecs_dataXy_names(wavAnnColl, featExtFun=None): """ Computes the X, y for a collection of annotated wav files for each annotated section in the wav file meant to be used with feature extraction 'split' ........O Used for call type classification Q.... Parameters ---------- < wavAnnColl : collection of annotated wavfiles < featExtFun : feature extraction function (callable) or a dictionary with the feature extraction settings featureExtractionParams = dict(zip(i, i)) Return ------ > datXy_names : features object """ datO = myML.dataXy_names() for wavF, annF in wavAnnColl[:]: # datO_test_new = wavAnn2sectionsXy( wF, annF, featExtFun=featExtFun) #wavPreprocessingT = wavPreprocessingFun ) datO_new = wavAnn2annSecs_dataXy_names( wavF, annF, featExtFun=featExtFun) # wavPreprocessingT = wavPreprocessingFun ) datO.addInstances(datO_new.X, datO_new.y_names) return datO
def wavFAnnF2sections_wavsEnsemble_datXy_names(wavF, annF, featExtFun=None, wavPreprocessingT=None, ensembleSettings=None): """ Computes the features of each annotated section in the wav file ment to be used with feature extraction 'split' Parameters: ---------- wavFi: str path to wave file featExtFun: callable feature extraction function function wavPreprocessingT : callable applied before ensemble generation ensembleSettings: dict instructions for ensemble generation Return: ------ > datXy_names : data object """ ### check feature extraction function if not callable(featExtFun): # dictionary or None (default parameters) featExtFun = wavFeatureExtraction(featExtFun).featExtrFun() # default if not callable(wavPreprocessingT): wavPreprocessingT = lambda x, y: x if ensembleSettings is None: ensembleSettings = dict(effectName="addWhiteNoise", generate_data_grid=np.ones(1)) ### extract features for each annotated section segmentsLi, fs = auf.getAnnWavSec(wavF, annF) # assert sr==fs, "noise and signal waves have different sampling rates" datO = myML.dataXy_names() ## for each annotation in the wavfile compute the features for annIndex in range(len(segmentsLi)): label = segmentsLi[annIndex]["label"] waveform = segmentsLi[annIndex]["waveform"] ## waveform = wavPreprocessingT(waveform, fs) # preprocess waveform ## generate ensemble Y = eff.generateWaveformEnsemble(waveform, **ensembleSettings) ## noise # Extrac for i in range(len(Y)): # np.shape(Y)[0]): # M, _, _, featStr = featExtFun(Y[i], fs) # M = featExtFun(Y[i, :]) datO.addInstances(np.expand_dims(M.flatten(), axis=0), [np.array(label)]) return datO
def test_dataXy_filter(): # test dataXy loading data M = np.random.randint(1, 5, (4, 4)) labs = np.random.randint(0, 1, (4, )) datO = myML.dataXy_names(M, labs) np.testing.assert_array_equal(M, datO.X) # test None filter form the data_ynames class M_NoneFilt, labs_NoneFilt = datO.filterInstances(None) np.testing.assert_array_equal(M, M_NoneFilt) # filtering
def waveformsLi2DatXy_names(waveformsLi, label, feExFun, nInstances): """Extracts features from an waveformlist and returns data object""" n_samps = len(waveformsLi) stopIdx = None if n_samps > nInstances: stopIdx = nInstances datO = myML.dataXy_names() # initialise data object for waveform in waveformsLi[:stopIdx]: M = feExFun(waveform) datO.addInstances(np.expand_dims(M.flatten(), axis=0), [np.array(label)]) return datO
def wavAnnCollection2Xy_ensemble_datXy_names(wavAnnColl, featExtFun, wavPreprocessingT=None, ensembleSettings=None): datO = myML.dataXy_names() # initialise data object for wavF, annF in wavAnnColl[:]: datO_new = wavFAnnF2sections_wavsEnsemble_datXy_names( wavF, annF, featExtFun=featExtFun, wavPreprocessingT=wavPreprocessingT, ensembleSettings=ensembleSettings) datO.addInstances(datO_new.X, datO_new.y_names) return datO
def get_DataXy_fromWavFannF(wavF, annF, feExFun, labelsHierarchy): """ extracts features and its labels (ground truth) from wavF and annF files and returns its dataXy_names instance ---------- wavF: str annF: str feExFun: callable labelsHierarchy: list """ # np.loadtxt(collFi, delimiter='\t', dtype='|S') # print("\n==========\nTEST\n==============",wavF) waveForm, fs = wav2waveform(wavF) tf = len(waveForm) / fs M0 = feExFun(waveForm) m = len(M0) y0_names = auf.annotationsFi2instances(annF, m, tf, labelsHierarchy=labelsHierarchy) datO = myML.dataXy_names(M0, y0_names) return datO
clfStr = 'cv{}-'.format(cv) settingsStr = "{}-{}".format(Tpipe.string, clfStr) settingsStr += '-labsHierarchy_' + '_'.join(labsHierarchy) ## write in out file out_file = open(out_fN, 'a') out_file.write("#WSD1\n###--------- {} ---------###\n".format( time.strftime("%Y.%m.%d\t\t%H:%M:%S"))) out_file.write("#" + settingsStr + '\n') out_file.close() ## load collections train_coll = fex.readCols(collFi_train, colIndexes=(0, 1)) test_coll = np.genfromtxt(collFi_test, dtype=object) lt = myML.labelTransformer(clf_labs) def runWSD2Experiment(train_coll, test_coll, lt, T_settings, labsHierarchy, cv, out_fN, testColl_scoreClassLabels, readSections, param=None, predictionsDir=None, keepSections='default', scoring=None):
out_file.write("\n###--------- {} ---------###\n".format( time.strftime("%Y.%m.%d\t\t%H:%M:%S"))) out_file.write("#" + settingsStr) out_file.close() #### load collection WavAnnCollection = fex.readCols(collFi_train, colIndexes=(0, 1)) print("\ncollection:", len(WavAnnCollection), "\nlast file:", WavAnnCollection[-1]) #### compute features trainDat = fex.wavAnnCollection2datXy( WavAnnCollection, feExFun) #, wavPreprocesingT=wavPreprocessingFun) ## y_names train and test data X, y_names = trainDat.filterInstances(labs) # train lt = myML.labelTransformer(labs) y = lt.nom2num(y_names) labsD = lt.targetNumNomDict() #### train/test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testFrac, random_state=0) with open(out_file_scores, 'a') as out_file: # print details to status file out_file.write("\n#TRAIN, shape {}\n".format(np.shape(X_train))) out_file.write("#TEST, shape {}\n".format(np.shape(X_test))) ## more info #out_file.write("#X {}, y {}\n".format( np.shape(X), np.shape(y)))
def run_experiment_WSD(train_coll, test_coll, test_frac, lt, Tpipe, labsHierarchy, out_fN, cv, clf_pipe, gs_grid, class_balance=None, metric=None, predictionsDir=None): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings cv: cv folds estimators: list for pipline gs_grid: list out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str class_balance: str name of the class to balance for metric: string or sklearn.metrics.scorer """ feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs if class_balance: X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance) X, y_names = X0, y0_names #myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) #labsD = lt.targetNumNomDict() ## scores header X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_frac, random_state=0) #### CLF scoring = MLvl.get_scorer(metric) pipe = Pipeline(clf_pipe) gs = GridSearchCV(estimator=pipe, param_grid=gs_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ### PRINT with open(out_fN, 'a') as out_file: # print details about the dataset into status file #out_file.write("# {} ({})\n".format( collFi_train, len(train_coll))) ## samples per class out_file.write(", ".join([str(list(y_names).count(item)) for item in lt.classes_])) ## sizes of the test/train sets out_file.write(", {}, {}".format(len(X_train), len(X_test))) ## best clf scores with open(out_fN, 'a') as out_file: out_file.write('')#", {}".format(str(gs.best_params_).replace('\n', ', '), # gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, 'a') as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write(", {:2.2f}, {:.2f}".format(100*np.mean(cv_sc), 100*2*np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write(", {:2.2f}, {:.2f}, ".format(100*np.mean(cv_acc), 100*2*np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN) ### Tpipe -- feature extraction params with open(out_fN, 'a') as out_file: settings_str = Tpipe_settings_and_header(Tpipe)[1] out_file.write(", " + settings_str+'\n')
# In[18]: # path to files train_collection = os.path.join(pDir, 'data/groupB_paths2files.csv') ## load data df = pd.read_csv(train_collection, usecols=['path_to_file', 'call']) wavColl = df.values # ## Extract features # In[7]: datO = myML.dataXy_names() datO_new = fex.wavLCollection2datXy( wavColl, featExtFun=feExFun, fs=fs ) datO.addInstances(datO_new.X, datO_new.y_names ) ## label transformer call_labels = [l[1] for l in wavColl] lt = myML.labelTransformer(call_labels) X = datO.X y_names = datO.y_names y = lt.nom2num(y_names) # In[8]:
def run_experiment_WSD( train_coll, test_coll, lt, Tpipe, labsHierarchy, out_fN, cv, clf_pipe, gs_grid, class_balance=None, metric=None, predictionsDir=None, ): """Runs clf experiments Parameters ---------- train_coll: list test_coll: list lt: ML.labelTransformer T_settings: list of tuples labelsHierachy: list of strings cv: cv folds estimators: list for pipline gs_grid: list out_fN: str returnClfs: dict, Flase => clfs are not stored predictionsDir: str class_balance: str name of the class to balance for metric: string or sklearn.metrics.scorer """ feExFun = Tpipe.fun #### prepare DATA: collections --> X y ## compute features dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy) ## prepare X y data X0, y0_names = dataO.filterInstances(lt.classes_) # filter for clf_labs if class_balance: X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance) X, y_names = ( X0, y0_names, ) # myML.balanceToClass(X0, y0_names, 'c') # balance classes X0, y0_names# y = lt.nom2num(y_names) labsD = lt.targetNumNomDict() ## scores header X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testFrac, random_state=0) #### CLF scoring = MLvl.get_scorer(metric) pipe = Pipeline(clf_pipe) gs = GridSearchCV(estimator=pipe, param_grid=gs_grid, scoring=scoring, cv=cv, n_jobs=-1) gs = gs.fit(X_train, y_train) ### PRINT with open( out_fN, "a" ) as out_file: # print details about the dataset into status file # out_file.write("# {} ({})\n".format( collFi_train, len(train_coll))) ## samples per class out_file.write(",".join( [str(list(y_names).count(item)) for item in lt.classes_])) ## sizes of the test/train sets out_file.write(", {}, {}".format(len(X_train), len(X_test))) ## best clf scores with open(out_fN, "a") as out_file: out_file.write( "") # ", {}".format(str(gs.best_params_).replace('\n', ', '), # gs.best_score_)) clf_best = gs.best_estimator_ ## clf scores over test set with open(out_fN, "a") as out_file: ### cv score cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring) out_file.write(", {:2.2f}, {:.2f}".format(100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc))) ### cv accuracy cv_acc = cross_val_score(clf_best, X_test, y_test) out_file.write(", {:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc), 100 * 2 * np.std(cv_acc))) ## print R, P an f1 for each class y_true, y_pred = y_test, clf_best.predict(X_test) MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN) ### Tpipe -- feature extraction params with open(out_fN, "a") as out_file: settings_str = expT.Tpipe_settings_and_header(Tpipe)[1] out_file.write("," + settings_str + "\n") ### settings # settings_str = Tpipe_settings_and_header(Tpipe)[1] # out_file.write(", {}\n".format(settings_str)) """ #### TEST collection ### train classifier with whole dataset clf = skb.clone(gs.best_estimator_) # clone to create a new classifier with the same parameters clf.fit(X,y) ### print scores callIx = lt.nom2num('c') for wavF, annF in test_coll[:]: A, a_names = fex.getXy_fromWavFAnnF(wavF, annF, feExFun, labsHierarchy, filter_classes=lt.classes_) a_true = lt.nom2num(a_names) a_pred = clf.predict(A) P = mt.precision_score(a_true, a_pred, average=None)[callIx] R = mt.recall_score(a_true, a_pred, average=None)[callIx] f1 = mt.f1_score(a_true, a_pred, average=None)[callIx] with open(out_fN, 'a') as out_file: out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format(100*f1, 100*P, 100*R)) if predictionsDir: bN = os.path.basename(annF) annFile_predict = os.path.join(predictionsDir, "{}_{}".format(int(f1*100), bN)) pT.predictSoundSections(wavF, clf, lt, feExFun, annSections=labsHierarchy, outF=annFile_predict) """ return clf_best
pDir = os.path.dirname(os.path.abspath(__file__)) inF = os.path.join(pDir, "data/annotations_301117.csv") # inF = '/home/florencia/profesjonell/bioacoustics/kdarras/data/annotations/annotations_301117.csv' df0 = pd.read_csv(inF) # ## Feature extraction settings # # Create pipeline for the feature extraction settings # # **y** settings # In[26]: classes = ["noise"] + clf_id_str.split("-") lt = myML.labelTransformer(classes) df = df0[df0["type"].isin(classes)] # **X** settings # In[12]: T_settings = [] #### preprocessing ## band pass filter filt = "band_pass_filter" filtDi = {"fs": sr, "lowcut": 10000, "highcut": 100000, "order": 5} # T_settings.append(('bandFilter', (filt, filtDi))) ## normalisation