示例#1
0
def wavAnn2annSecs_dataXy_names(wavF, annF, featExtFun=None):
    """
    Instantiates the annotated sections of a wavfile
    extracting a feature vector for each annotated section in the wav file
    meant to be used with feature extraction 'split'

    (n_n) used for call type classification Q (n_n) 
    Parameters
    ----------
    wavF: str
        path to wavefile
    annF: str
        path to wavfile
    featExtFun:  callable
        feature extraction function

    Returns
    -------
    datO: ML.dataXy_names
        classification features
    """

    ### extract features for each annotated section
    segmentsLi, fs = auf.getAnnWavSec(wavF, annF)

    datO = myML.dataXy_names()
    ## for each annotation in the wavfile compute the features
    for annIndex in range(len(segmentsLi)):
        label = segmentsLi[annIndex]["label"]
        waveform = segmentsLi[annIndex]["waveform"]
        M = featExtFun(waveform)
        datO.addInstances(np.expand_dims(M.flatten(), axis=0),
                          [np.array(label)])

    return datO
示例#2
0
def extractFeaturesWDataAugmentation(sampSpace,
                                     feExFun,
                                     n_instances=10,
                                     **ensSettings):
    """Prepares data with the labels in wavAnnCollection, 
    balancing the classes generating artificial samples
    Parameter
    ---------
    sampSpace: dict
        labels and waveforms (samples space)
    feExfun: callable
    n_instances: int
    ensemble_settings: dict
        kwards for the generation of artificial samples
        see exT.generateData_ensembleSettings(n_artificial_samples=1)"""

    datO = myML.dataXy_names()  # data object
    for call in sampSpace.keys():
        ### extract features from original samples
        dat = waveformsLi2DatXy_names(sampSpace[call],
                                      call,
                                      feExFun,
                                      nInstances=n_instances)
        datO.addInstances(dat.X, dat.y_names)
        n_art_instances = n_instances - dat.m_instances
        ### generate artificial samples
        datArt = waveformsLi2aritificial_DatXy_names(
            sampSpace[call],
            call,
            feExFun,
            n_instances=n_art_instances,
            **ensSettings)
        datO.addInstances(datArt.X, datArt.y_names)
    return datO
示例#3
0
def wavAnnCollection2datXy(WavAnnCollection,
                           feExFun=None,
                           labelsHierarchy="default"):
    """
    Extracts features and labels from wav-ann collection
    Parameters
    ----------
    WavAnnCollection: list of tuples
        [(<path to wavF>, <path to annF>), ...]
    feExFun: callable
        feature extraction function
    labelsHierarchy: list
        labels in hierarchical order for setting the label of the instances (WALKING)

    Return
    ------    
    > datO :  a file with the paths to the features and their labels
    """
    if labelsHierarchy == "default":
        labelsHierarchy = ["c"]

    datO = myML.dataXy_names()  # initialise data object

    for wavF, annF in WavAnnCollection:
        X, y0_names = getXy_fromWavFAnnF(wavF, annF, feExFun, labelsHierarchy)
        datO.addInstances(X, y0_names)

    return datO
示例#4
0
def waveformsLi2aritificial_DatXy_names(waveformsLi, label, feExFun,
                                        n_instances, **ensemble_settings):
    """takes a list of waveforms, all with the same label, generates artificial samples, 
    extracts features and returns data object
    Parameters
    ---------
    n_instances: int
        total number of artificial samples (instances) to generate
    ensemble_settings: dict
        kwargs for the generation of artificial samples
        see exT.generateData_ensembleSettings(n_artificial_samples=1)
    """
    n_samps = len(waveformsLi)
    # indices to take different waveforms until created desired number of samples
    indices = np.arange(n_instances) % n_samps
    datO = myML.dataXy_names()  # initialise data object

    for i in indices:
        waveform = waveformsLi[i]
        artificial_waveform = eff.generateWaveformEnsemble(
            waveform, **ensemble_settings)[0]
        art_samp = feExFun(artificial_waveform)
        datO.addInstances(np.expand_dims(art_samp.flatten(), axis=0),
                          [np.array(label)])
    return datO
示例#5
0
def wavLCollection2datXy(wavLabelCollection, fs=None, featExtFun=None):
    """
    returns the data object of a collection of labelled wavs

        ..... call type (classification) ....

    Parameters
    ----------
    wavLabelCollection : list of tuples
        tu[0] : path to wav file
        tu[1] : wav label
    featExtFun : callable

    Return
    ------
    > datO: myML.dataXy_names
        data
    """

    datO = myML.dataXy_names()  # initialise data object

    for wavF, l in wavLabelCollection:
        waveForm, fs = wav2waveform(wavF, fs=fs)  # , normalize=False)
        M = featExtFun(waveForm)
        datO.addInstances(np.expand_dims(M.flatten(), axis=0), [l])

        # print(np.shape(M0), datO.shape, np.shape(datO.y), os.path.basename(wavF))
    return datO
示例#6
0
def wavAnnCollection2annSecs_dataXy_names(wavAnnColl, featExtFun=None):
    """
    Computes the X, y for a collection of annotated wav files
    for each annotated section in the wav file
    meant to be used with feature extraction 'split'

    ........O Used for call type classification Q....
    
    Parameters
    ----------
    < wavAnnColl : collection of annotated wavfiles
    < featExtFun :  feature extraction function (callable)
                    or a dictionary with the feature extraction settings
                    featureExtractionParams = dict(zip(i, i))
    Return
    ------    
    > datXy_names : features object
    """

    datO = myML.dataXy_names()

    for wavF, annF in wavAnnColl[:]:
        # datO_test_new = wavAnn2sectionsXy( wF, annF, featExtFun=featExtFun) #wavPreprocessingT = wavPreprocessingFun )
        datO_new = wavAnn2annSecs_dataXy_names(
            wavF, annF,
            featExtFun=featExtFun)  # wavPreprocessingT = wavPreprocessingFun )
        datO.addInstances(datO_new.X, datO_new.y_names)

    return datO
示例#7
0
def wavFAnnF2sections_wavsEnsemble_datXy_names(wavF,
                                               annF,
                                               featExtFun=None,
                                               wavPreprocessingT=None,
                                               ensembleSettings=None):
    """
    Computes the features of each annotated section in the wav file
    ment to be used with feature extraction 'split' 

    
    Parameters:
    ----------
    wavFi: str
        path to wave file
    featExtFun: callable
        feature extraction function function
    wavPreprocessingT : callable
        applied before ensemble generation
    ensembleSettings: dict
        instructions for ensemble generation

    Return:
    ------
        > datXy_names : data object
    """

    ### check feature extraction function
    if not callable(featExtFun):  # dictionary or None (default parameters)
        featExtFun = wavFeatureExtraction(featExtFun).featExtrFun()  # default
    if not callable(wavPreprocessingT):
        wavPreprocessingT = lambda x, y: x
    if ensembleSettings is None:
        ensembleSettings = dict(effectName="addWhiteNoise",
                                generate_data_grid=np.ones(1))

    ### extract features for each annotated section
    segmentsLi, fs = auf.getAnnWavSec(wavF, annF)
    # assert sr==fs, "noise and signal waves have different sampling rates"

    datO = myML.dataXy_names()
    ## for each annotation in the wavfile compute the features
    for annIndex in range(len(segmentsLi)):
        label = segmentsLi[annIndex]["label"]
        waveform = segmentsLi[annIndex]["waveform"]
        ##
        waveform = wavPreprocessingT(waveform, fs)  # preprocess waveform
        ## generate ensemble
        Y = eff.generateWaveformEnsemble(waveform,
                                         **ensembleSettings)  ## noise
        # Extrac
        for i in range(len(Y)):  # np.shape(Y)[0]):
            # M, _, _, featStr = featExtFun(Y[i], fs) #
            M = featExtFun(Y[i, :])
            datO.addInstances(np.expand_dims(M.flatten(), axis=0),
                              [np.array(label)])

    return datO
def test_dataXy_filter():
    # test dataXy loading data
    M = np.random.randint(1, 5, (4, 4))
    labs = np.random.randint(0, 1, (4, ))
    datO = myML.dataXy_names(M, labs)
    np.testing.assert_array_equal(M, datO.X)

    # test None filter form the data_ynames class
    M_NoneFilt, labs_NoneFilt = datO.filterInstances(None)
    np.testing.assert_array_equal(M, M_NoneFilt)  # filtering
示例#9
0
def waveformsLi2DatXy_names(waveformsLi, label, feExFun, nInstances):
    """Extracts features from an waveformlist and returns data object"""
    n_samps = len(waveformsLi)
    stopIdx = None
    if n_samps > nInstances:
        stopIdx = nInstances

    datO = myML.dataXy_names()  # initialise data object
    for waveform in waveformsLi[:stopIdx]:
        M = feExFun(waveform)
        datO.addInstances(np.expand_dims(M.flatten(), axis=0),
                          [np.array(label)])
    return datO
示例#10
0
def wavAnnCollection2Xy_ensemble_datXy_names(wavAnnColl,
                                             featExtFun,
                                             wavPreprocessingT=None,
                                             ensembleSettings=None):

    datO = myML.dataXy_names()  # initialise data object
    for wavF, annF in wavAnnColl[:]:
        datO_new = wavFAnnF2sections_wavsEnsemble_datXy_names(
            wavF,
            annF,
            featExtFun=featExtFun,
            wavPreprocessingT=wavPreprocessingT,
            ensembleSettings=ensembleSettings)
        datO.addInstances(datO_new.X, datO_new.y_names)

    return datO
示例#11
0
def get_DataXy_fromWavFannF(wavF, annF, feExFun, labelsHierarchy):
    """
    extracts features and its labels (ground truth) from wavF and annF files
    and returns its dataXy_names instance
    ----------
    wavF: str
    annF: str
    feExFun: callable
    labelsHierarchy: list
    """
    # np.loadtxt(collFi, delimiter='\t', dtype='|S')
    # print("\n==========\nTEST\n==============",wavF)
    waveForm, fs = wav2waveform(wavF)
    tf = len(waveForm) / fs

    M0 = feExFun(waveForm)
    m = len(M0)
    y0_names = auf.annotationsFi2instances(annF,
                                           m,
                                           tf,
                                           labelsHierarchy=labelsHierarchy)
    datO = myML.dataXy_names(M0, y0_names)
    return datO
示例#12
0
clfStr = 'cv{}-'.format(cv)
settingsStr = "{}-{}".format(Tpipe.string, clfStr)
settingsStr += '-labsHierarchy_' + '_'.join(labsHierarchy)

## write in out file
out_file = open(out_fN, 'a')
out_file.write("#WSD1\n###---------   {}   ---------###\n".format(
    time.strftime("%Y.%m.%d\t\t%H:%M:%S")))
out_file.write("#" + settingsStr + '\n')
out_file.close()

## load collections
train_coll = fex.readCols(collFi_train, colIndexes=(0, 1))
test_coll = np.genfromtxt(collFi_test, dtype=object)

lt = myML.labelTransformer(clf_labs)


def runWSD2Experiment(train_coll,
                      test_coll,
                      lt,
                      T_settings,
                      labsHierarchy,
                      cv,
                      out_fN,
                      testColl_scoreClassLabels,
                      readSections,
                      param=None,
                      predictionsDir=None,
                      keepSections='default',
                      scoring=None):
示例#13
0
    out_file.write("\n###---------   {}   ---------###\n".format(
        time.strftime("%Y.%m.%d\t\t%H:%M:%S")))
    out_file.write("#" + settingsStr)
    out_file.close()

    #### load collection
    WavAnnCollection = fex.readCols(collFi_train, colIndexes=(0, 1))
    print("\ncollection:", len(WavAnnCollection), "\nlast file:",
          WavAnnCollection[-1])

    #### compute features
    trainDat = fex.wavAnnCollection2datXy(
        WavAnnCollection, feExFun)  #, wavPreprocesingT=wavPreprocessingFun)
    ## y_names train and test data
    X, y_names = trainDat.filterInstances(labs)  # train
    lt = myML.labelTransformer(labs)
    y = lt.nom2num(y_names)
    labsD = lt.targetNumNomDict()
    #### train/test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=testFrac,
                                                        random_state=0)

    with open(out_file_scores,
              'a') as out_file:  # print details to status file
        out_file.write("\n#TRAIN, shape {}\n".format(np.shape(X_train)))
        out_file.write("#TEST, shape {}\n".format(np.shape(X_test)))

        ## more info
        #out_file.write("#X {}, y {}\n".format( np.shape(X), np.shape(y)))
示例#14
0
def run_experiment_WSD(train_coll, test_coll, test_frac,
                       lt, Tpipe, labsHierarchy, 
                       out_fN,
                       cv, clf_pipe, gs_grid, 
                       class_balance=None, metric=None,
                       predictionsDir=None):
    """Runs clf experiments
    Parameters
    ----------
        train_coll: list
        test_coll: list
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierachy: list of strings
        cv: cv folds
        estimators: list
            for pipline
        gs_grid: list
                    
        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
	class_balance: str
		name of the class to balance for
        metric: string or sklearn.metrics.scorer
    """

    feExFun = Tpipe.fun
    #### prepare DATA: collections --> X y
    ## compute features
    dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy)
    ## prepare X y data
    X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
    if class_balance:
	X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance)
    X, y_names = X0, y0_names #myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
    y = lt.nom2num(y_names)
    #labsD = lt.targetNumNomDict()
    ## scores header
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=test_frac,
                                                        random_state=0)

    #### CLF
    scoring = MLvl.get_scorer(metric)
    pipe = Pipeline(clf_pipe)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=gs_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    
    ### PRINT
    with open(out_fN, 'a') as out_file: # print details about the dataset into status file
        #out_file.write("# {} ({})\n".format( collFi_train, len(train_coll)))
        ## samples per class
        out_file.write(", ".join([str(list(y_names).count(item)) 
                                  for item in lt.classes_]))
        ## sizes of the test/train sets
        out_file.write(", {}, {}".format(len(X_train), len(X_test)))

    ## best clf scores
    with open(out_fN, 'a') as out_file:
        out_file.write('')#", {}".format(str(gs.best_params_).replace('\n', ', '), 
                         #                                     gs.best_score_))
    clf_best = gs.best_estimator_

    ## clf scores over test set
    with open(out_fN, 'a') as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring)
        out_file.write(", {:2.2f}, {:.2f}".format(100*np.mean(cv_sc),
                                                  100*2*np.std(cv_sc)))
        ### cv accuracy
        cv_acc = cross_val_score(clf_best, X_test, y_test)
        out_file.write(", {:2.2f}, {:.2f}, ".format(100*np.mean(cv_acc),
                                                    100*2*np.std(cv_acc)))

    ## print R, P an f1 for each class
    y_true, y_pred = y_test, clf_best.predict(X_test)                                                         
    MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN)
    
    ### Tpipe -- feature extraction params
    with open(out_fN, 'a') as out_file:
        settings_str = Tpipe_settings_and_header(Tpipe)[1]
        out_file.write(", " + settings_str+'\n')
示例#15
0
# In[18]:


# path to files
train_collection = os.path.join(pDir, 'data/groupB_paths2files.csv')
## load data
df = pd.read_csv(train_collection, usecols=['path_to_file', 'call'])
wavColl = df.values


# ## Extract features

# In[7]:


datO = myML.dataXy_names() 
datO_new = fex.wavLCollection2datXy( wavColl, featExtFun=feExFun, fs=fs )
datO.addInstances(datO_new.X, datO_new.y_names )

## label transformer
call_labels = [l[1] for l in wavColl]
lt = myML.labelTransformer(call_labels)

X = datO.X
y_names = datO.y_names
y = lt.nom2num(y_names)


# In[8]:

示例#16
0
def run_experiment_WSD(
    train_coll,
    test_coll,
    lt,
    Tpipe,
    labsHierarchy,
    out_fN,
    cv,
    clf_pipe,
    gs_grid,
    class_balance=None,
    metric=None,
    predictionsDir=None,
):
    """Runs clf experiments
    Parameters
    ----------
        train_coll: list
        test_coll: list
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierachy: list of strings
        cv: cv folds
        estimators: list
            for pipline
        gs_grid: list

        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
    class_balance: str
        name of the class to balance for
        metric: string or sklearn.metrics.scorer
    """

    feExFun = Tpipe.fun
    #### prepare DATA: collections --> X y
    ## compute features
    dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy)
    ## prepare X y data
    X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
    if class_balance:
        X0, y0_names = myML.balanceToClass(X0, y0_names, class_balance)
    X, y_names = (
        X0,
        y0_names,
    )  # myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
    y = lt.nom2num(y_names)
    labsD = lt.targetNumNomDict()
    ## scores header

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=testFrac,
                                                        random_state=0)

    #### CLF
    scoring = MLvl.get_scorer(metric)
    pipe = Pipeline(clf_pipe)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=gs_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)

    ### PRINT
    with open(
            out_fN, "a"
    ) as out_file:  # print details about the dataset into status file
        # out_file.write("# {} ({})\n".format( collFi_train, len(train_coll)))
        ## samples per class
        out_file.write(",".join(
            [str(list(y_names).count(item)) for item in lt.classes_]))
        ## sizes of the test/train sets
        out_file.write(", {}, {}".format(len(X_train), len(X_test)))

    ## best clf scores
    with open(out_fN, "a") as out_file:
        out_file.write(
            "")  # ", {}".format(str(gs.best_params_).replace('\n', ', '),
        #                                     gs.best_score_))
    clf_best = gs.best_estimator_

    ## clf scores over test set
    with open(out_fN, "a") as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring)
        out_file.write(", {:2.2f}, {:.2f}".format(100 * np.mean(cv_sc),
                                                  100 * 2 * np.std(cv_sc)))
        ### cv accuracy
        cv_acc = cross_val_score(clf_best, X_test, y_test)
        out_file.write(", {:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc),
                                                    100 * 2 * np.std(cv_acc)))

    ## print R, P an f1 for each class
    y_true, y_pred = y_test, clf_best.predict(X_test)
    MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN)

    ### Tpipe -- feature extraction params
    with open(out_fN, "a") as out_file:
        settings_str = expT.Tpipe_settings_and_header(Tpipe)[1]
        out_file.write("," + settings_str + "\n")

    ### settings
    # settings_str = Tpipe_settings_and_header(Tpipe)[1]
    # out_file.write(", {}\n".format(settings_str))
    """
    #### TEST collection
    ### train classifier with whole dataset
    clf = skb.clone(gs.best_estimator_) # clone to create a new classifier with the same parameters
    clf.fit(X,y)
    ### print scores
    callIx = lt.nom2num('c')
    for wavF, annF in test_coll[:]:
        A, a_names = fex.getXy_fromWavFAnnF(wavF, annF, feExFun, labsHierarchy,
                                            filter_classes=lt.classes_)
        a_true = lt.nom2num(a_names)
        a_pred = clf.predict(A)
        P = mt.precision_score(a_true, a_pred, average=None)[callIx]
        R = mt.recall_score(a_true, a_pred, average=None)[callIx]
        f1 = mt.f1_score(a_true, a_pred, average=None)[callIx]
        with open(out_fN, 'a') as out_file:
            out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format(100*f1,
                                                                100*P, 100*R))
        if predictionsDir:
            bN = os.path.basename(annF)
            annFile_predict = os.path.join(predictionsDir,
                                           "{}_{}".format(int(f1*100),
                                                             bN))
            pT.predictSoundSections(wavF, clf,  lt, feExFun, annSections=labsHierarchy,
                                    outF=annFile_predict)

    """

    return clf_best
pDir = os.path.dirname(os.path.abspath(__file__))
inF = os.path.join(pDir, "data/annotations_301117.csv")
# inF = '/home/florencia/profesjonell/bioacoustics/kdarras/data/annotations/annotations_301117.csv'
df0 = pd.read_csv(inF)

# ## Feature extraction settings
#
# Create pipeline for the feature extraction settings
#
# **y** settings

# In[26]:

classes = ["noise"] + clf_id_str.split("-")
lt = myML.labelTransformer(classes)

df = df0[df0["type"].isin(classes)]

# **X** settings

# In[12]:

T_settings = []

#### preprocessing
## band pass filter
filt = "band_pass_filter"
filtDi = {"fs": sr, "lowcut": 10000, "highcut": 100000, "order": 5}
# T_settings.append(('bandFilter', (filt, filtDi)))
## normalisation