예제 #1
0
def featureExtractionInstructions2Xy(wavAnnColl, lt, TpipeSettings, labelSet=None, 
                                     **kwargs):
    """All instructions for feature extraction
    Params:
    -------
        wavAnnColl : 
        lt : label transformer
        featExtFun : feature extraction function
        labelSet : list with the subset of labels to consider
        **feExParamDict : other kwargs for feature extraction
            eg. dict(wavPreprocessingT=None, ensembleSettings=ensembleSettings)
    """
    Tpipe = fex.makeTransformationsPipeline(TpipeSettings)
    feExFun = Tpipe.fun
    datO = fex.wavAnnCollection2Xy_ensemble_datXy_names(wavAnnColl, feExFun,**kwargs)
    X_train, y_train_labels = datO.filterInstances(labelSet)
    y_train = lt.nom2num(y_train_labels)
    return X_train, y_train
예제 #2
0
def runWSD2Experiment(train_coll,
                      test_coll,
                      lt,
                      T_settings,
                      labsHierarchy,
                      cv,
                      out_fN,
                      testColl_scoreClassLabels,
                      readSections,
                      param=None,
                      predictionsDir=None,
                      keepSections='default',
                      scoring=None):
    """Runs clf experiments
    Parameters
    ----------
        train_coll: list
        test_coll: list of (wavF, annF, template_annF)
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierachy: list of strings
        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
        scoring: string or sklearn.metrics.scorer
    """

    Tpipe = fex.makeTransformationsPipeline(T_settings)
    feExFun = Tpipe.fun
    #### prepare DATA: collections --> X y
    ## compute features
    dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy)
    ## prepare X y data
    X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
    X, y_names = X0, y0_names  #myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
    y = lt.nom2num(y_names)
    labsD = lt.targetNumNomDict()
    with open(
            out_fN, 'a'
    ) as out_file:  # print details about the dataset into status file
        out_file.write("# {} ({})\n".format(collFi_train, len(train_coll)))
        out_file.write("#label_transformer {} {}\t data {}\n".format(
            lt.targetNumNomDict(), lt.classes_, Counter(y_names)))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=testFrac,
                                                        random_state=0)

    with open(out_fN, 'a') as out_file:  # print details to status file
        out_file.write("#TRAIN, shape {}\n".format(np.shape(X_train)))
        out_file.write("#TEST, shape {}\n".format(np.shape(X_test)))

    #### CLF
    pipe = Pipeline(estimators)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=param_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    ## best clf scores
    with open(out_fN, 'a') as out_file:
        out_file.write("#CLF\t{}\tbest score {:.3f}\n".format(
            str(gs.best_params_).replace('\n', ''), gs.best_score_))
    clf_best = gs.best_estimator_

    ## clf scores over test set
    with open(out_fN, 'a') as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring)
        out_file.write("{:2.2f}, {:2.2f}, {:.2f}, ".format(
            param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc)))
        ### cv accuracy
        cv_acc = cross_val_score(clf_best, X_test, y_test)
        out_file.write("{:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc),
                                                  100 * 2 * np.std(cv_acc)))
    ## print R, P an f1 for each class
    y_true, y_pred = y_test, clf_best.predict(X_test)
    MLvl.print_precision_recall_fscore_support(lt.num2nom(y_true),
                                               lt.num2nom(y_pred),
                                               out_fN,
                                               labels=lt.classes_)
    #strini="\nTest Set", strend="\n")

    #### TEST collection
    ### train classifier with whole dataset
    clf = skb.clone(
        gs.best_estimator_
    )  # clone to create a new classifier with the same parameters
    clf.fit(X, y)
    ### print scores
    callIx = lt.nom2num('c')
    for wavF, annF, template_annF in test_coll[:]:
        ### print WSD2 scores, see WSD2_experiment.ipynb
        MLvl.printWSD2_scores(
            wavF,
            true_annF=annF,
            template_annF=template_annF,
            WSD2_clf=clf,
            WSD2_feExFun=feExFun,
            lt=lt,
            scoreClassLabels=testColl_scoreClassLabels,
            outF=out_fN,
            strini=", ",
            strend="",
            m='auto',
            #strini="\nTESTCOLL", strend="\n", m='auto',
            readSectionsWSD2=readSections,  # for WSD2
            labelsHierarchy=['c'])

        if predictionsDir:
            bN = os.path.basename(annF)
            annFile_predict = os.path.join(predictionsDir,
                                           "{}_{}".format(int(param), bN))
            pT.WSD2predictAnnotations(wavF,
                                      template_annF,
                                      feExFun,
                                      lt,
                                      clf,
                                      outF=annFile_predict,
                                      readSections=readSections,
                                      keepSections=keepSections)

    with open(out_fN, 'a') as out_file:
        out_file.write("\n")

    return clf
예제 #3
0
###################  ASSIGNMENTS  ####################
##### OUTPUT FILES
try:
    os.makedirs(oDir)
except OSError:
    pass
out_fN = os.path.join(oDir, "scores.txt")

if savePredictions:
    predictionsDir = os.path.join(oDir, "predictions")
    try:
        os.makedirs(predictionsDir)
    except OSError:
        pass

Tpipe = fex.makeTransformationsPipeline(T_settings)

## clf settings
clfStr = 'cv{}-'.format(cv)
settingsStr = "{}-{}".format(Tpipe.string, clfStr)
settingsStr += '-labsHierarchy_' + '_'.join(labsHierarchy)

## write in out file
out_file = open(out_fN, 'a')
out_file.write("#WSD1\n###---------   {}   ---------###\n".format(
    time.strftime("%Y.%m.%d\t\t%H:%M:%S")))
out_file.write("#" + settingsStr + '\n')
out_file.close()

## load collections
train_coll = fex.readCols(collFi_train, colIndexes=(0, 1))
예제 #4
0
def runCallClfExperiment(wavColl,
                         lt,
                         T_settings,
                         out_fN,
                         testFrac,
                         cv,
                         pipe_estimators,
                         gs_grid,
                         filterClfClasses,
                         scoring=None,
                         param=None):
    """ . . . . . USE callClfExperiment instead  to be deprecated!!! . . . . .
    Runs clf experiments in the old way
    using T_settings instead of Tpipe
    Parameters
    ----------
        train_coll: list
        test_coll: list
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierarchy: list of strings
        cv: cv folds
        pipe_estimators: list
            for pipeline
        gs_grid: list
        filterClfClasses: list
            can use lt.classes_
        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
        scoring: string or sklearn.metrics.scorer
        param: float
            value of the param in experiment, for printing
    """

    Tpipe = fex.makeTransformationsPipeline(T_settings)
    feExFun = Tpipe.fun
    fs = Tpipe.Audio_features.fs
    #### prepare DATA: collections --> X y
    ## compute features
    datO = fex.wavLCollection2datXy(wavColl, fs=fs, featExtFun=feExFun)
    X, y_names = datO.filterInstances(filterClfClasses)
    y = lt.nom2num(y_names)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=testFrac,
                                                        random_state=0)

    #### CLF
    pipe = Pipeline(pipe_estimators)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=gs_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    clf_best = gs.best_estimator_
    y_pred = clf_best.predict(X_test)

    ## clf scores over test set
    with open(out_fN, 'a') as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_train, y_train, scoring=scoring)
        out_file.write("{:}, {:2.2f}, {:.2f}, ".format(
            param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc)))

        P, R, f1, _ = mt.precision_recall_fscore_support(
            y_test, y_pred,
            average='macro')  # average of the scores for the call classes
        acc = mt.accuracy_score(y_test, y_pred)
        out_file.write("{:.2f}, {:.2f}, {:.2f}, {:.2f}\n".format(
            acc * 100, P * 100, R * 100, f1 * 100))

    return clf_best.fit(X, y)
예제 #5
0
cv = 6

## clf settings
### inicialise Clf settings
paramsDi={}
pipe_estimators=[]
from pylotwhale.MLwhales.clf_pool import svc_l as clf_settings
pipe_estimators.append(('clf',  clf_settings.fun))
paramsDi.update(clf_settings.grid_params_di)
gs_grid = [paramsDi] # clfSettings.grid_params #

#### Classes                
callSet = ['126i', '130', '127', '129', '128i', '131i', '093ii']

##### FILES
## INPUT -> collection files
filesDi = {}
#collFi_train 
filesDi['train']= '/home/flo/x1-flo/whales/MLwhales/callClassification/data/collections/Vocal-repertoire-catalogue-Pilot-whales-Norway-callsL10.txt'

## OUTPUT -> DIR
#oDir = 
import pylotwhale.MLwhales.featureExtraction as fex
settings_str = fex.makeTransformationsPipeline(T_settings).string + clf_settings.clf_name
filesDi['outDir'] = '/home/flo/x1-flo/whales/MLwhales/callClassification/data/experiments/vocaRepNPW-clf/{}'.format(settings_str)





예제 #6
0
def run_iter_clf_experiment(param_grid, paramKey, paramDict,
                            clf_settings, feExParamsDict,
                            #updateParamInDict,
                            wavAnnColl_te, lt,
                            updateTestSet=True,
                            scores_file=None,
                            accum_file=None, ):
    """
    Run a clf experiments for different parameters (param_grid)
    Parameters:
    ----------
    param_grid : array_like
        Experiment parameters.
        Often repeated according to n_experiments
    paramKey : str
        key of paramDict, used to specify the and aupdate the experiments parameter
        which can be a feature name (ceps), NFFT or the instructions for the ensemble
        generation. See updateParamInDict() and feExParamsDict.
    paramDict: dict
        dictionary where the experment param was defined
    clf_settings : dictionary
           clf settings
    feExParamsDict: feExParamDict : dictionary
        Instructions for the extraction of features and ensemble generation.
        Used user over the train set and sometimes also over the test set.
            wavAnnColl : collection of annotated wavs
            lt : label transformer
            featExtFun : feature extraction instructions callable or dicT
            labelSet : set of clf-labels
            ensembleSettings :  instructions for the generation of a sound ensemble (dict)

    updateParamInDict : callable
        Instructions for updating the experimental parameter.
        Often specified by paramsDict.
    wavAnnColl_te : list,
        test collection
    lt : label transformer
    updateTestSet : bool
        True if feture extraction changes over the experiment
        False, no need to update feature representation if the test set
    scores_file : str
        output file for saving clf scores
    accum_file : str
        output file for saving predictions

    """

    ### TEST DATA
    ## data settings
    paramDict[paramKey]=param_grid[0]
    ## 
    feExFun = fex.makeTransformationsPipeline(feExParamsDict["TpipeSettings"]).fun
    XyDict_test = fex.wavAnnCollection2datXyDict(wavAnnColl_te, feExFun)
    XyO_test = fex.XyDict2XyO(XyDict_test)
    X_test, y_test_labels = XyO_test.filterInstances(lt.classes_)
    y_test = lt.nom2num(y_test_labels)

    scoresDict = None

    for param in param_grid:
        paramDict[paramKey]=param  #paramsDict = updateParamInDict(feExParamsDict, paramKey, param)
        print(paramKey, paramDict[paramKey], paramDict)#, feExParamsDict["TpipeSettings"])
        clfExp = clf_experimentO(clf_settings, **feExParamsDict)
        #print("param", param, '\n\n', paramsDict['featExtFun'])

        if updateTestSet:  # True when changing feature extraction instructions
            feExFun = fex.makeTransformationsPipeline(feExParamsDict["TpipeSettings"]).fun
            XyDict_test = updateParamTestSet(wavAnnColl_te, lt,
                                             featExtFun=feExFun,
                                             output_type='dict')
            X_test, y_test = updateParamTestSet(wavAnnColl_te, lt,
                                                featExtFun=feExFun,
                                                output_type='Xy')

        if scores_file is not None:
            clfExp.print_scores(scores_file, X_test, y_test, param)

        if XyDict_test is not None:
            scoresDict = clfExp.accumPredictions(XyDict_test, param,
                                                 predictionsDict=scoresDict)

    if accum_file is not None:
        clfExp.print_predictions(accum_file, scoresDict, lt)

    return True
예제 #7
0
def runExperiment(
    train_coll,
    test_coll,
    lt,
    T_settings,
    labsHierarchy,
    out_fN,
    cv,
    pipe_estimators,
    gs_grid,
    scoring=None,
    param=None,
    predictionsDir=None,
):
    """Runs clf experiments
    Parameters
    ----------
        train_coll: list
        test_coll: list
        lt: ML.labelTransformer
        T_settings: list of tuples
        labelsHierachy: list of strings
        cv: cv folds
        pipe_estimators: list
            for pipline
        gs_grid: list

        out_fN: str
        returnClfs: dict, Flase => clfs are not stored
        predictionsDir: str
        scoring: string or sklearn.metrics.scorer
    """

    Tpipe = fex.makeTransformationsPipeline(T_settings)
    feExFun = Tpipe.fun
    #### prepare DATA: collections --> X y
    ## compute features
    dataO = fex.wavAnnCollection2datXy(train_coll, feExFun, labsHierarchy)
    ## prepare X y data
    X0, y0_names = dataO.filterInstances(lt.classes_)  # filter for clf_labs
    X, y_names = (
        X0,
        y0_names,
    )  # myML.balanceToClass(X0, y0_names, 'c')  # balance classes X0, y0_names#
    y = lt.nom2num(y_names)
    labsD = lt.targetNumNomDict()
    with open(
            out_fN, "a"
    ) as out_file:  # print details about the dataset into status file
        out_file.write("# {} ({})\n".format(collFi_train, len(train_coll)))
        out_file.write("#label_transformer {} {}\t data {}\n".format(
            lt.targetNumNomDict(), lt.classes_, Counter(y_names)))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=testFrac,
                                                        random_state=0)

    with open(out_fN, "a") as out_file:  # print details to status file
        out_file.write("#TRAIN, shape {}\n".format(np.shape(X_train)))
        out_file.write("#TEST, shape {}\n".format(np.shape(X_test)))

    #### CLF
    pipe = Pipeline(pipe_estimators)
    gs = GridSearchCV(estimator=pipe,
                      param_grid=gs_grid,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=-1)

    gs = gs.fit(X_train, y_train)
    ## best clf scores
    with open(out_fN, "a") as out_file:
        out_file.write("#CLF\t{}\tbest score {:.3f}\n".format(
            str(gs.best_params_).replace("\n", ""), gs.best_score_))
    clf_best = gs.best_estimator_

    ## clf scores over test set
    with open(out_fN, "a") as out_file:
        ### cv score
        cv_sc = cross_val_score(clf_best, X_test, y_test, scoring=scoring)
        out_file.write("{:2.2f}, {:2.2f}, {:.2f}, ".format(
            param, 100 * np.mean(cv_sc), 100 * 2 * np.std(cv_sc)))
        ### cv accuracy
        cv_acc = cross_val_score(clf_best, X_test, y_test)
        out_file.write("{:2.2f}, {:.2f}, ".format(100 * np.mean(cv_acc),
                                                  100 * 2 * np.std(cv_acc)))
    ## print R, P an f1 for each class
    y_true, y_pred = y_test, clf_best.predict(X_test)
    MLvl.print_precision_recall_fscore_support(y_true, y_pred, out_fN)

    #### TEST collection
    ### train classifier with whole dataset
    clf = skb.clone(
        gs.best_estimator_
    )  # clone to create a new classifier with the same parameters
    clf.fit(X, y)
    ### print scores
    callIx = lt.nom2num("c")
    for wavF, annF in test_coll[:]:
        A, a_names = fex.getXy_fromWavFAnnF(wavF,
                                            annF,
                                            feExFun,
                                            labsHierarchy,
                                            filter_classes=lt.classes_)
        a_true = lt.nom2num(a_names)
        a_pred = clf.predict(A)
        P = mt.precision_score(a_true, a_pred, average=None)[callIx]
        R = mt.recall_score(a_true, a_pred, average=None)[callIx]
        f1 = mt.f1_score(a_true, a_pred, average=None)[callIx]
        with open(out_fN, "a") as out_file:
            out_file.write(", {:2.2f}, {:2.2f}, {:2.2f}".format(
                100 * f1, 100 * P, 100 * R))
        if predictionsDir:
            bN = os.path.basename(annF)
            annFile_predict = os.path.join(
                predictionsDir, "{}_{}_{}".format(int(f1 * 100), int(param),
                                                  bN))
            pT.predictSoundSections(wavF,
                                    clf,
                                    lt,
                                    feExFun,
                                    annSections=labsHierarchy,
                                    outF=annFile_predict)

    with open(out_fN, "a") as out_file:
        out_file.write("\n")

    return clf
예제 #8
0
metric='f1_macro'#'accuracy'
cv = 5

## clf settings
### inicialise Clf settings
paramsDi={}
pipe_estimators=[]
from pylotwhale.MLwhales.clf_pool import svc_rbf as clf_settings
pipe_estimators.append(('clf',  clf_settings.fun))
paramsDi.update(clf_settings.grid_params_di)
gs_grid = [paramsDi] # clfSettings.grid_params #

#### Classes                
#callSet = ['126i', '130', '127', '129', '128i', '131i', '093ii']

##### FILES
## INPUT -> collection files
filesDi = {}
#collFi_train 
filesDi['train'] ='/home/florencia/whales/MLwhales/callClassification/data/collections/Vocal-repertoire-catalogue-Pilot-whales-Norway-callsL10.txt'
# '/home/florencia/whales/data/orchive/flo/data/wavLabelColl-call-catalog-xsilence.txt'
#

## OUTPUT -> DIR
#oDir = 
from pylotwhale.MLwhales.featureExtraction import makeTransformationsPipeline
settings_str = makeTransformationsPipeline(T_settings).string + clf_settings.clf_name + '-'+ metric
filesDi['outDir'] = '/home/florencia/whales/MLwhales/callClassification/data/experiments/fullRep-cutWavs/f1_macro_n_mels_Nslices_NFFT1024/{}'.format(settings_str)
# '/home/florencia/whales/data/orchive/flo/data/experiments/{}'.format(settings_str)
#'/home/florencia/whales/MLwhales/callClassification/data/experiments/fullRep-cutWavs/{}'.format(settings_str)