Пример #1
0
def segmentClassification(data, model_name, model_type):
    if not os.path.isfile(model_name):
        print("fileClassification: input model_name not found!")
        return (-1, -1, -1)

    if model_type == 'knn':
        [classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step,
         compute_beat] = load_model_knn(model_name)
    else:
        [classifier, MEAN, STD, classNames, mt_win, mt_step, st_win, st_step,
         compute_beat] = load_model(model_name)

    [Fs, x] = 250000, audioBasicIO.stereo2mono(data)

    # feature extraction:
    [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step))
    mt_features = mt_features.mean(axis=1)        # long term averaging of mid-term statistics
    if compute_beat:
        [beat, beatConf] = aF.beatExtraction(s, st_step)
        mt_features = numpy.append(mt_features, beat)
        mt_features = numpy.append(mt_features, beatConf)
    curFV = (mt_features - MEAN) / STD                # normalization

    [Result, P] = classifierWrapper(classifier, model_type, curFV)    # classification        
    return Result, P, classNames
Пример #2
0
def process_files(recordings_dir, sub_dirs, file_ext='*.wav'):
    features, labels = np.empty((0, 261)), np.empty(
        0
    )  #193 is the number of features we get from extract_acoustic_features + 68 from pyAudioAnalysis
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(recordings_dir, sub_dir, file_ext)):
            #get features using pyAudioAnalysis
            [Fs, x
             ] = audioBasicIO.readAudioFile(fn)  #open file for pyAudioAnalysis
            #Mid-term feature extraction - M, SD of short-term feature sequence (essentially stft, but we define the window)
            #mtFeatureExtraction(signal, Fs, mtWin, mtStep, stWin, stStep), also extracts short-term features, but we are not using them
            [mtF, stF] = audioFeatureExtraction.mtFeatureExtraction(
                x, Fs, 1 * Fs, 1 * Fs, 0.050 * Fs, 0.025 * Fs
            )  #50ms frame size with 25ms frame step (50% overlap) for stft
            mtF = np.reshape(
                mtF,
                -1)  #get rid of one empty dimension, so we can np.hstack below
            #get features using librosa via extract_acoustic_features
            mfcc, chroma, mel, contrast, tonnetz = extract_acoustic_features(
                fn)  #extract features here
            get_features = np.hstack(
                [mfcc, chroma, mel, contrast, tonnetz,
                 mtF])  #we need all numbers in one vector so np.hstack
            #            print(mfcc, chroma, mel, contrast,tonnetz, mtF) #look at each acoustic measure
            features = np.vstack([features, get_features
                                  ])  #each file is one row, so np.vstack
            labels = np.append(
                labels,
                fn.split('/')[2].split('_')[2]
            )  #class labels come from file names 006_food_1_.wav gives 1
            print(
                fn
            )  #print file names as they get processed        #split '/', then by '_', take the third element (after the seond '/' and '_')
    return np.array(features), np.array(labels, dtype=np.int)
Пример #3
0
def classifyNN(inputFile, modelName):

    [
        Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
        computeBEAT
    ] = loadModel(modelName)
    [Fs, x] = audioBasicIO.readAudioFile(inputFile)
    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):
        return (-1, -1, -1)
    if x.shape[0] / float(Fs) <= mtWin:
        return (-1, -1, -1)

    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(
        axis=1)  # long term averaging of mid-term statistics
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)
    curFV = (MidTermFeatures - MEAN) / STD  # normalization

    [Result, P] = classify(Classifier, curFV)
    return Result, P, classNames
Пример #4
0
 def POST(self):
     x = web.input(myfile={})
     filename = 'tmp/' + uuid.uuid4().hex + '.wav'
     file = open(filename, 'w+')
     file.seek(0)
     file.write(x['myfile'].value)
     file.close()
     [Fs, x] = audioBasicIO.readAudioFile(filename)
     #os.remove(filename)
     x = audioBasicIO.stereo2mono(x)
     [F, _] = audioFeatureExtraction.mtFeatureExtraction(
         x, Fs, round(Fs * 1.0), round(Fs * 1.0), round(Fs * 0.050),
         round(Fs * 0.050))
     F = F.transpose()
     for vec in F:
         results = {}
         current_highest = ""
         current_highest_value = 0
         vec = numpy.around(vec.astype(numpy.float), 6)
         current = model.getNN(vec)
         result = current[0][1].partition("_")[0]
         if result in results:
             results[result] = results[result] + 1
         else:
             results[result] = 1
         if results[result] > current_highest_value:
             current_highest_value = results[result]
             current_highest = result
     print results
     print current_highest
     raise web.seeother('/')
Пример #5
0
def perdict(files, file, modelName):
    #read audio file, convert to mono (if needed)
    [Fs, x] = audioBasicIO.readAudioFile(file)
    x = audioBasicIO.stereo2mono(x)

    if modelName:
        mtWin, mtStep, stWin, stStep = 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep
        Classifier, MEAN, STD = loadSVModel(modelName)
    else:
        with open(SVMmodelName, 'rb') as fid:
            Classifier = cPickle.load(fid)
            MEAN = cPickle.load(fo)
            STD = cPickle.load(fo)
            mtWin, mtStep, stWin, stStep = 1.0, 1.0, aT.shortTermWindow, aT.shortTermStep

    #extract features from sample
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))
    MidTermFeatures = MidTermFeatures.mean(
        axis=1)  # long term averaging of mid-term statistics
    curFV = (MidTermFeatures - MEAN) / STD  # normalization

    #predict
    result = Classifier.predict(curFV.reshape(1, -1))[0]
    prob = Classifier.predict_proba(curFV.reshape(1, -1))[0]
    s = files[int(result)]

    return re.findall(r'\d+', s)[0] + " pills", prob
Пример #6
0
 def POST(self):
     x = web.input(myfile={})
     filename = 'tmp/'+uuid.uuid4().hex+'.wav'
     file = open(filename, 'w+')
     file.seek(0)
     file.write(x['myfile'].value)
     file.close()
     [Fs, x] = audioBasicIO.readAudioFile(filename);
     #os.remove(filename)
     x = audioBasicIO.stereo2mono(x)
     [F, _] = audioFeatureExtraction.mtFeatureExtraction(x, Fs, round(Fs*1.0), round(Fs * 1.0), round(Fs * 0.050), round(Fs * 0.050))
     F = F.transpose()
     for vec in F:
         results={}
         current_highest = ""
         current_highest_value = 0
         vec = numpy.around(vec.astype(numpy.float), 6)
         current = model.getNN(vec)
         result = current[0][1].partition("_")[0]
         if result in results:
             results[result] = results[result]+1
         else:
             results[result] = 1
         if results[result] > current_highest_value:
             current_highest_value = results[result]
             current_highest = result
     print results
     print current_highest
     raise web.seeother('/')
Пример #7
0
def fileRegression(inputFile, modelName, modelType):
    # Load classifier:

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    regressionModels = glob.glob(modelName + "_*")
    regressionModels2 = []
    for r in regressionModels:
        if r[-5::] != "MEANS":
            regressionModels2.append(r)
    regressionModels = regressionModels2
    regressionNames = []
    for r in regressionModels:
        regressionNames.append(r[r.rfind("_") + 1::])

    # FEATURE EXTRACTION
    # LOAD ONLY THE FIRST MODEL (for mtWin, etc)
    if modelType == 'svm' or modelType == "svm_rbf":
        [_, _, _, mtWin, mtStep, stWin, stStep,
         computeBEAT] = loadSVModel(regressionModels[0], True)
    elif modelType == 'randomforest':
        [_, _, _, mtWin, mtStep, stWin, stStep,
         computeBEAT] = loadRandomForestModel(regressionModels[0], True)

    # read audio file and convert to mono
    [Fs, x] = audioBasicIO.readAudioFile(inputFile)
    x = audioBasicIO.stereo2mono(x)
    # feature extraction:
    [MidTermFeatures, s] = aF.mtFeatureExtraction(x, Fs,
                                                  mtWin * Fs, mtStep * Fs,
                                                  round(Fs * stWin),
                                                  round(Fs * stStep))
    # long term averaging of mid-term statistics
    MidTermFeatures = MidTermFeatures.mean(axis=1)
    if computeBEAT:
        [beat, beatConf] = aF.beatExtraction(s, stStep)
        MidTermFeatures = numpy.append(MidTermFeatures, beat)
        MidTermFeatures = numpy.append(MidTermFeatures, beatConf)

    # REGRESSION
    R = []
    for ir, r in enumerate(regressionModels):
        if not os.path.isfile(r):
            print("fileClassification: input modelName not found!")
            return (-1, -1, -1)
        if modelType == 'svm' or modelType == "svm_rbf":
            [Model, MEAN, STD, mtWin, mtStep, stWin, stStep,
             computeBEAT] = loadSVModel(r, True)
        elif modelType == 'randomforest':
            [Model, MEAN, STD, mtWin, mtStep, stWin, stStep,
             computeBEAT] = loadRandomForestModel(r, True)
        curFV = (MidTermFeatures - MEAN) / STD  # normalization
        # classification
        R.append(regressionWrapper(Model, modelType, curFV))
    return R, regressionNames
Пример #8
0
def hmmSegmentation(wavFileName, hmmModelName, PLOT=False, gtFileName=""):
    [Fs, x] = audioBasicIO.readAudioFile(
        wavFileName)          # read audio data

    try:
        fo = open(hmmModelName, "rb")
    except IOError:
        print("didn't find file")
        return

    try:
        hmm = cPickle.load(fo)
        classesAll = cPickle.load(fo)
        mtWin = cPickle.load(fo)
        mtStep = cPickle.load(fo)
    except:
        fo.close()
    fo.close()

    # Features = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.050*Fs,
    # 0.050*Fs);    # feature extraction
    [Features, _] = aF.mtFeatureExtraction(
        x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))
    flagsInd = hmm.predict(Features.T)                            # apply model
    # for i in range(len(flagsInd)):
    #    if classesAll[flagsInd[i]]=="silence":
    #        flagsInd[i]=classesAll.index("speech")

    # plot results
    if os.path.isfile(gtFileName):
        [segStart, segEnd, segLabels] = readSegmentGT(gtFileName)
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)
        flagsGTNew = []
        # "align" labels with GT
        for j, fl in enumerate(flagsGT):
            if classNamesGT[flagsGT[j]] in classesAll:
                flagsGTNew.append(classesAll.index(classNamesGT[flagsGT[j]]))
            else:
                flagsGTNew.append(-1)
        CM = numpy.zeros((len(classNamesGT), len(classNamesGT)))
        flagsIndGT = numpy.array(flagsGTNew)
        for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
            CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1
    else:
        flagsIndGT = numpy.array([])
    acc = plotSegmentationResults(
        flagsInd, flagsIndGT, classesAll, mtStep, not PLOT)
    if acc >= 0:
        print("Overall Accuracy: {0:.2f}".format(acc))
        return (flagsInd, classNamesGT, acc, CM)
    else:
        return (flagsInd, classesAll, -1, -1)
Пример #9
0
def fileClassification(inputFile, model_name, model_type):
    # Load classifier:
    print("Loading Classifier")

    if not os.path.isfile(model_name):
        print("fileClassification: input model_name not found!")
        return (-1, -1, -1)

    if not os.path.isfile(inputFile):
        print("fileClassification: wav file not found!")
        return (-1, -1, -1)

    if model_type == 'knn':
        [
            classifier, MEAN, STD, classNames, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, classNames, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = load_model(model_name)

    print("Printing Classnames")
    print(classNames)

    [Fs, x] = audioBasicIO.readAudioFile(
        inputFile)  # read audio file and convert to mono
    x = audioBasicIO.stereo2mono(x)

    if isinstance(x, int):  # audio file IO problem
        return (-1, -1, -1)
        print('io problem')
    if x.shape[0] / float(Fs) <= mt_win:
        return (-1, -1, -1)

    # feature extraction:
    [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs,
                                                 mt_step * Fs,
                                                 round(Fs * st_win),
                                                 round(Fs * st_step))
    mt_features = mt_features.mean(
        axis=1)  # long term averaging of mid-term statistics
    if compute_beat:
        [beat, beatConf] = aF.beatExtraction(s, st_step)
        mt_features = numpy.append(mt_features, beat)
        mt_features = numpy.append(mt_features, beatConf)
    curFV = (mt_features - MEAN) / STD  # normalization

    [Result, P] = classifierWrapper(classifier, model_type,
                                    curFV)  # classification
    return Result, P, classNames
Пример #10
0
def analyze(vocs):
    for voc in tqdm(vocs, desc="Vocalization Analysis: ", ascii=True):
        data, rate = voc['audio'], voc['framerate']
        mt_feats, _, _ = audioFeatureExtraction.mtFeatureExtraction(
            data, rate, MT_WIN, MT_STEP, ST_WIN, ST_STEP)
        voc['spectral_entropy'] = np.mean(mt_feats[5])
        voc['spectral_centroid'] = np.mean(mt_feats[3])
        voc['rolloff_90'] = np.mean(mt_feats[7])
        voc['rolloff_50'] = np.mean(mt_feats[8])
        voc['rolloff_25'] = np.mean(mt_feats[9])
        voc['rolloff_10'] = np.mean(mt_feats[10])
        voc['zcr'] = np.mean(mt_feats[0])
        voc['spectral_spread'] = np.mean(mt_feats[4])
Пример #11
0
def extractAudioFeatures(audioPath):
    # Check if file exists
    if not os.path.exists(audioPath):
        raise Exception('File not found!')
    # Extract features from audio file
    [Fs, x] = audioBasicIO.readAudioFile(audioPath)
    x = audioBasicIO.stereo2mono(x)
    mF, sF = audioFeatureExtraction.mtFeatureExtraction(
        x, Fs, len(x), len(x), Fs, Fs)
    res = list()
    for item in mF:
        res.append(item[0])
    return res
Пример #12
0
    def extract_mid_features(input_file):
        class_names = [os.path.basename(input_file)]
        features = []
        fs, x = readAudioFile(input_file)
        x = stereo2mono(x)
        mt_size, mt_step, st_win, st_step = 1, 0.4, 0.025, 0.010
        [mt_feats, st_feats, _] = mtFeatureExtraction(x, fs, mt_size * fs,
                                                      mt_step * fs,
                                                      round(st_win * fs),
                                                      round(st_step * fs))

        mtFeatureExtractionToFile(input_file, mt_size, mt_step, st_win,
                                  st_step, input_file, False, True, True)
        return mt_feats, st_feats, _
Пример #13
0
def hmmSegmentation(wav_file_name,
                    hmm_model_name,
                    plot_res=False,
                    gt_file_name=""):
    [fs, x] = audioBasicIO.readAudioFile(wav_file_name)
    try:
        fo = open(hmm_model_name, "rb")
    except IOError:
        print("didn't find file")
        return

    try:
        hmm = cPickle.load(fo)
        classes_all = cPickle.load(fo)
        mt_win = cPickle.load(fo)
        mt_step = cPickle.load(fo)
    except:
        fo.close()
    fo.close()

    [Features, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                              round(fs * 0.050),
                                              round(fs * 0.050))
    flags_ind = hmm.predict(Features.T)  # apply model
    if os.path.isfile(gt_file_name):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file_name)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs,
                                              mt_step)
        flagsGTNew = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in classes_all:
                flagsGTNew.append(
                    classes_all.index(class_names_gt[flags_gt[j]]))
            else:
                flagsGTNew.append(-1)
        cm = numpy.zeros((len(classes_all), len(classes_all)))
        flags_ind_gt = numpy.array(flagsGTNew)
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1
    else:
        flags_ind_gt = numpy.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt, classes_all,
                                  mt_step, not plot_res)
    if acc >= 0:
        print("Overall Accuracy: {0:.2f}".format(acc))
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, classes_all, -1, -1)
Пример #14
0
def trainHMM_fromFile(wavFile, gtFile, hmmModelName, mtWin, mtStep):
    '''
    This function trains a HMM model for segmentation-classification using a single annotated audio file
    ARGUMENTS:
     - wavFile:        the path of the audio filename
     - gtFile:         the path of the ground truth filename
                       (a csv file of the form <segment start in seconds>,<segment end in seconds>,<segment label> in each row
     - hmmModelName:   the name of the HMM model to be stored
     - mtWin:          mid-term window size
     - mtStep:         mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - classNames:     a list of classNames

    After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file
    '''

    [segStart, segEnd, segLabels] = readSegmentGT(
        gtFile)                        # read ground truth data
    # convert to fix-sized sequence of flags
    flags, classNames = segs2flags(segStart, segEnd, segLabels, mtStep)

    [Fs, x] = audioBasicIO.readAudioFile(
        wavFile)                                # read audio data
    #F = aF.stFeatureExtraction(x, Fs, 0.050*Fs, 0.050*Fs);
    [F, _] = aF.mtFeatureExtraction(
        x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))    # feature extraction
    # compute HMM statistics (priors, transition matrix, etc)
    startprob, transmat, means, cov = trainHMM_computeStatistics(F, flags)

    hmm = hmmlearn.hmm.GaussianHMM(
        startprob.shape[0], "diag")            # hmm training

    hmm.startprob_ = startprob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    # output to file
    fo = open(hmmModelName, "wb")
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classNames, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classNames
Пример #15
0
def bufferRegression(audioBuffer, sampleRate, model_name, model_type):
    # Load classifier:
    regression_models = glob.glob(model_name + "_*")
    regression_models2 = []
    for r in regression_models:
        if r[-5::] != "MEANS":
            regression_models2.append(r)
    regression_models = regression_models2
    regression_names = []
    for r in regression_models:
        regression_names.append(r[r.rfind("_") + 1::])

    # FEATURE EXTRACTION
    # LOAD ONLY THE FIRST MODEL (for mt_win, etc)
    if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest':
        [_, _, _, mt_win, mt_step, st_win, st_step,
         compute_beat] = load_model(regression_models[0], True)

    Fs = sampleRate
    x = audioBuffer

    # feature extraction:
    [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs,
                                                 mt_step * Fs,
                                                 round(Fs * st_win),
                                                 round(Fs * st_step))
    mt_features = mt_features.mean(
        axis=1)  # long term averaging of mid-term statistics
    if compute_beat:
        [beat, beatConf] = aF.beatExtraction(s, st_step)
        mt_features = numpy.append(mt_features, beat)
        mt_features = numpy.append(mt_features, beatConf)

    # REGRESSION
    R = []
    for ir, r in enumerate(regression_models):
        if not os.path.isfile(r):
            print("fileClassification: input model_name not found!")
            return (-1, -1, -1)
        if model_type == 'svm' or model_type == "svm_rbf" \
                or model_type == 'randomforest':
            [model, MEAN, STD, mt_win, mt_step, st_win, st_step, compute_beat] = \
                load_model(r, True)
        curFV = (mt_features - MEAN) / STD  # normalization
        R.append(regressionWrapper(model, model_type, curFV))  # classification
    return R, regression_names
Пример #16
0
    def _feature_extractor(self, frames):
        if len(frames) < self.Win :
            frames = np.append(frames, [0]*(self.Win-len(frames)))

        frames = sound.normalize(frames, -20.0)
        frames = sound.apply_filter(self.FilterB,
                                    self.FilterA,
                                    frames)

        assert(len(frames) >= self.Win)
        [_mf, _f] = aF.mtFeatureExtraction(frames,
                                           self.sr,
                                           self.Win,
                                           self.Win/2,
                                           self.Win/4,
                                           self.Win/4)
        _f = _mf.transpose()
        return _f
Пример #17
0
    def get_segments(self, file, part, video_id):

        try:
            audio_file = "aud.part" + str(part) + "." + file + ".wav"
            fs, s = aIO.readAudioFile(audio_file)
            af, _, afn = aF.mtFeatureExtraction(s, fs, int(0.5 * fs),
                                                int(0.5 * fs), int(0.1 * fs),
                                                int(0.1 * fs))

            video_file = "vid.part" + str(part) + "." + file
            vf, t, vfn = self._video_extractor.extract_features(video_file)

        except Exception as e:
            print(e)
            return None
        else:
            segment_features = []
            # construct segmentfeature

            vmean = vf.T.mean(axis=1)
            for i, val in enumerate(vmean):
                if isinstance(val, np.ndarray):
                    val = val[0]

                feature = SegmentFeatures(value=val,
                                          seq_no=1,
                                          feature_id=self._feature_map[vfn[i]])
                segment_features.append(feature)

            amean = af.mean(axis=1)
            for i, val in enumerate(amean):
                feature = SegmentFeatures(value=val,
                                          seq_no=1,
                                          feature_id=self._feature_map[afn[i]])
                segment_features.append(feature)

            # construct segment with its segmentfeatures
            segment = Segment(video_id=video_id,
                              start_sec=part,
                              end_sec=self._splitter.get_segment_end(part),
                              features=segment_features)

            return segment
Пример #18
0
def extractFeatures(fs, signal):
    '''
        spf = wave.open('WaveFiles/test.wav', 'r')
	signal = spf.readframes(-1)
	fs = spf.getframerate()
	signal = np.fromstring(signal, 'int16')
	time = np.linspace(0,len(signal)/fs, num=len(signal))
        '''
    F, Y = audioFeatureExtraction.mtFeatureExtraction(signal, fs, 0.025 * fs,
                                                      0.025 * fs, 0.050 * fs,
                                                      0.025 * fs)
    #meanMFCC = getMeanMFCC(F)
    amplitudePeak = getAmplitudePeak(signal)
    numPeaks = getNumPeak(F[5, :])
    maxPeak = getMaxPeak(signal, fs)
    centroid, spectrum = stSpectralCentroidAndSpread(signal, fs)
    #rolloff = stSpectralRollOff(signal, 0.85, fs)
    #maxFlux = np.amax(F[6])
    #avgFlux = np.mean(F[6])
    return [amplitudePeak, numPeaks, centroid, spectrum]
def get_features_from_wav(wav_path, sec):
    """
    Samples audio by given time window

    :param wav_path: path to .wav file
    :param sec: float, sampling frame size in sec
    :return: pandas.DataFrame with sampled audio of shape (n_samples, frames_per_sample)
    """
    rate, audio = wav.read(wav_path)

    short_frame = rate * sec
    mt_features = mtFeatureExtraction(audio, rate, mtWin=short_frame * 10, mtStep=short_frame,
                                      stWin=short_frame, stStep=short_frame)
    big_mat = np.vstack([mt_features[0], mt_features[1]]).T
    big_mat = StandardScaler().fit_transform(big_mat)
    big_df = pd.DataFrame(big_mat)
    colnames = ["pyAA{}".format(i) for i in range(big_mat.shape[1])]
    big_df.columns = colnames

    return big_df
Пример #20
0
def trainHMM_fromFile(wav_file, gt_file, hmm_model_name, mt_win, mt_step):
    """
    This function trains a HMM model for segmentation-classification
    using a single annotated audio file
    ARGUMENTS:
     - wav_file:        the path of the audio filename
     - gt_file:         the path of the ground truth filename
                       (a csv file of the form <segment start in seconds>,
                       <segment end in seconds>,<segment label> in each row
     - hmm_model_name:   the name of the HMM model to be stored
     - mt_win:          mid-term window size
     - mt_step:         mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - class_names:     a list of class_names

    After training, hmm, class_names, along with the mt_win and mt_step
    values are stored in the hmm_model_name file
    """

    [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
    flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step)
    [fs, x] = audioBasicIO.readAudioFile(wav_file)
    [F, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                       round(fs * 0.050), round(fs * 0.050))
    start_prob, transmat, means, cov = trainHMM_computeStatistics(F, flags)
    hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")

    hmm.startprob_ = start_prob
    hmm.transmat_ = transmat    
    hmm.means_ = means
    hmm.covars_ = cov
    
    fo = open(hmm_model_name, "wb")
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(class_names, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, class_names
Пример #21
0
    def __call__(self, input_file):
        (frame_rate, x) = aIO.readAudioFile(input_file)
        if frame_rate < 0:
            return None

        [feats, s] = aF.mtFeatureExtraction(aIO.stereo2mono(x), frame_rate,
                                            self.mt_win * frame_rate,
                                            self.mt_step * frame_rate,
                                            round(frame_rate * self.st_win),
                                            round(frame_rate * self.st_step))
        feats = feats.mean(axis=1)
        feats = (feats - self.model_mean) / self.model_sd

        p = self.classifier.predict_proba(feats.reshape(1, -1))[0]
        out = dict(zip(self.class_names, map(float, p)))
        out.update({
            "_frame_rate":
            float(frame_rate),
            "_duration_seconds":
            float(x.shape[0]) / frame_rate if (frame_rate > 0) else None
        })
        return out
Пример #22
0
def bufferClassification(audioBuffer, sampleRate, model_name, model_type):
    # Load classifier:
    if model_type == 'knn':
        [
            classifier, MEAN, STD, classNames, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, classNames, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = load_model(model_name)

    if isinstance(audioBuffer, int):  # audio buffer format problem
        print("bufferClassification: bad audio format!")
        return (-1, -1, -1)
    if audioBuffer.shape[0] / float(sampleRate) <= mt_win:
        print(
            "bufferClassification: too little audio to analyze with medium term window",
            mt_win)
        return (-1, -1, -1)

    # feature extraction:
    [mt_features, s,
     _] = aF.mtFeatureExtraction(audioBuffer, sampleRate, mt_win * sampleRate,
                                 mt_step * sampleRate,
                                 round(sampleRate * st_win),
                                 round(sampleRate * st_step))
    mt_features = mt_features.mean(
        axis=1)  # long term averaging of mid-term statistics
    if compute_beat:
        [beat, beatConf] = aF.beatExtraction(s, st_step)
        mt_features = numpy.append(mt_features, beat)
        mt_features = numpy.append(mt_features, beatConf)
    curFV = (mt_features - MEAN) / STD  # normalization

    [Result, P] = classifierWrapper(classifier, model_type,
                                    curFV)  # classification
    return Result, P, classNames
Пример #23
0
def trainHMM_fromDir(dirPath, hmm_model_name, mt_win, mt_step):
    '''
    This function trains a HMM model for segmentation-classification using
    a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - dirPath:        the path of the data diretory
     - hmm_model_name:    the name of the HMM model to be stored
     - mt_win:        mid-term window size
     - mt_step:        mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - class_names:        a list of class_names

    After training, hmm, class_names, along with the mt_win
    and mt_step values are stored in the hmm_model_name file
    '''

    flags_all = numpy.array([])
    classes_all = []
    for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')):
        # for each WAV file
        wav_file = f
        gt_file = f.replace('.wav', '.segments')
        if not os.path.isfile(gt_file):
            continue
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags, class_names = segs2flags(seg_start, seg_end, seg_labs, mt_step)
        for c in class_names:
            # update class names:
            if c not in classes_all:
                classes_all.append(c)
        [fs, x] = audioBasicIO.readAudioFile(wav_file)
        [F, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                           round(fs * 0.050),
                                           round(fs * 0.050))

        lenF = F.shape[1]
        lenL = len(flags)
        min_sm = min(lenF, lenL)
        F = F[:, 0:min_sm]
        flags = flags[0:min_sm]

        flagsNew = []
        for j, fl in enumerate(flags):  # append features and labels
            flagsNew.append(classes_all.index(class_names[flags[j]]))

        flags_all = numpy.append(flags_all, numpy.array(flagsNew))

        if i == 0:
            f_all = F
        else:
            f_all = numpy.concatenate((f_all, F), axis=1)
    start_prob, transmat, means, cov = trainHMM_computeStatistics(
        f_all, flags_all)  # compute HMM statistics
    hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")  # train HMM
    hmm.startprob_ = start_prob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmm_model_name, "wb")  # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classes_all, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_win, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mt_step, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classes_all
Пример #24
0
#iterate over track
while data != '':
    print("At second: ")
    print(counter * chunk_size)
    counter += 1

    x_data.append(counter * chunk_size)

    #stream.write(data)
    array = _wav2array(wf.getnchannels(), wf.getsampwidth(), data)
    array = audioBasicIO.stereo2mono(array)

    #extract features
    MidTermFeatures = aF.mtFeatureExtraction(array, Fs, mtWin * Fs,
                                             mtStep * Fs, round(Fs * stWin),
                                             round(Fs * stStep))
    MidTermFeatures = MidTermFeatures[0]

    #classify chunks to speech/music
    flags = []
    Ps = []
    flagsInd = []
    for i in range(
            MidTermFeatures[0].shape[0]
    ):  # for each feature vector (i.e. for each fix-sized segment):
        curFV = (MidTermFeatures[:, i] -
                 MEAN) / STD  # normalize current feature vector
        [Result, P] = aT.classifierWrapper(Classifier, modelType,
                                           curFV)  # classify vector
        flagsInd.append(Result)
Пример #25
0
def pyAudioAnalysis_features(x, Fs):
    # [Fs, x] = audioBasicIO.readAudioFile(file_name)
    # stF = audioFeatureExtraction.stFeatureExtraction(x, Fs, 0.05 * Fs, 0.05 * Fs)
    mtF = audioFeatureExtraction.mtFeatureExtraction(x, Fs, 1 * Fs, 1 * Fs,
                                                     0.5 * Fs, 0.5 * Fs)
    return mtF
Пример #26
0
def speakerDiarization(fileName, numOfSpeakers, mtSize=2.0, mtStep=0.2, stWin=0.05, LDAdim=0, PLOT=False):
    '''
    ARGUMENTS:
        - fileName:        the name of the WAV file to be analyzed
        - numOfSpeakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mtSize (opt)     mid-term window size
        - mtStep (opt)     mid-term window step
        - stWin  (opt)     short-term window size
        - LDAdim (opt)     LDA dimension (0 for no LDA)
        - PLOT     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [Fs, x] = pyAudioAnalysis.audioBasicIO.readAudioFile(fileName)
    x = pyAudioAnalysis.audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs

    #[Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel(os.path.join("data","knnSpeakerAll"))
    #[Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel(os.path.join("data","knnSpeakerFemaleMale"))
    [Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerAll")
    [Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2] = aT.loadKNNModel("pyAudioAnalysis/data/knnSpeakerFemaleMale")

    [MidTermFeatures, ShortTermFeatures] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs*stWin * 0.5))

    MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1]))

    for i in range(MidTermFeatures.shape[1]):
        curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
        curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2
        [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
        [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
        MidTermFeatures2[0:MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
        MidTermFeatures2[MidTermFeatures.shape[0]:MidTermFeatures.shape[0]+len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1)::, i] = P2 + 0.0001

    MidTermFeatures = MidTermFeatures2    # TODO
    # SELECT FEATURES:
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20];                                                                                         # SET 0A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 99,100];                                                                                 # SET 0B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,
    #   97,98, 99,100];     # SET 0C

    iFeaturesSelect = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]                           # SET 1A
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];                                          # SET 1B
    #iFeaturesSelect = [8,9,10,11,12,13,14,15,16,17,18,19,20,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 1C

    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53];             # SET 2A
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 99,100];     # SET 2B
    #iFeaturesSelect = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53, 68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98, 99,100];     # SET 2C

    #iFeaturesSelect = range(100);                                                                                                    # SET 3
    #MidTermFeatures += numpy.random.rand(MidTermFeatures.shape[0], MidTermFeatures.shape[1]) * 0.000000010

    MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

    (MidTermFeaturesNorm, MEAN, STD) = aT.normalizeFeatures([MidTermFeatures.T])
    MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
    numOfWindows = MidTermFeatures.shape[1]

    # remove outliers:
    DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis=0)
    MDistancesAll = numpy.mean(DistancesAll)
    iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(MidTermFeatures[1,:])
    #EnergyMean = numpy.mean(MidTermFeatures[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #iNonOutLiers = numpy.nonzero(MidTermFeatures[1,:] > Thres)[0]
    #print iNonOutLiers

    perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
    MidTermFeaturesNormOr = MidTermFeaturesNorm
    MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

    # LDA dimensionality reduction:
    if LDAdim > 0:
        #[mtFeaturesToReduce, _] = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, stWin * Fs, round(Fs*stWin), round(Fs*stWin));
        # extract mid-term features with minimum step:
        mtWinRatio = int(round(mtSize / stWin))
        mtStepRatio = int(round(stWin / stWin))
        mtFeaturesToReduce = []
        numOfFeatures = len(ShortTermFeatures)
        numOfStatistics = 2
        #for i in range(numOfStatistics * numOfFeatures + 1):
        for i in range(numOfStatistics * numOfFeatures):
            mtFeaturesToReduce.append([])

        for i in range(numOfFeatures):        # for each of the short-term features:
            curPos = 0
            N = len(ShortTermFeatures[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mtWinRatio
                if N2 > N:
                    N2 = N
                curStFeatures = ShortTermFeatures[i][N1:N2]
                mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
                mtFeaturesToReduce[i+numOfFeatures].append(numpy.std(curStFeatures))
                curPos += mtStepRatio
        mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
        mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1]))
        for i in range(mtFeaturesToReduce.shape[1]):
            curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
            curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
            [Result, P1] = aT.classifierWrapper(Classifier1, "knn", curF1)
            [Result, P2] = aT.classifierWrapper(Classifier2, "knn", curF2)
            mtFeaturesToReduce2[0:mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]:mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001
            mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]+len(classNames1)::, i] = P2 + 0.0001
        mtFeaturesToReduce = mtFeaturesToReduce2
        mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
        #mtFeaturesToReduce += numpy.random.rand(mtFeaturesToReduce.shape[0], mtFeaturesToReduce.shape[1]) * 0.0000010
        (mtFeaturesToReduce, MEAN, STD) = aT.normalizeFeatures([mtFeaturesToReduce.T])
        mtFeaturesToReduce = mtFeaturesToReduce[0].T
        #DistancesAll = numpy.sum(distance.squareform(distance.pdist(mtFeaturesToReduce.T)), axis=0)
        #MDistancesAll = numpy.mean(DistancesAll)
        #iNonOutLiers2 = numpy.nonzero(DistancesAll < 3.0*MDistancesAll)[0]
        #mtFeaturesToReduce = mtFeaturesToReduce[:, iNonOutLiers2]
        Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ));
        LDAstep = 1.0
        LDAstepRatio = LDAstep / stWin
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i*stWin/LDAstepRatio);        
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components=LDAdim)
        clf.fit(mtFeaturesToReduce.T, Labels)
        MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

    if numOfSpeakers <= 0:
        sRange = range(2, 10)
    else:
        sRange = [numOfSpeakers]
    clsAll = []
    silAll = []
    centersAll = []
    
    for iSpeakers in sRange:        
        k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers)
        k_means.fit(MidTermFeaturesNorm.T)
        cls = k_means.labels_        
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(MidTermFeaturesNorm.T))
        clsAll.append(cls)
        centersAll.append(means)
        silA = []; silB = []
        for c in range(iSpeakers):                                # for each speaker (i.e. for each extracted cluster)
            clusterPerCent = numpy.nonzero(cls==c)[0].shape[0] / float(len(cls))
            if clusterPerCent < 0.020:
                silA.append(0.0)
                silB.append(0.0)
            else:
                MidTermFeaturesNormTemp = MidTermFeaturesNorm[:,cls==c]            # get subset of feature vectors
                Yt = distance.pdist(MidTermFeaturesNormTemp.T)                # compute average distance between samples that belong to the cluster (a values)
                silA.append(numpy.mean(Yt)*clusterPerCent)
                silBs = []
                for c2 in range(iSpeakers):                        # compute distances from samples of other clusters
                    if c2!=c:
                        clusterPerCent2 = numpy.nonzero(cls==c2)[0].shape[0] / float(len(cls))
                        MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:,cls==c2]
                        Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
                        silBs.append(numpy.mean(Yt)*(clusterPerCent+clusterPerCent2)/2.0)
                silBs = numpy.array(silBs)                            
                silB.append(min(silBs))                            # ... and keep the minimum value (i.e. the distance from the "nearest" cluster)
        silA = numpy.array(silA); 
        silB = numpy.array(silB); 
        sil = []
        for c in range(iSpeakers):                                # for each cluster (speaker)
            sil.append( ( silB[c] - silA[c]) / (max(silB[c],  silA[c])+0.00001)  )        # compute silhouette

        silAll.append(numpy.mean(sil))                                # keep the AVERAGE SILLOUETTE

    #silAll = silAll * (1.0/(numpy.power(numpy.array(sRange),0.5)))
    imax = numpy.argmax(silAll)                                    # position of the maximum sillouette value
    nSpeakersFinal = sRange[imax]                                    # optimal number of clusters

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows: this is achieved by giving them the value of their nearest non-outlier window)
    cls = numpy.zeros((numOfWindows,))
    for i in range(numOfWindows):
        j = numpy.argmin(numpy.abs(i-iNonOutLiers))        
        cls[i] = clsAll[imax][j]
        
    # Post-process method 1: hmm smoothing
    for i in range(1):
        startprob, transmat, means, cov = trainHMM_computeStatistics(MidTermFeaturesNormOr, cls)
        hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")            # hmm training        
        hmm.startprob_ = startprob
        hmm.transmat_ = transmat            
        hmm.means_ = means; hmm.covars_ = cov
        cls = hmm.predict(MidTermFeaturesNormOr.T)                    
    
    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = silAll[imax]                                        # final sillouette
    classNames = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)];


    # load ground-truth if available
    gtFile = fileName.replace('.wav', '.segments');                            # open for annotated file
    if os.path.isfile(gtFile):                                    # if groundturh exists
        [segStart, segEnd, segLabels] = readSegmentGT(gtFile)                    # read GT data
        flagsGT, classNamesGT = segs2flags(segStart, segEnd, segLabels, mtStep)            # convert to flags

    if PLOT:
        fig = plt.figure()    
        if numOfSpeakers>0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(classNames))))
        ax1.axis((0, Duration, -1, len(classNames)))
        ax1.set_yticklabels(classNames)
        ax1.plot(numpy.array(range(len(cls)))*mtStep+mtStep/2.0, cls)

    if os.path.isfile(gtFile):
        if PLOT:
            ax1.plot(numpy.array(range(len(flagsGT)))*mtStep+mtStep/2.0, flagsGT, 'r')
        purityClusterMean, puritySpeakerMean = evaluateSpeakerDiarization(cls, flagsGT)
        print "{0:.1f}\t{1:.1f}".format(100*purityClusterMean, 100*puritySpeakerMean)
        if PLOT:
            plt.title("Cluster purity: {0:.1f}% - Speaker purity: {1:.1f}%".format(100*purityClusterMean, 100*puritySpeakerMean) )
    if PLOT:
        plt.xlabel("time (seconds)")
        #print sRange, silAll    
        if numOfSpeakers<=0:
            plt.subplot(212)
            plt.plot(sRange, silAll)
            plt.xlabel("number of clusters");
            plt.ylabel("average clustering's sillouette");
        plt.show()
    return cls
Пример #27
0
def speakerDiarization(fileName, sRange = xrange(2, 10), mtSize = 2.0, mtStep = 0.2, stWin = 0.05, LDAdim = 35):
	Fs, x = audioBasicIO.readAudioFile(fileName)
	x = audioBasicIO.stereo2mono(x)
	duration = len(x) / Fs

	Classifier1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1, stStep1, computeBEAT1 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerAll'))
	Classifier2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2, stStep2, computeBEAT2 = aT.loadKNNModel(os.path.join('/home/aaiijmrtt/Code/deepspeech/res/pyAudioAnalysis/data', 'knnSpeakerFemaleMale'))

	MidTermFeatures, ShortTermFeatures = aF.mtFeatureExtraction(x, Fs, mtSize * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stWin * 0.5))
	MidTermFeatures2 = numpy.zeros((MidTermFeatures.shape[0] + len(classNames1) + len(classNames2), MidTermFeatures.shape[1]))

	for i in range(MidTermFeatures.shape[1]):
		curF1 = (MidTermFeatures[:, i] - MEAN1) / STD1
		curF2 = (MidTermFeatures[:, i] - MEAN2) / STD2

		Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
		Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)

		MidTermFeatures2[0: MidTermFeatures.shape[0], i] = MidTermFeatures[:, i]
		MidTermFeatures2[MidTermFeatures.shape[0]: MidTermFeatures.shape[0] + len(classNames1), i] = P1 + 0.0001
		MidTermFeatures2[MidTermFeatures.shape[0] + len(classNames1):, i] = P2 + 0.0001

	MidTermFeatures = MidTermFeatures2
	iFeaturesSelect = range(8, 21) + range(41, 54)
	MidTermFeatures = MidTermFeatures[iFeaturesSelect, :]

	MidTermFeaturesNorm, MEAN, STD = aT.normalizeFeatures([MidTermFeatures.T])
	MidTermFeaturesNorm = MidTermFeaturesNorm[0].T
	numOfWindows = MidTermFeatures.shape[1]

	DistancesAll = numpy.sum(distance.squareform(distance.pdist(MidTermFeaturesNorm.T)), axis = 0)
	MDistancesAll = numpy.mean(DistancesAll)
	iNonOutLiers = numpy.nonzero(DistancesAll < 1.2 * MDistancesAll)[0]

	perOutLier = (100.0 * (numOfWindows - iNonOutLiers.shape[0])) / numOfWindows
	MidTermFeaturesNormOr = MidTermFeaturesNorm
	MidTermFeaturesNorm = MidTermFeaturesNorm[:, iNonOutLiers]

	if LDAdim > 0:
		mtWinRatio, mtStepRatio, mtFeaturesToReduce, numOfFeatures, numOfStatistics = int(round(mtSize / stWin)), int(round(stWin / stWin)), list(), len(ShortTermFeatures), 2
		for i in range(numOfStatistics * numOfFeatures): mtFeaturesToReduce.append(list())

		for i in range(numOfFeatures):
			curPos = 0
			N = len(ShortTermFeatures[i])
			while (curPos < N):
				N1, N2 = curPos, curPos + mtWinRatio
				if N2 > N: N2 = N
				curStFeatures = ShortTermFeatures[i][N1: N2]
				mtFeaturesToReduce[i].append(numpy.mean(curStFeatures))
				mtFeaturesToReduce[i + numOfFeatures].append(numpy.std(curStFeatures))
				curPos += mtStepRatio

		mtFeaturesToReduce = numpy.array(mtFeaturesToReduce)
		mtFeaturesToReduce2 = numpy.zeros((mtFeaturesToReduce.shape[0] + len(classNames1) + len(classNames2), mtFeaturesToReduce.shape[1]))
		for i in range(mtFeaturesToReduce.shape[1]):
			curF1 = (mtFeaturesToReduce[:, i] - MEAN1) / STD1
			curF2 = (mtFeaturesToReduce[:, i] - MEAN2) / STD2
			Result, P1 = aT.classifierWrapper(Classifier1, 'knn', curF1)
			Result, P2 = aT.classifierWrapper(Classifier2, 'knn', curF2)
			mtFeaturesToReduce2[0: mtFeaturesToReduce.shape[0], i] = mtFeaturesToReduce[:, i]
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0]: mtFeaturesToReduce.shape[0] + len(classNames1), i] = P1 + 0.0001
			mtFeaturesToReduce2[mtFeaturesToReduce.shape[0] + len(classNames1):, i] = P2 + 0.0001

		mtFeaturesToReduce = mtFeaturesToReduce2
		mtFeaturesToReduce = mtFeaturesToReduce[iFeaturesSelect, :]
		mtFeaturesToReduce, MEAN, STD = aT.normalizeFeatures([mtFeaturesToReduce.T])
		mtFeaturesToReduce = mtFeaturesToReduce[0].T
	
		Labels = numpy.zeros((mtFeaturesToReduce.shape[1], ))
		LDAstep = 1.0
		LDAstepRatio = LDAstep / stWin

		for i in range(Labels.shape[0]): Labels[i] = int(i * stWin / LDAstepRatio)
		clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(n_components = LDAdim)
		clf.fit(mtFeaturesToReduce.T, Labels)

		MidTermFeaturesNorm = (clf.transform(MidTermFeaturesNorm.T)).T

	clsAll, silAll, centersAll = list(), list(), list()

	for iSpeakers in sRange:
		k_means = sklearn.cluster.KMeans(n_clusters = iSpeakers)
		k_means.fit(MidTermFeaturesNorm.T)
		cls = k_means.labels_
		means = k_means.cluster_centers_

		clsAll.append(cls)
		centersAll.append(means)
		silA, silB = list(), list()
		for c in range(iSpeakers):
			clusterPerCent = numpy.nonzero(cls == c)[0].shape[0] / float(len(cls))
			if clusterPerCent < 0.02:
				silA.append(0.0)
				silB.append(0.0)
			else:
				MidTermFeaturesNormTemp = MidTermFeaturesNorm[:, cls == c]
				Yt = distance.pdist(MidTermFeaturesNormTemp.T)
				silA.append(numpy.mean(Yt) * clusterPerCent)
				silBs = list()
				for c2 in range(iSpeakers):
					if c2 != c:
						clusterPerCent2 = numpy.nonzero(cls == c2)[0].shape[0] / float(len(cls))
						MidTermFeaturesNormTemp2 = MidTermFeaturesNorm[:, cls == c2]
						Yt = distance.cdist(MidTermFeaturesNormTemp.T, MidTermFeaturesNormTemp2.T)
						silBs.append(numpy.mean(Yt) * (clusterPerCent+clusterPerCent2) / 2.0)
				silBs = numpy.array(silBs)
				silB.append(min(silBs))
		silA, silB, sil = numpy.array(silA), numpy.array(silB), list()
		for c in range(iSpeakers): sil.append((silB[c] - silA[c]) / (max(silB[c],  silA[c]) + 0.00001))
		silAll.append(numpy.mean(sil))

	imax = numpy.argmax(silAll)
	nSpeakersFinal = sRange[imax]

	cls = numpy.zeros((numOfWindows, ))
	for i in range(numOfWindows):
		j = numpy.argmin(numpy.abs(i - iNonOutLiers))
		cls[i] = clsAll[imax][j]

	startprob, transmat, means, cov = trainHMM(MidTermFeaturesNormOr, cls)
	hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], 'diag')
	hmm.startprob_ = startprob
	hmm.transmat_ = transmat
	hmm.means_ = means
	hmm.covars_ = cov
	cls = hmm.predict(MidTermFeaturesNormOr.T)
	cls = scipy.signal.medfilt(cls, 13)
	cls = scipy.signal.medfilt(cls, 11)

	sil = silAll[imax]
	classNames = ['SPEAKER{0:d}'.format(c) for c in range(nSpeakersFinal)]

	return cls, classNames, duration, mtStep, silAll
Пример #28
0
def mtFileClassification(inputFile, modelName, modelType, plotResults=False, gtFile=""):
    '''
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
    ARGUMENTS:
        - inputFile:        path of the input WAV file
        - modelName:        name of the classification model
        - modelType:        svm or knn depending on the classifier type
        - plotResults:      True if results are to be plotted using matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the class ID of the i-th segment
    '''

    if not os.path.isfile(modelName):
        print("mtFileClassificationError: input modelType not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if (modelType == 'svm') or (modelType == 'svm_rbf'):
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin,
            stStep, computeBEAT] = aT.loadSVModel(modelName)
    elif modelType == 'knn':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin,
            stStep, computeBEAT] = aT.loadKNNModel(modelName)
    elif modelType == 'randomforest':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin,
            stStep, computeBEAT] = aT.loadRandomForestModel(modelName)
    elif modelType == 'gradientboosting':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin, stStep,
            computeBEAT] = aT.loadGradientBoostingModel(modelName)
    elif modelType == 'extratrees':
        [Classifier, MEAN, STD, classNames, mtWin, mtStep, stWin,
            stStep, computeBEAT] = aT.loadExtraTreesModel(modelName)

    if computeBEAT:
        print("Model " + modelName +
              " contains long-term music features (beat etc) and cannot be used in segmentation")
        return (-1, -1, -1, -1)
    [Fs, x] = audioBasicIO.readAudioFile(inputFile)        # load input file
    if Fs == -1:                                           # could not read file
        return (-1, -1, -1, -1)
    # convert stereo (if) to mono
    x = audioBasicIO.stereo2mono(x)
    Duration = len(x) / Fs
    # mid-term feature extraction:
    [MidTermFeatures, _] = aF.mtFeatureExtraction(
        x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * stWin), round(Fs * stStep))
    flags = []
    Ps = []
    flagsInd = []
    # for each feature vector (i.e. for each fix-sized segment):
    for i in range(MidTermFeatures.shape[1]):
        # normalize current feature vector
        curFV = (MidTermFeatures[:, i] - MEAN) / STD
        [Result, P] = aT.classifierWrapper(
            Classifier, modelType, curFV)    # classify vector
        flagsInd.append(Result)
        # update class label matrix
        flags.append(classNames[int(Result)])
        # update probability matrix
        Ps.append(numpy.max(P))
    flagsInd = numpy.array(flagsInd)

    # 1-window smoothing
    for i in range(1, len(flagsInd) - 1):
        if flagsInd[i - 1] == flagsInd[i + 1]:
            flagsInd[i] = flagsInd[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mtStep)
    segs[-1] = len(x) / float(Fs)

    # Load grount-truth:
    if os.path.isfile(gtFile):
        [segStartGT, segEndGT, segLabelsGT] = readSegmentGT(gtFile)
        flagsGT, classNamesGT = segs2flags(
            segStartGT, segEndGT, segLabelsGT, mtStep)
        flagsIndGT = []
        for j, fl in enumerate(flagsGT):                    # "align" labels with GT
            if classNamesGT[flagsGT[j]] in classNames:
                flagsIndGT.append(classNames.index(classNamesGT[flagsGT[j]]))
            else:
                flagsIndGT.append(-1)
        flagsIndGT = numpy.array(flagsIndGT)
        CM = numpy.zeros((len(classNamesGT), len(classNamesGT)))
        for i in range(min(flagsInd.shape[0], flagsIndGT.shape[0])):
            CM[int(flagsIndGT[i]), int(flagsInd[i])] += 1
    else:
        CM = []
        flagsIndGT = numpy.array([])
    acc = plotSegmentationResults(
        flagsInd, flagsIndGT, classNames, mtStep, not plotResults)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flagsInd, classNamesGT, acc, CM)
    else:
        return (flagsInd, classNames, acc, CM)
Пример #29
0
def mtFileClassification(input_file,
                         model_name,
                         model_type,
                         plot_results=False,
                         gt_file=""):
    '''
    This function performs mid-term classification of an audio stream.
    Towards this end, supervised knowledge is used, i.e. a pre-trained classifier.
    ARGUMENTS:
        - input_file:        path of the input WAV file
        - model_name:        name of the classification model
        - model_type:        svm or knn depending on the classifier type
        - plot_results:      True if results are to be plotted using
                             matplotlib along with a set of statistics

    RETURNS:
          - segs:           a sequence of segment's endpoints: segs[i] is the
                            endpoint of the i-th segment (in seconds)
          - classes:        a sequence of class flags: class[i] is the
                            class ID of the i-th segment
    '''

    if not os.path.isfile(model_name):
        print("mtFileClassificationError: input model_type not found!")
        return (-1, -1, -1, -1)
    # Load classifier:
    if model_type == "knn":
        [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, compute_beat] = \
            aT.load_model_knn(model_name)
    else:
        [
            classifier, MEAN, STD, class_names, mt_win, mt_step, st_win,
            st_step, compute_beat
        ] = aT.load_model(model_name)

    if compute_beat:
        print("Model " + model_name + " contains long-term music features "
              "(beat etc) and cannot be used in "
              "segmentation")
        return (-1, -1, -1, -1)
    [fs, x] = audioBasicIO.readAudioFile(input_file)  # load input file
    if fs == -1:  # could not read file
        return (-1, -1, -1, -1)
    x = audioBasicIO.stereo2mono(x)  # convert stereo (if) to mono
    duration = len(x) / fs
    # mid-term feature extraction:
    [mt_feats, _, _] = aF.mtFeatureExtraction(x, fs, mt_win * fs, mt_step * fs,
                                              round(fs * st_win),
                                              round(fs * st_step))
    flags = []
    Ps = []
    flags_ind = []
    for i in range(
            mt_feats.shape[1]
    ):  # for each feature vector (i.e. for each fix-sized segment):
        cur_fv = (mt_feats[:, i] -
                  MEAN) / STD  # normalize current feature vector
        [res, P] = aT.classifierWrapper(classifier, model_type,
                                        cur_fv)  # classify vector
        flags_ind.append(res)
        flags.append(class_names[int(res)])  # update class label matrix
        Ps.append(numpy.max(P))  # update probability matrix
    flags_ind = numpy.array(flags_ind)

    # 1-window smoothing
    for i in range(1, len(flags_ind) - 1):
        if flags_ind[i - 1] == flags_ind[i + 1]:
            flags_ind[i] = flags_ind[i + 1]
    # convert fix-sized flags to segments and classes
    (segs, classes) = flags2segs(flags, mt_step)
    segs[-1] = len(x) / float(fs)

    # Load grount-truth:
    if os.path.isfile(gt_file):
        [seg_start_gt, seg_end_gt, seg_l_gt] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start_gt, seg_end_gt,
                                              seg_l_gt, mt_step)
        flags_ind_gt = []
        for j, fl in enumerate(flags_gt):
            # "align" labels with GT
            if class_names_gt[flags_gt[j]] in class_names:
                flags_ind_gt.append(
                    class_names.index(class_names_gt[flags_gt[j]]))
            else:
                flags_ind_gt.append(-1)
        flags_ind_gt = numpy.array(flags_ind_gt)
        cm = numpy.zeros((len(class_names_gt), len(class_names_gt)))
        for i in range(min(flags_ind.shape[0], flags_ind_gt.shape[0])):
            cm[int(flags_ind_gt[i]), int(flags_ind[i])] += 1
    else:
        cm = []
        flags_ind_gt = numpy.array([])
    acc = plotSegmentationResults(flags_ind, flags_ind_gt, class_names,
                                  mt_step, not plot_results)
    if acc >= 0:
        print("Overall Accuracy: {0:.3f}".format(acc))
        return (flags_ind, class_names_gt, acc, cm)
    else:
        return (flags_ind, class_names, acc, cm)
Пример #30
0
def emotion_from_speech(Fs, x, log, model_name="pyAudioAnalysis/pyAudioAnalysis/data/svmSpeechEmotion", model_type="svm"):
    """

    :param Fs: frame rate
    :param x: data
    :param model_name:
    :param model_type:
    :param log:
    :return:
    """
    regression_models = glob.glob(model_name + "_*")
    regression_models2 = []
    for r in regression_models:
        if r[-5::] != "MEANS":
            regression_models2.append(r)
    regression_models = regression_models2
    regression_names = []
    for r in regression_models:
        regression_names.append(r[r.rfind("_")+1::])

    emotion = {"valence": None, "arousal":None}
    # Feature extraction
    x = np.fromstring(x, np.int16)
    if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest':
        [_, _, _, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(regression_models[0], True)
    else:
        return emotion

    [mt_features, s, _] = aF.mtFeatureExtraction(x, Fs, mt_win * Fs, mt_step * Fs, round(Fs * st_win), round(Fs * st_step))
    mt_features = mt_features.mean(axis=1)        # long term averaging of mid-term statistics
    if compute_beat:
        [beat, beatConf] = aF.beatExtraction(s, st_step)
        mt_features = np.append(mt_features, beat)
        mt_features = np.append(mt_features, beatConf)

    # Regression
    R = []
    for ir, r in enumerate(regression_models):
        if not os.path.isfile(r):
            print("fileClassification: input model_name not found!")
            return emotion
        if model_type == 'svm' or model_type == "svm_rbf" or model_type == 'randomforest':
            [model, MEAN, STD, mt_win, mt_step, st_win, st_step, compute_beat] = aT.load_model(r, True)
        curFV = (mt_features - MEAN) / STD                  # normalization
        R.append(aT.regressionWrapper(model, model_type, curFV))

    if R[0] > 1:
        log.warning("Valence > 1")
        emotion["valence"] = 1
    elif R[0] < -1:
        log.warning("Valence < -1")
        emotion["valence"] = -1
    else:
        emotion["valence"] = R[0]

    if R[1] > 1:
        log.warning("Arousal > 1")
        emotion["arousal"] = 1
    elif R[1] < -1:
        log.warning("Arousal < -1")
        emotion["arousal"] = -1
    else:
        emotion["arousal"] = R[1]

    return emotion
Пример #31
0
def speakerDiarization(filename,
                       n_speakers,
                       mt_size=2.0,
                       mt_step=0.2,
                       st_win=0.05,
                       lda_dim=35,
                       plot_res=False):
    '''
    ARGUMENTS:
        - filename:        the name of the WAV file to be analyzed
        - n_speakers    the number of speakers (clusters) in the recording (<=0 for unknown)
        - mt_size (opt)     mid-term window size
        - mt_step (opt)     mid-term window step
        - st_win  (opt)     short-term window size
        - lda_dim (opt)     LDA dimension (0 for no LDA)
        - plot_res     (opt)   0 for not plotting the results 1 for plottingy
    '''
    [fs, x] = audioBasicIO.readAudioFile(filename)
    x = audioBasicIO.stereo2mono(x)
    duration = len(x) / fs

    [
        classifier_1, MEAN1, STD1, classNames1, mtWin1, mtStep1, stWin1,
        stStep1, computeBEAT1
    ] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
                     "knnSpeakerAll"))
    [
        classifier_2, MEAN2, STD2, classNames2, mtWin2, mtStep2, stWin2,
        stStep2, computeBEAT2
    ] = aT.load_model_knn(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
                     "knnSpeakerFemaleMale"))

    [mt_feats, st_feats, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs,
                                                     mt_step * fs,
                                                     round(fs * st_win),
                                                     round(fs * st_win * 0.5))

    MidTermFeatures2 = numpy.zeros(
        (mt_feats.shape[0] + len(classNames1) + len(classNames2),
         mt_feats.shape[1]))

    for i in range(mt_feats.shape[1]):
        cur_f1 = (mt_feats[:, i] - MEAN1) / STD1
        cur_f2 = (mt_feats[:, i] - MEAN2) / STD2
        [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
        [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
        MidTermFeatures2[0:mt_feats.shape[0], i] = mt_feats[:, i]
        MidTermFeatures2[mt_feats.shape[0]:mt_feats.shape[0] +
                         len(classNames1), i] = P1 + 0.0001
        MidTermFeatures2[mt_feats.shape[0] + len(classNames1)::,
                         i] = P2 + 0.0001

    mt_feats = MidTermFeatures2  # TODO
    iFeaturesSelect = [
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45,
        46, 47, 48, 49, 50, 51, 52, 53
    ]

    mt_feats = mt_feats[iFeaturesSelect, :]

    (mt_feats_norm, MEAN, STD) = aT.normalizeFeatures([mt_feats.T])
    mt_feats_norm = mt_feats_norm[0].T
    n_wins = mt_feats.shape[1]

    # remove outliers:
    dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_norm.T)),
                         axis=0)
    m_dist_all = numpy.mean(dist_all)
    i_non_outliers = numpy.nonzero(dist_all < 1.2 * m_dist_all)[0]

    # TODO: Combine energy threshold for outlier removal:
    #EnergyMin = numpy.min(mt_feats[1,:])
    #EnergyMean = numpy.mean(mt_feats[1,:])
    #Thres = (1.5*EnergyMin + 0.5*EnergyMean) / 2.0
    #i_non_outliers = numpy.nonzero(mt_feats[1,:] > Thres)[0]
    #print i_non_outliers

    perOutLier = (100.0 * (n_wins - i_non_outliers.shape[0])) / n_wins
    mt_feats_norm_or = mt_feats_norm
    mt_feats_norm = mt_feats_norm[:, i_non_outliers]

    # LDA dimensionality reduction:
    if lda_dim > 0:
        #[mt_feats_to_red, _, _] = aF.mtFeatureExtraction(x, fs, mt_size * fs, st_win * fs, round(fs*st_win), round(fs*st_win));
        # extract mid-term features with minimum step:
        mt_win_ratio = int(round(mt_size / st_win))
        mt_step_ratio = int(round(st_win / st_win))
        mt_feats_to_red = []
        num_of_features = len(st_feats)
        num_of_stats = 2
        #for i in range(num_of_stats * num_of_features + 1):
        for i in range(num_of_stats * num_of_features):
            mt_feats_to_red.append([])

        for i in range(
                num_of_features):  # for each of the short-term features:
            curPos = 0
            N = len(st_feats[i])
            while (curPos < N):
                N1 = curPos
                N2 = curPos + mt_win_ratio
                if N2 > N:
                    N2 = N
                curStFeatures = st_feats[i][N1:N2]
                mt_feats_to_red[i].append(numpy.mean(curStFeatures))
                mt_feats_to_red[i + num_of_features].append(
                    numpy.std(curStFeatures))
                curPos += mt_step_ratio
        mt_feats_to_red = numpy.array(mt_feats_to_red)
        mt_feats_to_red_2 = numpy.zeros(
            (mt_feats_to_red.shape[0] + len(classNames1) + len(classNames2),
             mt_feats_to_red.shape[1]))
        for i in range(mt_feats_to_red.shape[1]):
            cur_f1 = (mt_feats_to_red[:, i] - MEAN1) / STD1
            cur_f2 = (mt_feats_to_red[:, i] - MEAN2) / STD2
            [res, P1] = aT.classifierWrapper(classifier_1, "knn", cur_f1)
            [res, P2] = aT.classifierWrapper(classifier_2, "knn", cur_f2)
            mt_feats_to_red_2[0:mt_feats_to_red.shape[0],
                              i] = mt_feats_to_red[:, i]
            mt_feats_to_red_2[
                mt_feats_to_red.shape[0]:mt_feats_to_red.shape[0] +
                len(classNames1), i] = P1 + 0.0001
            mt_feats_to_red_2[mt_feats_to_red.shape[0] + len(classNames1)::,
                              i] = P2 + 0.0001
        mt_feats_to_red = mt_feats_to_red_2
        mt_feats_to_red = mt_feats_to_red[iFeaturesSelect, :]
        #mt_feats_to_red += numpy.random.rand(mt_feats_to_red.shape[0], mt_feats_to_red.shape[1]) * 0.0000010
        (mt_feats_to_red, MEAN,
         STD) = aT.normalizeFeatures([mt_feats_to_red.T])
        mt_feats_to_red = mt_feats_to_red[0].T
        #dist_all = numpy.sum(distance.squareform(distance.pdist(mt_feats_to_red.T)), axis=0)
        #m_dist_all = numpy.mean(dist_all)
        #iNonOutLiers2 = numpy.nonzero(dist_all < 3.0*m_dist_all)[0]
        #mt_feats_to_red = mt_feats_to_red[:, iNonOutLiers2]
        Labels = numpy.zeros((mt_feats_to_red.shape[1], ))
        LDAstep = 1.0
        LDAstepRatio = LDAstep / st_win
        #print LDAstep, LDAstepRatio
        for i in range(Labels.shape[0]):
            Labels[i] = int(i * st_win / LDAstepRatio)
        clf = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(
            n_components=lda_dim)
        clf.fit(mt_feats_to_red.T, Labels)
        mt_feats_norm = (clf.transform(mt_feats_norm.T)).T

    if n_speakers <= 0:
        s_range = range(2, 10)
    else:
        s_range = [n_speakers]
    clsAll = []
    sil_all = []
    centersAll = []

    for iSpeakers in s_range:
        k_means = sklearn.cluster.KMeans(n_clusters=iSpeakers)
        k_means.fit(mt_feats_norm.T)
        cls = k_means.labels_
        means = k_means.cluster_centers_

        # Y = distance.squareform(distance.pdist(mt_feats_norm.T))
        clsAll.append(cls)
        centersAll.append(means)
        sil_1 = []
        sil_2 = []
        for c in range(iSpeakers):
            # for each speaker (i.e. for each extracted cluster)
            clust_per_cent = numpy.nonzero(cls == c)[0].shape[0] / \
                             float(len(cls))
            if clust_per_cent < 0.020:
                sil_1.append(0.0)
                sil_2.append(0.0)
            else:
                # get subset of feature vectors
                mt_feats_norm_temp = mt_feats_norm[:, cls == c]
                # compute average distance between samples
                # that belong to the cluster (a values)
                Yt = distance.pdist(mt_feats_norm_temp.T)
                sil_1.append(numpy.mean(Yt) * clust_per_cent)
                silBs = []
                for c2 in range(iSpeakers):
                    # compute distances from samples of other clusters
                    if c2 != c:
                        clust_per_cent_2 = numpy.nonzero(cls == c2)[0].shape[0] /\
                                           float(len(cls))
                        MidTermFeaturesNormTemp2 = mt_feats_norm[:, cls == c2]
                        Yt = distance.cdist(mt_feats_norm_temp.T,
                                            MidTermFeaturesNormTemp2.T)
                        silBs.append(
                            numpy.mean(Yt) *
                            (clust_per_cent + clust_per_cent_2) / 2.0)
                silBs = numpy.array(silBs)
                # ... and keep the minimum value (i.e.
                # the distance from the "nearest" cluster)
                sil_2.append(min(silBs))
        sil_1 = numpy.array(sil_1)
        sil_2 = numpy.array(sil_2)
        sil = []
        for c in range(iSpeakers):
            # for each cluster (speaker) compute silhouette
            sil.append(
                (sil_2[c] - sil_1[c]) / (max(sil_2[c], sil_1[c]) + 0.00001))
        # keep the AVERAGE SILLOUETTE
        sil_all.append(numpy.mean(sil))

    imax = numpy.argmax(sil_all)
    # optimal number of clusters
    nSpeakersFinal = s_range[imax]

    # generate the final set of cluster labels
    # (important: need to retrieve the outlier windows:
    # this is achieved by giving them the value of their
    # nearest non-outlier window)
    cls = numpy.zeros((n_wins, ))
    for i in range(n_wins):
        j = numpy.argmin(numpy.abs(i - i_non_outliers))
        cls[i] = clsAll[imax][j]

    # Post-process method 1: hmm smoothing
    for i in range(1):
        # hmm training
        start_prob, transmat, means, cov = \
            trainHMM_computeStatistics(mt_feats_norm_or, cls)
        hmm = hmmlearn.hmm.GaussianHMM(start_prob.shape[0], "diag")
        hmm.startprob_ = start_prob
        hmm.transmat_ = transmat
        hmm.means_ = means
        hmm.covars_ = cov
        cls = hmm.predict(mt_feats_norm_or.T)

    # Post-process method 2: median filtering:
    cls = scipy.signal.medfilt(cls, 13)
    cls = scipy.signal.medfilt(cls, 11)

    sil = sil_all[imax]
    class_names = ["speaker{0:d}".format(c) for c in range(nSpeakersFinal)]

    # load ground-truth if available
    gt_file = filename.replace('.wav', '.segments')
    # if groundturh exists
    if os.path.isfile(gt_file):
        [seg_start, seg_end, seg_labs] = readSegmentGT(gt_file)
        flags_gt, class_names_gt = segs2flags(seg_start, seg_end, seg_labs,
                                              mt_step)

    if plot_res:
        fig = plt.figure()
        if n_speakers > 0:
            ax1 = fig.add_subplot(111)
        else:
            ax1 = fig.add_subplot(211)
        ax1.set_yticks(numpy.array(range(len(class_names))))
        ax1.axis((0, duration, -1, len(class_names)))
        ax1.set_yticklabels(class_names)
        ax1.plot(numpy.array(range(len(cls))) * mt_step + mt_step / 2.0, cls)

    if os.path.isfile(gt_file):
        if plot_res:
            ax1.plot(
                numpy.array(range(len(flags_gt))) * mt_step + mt_step / 2.0,
                flags_gt, 'r')
        purity_cluster_m, purity_speaker_m = \
            evaluateSpeakerDiarization(cls, flags_gt)
        print("{0:.1f}\t{1:.1f}".format(100 * purity_cluster_m,
                                        100 * purity_speaker_m))
        if plot_res:
            plt.title("Cluster purity: {0:.1f}% - "
                      "Speaker purity: {1:.1f}%".format(
                          100 * purity_cluster_m, 100 * purity_speaker_m))
    if plot_res:
        plt.xlabel("time (seconds)")
        #print s_range, sil_all
        if n_speakers <= 0:
            plt.subplot(212)
            plt.plot(s_range, sil_all)
            plt.xlabel("number of clusters")
            plt.ylabel("average clustering's sillouette")
        plt.show()
    return cls
Пример #32
0
def trainHMM_fromDir(dirPath, hmmModelName, mtWin, mtStep):
    '''
    This function trains a HMM model for segmentation-classification using a where WAV files and .segment (ground-truth files) are stored
    ARGUMENTS:
     - dirPath:        the path of the data diretory
     - hmmModelName:    the name of the HMM model to be stored
     - mtWin:        mid-term window size
     - mtStep:        mid-term window step
    RETURNS:
     - hmm:            an object to the resulting HMM
     - classNames:        a list of classNames

    After training, hmm, classNames, along with the mtWin and mtStep values are stored in the hmmModelName file
    '''

    flagsAll = numpy.array([])
    initializedFall = False
    classesAll = []
    # for each WAV file
    for i, f in enumerate(glob.glob(dirPath + os.sep + '*.wav')):
        wavFile = f
        # open for annotated file
        gtFile = f.replace('.wav', '.segments')
        # if current WAV file does not have annotation -> skip
        if not os.path.isfile(gtFile):
            continue
        [segStart, segEnd, segLabels] = readSegmentGT(
            gtFile)                   # read GT data
        flags, classNames = segs2flags(
            segStart, segEnd, segLabels, mtStep)     # convert to flags
        # update classnames:
        for c in classNames:
            if c not in classesAll:
                classesAll.append(c)
        [Fs, x] = audioBasicIO.readAudioFile(
            wavFile)                           # read audio data
        [F, _] = aF.mtFeatureExtraction(
            x, Fs, mtWin * Fs, mtStep * Fs, round(Fs * 0.050), round(Fs * 0.050))     # feature extraction

        lenF = F.shape[1]
        lenL = len(flags)
        MIN = min(lenF, lenL)
        F = F[:, 0:MIN]
        flags = flags[0:MIN]

        flagsNew = []
        for j, fl in enumerate(flags):      # append features and labels
            flagsNew.append(classesAll.index(classNames[flags[j]]))

        flagsAll = numpy.append(flagsAll, numpy.array(flagsNew))

        if not initializedFall:
            Fall = F
            initializedFall = True
        else:
            Fall = numpy.concatenate((Fall, F), axis=1)
    startprob, transmat, means, cov = trainHMM_computeStatistics(
        Fall, flagsAll)        # compute HMM statistics
    hmm = hmmlearn.hmm.GaussianHMM(startprob.shape[0], "diag")      # train HMM
    hmm.startprob_ = startprob
    hmm.transmat_ = transmat
    hmm.means_ = means
    hmm.covars_ = cov

    fo = open(hmmModelName, "wb")   # save HMM model
    cPickle.dump(hmm, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(classesAll, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtWin, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    cPickle.dump(mtStep, fo, protocol=cPickle.HIGHEST_PROTOCOL)
    fo.close()

    return hmm, classesAll