예제 #1
0
def shortTermAnalyses(sound_type, filename, patient_name):
    fs, signal = wavfile.read(filename)
    window.refresh()
    if sound_type == 'speech':
        s = audioSegmentation.silence_removal(signal, fs, 0.5, 0.1, weight=0.2)
        signal2 = np.concatenate([signal[int((i[0]+0.1)*fs):int((i[1]+0.1)*fs)] for i in s])
        wavfile.write("database/{0}/speechFileSegmented.wav".format(patient_name), fs, signal2)
        s1 = ShortTermFeatures.feature_extraction(signal[:, 0], fs, 0.05*fs, 0.025*fs, deltas=True)[0][:8]
        window.refresh()
        s2 = ShortTermFeatures.feature_extraction(signal[:, 1], fs, 0.05*fs, 0.025*fs, deltas=True)[0][:8]
        window.refresh()
        filename = filename[:-4] + "1.wav"
        fs, signal = wavfile.read(filename)
        s = audioSegmentation.silence_removal(signal, fs, 0.5, 0.1, weight=0.2)
        signal2 = np.concatenate([signal[int((i[0]+0.1)*fs):int((i[1]+0.1)*fs)] for i in s])
        wavfile.write("database/{0}/speechFileSegmented1.wav".format(patient_name), fs, signal2)
        s3 = ShortTermFeatures.feature_extraction(signal[:, 0], fs, 0.05*fs, 0.025*fs, deltas=True)[0][:8]
        window.refresh()
        s4 = ShortTermFeatures.feature_extraction(signal[:, 1], fs, 0.05*fs, 0.025*fs, deltas=True)[0][:8]
        window.refresh()
        n = min(s1.shape[0], s2.shape[0], s3.shape[0], s4.shape[0])
        m = min(s1.shape[1], s2.shape[1], s3.shape[1], s4.shape[1])
        return (s1[:n, :m]+s2[:n, :m]+s3[:n, :m]+s4[:n, :m])/4
    else:
        return ShortTermFeatures.feature_extraction(signal, fs, 0.05*fs, 0.025*fs, deltas=True)[0][:8]
예제 #2
0
def silence_detection(speech):
    '''
    Returns list of lists containing the start and end time stamps of speech segments
    e.g.: [[0.05,0.07],[0.09,0.012]] --> speech exists in the intervals 0.05-0.07 and 
                                         0.09-1.12   
    Utlises a pretained Hidden Markov Model in pyAudioanalysis that determines speech 
    activity.

    Parameters: 
    data (reader): reader returned by bob.io.audio.reader  
  
    Returns: 
    intervals(list): list of lists containing intervals where speech segments exist
    '''
    config = configuration()['speech_params']
    intervals = aS.silence_removal(speech,
                                   config['sample_rate'].get(),
                                   config['frame_length'].get(),
                                   config['frame_overlap'].get(),
                                   smooth_window=config['smooth_window'].get(),
                                   weight=config['sil_rem_wt'].get(),
                                   plot=config['sil_rem_plt'].get())

    for i in range(len(intervals)):
        intervals[i] = [
            int(stamp * config['sample_rate'].get()) for stamp in intervals[i]
        ]

    return intervals
예제 #3
0
파일: vad_simple.py 프로젝트: shammur/pyVAD
def silenceRemovalWrapper(inputFile, outFolder, smoothingWindow, weight):
    if not os.path.isfile(inputFile):
        raise Exception("Input audio file not found!")

    [fs, x] = audioBasicIO.read_audio_file(inputFile)
    segmentLimits = aS.silence_removal(x, fs, 0.05, 0.05,
                                       smoothingWindow, weight, True)
    for i, s in enumerate(segmentLimits):
        strOut = outFolder+"{0:s}_{1:.3f}-{2:.3f}.wav".format(inputFile[0:-4], s[0], s[1])
        wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])])
def silence_detection(utter):
    intervals = aS.silence_removal(utter, 
                                   hp.data.sr, 
                                   0.025, 
                                   0.010, 
                                   smooth_window = 1.0, 
                                   weight = 0.3, 
                                   plot = False)

    for i in range(len(intervals)):
        intervals[i] = [int(stamp*hp.data.sr) for stamp in intervals[i]]

    return intervals
예제 #5
0
파일: utils.py 프로젝트: DhivehiAI/DV-Subs
def extractAudio(input_file, output_dir, smoothing_window=1.0, weight=0.1):

    print("Detecting silences...")
    [fs, x] = read_audio_file(input_file)
    segmentLimits = silence_removal(x, fs, 0.05, 0.05, smoothing_window,
                                    weight)
    ifile_name = os.path.basename(input_file)

    os.makedirs(output_dir, exist_ok=True)
    files = []

    print("Writing segments...")
    for i, s in enumerate(segmentLimits):
        strOut = "{0:s}_{1:.3f}-{2:.3f}.wav".format(ifile_name, s[0], s[1])
        strOut = os.path.join(output_dir, strOut)
        wavfile.write(strOut, fs, x[int(fs * s[0]):int(fs * s[1])])
        files.append(strOut)

    return files
예제 #6
0
def segment(path, file):
    Fs, x = audioBasicIO.read_audio_file(path + file)

    segments = audioSegmentation.silence_removal(x,
                                                 Fs,
                                                 0.020,
                                                 0.020,
                                                 smooth_window=1.0,
                                                 weight=0.3,
                                                 plot=False)
    print(segments)

    X = x[0:0]
    for i, segment in enumerate(segments):
        print(i)
        t0 = int(Fs * segment[0])
        t1 = int(Fs * segment[1])
        # wavfile.write( path + 'segments/' + str(i)  + '.wav', Fs, x[t0:t1])
        X = np.concatenate((X, x[t0:t1]), axis=0)

    wavfile.write(path + 'seg_pyAudioAnalysis.wav', Fs, X)
예제 #7
0
def main(argv):
    if argv[1] == "-shortTerm":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.time()
            F = MidTermFeatures.short_term_feature_extraction(
                x, Fs, 0.050 * Fs, 0.050 * Fs)
            t2 = time.time()
            perTime1 = duration / (t2 - t1)
            print "short-term feature extraction: {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-classifyFile":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.time()
            aT.file_classification("diarizationExample.wav", "svmSM", "svm")
            t2 = time.time()
            perTime1 = duration / (t2 - t1)
            print "Mid-term feature extraction + classification \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-mtClassify":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.time()
            [flagsInd, classesAll,
             acc] = aS.mid_term_file_classification("diarizationExample.wav",
                                                    "svmSM", "svm", False, '')
            t2 = time.time()
            perTime1 = duration / (t2 - t1)
            print "Fix-sized classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-hmmSegmentation":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.time()
            aS.hmm_segmentation('diarizationExample.wav', 'hmmRadioSM', False,
                                '')
            t2 = time.time()
            perTime1 = duration / (t2 - t1)
            print "HMM-based classification - segmentation \t {0:.1f} x realtime".format(
                perTime1)
    elif argv[1] == "-silenceRemoval":
        for i in range(nExp):
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration = x.shape[0] / float(Fs)
            t1 = time.time()
            [Fs, x] = audioBasicIO.read_audio_file("diarizationExample.wav")
            segments = aS.silence_removal(x,
                                          Fs,
                                          0.050,
                                          0.050,
                                          smooth_window=1.0,
                                          Weight=0.3,
                                          plot=False)
            t2 = time.time()
            perTime1 = duration / (t2 - t1)
            print "Silence removal \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-thumbnailing":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.read_audio_file("scottish.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.time()
            [A1, A2, B1, B2,
             Smatrix] = aS.music_thumbnailing(x1, Fs1, 1.0, 1.0,
                                              15.0)  # find thumbnail endpoints
            t2 = time.time()
            perTime1 = duration1 / (t2 - t1)
            print "Thumbnail \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-noLDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.time()
            aS.speaker_diarization("diarizationExample.wav",
                                   4,
                                   LDAdim=0,
                                   PLOT=False)
            t2 = time.time()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
    elif argv[1] == "-diarization-LDA":
        for i in range(nExp):
            [Fs1, x1] = audioBasicIO.read_audio_file("diarizationExample.wav")
            duration1 = x1.shape[0] / float(Fs1)
            t1 = time.time()
            aS.speaker_diarization("diarizationExample.wav", 4, PLOT=False)
            t2 = time.time()
            perTime1 = duration1 / (t2 - t1)
            print "Diarization \t {0:.1f} x realtime".format(perTime1)
예제 #8
0
파일: vad_simple.py 프로젝트: shammur/pyVAD
def vadFolderWrapperMergedByTh(inputFolder, outFolder, smoothingWindow, weight, model_name, threshold):

    if not os.path.isfile(model_name):
        print("fileClassification: input model_name not found!")



    classifier, mean, std, classes, mid_window, mid_step, short_window, \
    short_step, compute_beat = aT.load_model(model_name)

    types = ('*.wav', '*.mp3')

    wavFilesList = []
    for files in types:
        print(inputFolder + files)
        wavFilesList.extend(glob.glob((inputFolder + files)))
    wavFilesList = sorted(wavFilesList)
    if len(wavFilesList) == 0:
        print("No WAV files found!")
        return
    for wavFile in wavFilesList:
        # print(wavFile)
        if not os.path.isfile(wavFile):
            raise Exception("Input audio file not found!")
        base = os.path.splitext(os.path.basename(wavFile))[0]
        folder = outFolder + base + '/'
        if not os.path.exists(folder):
            os.makedirs(folder)
        segfile = open(os.path.join(folder, 'segments'), 'w+')
        segfile2 = open(os.path.join(folder, 'segments_details'), 'w+')

        stack = deque()

        [fs, x] = audioBasicIO.read_audio_file(wavFile)
        segmentLimits = aS.silence_removal(x, fs, 0.05, 0.05, smoothingWindow, weight, False)
        merge=True

        for i, st in enumerate(segmentLimits):


            signal = audioBasicIO.stereo_to_mono(x[int(fs * st[0]):int(fs * st[1])])
            # print('in here', len(segmentLimits), st[0],st[1],classes, type(st))
            if fs == 0:
                continue
                # audio file IO problem
                # return -1, -1, -1

            if signal.shape[0] / float(fs) < mid_window:
                mid_window = signal.shape[0] / float(fs)

            # feature extraction:
            mid_features, s, _ = \
                aF.mid_feature_extraction(signal, fs,
                                          mid_window * fs,
                                          mid_step * fs,
                                          round(fs * short_window),
                                          round(fs * short_step))
            # long term averaging of mid-term statistics
            mid_features = mid_features.mean(axis=1)
            if compute_beat:
                # print('in here3')
                beat, beat_conf = aF.beat_extraction(s, short_step)
                mid_features = np.append(mid_features, beat)
                mid_features = np.append(mid_features, beat_conf)
            feature_vector = (mid_features - mean) / std  # normalization
            # class_id = -1
            # probability = -1
            class_id = classifier.predict(feature_vector.reshape(1, -1))[0]
            # probability = classifier.predict_proba(feature_vector.reshape(1, -1))[0]
            print(class_id, type(class_id))
            label=classes[int(class_id)]

            print(label)
            if label=='speech':
                dur=st[1]-st[0]
                # print('in hereas')
                if merge == True:
                    seg_prev=[]
                    # print('in hereasq12')
                    if len(stack) >0:
                        seg_prev = stack.pop()


                    if len(seg_prev) >0 and st[1]-seg_prev[0] > threshold:
                        # print('in hereas4')
                        seg = [st[0], st[1], label]
                        stack.append(seg_prev)
                        stack.append(seg)
                        merge = True
                    elif len(seg_prev) >0:
                        # print('in hereasqw345')
                        seg = [seg_prev[0], st[1], label]
                        stack.append(seg)
                        merge = True
                    else:
                        seg = [st[0], st[1], label]
                        stack.append(seg)
                        merge = True
                else:
                    # print('in hereas2')
                    seg = [st[0], st[1], label]
                    stack.append(seg)
                    merge = True

            else:
                merge = False
            print(i, merge)
        # print(len(segmentLimits), len(stack))
        for sn in stack:
            # print(type(wavFile), sn[0].shape, sn[1].shape, type(sn[0]), type(sn[1]))

            strName = base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1])
            if sn[2] == 'speech':
                strOut = folder + base + "_" + "{:.3f}".format(sn[0]) + "_" + "{:.3f}".format(sn[1]) + ".wav"

                wavfile.write(strOut, fs, x[int(fs * sn[0]):int(fs * sn[1])])
                segfile.write(strName + ' ' + base + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + "\n")
            segfile2.write(strName + ' ' + "{:.3f}".format(sn[0]) + ' ' + "{:.3f}".format(sn[1]) + ' ' + sn[2] + "\n")
    segfile.close()
    segfile2.close()
예제 #9
0
from pyAudioAnalysis import audioBasicIO as aIO
from pyAudioAnalysis import audioSegmentation as aS
audio_path = "../project_files/project_dataset/audio/soccer.wav"
[Fs, x] = aIO.read_audio_file(audio_path)
segments = aS.silence_removal(x,
                              Fs,
                              0.020,
                              0.020,
                              smooth_window=0.1,
                              weight=0.6,
                              plot=False)

for i in range(len(segments)):
    print("[" + str(segments[i][0]) + "," + str(segments[i][1]) + "]")
    segments[i][0] = segments[i][0] // 30
    segments[i][1] = segments[i][1] // 30
예제 #10
0
def audio_based_feature_extraction(input_file,
                                   models_directory,
                                   raudio_features_discard=0,
                                   pyaudio_num_features="all",
                                   mode=0,
                                   pyaudio_params=None):
    """
        Export all features for a wav file (silence based + classifiers based)
        :param input_file: the audio file
        :param models_directory: the directory which contains all trained
        classifiers (models' files + MEANS files)
        :return: features , feature_names , metadata
    """
    # A. silence features
    fs, dur = get_wav_properties(input_file)
    fs, x = aio.read_audio_file(input_file)

    print(input_file)
    print(len(x) / fs)
    # get the silence estimates using pyAudioAnalysis semi-supervised approach
    # for different windows and steps
    if dur < 6.2:
        seg_limits_short = [[0, dur]]
        seg_limits_long = [[0, dur]]
    else:
        seg_limits_short = aS.silence_removal(x, fs, 0.5, 0.25, 0.5)
        seg_limits_long = aS.silence_removal(x, fs, 1.0, 0.25, 0.5)

    # short windows
    silence_features_short, number_of_pauses_short, total_speech_short = \
        silence_features(seg_limits_short, dur)
    # long windows
    silence_features_long, number_of_pauses_long, total_speech_long = \
        silence_features(seg_limits_long, dur)

    features = []
    feature_names = []

    if mode < 2:

        # B. segment model-based features
        # Load classifier:
        dictionaries = []
        for filename in os.listdir(models_directory):
            model_path = os.path.join(models_directory, filename)
            dictionary = predict_audio_labels(input_file, model_path)[0]
            dictionaries.append(dictionary)

        # list of features and feature names
        feature_names = [
            "Average silence duration short (sec)",
            "Average silence duration long (sec)",
            "Silence segments per minute short (segments/min)",
            "Silence segments per minute long (segments/min)", "Std short",
            "Std long", "Speech ratio short (sec)", "Speech ratio long (sec)",
            "Word rate in speech short (words/sec)",
            "Word rate in speech long (words/sec)"
        ]

        for i in range(len(silence_features_short)):
            features.append(silence_features_short[i])
            features.append(silence_features_long[i])
        for dictionary in dictionaries:
            for label in dictionary:
                feature_string = label + "(%)"
                feature_value = dictionary[label]
                feature_names.append(feature_string)
                features.append(feature_value)
        if raudio_features_discard != 0:
            features = features[raudio_features_discard:]
            feature_names = feature_names[raudio_features_discard:]

    # C. pyaudio features
    if mode > 0:
        (segment_features_stats, segment_features,
         pyaudio_feature_names) = aF.mid_feature_extraction(
             x, fs, round(pyaudio_params['mid_window'] * fs),
             round(pyaudio_params['mid_step'] * fs),
             round(fs * pyaudio_params['short_window']),
             round(fs * pyaudio_params['short_step']))
        pyaudio_list = list(segment_features_stats.mean(axis=1))
        if pyaudio_num_features != "all":
            #pyaudio_num_features = int(pyaudio_num_features)
            pyaudio_list = pyaudio_list[:pyaudio_num_features - 1]
            pyaudio_feature_names = pyaudio_feature_names[:pyaudio_num_features
                                                          - 1]
        features = features + pyaudio_list
        feature_names = feature_names + pyaudio_feature_names

    metadata = {
        "Number of pauses short": number_of_pauses_short,
        "Number of pauses long": number_of_pauses_long,
        "Total speech duration short (sec)": total_speech_short,
        "Total speech duration long (sec)": total_speech_long
    }
    return features, feature_names, metadata