Пример #1
0
def ds_process_audio(ds, audio_file, file_handle):  
    """Run DeepSpeech inference on each audio file generated after silenceRemoval
    and write to file pointed by file_handle

    Args:
        ds : DeepSpeech Model
        audio_file : audio file
        file_handle : SRT file handle
    """
    
    global line_count
    fin = wave.open(audio_file, 'rb')
    fs_orig = fin.getframerate()
    desired_sample_rate = ds.sampleRate()
    
    # Check if sampling rate is required rate (16000)
    if fs_orig != desired_sample_rate:
        print("Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition".format(fs_orig, desired_sample_rate), file=sys.stderr)
        audio = convert_samplerate(audio_file, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    fin.close()
    
    # Perform inference on audio segment
    infered_text = ds.stt(audio)
    
    # File name contains start and end times in seconds. Extract that
    limits = audio_file.split("/")[-1][:-4].split("_")[-1].split("-")
    
    if len(infered_text) != 0:
        line_count += 1
        write_to_file(file_handle, infered_text, line_count, limits)
Пример #2
0
def ds_process_audio(ds,filepath,file_handle):
    w = wave.open(filepath, 'r')
    frames = w.getnframes()
    fs = w.getframerate()#sampling rate
    buffer = w.readframes(frames)
    desired_sample_rate = ds.sampleRate()
    if fs == desired_sample_rate:
        data16 = np.frombuffer(buffer, dtype=np.int16)
    else:
        data16 = convert_samplerate(filepath,desired_sample_rate)
    w.close()
    
    text = ds.stt(data16)#use the deepspeech model to perform speech-to-text
    file_handle.write(text+"\n")
Пример #3
0
def ds_process_audio(ds, audio_file, file_handle, vtt):  
    """Run DeepSpeech inference on each audio file generated after silenceRemoval
    and write to file pointed by file_handle

    Args:
        ds : DeepSpeech Model
        audio_file : audio file
        file_handle : SRT file handle
    """
    
    global line_count
    fin = wave.open(audio_file, 'rb')
    fs_orig = fin.getframerate()
    desired_sample_rate = ds.sampleRate()
    
    # Check if sampling rate is required rate (16000)
    # won't be carried out as FFmpeg already converts to 16kHz
    if fs_orig != desired_sample_rate:
        print("Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition".format(fs_orig, desired_sample_rate), file=sys.stderr)
        audio = convert_samplerate(audio_file, desired_sample_rate)
    else:
        audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    fin.close()
    
    # Perform inference on audio segment
    metadata = ds.sttWithMetadata(audio)
    infered_text = ''.join([x.text for x in metadata.transcripts[0].tokens])
    
    
    # File name contains start and end times in seconds. Extract that
    limits = audio_file.split(os.sep)[-1][:-4].split("_")[-1].split("-")

    # Get time cues for each word
    cues = [float(limits[0])] + [x.start_time + float(limits[0]) for x in metadata.transcripts[0].tokens if x.text == " "]
    
    if len(infered_text) != 0:
        line_count += 1
        write_to_file(file_handle, infered_text, line_count, limits, vtt, cues)