def retrieve_transcript(identifier, language, speaker_type, service_config):
    gcs_uri = f"gs://{identifier}/audio.wav"
    audio = speech.RecognitionAudio(uri=gcs_uri)

    if speaker_type == 'both':
        recognition_config = speech.RecognitionConfig(
            enable_automatic_punctuation=True,
            enable_word_time_offsets=True,
            enable_speaker_diarization=True,
            diarization_speaker_count=2,
            language_code=language)
    elif speaker_type in ['interviewee', 'interviewer']:
        recognition_config = speech.RecognitionConfig(
            enable_automatic_punctuation=True,
            enable_word_time_offsets=True,
            enable_speaker_diarization=False,
            language_code=language)
    else:
        raise TypeError(
            'unknown speaker type: {speaker}'.format(speaker=speaker_type))
    speech_client = get_google_client(type="speech",
                                      service_config=service_config)
    operation = speech_client.long_running_recognize(config=recognition_config,
                                                     audio=audio)
    response = operation.result()
    response_dict = MessageToDict(response.__class__.pb(response))
    return response_dict
Exemplo n.º 2
0
def transcribe_file_with_spoken_punctuation_end_emojis():
    """Transcribe the given audio file with spoken punctuation and emojis enabled."""
    # [START speech_transcribe_spoken_punctuation_emojis_beta]
    from google.cloud import speech_v1p1beta1 as speech
    from google.protobuf import wrappers_pb2

    client = speech.SpeechClient()

    speech_file = "resources/commercial_mono.wav"

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        # Enable spoken punctuation
        enable_spoken_punctuation=wrappers_pb2.BoolValue(value=True),
        # Enable spoken emojis
        enable_spoken_emojis=wrappers_pb2.BoolValue(value=True),
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print(u"First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))
Exemplo n.º 3
0
def transcribe_file_with_auto_punctuation():
    """Transcribe the given audio file with auto punctuation enabled."""
    # [START speech_transcribe_auto_punctuation_beta]
    from google.cloud import speech_v1p1beta1 as speech
    client = speech.SpeechClient()

    speech_file = 'resources/commercial_mono.wav'

    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code='en-US',
        # Enable automatic punctuation
        enable_automatic_punctuation=True)

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print('-' * 20)
        print(u'First alternative of result {}'.format(i))
        print(u'Transcript: {}'.format(alternative.transcript))
Exemplo n.º 4
0
def get_stt_response(audio_path: str, client: Any, stt_provider: str) -> Any:
    """sends a call to the STT specified by the client for the input audio_path"""

    with open(audio_path, "rb") as fid:
        content = fid.read()

    if stt_provider == "google":
        audio = speech.RecognitionAudio(content=content)
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code="en-US",
            enable_word_confidence=True,
            model="default",
        )
        response = client.recognize(config=config, audio=audio)

    elif stt_provider == "ibm":
        response = client.recognize(audio=content,
                                    content_type='audio/wav',
                                    model="en-US_BroadbandModel",
                                    word_confidence=True).get_result()

    elif stt_provider == "azure":
        audio_input = speechsdk.AudioConfig(filename=audio_path)
        speech_recognizer = speechsdk.SpeechRecognizer(
            speech_config=client, audio_config=audio_input)
        result = speech_recognizer.recognize_once_async().get()

    else:
        raise ValueError(
            f"stt provider: {stt_provider} is unacceptable. Use 'google' or 'ibm'."
        )

    return response
Exemplo n.º 5
0
def mp3ToYML(fileName):
    inputFile = AudioSegment.from_mp3(fileName)
    inputFile.export(fileName + ".wav", format="wav")
    AUDIO_FILE = fileName + ".wav"
    r = sr.Recognizer()
    with sr.AudioFile(AUDIO_FILE) as source:
        r.adjust_for_ambient_noise(source)
        audio = r.record(source)
        client = speech.SpeechClient()
        with open(audio, "rb") as audio_file:
            content = audio_file.read()

        audio = speech.RecognitionAudio(content=content)

        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=8000,
            language_code="en-US",
            enable_speaker_diarization=True,
            diarization_speaker_count=2,
        )
        response = client.recognize(config=config, audio=audio)
        result = response.results[-1]

        words_info = result.alternatives[0].words

        for word_info in words_info:
            print(
                u"word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag)
            )

        #converstations = [{'converstations' : ['soccer', 'football']}]
        #with open(r'E:\data\store_file.yaml', 'w') as file:
            #documents = yaml.dump(converstations, file)
Exemplo n.º 6
0
def initialize_recognition_config():
    """ LOADS THE CONFIGURATION FROM THE config.ini FILE. RETURNS A SPEECH RECOGNITION CONFIG FILE 
        THAT CONTAINS INFORMATION ABOUT THE SAMPLE RATE, LANGUAGE CODE, TYPE OF ENCODING, LIST OF 
        WORDS AND PHRASES THAT ARE MORE LIKELY TO OCCUR (i.e. barkod, potvrda, lokacija, vozilo) """
    
    config_file = configparser.ConfigParser()
    config_file.read('config.ini')
    
    # Initialize the speech recognition
    config = speech.RecognitionConfig()
    
    # Set the configurations
    #config.sample_rate_hertz = int(config_file.get('config','sample_rate'))
    config.language_code = str(config_file.get('config','language_code'))
    config.enable_speaker_diarization = True
    config.encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16  
    # Extract the phrases list from the config.ini
    phrases_list = (config_file.get('speech_context','phrases_list'))
    
    # Set the speech context to match the phrases list from the config.ini
    #speech_context = speech.SpeechContext(phrases=phrases_list, boost=20)
    #config.speech_contexts = [speech_context]
    config.metadata = initialize_metadata()
    
    
    
    # Uncomment to get confidences for each word
    #config.enable_word_confidence = True
    
    return config
Exemplo n.º 7
0
    def transcribe_from_file(self, speech_file, frameRate=None):
        """
        :param speech_file: str
                    relative/ full path of the speech file
        :param frameRate: int, optional
                    sample rate of the speech file
        :return: dictionary
                    transcript and confidence level
        """
        self.speech_file = speech_file
        client = speech.SpeechClient()
        with io.open(speech_file, "rb") as audio_file:
            content = audio_file.read()

        audio = speech.RecognitionAudio(content=content)
        config = speech.RecognitionConfig(
            self._get_recognition_config_params(frameRate))
        operation = client.long_running_recognize(config=config, audio=audio)
        # print("Waiting for operation to complete...")
        response = operation.result()
        # print(f'result length: {len(response.results)}')

        if len(response.results) >= 1:
            result = {
                'Transcript': response.results[0].alternatives[0].transcript,
                'Confidence': response.results[0].alternatives[0].confidence
            }
        else:
            result = {'Transcript': None, 'Confidence': None}
        return result
Exemplo n.º 8
0
def google_transcribe(audio_file_path):

    file_name = audio_file_path
    # mp3_to_wav(file_name)

    # The name of the audio file to transcribe
    frame_rate, channels = frame_rate_channel(file_name)

    if channels > 1:
        stereo_to_mono(file_name)

    with io.open(file_name, "rb") as audio_file:
        content = audio_file.read()

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code='en-US',
        enable_word_confidence=True)
    # config = speech.RecognitionConfig(encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,             sample_rate_hertz=frame_rate,
    # language_code='en-US'
    # )

    # Detects speech in the audio file
    response = client.recognize(config=config, audio=audio)
    # print(response)

    return response
Exemplo n.º 9
0
def transcribe_file_with_multiple_channels():
    """Transcribe the given audio file synchronously with multiple channels"""
    # [START speech_transcribe_audio_with_multiple_channels]
    from google.cloud import speech_v1p1beta1 as speech
    client = speech.SpeechClient()

    speech_file = 'resources/voice_tom2.wav'

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        #sample_rate_hertz=44100,
        language_code="th-TH",
        audio_channel_count=2,
        enable_separate_recognition_per_channel=True,
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print("First alternative of result {}".format(i))
        print(u"Transcript: {}".format(alternative.transcript))
        print(u"Channel Tag: {}".format(result.channel_tag))
Exemplo n.º 10
0
def transcribe_file():
    client = speech.SpeechClient()
    w = wave.open('output.wav', 'w')
    w.setnchannels(1)
    w.close()
    speech_file = "output.wav"

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code="en-US",
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        audio_channel_count=2,
    )

    print("Waiting for operation to complete...")
    response = client.recognize(config=config, audio=audio)

    # The transcript within each result is separate and sequential per result.
    # However, the words list within an alternative includes all the words
    # from all the results thus far. Thus, to get all the words with speaker
    # tags, you only have to take the words list from the last result:
    result = response.results[-1]

    words_info = result.alternatives[0].words

    # Printing out the output:
    for word_info in words_info:
        print(u"word: '{}', speaker_tag: {}".format(word_info.word,
                                                    word_info.speaker_tag))
Exemplo n.º 11
0
def transcribe_file_with_multichannel():
    """Transcribe the given audio file synchronously with
      multi channel."""
    # [START speech_transcribe_multichannel_beta]
    from google.cloud import speech_v1p1beta1 as speech
    client = speech.SpeechClient()

    speech_file = 'resources/Google_Gnome.wav'

    with open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='en-US',
        audio_channel_count=1,
        enable_separate_recognition_per_channel=True)

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print('-' * 20)
        print('First alternative of result {}'.format(i))
        print(u'Transcript: {}'.format(alternative.transcript))
        print(u'Channel Tag: {}'.format(result.channel_tag))
Exemplo n.º 12
0
def transcribe_file_with_diarization():
    """Transcribe the given audio file synchronously with diarization."""
    # [START speech_transcribe_diarization_beta]
    from google.cloud import speech_v1p1beta1 as speech
    client = speech.SpeechClient()

    speech_file = 'resources/commercial_mono.wav'

    with open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code='en-US',
        enable_speaker_diarization=True,
        diarization_speaker_count=2)

    print('Waiting for operation to complete...')
    response = client.recognize(config=config, audio=audio)

    # The transcript within each result is separate and sequential per result.
    # However, the words list within an alternative includes all the words
    # from all the results thus far. Thus, to get all the words with speaker
    # tags, you only have to take the words list from the last result:
    result = response.results[-1]

    words_info = result.alternatives[0].words

    # Printing out the output:
    for word_info in words_info:
        print(u"word: '{}', speaker_tag: {}".format(word_info.word,
                                                    word_info.speaker_tag))
Exemplo n.º 13
0
def transcribe_file_with_multilanguage():
    """Transcribe the given audio file synchronously with
      multi language."""
    # [START speech_transcribe_multilanguage_beta]
    from google.cloud import speech_v1p1beta1 as speech
    client = speech.SpeechClient()

    speech_file = 'resources/multi.wav'
    first_lang = 'en-US'
    second_lang = 'es'

    with open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        audio_channel_count=2,
        language_code=first_lang,
        alternative_language_codes=[second_lang])

    print('Waiting for operation to complete...')
    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print('-' * 20)
        print(u'First alternative of result {}: {}'.format(i, alternative))
        print(u'Transcript: {}'.format(alternative.transcript))
Exemplo n.º 14
0
def transcribe_file_with_word_level_confidence():
    """Transcribe the given audio file synchronously with
      word level confidence."""
    # [START speech_transcribe_word_level_confidence_beta]
    from google.cloud import speech_v1p1beta1 as speech
    client = speech.SpeechClient()

    speech_file = 'resources/Google_Gnome.wav'

    with open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='en-US',
        enable_word_confidence=True)

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print('-' * 20)
        print('First alternative of result {}'.format(i))
        print(u'Transcript: {}'.format(alternative.transcript))
        print(u'First Word and Confidence: ({}, {})'.format(
            alternative.words[0].word, alternative.words[0].confidence))
Exemplo n.º 15
0
def transcribe_file_with_enhanced_model():
    """Transcribe the given audio file using an enhanced model."""
    # [START speech_transcribe_enhanced_model_beta]
    from google.cloud import speech_v1p1beta1 as speech

    client = speech.SpeechClient()

    speech_file = "resources/commercial_mono.wav"

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code="en-US",
        use_enhanced=True,
        # A model must be specified to use enhanced model.
        model="phone_call",
    )

    response = client.recognize(request={"config": config, "audio": audio})

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print(f"First alternative of result {i}")
        print(f"Transcript: {alternative.transcript}")
Exemplo n.º 16
0
def transcribe_file_with_enhanced_model():
    """Transcribe the given audio file using an enhanced model."""
    # [START speech_transcribe_enhanced_model_beta]
    from google.cloud import speech_v1p1beta1 as speech
    client = speech.SpeechClient()

    speech_file = 'resources/commercial_mono.wav'

    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code='en-US',
        use_enhanced=True,
        # A model must be specified to use enhanced model.
        model='phone_call')

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print('-' * 20)
        print(u'First alternative of result {}'.format(i))
        print(u'Transcript: {}'.format(alternative.transcript))
Exemplo n.º 17
0
def google_transcribe(audio_file_name):
    file_name = filepath + audio_file_name
   # mp3_to_wav(file_name)

    # The name of the audio file to transcribe
    
    frame_rate, channels = frame_rate_channel(file_name)
    
    if channels > 1:
        stereo_to_mono(file_name)
    
    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name
    
    upload_blob(bucket_name, source_file_name, destination_blob_name)
    
    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''
        
    credential_path = "/home/asheeshg01/Speech-f22e193c0063.json"
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path
    
    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=frame_rate,
    language_code='en-US',
    enable_speaker_diarization=True,
    diarization_speaker_count=2)

    # Detects speech in the audio file
    #operation = client.long_running_recognize(config, audio)
    
    operation = client.long_running_recognize(request={"config":config, "audio":audio})
    response = operation.result(timeout=10000)
    result = response.results[-1] #Changed
    words_info = result.alternatives[0].words #Changed
    
    tag=1 #Changed
    speaker="" #Changed

    for word_info in words_info: #Changed
        if word_info.speaker_tag==tag: #Changed
            speaker=speaker+" "+word_info.word #Changed
        else: #Changed
            transcript += "speaker {}: {}".format(tag,speaker) + '\n' #Changed
            tag=word_info.speaker_tag #Changed
            speaker=""+word_info.word #Changed
          
    
    transcript += "speaker {}: {}".format(tag,speaker) #Changed
    #for result in response.results:
        #transcript += result.alternatives[0].transcript
    
    delete_blob(bucket_name, destination_blob_name)
    return transcript
Exemplo n.º 18
0
def google_word_details(audio_file_name):
    file_name = filepath + audio_file_name
    second_lang = "hi-IN"
    frame_rate, channels = frame_rate_channel(file_name)
    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name
    upload_blob(bucket_name, source_file_name, destination_blob_name)
    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''
    word_details = ''
    credential_path = s.get("credential_path")
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code='en-US',
        alternative_language_codes=[second_lang],
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        enable_word_time_offsets=True)

    # Detects speech in the audio file
    #operation = client.long_running_recognize(config, audio)

    operation = client.long_running_recognize(request={
        "config": config,
        "audio": audio
    })
    response = operation.result(timeout=10000)
    result = response.results[-1]
    words_info = result.alternatives[0].words

    tag = 1
    speaker = ""

    for word_info in words_info:
        word = word_info.word
        start_time = word_info.start_time
        end_time = word_info.end_time
        speaker1 = word_info.speaker_tag
        word_details += " Word: {} : start_time: {}: end_time: {}: speaker {}".format(
            word, start_time.total_seconds(), end_time.total_seconds(),
            speaker1)

    storage_client = storage.Client()
    bucket_name = storage_client.get_bucket(bucket_name)
    word_details_filename = audio_file_name.split(
        '.')[0] + '_word_details' + '.txt'
    blob_word_details_file = bucket_name.blob(word_details_filename)
    blob_word_details_file.upload_from_string(word_details)

    #delete_blob(bucket_name, destination_blob_name)
    return word_details
Exemplo n.º 19
0
def get_transcripts_json(gcstorage_path,
                         lang,
                         phrase_hints=[],
                         speaker_count=1,
                         enhanced_model=None):
    # transcribes audio files
    def _jsonify(res):
        # helper func for simplifying gcp speech client response
        json = []
        for section in res.results:
            data = {
                'transcript': section.alternatives[0].transcript,
                'words': []
            }
            for word in section.alternative[0].words:
                data['words'].append({
                    'word':
                    word.word,
                    'start_time':
                    word.start_time.total_seconds(),
                    'end_time':
                    word.end_time.total_seconds(),
                    'speaker_tag':
                    word.speaker_tag
                })
            json.append(data)

        return json

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcstorage_path)
    diarize = speaker_count if speaker_count > 1 else False
    print(f"Diarizing: {diarize}")
    diarizationConfig = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=speaker_count
        if speaker_count > 1 else False, )

    # if eng only, can use the optimized video model
    if lang == 'en':
        enhanced_model = 'video'

    config = speech.RecognitionConfig(
        lang_code='en-US' if lang == 'en' else lang,
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        speech_contexts=[{
            'phrases': phrase_hints,
            'boost': 15
        }],
        diarization_config=diarizationConfig,
        profanity_filter=True,
        use_enhanced=True if enhanced_model else False,
        model='video' if enhanced_model else None)

    res = client.long_running_recognize(config=config, audio=audio).result()

    return _jsonify(res)
Exemplo n.º 20
0
def transcribe_file_with_multilanguage(files_path=r'D:/dirname'):
    client = speech.SpeechClient()

    first_lang = "fr-FR"
    #second_lang = "cmn-Hans-CN"

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        #language_code='fr-FR',
        model='command_and_search',
        enable_automatic_punctuation=True,
        sample_rate_hertz=16000,
        #audio_channel_count=2,
        #enable_speaker_diarization=True,
        language_code=first_lang,
        #alternative_language_codes=[second_lang],
        #model="video",
    )

    for f in os.listdir(files_path):
        speech_file = os.path.join(files_path, f)
        outputfile = os.path.splitext(f)[0] + '.txt'
        outputfile = os.path.join(files_path, outputfile)
        if os.path.splitext(speech_file)[-1] != '.mp3':
            continue

        if os.path.exists(outputfile):
            print(speech_file + ' already transcribed in ' + outputfile)
            continue

        print(speech_file)
        with open(speech_file, "rb") as audio_file:
            content = audio_file.read()

        audio = speech.RecognitionAudio(content=content)
        #gcs_uri = "gs://pathname.mp3"
        #audio = speech.RecognitionAudio(uri=gcs_uri)

        #operation = client.long_running_recognize(config=config, audio=audio)
        operation = client.recognize(config=config, audio=audio)
        print("Waiting for operation to complete...")
        #response = operation.result(timeout=30)
        response = operation
        #print(response.results)

        print('saving to ' + outputfile)
        with open(outputfile, 'w', encoding='utf-8') as f:
            for i, result in enumerate(response.results):
                alternative = result.alternatives[0]

                print("-" * 20)
                print(u"First alternative of result {}: {}".format(
                    i, alternative))
                print(u"Transcript: {}".format(alternative.transcript))
                f.write(alternative.transcript)
                f.write('\n')
Exemplo n.º 21
0
def transcribe_file(speech_file, num_speakers):
    """Transcribe the given audio file asynchronously."""

    # Imports the Google Cloud client library
    #from google.cloud import speech
    from google.cloud import speech_v1p1beta1 as speech


    # Instantiates a client
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    file_name = os.path.join(os.path.dirname(__file__), "resources", "audio.raw")
    
    # Loads the audio into memory
    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()
    
    # Construct a recognition metadata object
    metadata = speech.RecognitionMetadata()
    metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
    metadata.recording_device_type = (
        speech.RecognitionMetadata.RecordingDeviceType.OTHER_INDOOR_DEVICE
    )
    metadata.audio_topic = "court trial hearing" 
    metadata.original_mime_type = "audio/mp3"

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=num_speakers,
        # Enhanced models cost more than standard models. 
        use_enhanced=True,
        model="video",
        enable_word_time_offsets=True,

        )

    # Detects speech in the audio file -- short audio file
    print("Waiting for operation to complete...")
    response = client.recognize(config=config, audio=audio)
    result = response.results[-1]

    words_info = result.alternatives[0].words

    # Printing out the output:
    for word_info in words_info:
        print(
            u"word: '{}', speaker_tag: {}, start_time:{}, end_time:{}".format(word_info.word, word_info.speaker_tag, word_info.start_time.total_seconds(), word_info.end_time.total_seconds())
        )
Exemplo n.º 22
0
 def testLoadAudioData_succeeds(self):
     audio_path = os.path.join(self.get_temp_dir(), "a1.wav")
     wavfile.write(audio_path, 16000, np.zeros(16000 * 1, dtype=np.int16))
     buffer = audio_asr.load_audio_data(
         audio_path,
         speech.RecognitionConfig(
             encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
             sample_rate_hertz=16000,
             audio_channel_count=1,
             language_code="en-US"))
     self.assertLen(buffer, 16000 * 2)
Exemplo n.º 23
0
 def testLoadAudioData_incorrecSampleRate_raiseValueError(self):
     audio_path = os.path.join(self.get_temp_dir(), "a1.wav")
     wavfile.write(audio_path, 16000, np.zeros(16000 * 1, dtype=np.int16))
     with self.assertRaises(ValueError):
         audio_asr.load_audio_data(
             audio_path,
             speech.RecognitionConfig(
                 encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                 sample_rate_hertz=44100,
                 audio_channel_count=1,
                 language_code="en-US"))
Exemplo n.º 24
0
def speech_to_text(gcs_URI, keypath):
    # Reference: https://cloud.google.com/speech-to-text/docs/async-recognize
    # Set up credentials from local keypath
    G = 'https://www.listennotes.com/e/p/ea09b575d07341599d8d5b71f205517b/'
    credentials = service_account.Credentials.from_service_account_file(
        keypath)
    audio = speech.RecognitionAudio(uri=gcs_URI)
    config = speech.RecognitionConfig(
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=16000,
    )

    client = speech.SpeechClient(credentials=credentials)
    operation = client.long_running_recognize(config=config, audio=audio)
    print("Waiting for operation to complete...")
    response = operation.result()
    i = 1
    sentence = ''
    transcript_all = ''
    start_time_offset = []
    # Building a python dict (contains start time and words) from the response:
    for result in response.results:
        best_alternative = result.alternatives[0]
        transcript = best_alternative.transcript
        if i == 1:
            transcript_all = transcript
        else:
            transcript_all += " " + transcript
        i += 1
        # Getting timestamps
        for word in best_alternative.words:
            start_s = word.start_time.total_seconds()
            word = word.word
            if sentence == '':
                sentence = word
                sentence_start_time = start_s
            else:
                sentence += ' ' + word
                if '.' in word:
                    start_time_offset.append({
                        'time': sentence_start_time,
                        'sentence': sentence
                    })
                    sentence = ''
    speech_to_text_data = {
        'transcript': transcript_all,
        'timestamps': start_time_offset
    }
    print('Finish transcription.')
    return speech_to_text_data
Exemplo n.º 25
0
def transcribe_gcs(gcs_uri):
    print('Process', gcs_uri)
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech_v1p1beta1 as speech
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = osp.abspath(
        configs['google_ca_dir'])

    client = speech.SpeechClient()

    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.MP3,
        sample_rate_hertz=44100,
        language_code="ja-jp",
        enable_word_time_offsets=True,
    )

    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.

    res = []
    for result in response.results:
        alternative = result.alternatives[0]
        # The first alternative is the most likely one for this portion.
        print(u"Transcript: {}".format(alternative.transcript))
        print("Confidence: {}".format(alternative.confidence))
        words = []
        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            words.append({
                'word': word,
                'start_time': start_time,
                'end_time': end_time
            })
            print(
                f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}"
            )
        res.append({
            "Transcript": alternative.transcript,
            "Confidence": alternative.confidence,
            'word': words
        })

    with open(osp.join('res', gcs_uri[-7:-4]), 'wb') as f:
        pickle.dump(res, f)
Exemplo n.º 26
0
 def testTwoFiles(self):
     audio_path_1 = os.path.join(self.get_temp_dir(), "a1.wav")
     wavfile.write(audio_path_1, 16000, np.zeros(16000 * 1, dtype=np.int16))
     audio_path_2 = os.path.join(self.get_temp_dir(), "a2.wav")
     wavfile.write(audio_path_2, 16000, np.zeros(16000 * 1, dtype=np.int16))
     audio_paths = [audio_path_1, audio_path_2]
     config = speech.RecognitionConfig(
         encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
         sample_rate_hertz=16000,
         audio_channel_count=1,
         language_code="en-US")
     generator = audio_asr.audio_data_generator(audio_paths, config)
     self.assertLen(list(generator), 2)
Exemplo n.º 27
0
def transcribe_audio_to_tsv(input_audio_paths,
                            output_tsv_path,
                            sample_rate,
                            language_code,
                            begin_sec=0.0):
    """Transcribe speech in input audio files and write results to .tsv file."""
    client = speech.SpeechClient()
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=sample_rate,
        audio_channel_count=1,
        language_code=language_code)
    streaming_config = speech.StreamingRecognitionConfig(config=config,
                                                         interim_results=False)
    requests = audio_data_generator(input_audio_paths, config)
    responses = client.streaming_recognize(streaming_config, requests)

    with open(output_tsv_path, "w" if not begin_sec else "a") as f:
        if not begin_sec:
            # Write the TSV header.
            f.write(tsv_data.HEADER + "\n")

        for response in responses:
            if not response.results:
                continue
            results = [
                result for result in response.results if result.is_final
            ]
            max_confidence = -1
            best_transcript = None
            result_end_time = None
            for result in results:
                for alt in result.alternatives:
                    if alt.confidence > max_confidence:
                        max_confidence = alt.confidence
                        best_transcript = alt.transcript.strip()
                        result_end_time = result.result_end_time
            if not best_transcript:
                continue
            end_time_sec = result_end_time.total_seconds()
            # TODO(cais): The default transcript result doesn't include the start
            # time stamp, so we currently pretend that each recognizer output phrase
            # is exactly 1 second.
            # TODO(cais): Should we use absolute timestamps such as epoch time, instead of
            # time relative to the beginning of the first file?
            start_time_sec = end_time_sec - 1
            line = "%.3f\t%.3f\t%s\t%s" % (
                start_time_sec + begin_sec, end_time_sec + begin_sec,
                tsv_data.SPEECH_TRANSCRIPT_TIER, best_transcript)
            print(line)
            f.write(line + "\n")
Exemplo n.º 28
0
def my_transcribe():
    from google.cloud import speech_v1p1beta1 as speech
    client = speech.SpeechClient()

    speech_file = 'resources/voice_tom2.wav'
    # speech_file = 'resources/voice_tom_southern.wav'

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        #sample_rate_hertz=44100,
        language_code="th-TH",
        audio_channel_count=2,  # 2 (stereo), 1 (mono)
        enable_word_confidence=True,
        enable_word_time_offsets=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=2,
        model="default",
    )

    print("Waiting for operation to complete...")
    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 30)
        #print(u"Transcript: {}".format(alternative.transcript))
        print("Confidence: {}".format(alternative.confidence))
        print(u"Channel Tag: {}".format(result.channel_tag))
        ground_truth = get_ground_truth_text()
        hypothesis = str(alternative.transcript)
        print("Ground Truth: ", get_ground_truth_text())
        print("Hypothesis: ", hypothesis)

        atta = Tokenizer(model="attacut-sc")
        gt_word_tokenize = atta.tokenize(ground_truth)
        hp_word_tokenize = atta.tokenize(hypothesis)

        # gt_word_tokenize = word_tokenize(ground_truth, engine="newmm") # default=newmm, longest
        # hp_word_tokenize = word_tokenize(hypothesis, engine="newmm")

        print("Ground Truth Word Tokenize:", gt_word_tokenize)
        print("Hypothesis Word Tokenize:", hp_word_tokenize)
        error = evaluation.util.word_error_rate(hp_word_tokenize,
                                                gt_word_tokenize)
        print("WER: ", error)
Exemplo n.º 29
0
def get_speaker_diarization_results(source_file_name, speaker_count):
    client = speech.SpeechClient()

    gcs_uri = "gs://ami_corpus/meeting_files/" + source_file_name
    audio = speech.RecognitionAudio(uri=gcs_uri)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,
        language_code="en-US",
        enable_speaker_diarization=True,
        diarization_speaker_count=speaker_count,
    )
    response = client.long_running_recognize(config=config, audio=audio)
    result = response.result().results[-1]
    return result.alternatives[0].words
Exemplo n.º 30
0
def google():
    if request.method == 'POST':
        if os.path.exists("speechtotext.wav"):
            os.remove("speechtotext.wav")
        if os.path.exists("monosound.wav"):
            os.remove("monosound.wav")

        f = request.files['file']
        content = f.read()

        with open('speechtotext.wav', mode='bx') as file:
            file.write(content)

        client = speech.SpeechClient()
        speech_file = "speechtotext.wav"

        rate, data = wf.read(speech_file)
        data0 = data[:, 0]

        wf.write("monosound.wav", 48000, data0)

        with io.open("monosound.wav", "rb") as audio_file:
            content = audio_file.read()

        audio = speech.RecognitionAudio(content=content)

        ob = sf.SoundFile(speech_file)

        first_lang = "en-US"
        second_lang = "es-US"
        third_lang = "zh-cmn-Hans-CN"
        fourth_lang = "hi-IN"

        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=ob.samplerate,
            language_code="en-US",
            alternative_language_codes=[second_lang, third_lang, fourth_lang])

        response = client.recognize(config=config, audio=audio)

        text = ""
        for i, result in enumerate(response.results):
            alternative = result.alternatives[0]
            text = text + alternative.transcript + "\n"

        return jsonify({'text': text})