예제 #1
0
    def diarization(self, wavfile, bucket_name, project_name):
        if not os.path.exists(project_name):
            os.mkdir(project_name)
        print("Uploading {} to google cloud storage bucket".format(wavfile))
        set_value("label_wav_file_transcribe",
                  "Uploading file to cloud storage bucket...")
        self.upload_blob(bucket_name, wavfile, "temp_audio.wav")
        gcs_uri = "gs://{}/temp_audio.wav".format(bucket_name)
        set_value("label_wav_file_transcribe", "Finished uploading.")

        client = speech.SpeechClient()
        audio = speech.RecognitionAudio(uri=gcs_uri)
        info = mediainfo(wavfile)
        sample_rate = info['sample_rate']
        print("Transcribing {} with audio rate {}".format(
            wavfile, sample_rate))

        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=int(sample_rate),
            language_code="en-US",
            enable_automatic_punctuation=True,
            enable_word_time_offsets=True,
            enable_speaker_diarization=True,
            diarization_speaker_count=int(get_value("input_diarization_num")),
        )

        operation = client.long_running_recognize(config=config, audio=audio)
        print(
            "Waiting for operation to complete, this may take several minutes..."
        )
        set_value(
            "label_wav_file_transcribe",
            "Waiting for operation to complete, this may take several minutes..."
        )
        response = operation.result(timeout=28800)

        result = response.results[-1]
        words = result.alternatives[0].words

        active_speaker = 1
        transcript = []
        current_cut = 0
        previous_cut = 0
        speaker_wavs = []

        for x in range(int(get_value("input_diarization_num"))):
            speaker_wavs.append(AudioSegment.empty())
            transcript.append("")

        w = AudioSegment.from_wav(wavfile)

        for word in words:
            if word.speaker_tag == active_speaker:
                end_time = word.end_time
                current_cut = end_time.total_seconds() * 1e3
                #print(current_cut)
                transcript[active_speaker - 1] += word.word + ' '
            else:
                #speaker has changed
                transcript[active_speaker - 1] += word.word + ' '
                w_cut = w[(previous_cut):current_cut]
                previous_cut = current_cut
                speaker_wavs[active_speaker -
                             1] = speaker_wavs[active_speaker - 1] + w_cut
                active_speaker = word.speaker_tag

        #finish last wav cut
        w_cut = w[previous_cut:current_cut]
        speaker_wavs[active_speaker -
                     1] = speaker_wavs[active_speaker - 1] + w_cut

        for i, wave in enumerate(speaker_wavs):
            speaker_wavs[i].export("{}/speaker_{}.wav".format(
                project_name, i + 1),
                                   format="wav")

        for i, text in enumerate(transcript):
            f = open("{}/speaker_{}.txt".format(project_name, i + 1), 'w')
            f.write(transcript[i])
            f.close()

        set_value("label_wav_file_transcribe", "Done!")
        print("Done with diarization!")
        print('\a')  #system beep
예제 #2
0
def transcribe_audio_to_tsv_with_diarization(input_audio_paths,
                                             output_tsv_path,
                                             sample_rate,
                                             language_code,
                                             speaker_count,
                                             begin_sec=0.0):
    """Transcribe speech in input audio files and write results to .tsv file.

  This method differs from transcribe_audio_to_tsv() in that it performs speaker
  diarization and uses the word-level speaker indices to regroup the transcripts.
  """
    client = speech.SpeechClient()
    enable_speaker_diarization = speaker_count > 0
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=sample_rate,
        audio_channel_count=1,
        enable_separate_recognition_per_channel=False,
        language_code=language_code,
        enable_speaker_diarization=enable_speaker_diarization,
        diarization_speaker_count=speaker_count)
    streaming_config = speech.StreamingRecognitionConfig(config=config,
                                                         interim_results=False)
    requests = audio_data_generator(input_audio_paths, config)
    responses = client.streaming_recognize(streaming_config, requests)

    with open(output_tsv_path, "w" if not begin_sec else "a") as f:
        if not begin_sec:
            # Write the TSV header.
            f.write(tsv_data.HEADER + "\n")
        utterances = []
        for response in responses:
            if not response.results:
                continue
            results = [
                result for result in response.results if result.is_final
            ]
            max_confidence = -1
            best_transcript = None
            result_end_time = None
            for result in results:
                for alt in result.alternatives:
                    if alt.confidence > max_confidence:
                        max_confidence = alt.confidence
                        best_transcript = alt.transcript.strip()
                        diarized_words = [(word.word, word.speaker_tag,
                                           word.start_time.total_seconds(),
                                           word.end_time.total_seconds())
                                          for word in alt.words]
                        result_end_time = result.result_end_time
            if not best_transcript:
                continue
            end_time_sec = result_end_time.total_seconds()
            utterances.append(best_transcript)

        regrouped_utterances = regroup_utterances(utterances, diarized_words)
        utterance_counter = 0
        for (regrouped_utterance, speaker_index, start_time_sec,
             end_time_sec) in regrouped_utterances:
            utterance_counter += 1
            line = "%.3f\t%.3f\t%s\t%s [U%d] [Speaker #%d]" % (
                start_time_sec + begin_sec, end_time_sec + begin_sec,
                tsv_data.SPEECH_TRANSCRIPT_TIER, regrouped_utterance,
                utterance_counter, speaker_index)
            print(line)
            f.write(line + "\n")
예제 #3
0
def get_transcript(speech_file,content_type):
    # google authentication
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/andrewfung/Programming/Multiple Speaker Detection/multiple-speaker-detection-3ed65d50eff1.json'

    # wget -nc https://realenglishconversations.com/...

    # instantiate a speech client and declare an audio file
    client = speech.SpeechClient()

    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    if 'wav' in content_type:
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code="en-US",
            enable_speaker_diarization=True,
            diarization_speaker_count=2,
        )
    elif 'mpeg' in content_type:
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.MP3,
            sample_rate_hertz=16000,
            language_code="en-US",
            enable_speaker_diarization=True,
            diarization_speaker_count=2,
        )
    elif 'flac' in content_type:
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
            sample_rate_hertz=16000,
            language_code="en-US",
            enable_speaker_diarization=True,
            diarization_speaker_count=2,
        )

    print("Waiting for operation to complete...")
    response = client.recognize(config=config, audio=audio)

    result = response.results[-1]
    words_info = result.alternatives[0].words

    words_list = []
    # Printing out the output:
    for word_info in words_info:
        words_list.append(
            {
                'word': word_info.word,
                'speaker_tag': word_info.speaker_tag,
                'start_time': word_info.start_time,
                'end_time': word_info.end_time,
            }
        )
    # print(words_list)

    # create a script based on the words_list
    current_speaker = words_list[0]['speaker_tag']
    current_line = []
    script = []

    for item in words_list:
        if item['speaker_tag'] != current_speaker:
            # speaker changed, end of line
            script.append(
                {
                    'speaker': current_speaker,
                    'line': current_line
                }
            )
            current_line = []
            current_speaker = item['speaker_tag']
        else:
            # same speaker, add to the current line
            current_line.append(item['word'])

    script.append(
        {
            'speaker': current_speaker,
            'line': current_line
        }
    )

    script = [(f"Speaker {line['speaker']}: " + " ".join(line['line']) + "\n") for line in script]
    return script
예제 #4
0
def get_transcripts_json(gcsPath,
                         langCode,
                         phraseHints=[],
                         speakerCount=1,
                         enhancedModel=None):
    """Transcribes audio files.
    Args:
        gcsPath (String): path to file in cloud storage (i.e. "gs://audio/clip.mp4")
        langCode (String): language code (i.e. "en-US", see https://cloud.google.com/speech-to-text/docs/languages)
        phraseHints (String[]): list of words that are unusual but likely to appear in the audio file.
        speakerCount (int, optional): Number of speakers in the audio. Only works on English. Defaults to None.
        enhancedModel (String, optional): Option to use an enhanced speech model, i.e. "video"
    Returns:
        list | Operation.error
    """

    # Helper function for simplifying Google speech client response
    def _jsonify(result):
        json = []
        for section in result.results:
            data = {
                "transcript": section.alternatives[0].transcript,
                "words": []
            }
            for word in section.alternatives[0].words:
                data["words"].append({
                    "word":
                    word.word,
                    "start_time":
                    word.start_time.total_seconds(),
                    "end_time":
                    word.end_time.total_seconds(),
                    "speaker_tag":
                    word.speaker_tag
                })
            json.append(data)
        return json

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcsPath)

    diarize = speakerCount if speakerCount > 1 else False
    print(f"Diarizing: {diarize}")
    diarizationConfig = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=speakerCount
        if speakerCount > 1 else False, )

    # In English only, we can use the optimized video model
    if langCode == "en":
        enhancedModel = "video"

    config = speech.RecognitionConfig(
        language_code="en-US" if langCode == "en" else langCode,
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        speech_contexts=[{
            "phrases": phraseHints,
            "boost": 15
        }],
        diarization_config=diarizationConfig,
        profanity_filter=True,
        use_enhanced=True if enhancedModel else False,
        model="video" if enhancedModel else None)
    res = client.long_running_recognize(config=config, audio=audio).result()

    return _jsonify(res)
예제 #5
0
def transcribe_file(speech_file):
    """Transcribe the given audio file asynchronously."""
    from google.cloud import speech_v1p1beta1
    from google.cloud.speech_v1p1beta1 import enums
    from google.cloud.speech_v1p1beta1 import types
    client = speech_v1p1beta1.SpeechClient()

    # [START migration_async_request]
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    #print('Using ', speech_file, ', with the below config:')
    #print("")
    #print("importing speech_v1p1beta1")
    #print("language_code='en-US'")
    #print("use_enhanced=True")
    #print("enable_automatic_punctuation=False")
    #print("enable_word_time_offsets=False")
    #print("profanity_filter=True")
    #print("sample_rate=48000hz")
    #print("")
    #print("Transcript is as follows")

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=int(args.samplerate),
        language_code='en-US',
        # alternative_language_codes='yue-Hant-HK',
        use_enhanced=True,
        # A model must be specified to use enhanced model.
        model='phone_call',
        enable_automatic_punctuation=False,
        enable_word_time_offsets=False,
        profanity_filter=True,
        enable_speaker_diarization=True,
        speech_contexts=[
            speech_v1p1beta1.types.SpeechContext(phrases=[
                keyword1, keyword2, keyword3, keyword4, keyword5, keyword6,
                keyword7, keyword8, keyword9, keyword10
            ], )
        ],
    )

    # [START migration_async_response]
    operation = client.long_running_recognize(config, audio)
    # [END migration_async_request]
    os.chdir("..")
    os.chdir("Trans_Output_" + foldernametime)

    with open("output_transcription.txt", "a") as myfile:
        myfile.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
        #myfile.write(' - Starting a new transcription.......\n')

        #print('Waiting for operation to complete...')
        response = operation.result(timeout=90)

        # Each result is for a consecutive portion of the audio. Iterate through
        # them to get the transcripts for the entire audio file.
        for result in response.results:
            # The first alternative is the most likely one for this portion.
            print(('Transcript: {}'.format(result.alternatives[0].transcript)))
            print(('Confidence: {}'.format(result.alternatives[0].confidence)))
            with open("output_transcription.txt", "a") as myfile:
                myfile.write(('Transcript: {}'.format(
                    result.alternatives[0].transcript)) + "\n")
                myfile.write(('Confidence: {}'.format(
                    result.alternatives[0].confidence)) + "\n")
        with open("output_transcription.txt", "a") as myfile:
            myfile.write('')
예제 #6
0
script_config = configparser.ConfigParser()

try:
    script_config.read("config.ini")
except Exception as e:
    print("Error reading config file. Exiting.")
    print(e)
    exit()

# Auth
credentials = service_account.Credentials.from_service_account_file(
    PurePath(Path(__file__).resolve().parent).joinpath(
        Path(str(script_config["OPTS"]["Credentials"]))))

# Instantiate GC Speech client
client = speech.SpeechClient(credentials=credentials)

if str(script_config["OPTS"]["Mode"]) == "local":
    # Read-in audio from local file (60s limit, gs is recommended Mode)
    with io.open(
            PurePath(Path(__file__).resolve().parent).joinpath(
                str(script_config["OPTS"]["Path"])), "rb") as audio_file:
        content = audio_file.read()
        audio = speech.RecognitionAudio(content=content)
else:
    # Read-in audio from GS
    print(str(script_config["OPTS"]["Path"]))
    audio = speech.RecognitionAudio(uri=str(script_config["OPTS"]["Path"]))

# Config request
req_config = speech.RecognitionConfig(
    def transcribe_streaming(self, stream_file, configuration):
        """Streams transcription of the given audio file."""
        import io
        client = speech.SpeechClient()
        output = ''

        with io.open(stream_file, 'rb') as audio_file:
            audio_content = audio_file.read()

        config = {
            "model":
            configuration.get_model(),
            "use_enhanced":
            configuration.get_use_enhanced(),
            "encoding":
            configuration.get_encoding(),
            "sample_rate_hertz":
            configuration.get_sample_rate_hertz(),
            "language_code":
            configuration.get_language_code(),
            "alternative_language_codes":
            configuration.get_alternative_language_codes(),
            "audio_channel_count":
            configuration.get_audio_channel_count(),
            "enable_separate_recognition_per_channel":
            configuration.get_enable_separate_recognition_per_channel(),
            "enable_speaker_diarization":
            configuration.get_enableSpeakerDiarization(),
            "diarization_speaker_count":
            configuration.get_diarizationSpeakerCount(),
            "enable_automatic_punctuation":
            configuration.get_enableAutomaticPunctuation(),
            "speech_contexts":
            configuration.get_speech_context()
        }

        streaming_config = speech.types.StreamingRecognitionConfig(
            config=config, interim_results=True)

        # BUG IS HERE

        #requests = speech.types.StreamingRecognizeRequest(
        #    audio_content=audio_content)

        stream = [audio_content]
        requests = (speech.types.StreamingRecognizeRequest(audio_content=chunk)
                    for chunk in stream)

        responses = client.streaming_recognize(streaming_config, requests)

        #import pdb; pdb.set_trace()
        for response in responses:
            # Once the transcription has settled, the first result will contain the
            # is_final result. The other results will be for subsequent portions of
            # the audio.
            for result in response.results:
                alternatives = result.alternatives
                # The alternatives are ordered from most likely to least.
                for alternative in alternatives:
                    output = ''.join(alternative.transcript)

        return output
def analyze_audio(ogg_file_path, speaker_count=3):
    """Takes an audio file and outputs meeting statistics as a dictionary.

    Args:
        ogg_file_path (str): Path to the audio file in ogg-format.
        speaker_count (int): Number of people participating in the meeting.

    Returns:
        Dict[str, Any]

    """

    # Convert audio files to flac
    if ogg_file_path.split(".")[-1] != "flac":
        ogg_file_path = convert_ogg2flac(ogg_file_path)

    speech_client = speech_v1p1beta1.SpeechClient(credentials=CREDENTIALS)

    config = {
        "enable_speaker_diarization": True,
        "diarization_speaker_count": speaker_count,
        "language_code": "en-US",
        "encoding":
        speech_v1p1beta1.enums.RecognitionConfig.AudioEncoding.FLAC,
        "max_alternatives": 1,
        "use_enhanced": True,
        "sample_rate_hertz": 48000,
    }

    # Upload file to GCS Storage bucket
    client = storage.Client(credentials=CREDENTIALS)
    bucket = client.get_bucket(GCS_BUCKET_NAME)
    blob = bucket.blob(ogg_file_path)
    blob.upload_from_filename(ogg_file_path)
    audio = {"uri": f"gs://{GCS_BUCKET_NAME}/{ogg_file_path}"}

    operation = speech_client.long_running_recognize(config, audio)
    response = operation.result()

    # Empty response when speech to text failed
    if not response.results:
        json_out = {
            "google_transcript": "",
            "raw_transcript": "",
            "transcript": [],
            "speakers": [],
            "topics": [],
            "sentiment": {
                "score": 0,
                "magnitude": 0
            },
        }
        return json_out

    result = response.results[-1]
    alternative = result.alternatives[0]

    json_out = {
        "google_transcript": alternative.transcript,
        "raw_transcript": ' '.join([word.word for word in alternative.words])
    }

    # Get transcript distributed by speakers
    transcript = []
    sentence = []
    last_speaker = alternative.words[0].speaker_tag
    for word in alternative.words:
        current_speaker = word.speaker_tag
        if current_speaker == last_speaker:
            sentence.append(word.word)
        else:
            transcript.append({
                "speaker_id": last_speaker,
                "line": ' '.join(sentence)
            })
            sentence = [word.word]
            last_speaker = current_speaker
    transcript.append({"speaker_id": last_speaker, "line": ' '.join(sentence)})
    json_out["transcript"] = transcript

    # Analyze speakers
    speaker_tags = [word.speaker_tag for word in alternative.words]
    unique_speakers = set(speaker_tags)
    speaker_ratios = []
    for speaker in unique_speakers:
        speaker_ratios.append({
            "speaker_id":
            speaker,
            "ratio":
            round(speaker_tags.count(speaker) / len(speaker_tags), 2)
        })
    json_out["speakers"] = speaker_ratios

    # Analyze sentiment and topics
    sentiment, topics = analyze_text(json_out["raw_transcript"])
    json_out["sentiment"] = sentiment
    json_out["topics"] = topics
    # Include speaker sentiment
    speaker_sentiment = analyze_speaker_sentiment(json_out['transcript'])
    for line in json_out['speakers']:
        line.update({'sentiment_score': speaker_sentiment[line['speaker_id']]})

    return json_out
예제 #9
0
def get_transcript_audio_file(audio_path, langs):
    if 'en-US' in langs:
        main_lang = 'en-US'
    else:
        main_lang = langs[0]

    parent_dir = '/'.join(audio_path.split('/')[:-1])
    temp_dir = parent_dir + '/temp'
    fname = audio_path.split('/')[-1]
    try:
        os.mkdir(temp_dir)
    except FileExistsError:
        pass

    full_audio = pydub.AudioSegment.from_wav(audio_path)
    transcript = {}
    for t in range(0, len(full_audio), SIZE):
        try:
            chunk = full_audio[t:t + SIZE]
        except IndexError:
            chunk = full_audio[t:]

        chunk = chunk.set_sample_width(2)
        chunk.export(temp_dir + '/' + str(int(t / SIZE)) + '_' + fname,
                     format='wav',
                     bitrate='16k')
        chunk_info = mediainfo(temp_dir + '/' + str(int(t / SIZE)) + '_' +
                               fname)

        config = {
            "language_code": main_lang,
            "sample_rate_hertz": int(chunk_info['sample_rate']),
            "encoding": enums.RecognitionConfig.AudioEncoding.LINEAR16,
            "profanity_filter": False,
            "audio_channel_count": int(chunk_info['channels'])
        }

        if len(langs) > 1:
            config["alternative_language_codes"] = langs[1:]

        with io.open(temp_dir + '/' + str(int(t / SIZE)) + '_' + fname,
                     'rb') as f:
            content = f.read()
        audio = {"content": content}

        try:
            client = speech_v1p1beta1.SpeechClient.from_service_account_json(
                'api_keys/google_cloud.json')
        except:
            client = speech_v1p1beta1.SpeechClient()

        response = client.recognize(config, audio)

        for result in response.results:
            if int(t / 1000) not in transcript.keys():
                transcript[int(t / 1000)] = []

            alternative = result.alternatives[0]
            transcript[int(t / 1000)].append(alternative.transcript)

        os.remove(temp_dir + '/' + str(int(t / SIZE)) + '_' + fname)

    os.rmdir(temp_dir)

    return transcript
예제 #10
0
def main(new_connection_index):
    """start bidirectional streaming from microphone input to speech API"""

    # diarization_config = {
    #     "enable_speaker_diarization": True,
    #     "min_speaker_count": 1,
    #     "max_speaker_count": 6,
    # }

    # alternative_language_codes = ['zh'],
    client = speech_v1p1beta1.SpeechClient()
    config = speech_v1p1beta1.RecognitionConfig(
        encoding=speech_v1p1beta1.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=SAMPLE_RATE,
        language_code="en-US",
        max_alternatives=1,
        enable_word_time_offsets=True,
        use_enhanced=True,
        model="default",
    )
    # model="video",
    # diarization_config=diarization_config,

    streaming_config = speech_v1p1beta1.StreamingRecognitionConfig(
        config=config, interim_results=True, single_utterance=False)

    # send data of the start of the speech recognition
    import requests
    url = "http://localhost:3000/api/zoom/recog_start"
    now = datetime.datetime.now()
    recog_start = now.strftime('%Y-%m-%d %H:%M:%S')
    offset_time_start = time.time()
    data = {"recog_start": recog_start}
    res = requests.post(url, json=data)

    mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
    print(mic_manager.chunk_size)
    sys.stdout.write(YELLOW)
    sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n')
    sys.stdout.write("End (ms)       Transcript Results/Status\n")
    sys.stdout.write("=====================================================\n")

    with mic_manager as stream:

        while not stream.closed:
            new_connection_index += 1
            sys.stdout.write(YELLOW)
            sys.stdout.write("\n" +
                             str(STREAMING_LIMIT * stream.restart_counter) +
                             ": NEW REQUEST\n")
            print("#### new_connection_index: {}".format(new_connection_index))

            stream.audio_input = []
            audio_generator = stream.generator()

            requests = (speech_v1p1beta1.StreamingRecognizeRequest(
                audio_content=content) for content in audio_generator)

            responses = client.streaming_recognize(streaming_config, requests)

            # Now, put the transcription responses to use.
            listen_print_loop(responses, stream, new_connection_index)

            if stream.result_end_time > 0:
                stream.final_request_end_time = stream.is_final_end_time
            stream.result_end_time = 0
            stream.last_audio_input = []
            stream.last_audio_input = stream.audio_input
            stream.audio_input = []
            stream.restart_counter = stream.restart_counter + 1

            if not stream.last_transcript_was_final:
                sys.stdout.write("\n")
            stream.new_stream = True
예제 #11
0
 def __init__(self, config, link_db):
     self.client = speech.SpeechClient()
예제 #12
0
def speech_to_text(audio_path, SPEECHTOTEXT_SPEAKER_COUNT):
    # Instantiates a client
    client = speech.SpeechClient()

    # Loads the audio into memory
    with io.open(audio_path, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
    # encoding=enums.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
    sample_rate_hertz=16000,
    language_code='en-US',
    enable_word_time_offsets=True,
    enable_speaker_diarization=True,
    diarization_speaker_count=SPEECHTOTEXT_SPEAKER_COUNT,
    audio_channel_count=2,
    # model='video',
    )

    # async longrunnning audio file to text
    operation = client.long_running_recognize(config, audio)

    # Detects speech in the audio file
    print("Waiting for operation to complete...")
    # response = client.recognize(config, audio)
    response = operation.result(timeout=90)

    '''
    Each result is for a consecutive portion of the audio. Iterate through
    them to get the transcripts for the entire audio file.
    '''
    words_list = []

    # for result in response.results:
    result = response.results[-1]
    alternative = result.alternatives[0]
    print(u'Transcript: {}'.format(alternative.transcript))
    print('Confidence: {}'.format(alternative.confidence))

    for word_info in alternative.words:
        word = word_info.word
        start_time = word_info.start_time
        start_secs = start_time.seconds + start_time.nanos * 1e-9
        end_time = word_info.end_time
        end_secs = end_time.seconds + end_time.nanos * 1e-9
        
        print('Word: {}, start_time: {}, end_time: {}, speaker_tag: {}'.format(
        word,
        start_time.seconds + start_time.nanos * 1e-9,
        end_time.seconds + end_time.nanos * 1e-9,
        word_info.speaker_tag,
        ))

        words_list.append({
        'value': word_info.word,
        'start_secs': start_secs,
        'end_secs': end_secs,
        'speaker_tag': word_info.speaker_tag,
        })
    return words_list
예제 #13
0
파일: google.py 프로젝트: zevaverbach/tatt
class Transcriber(TranscriberBaseClass):

    name = NAME
    SUPPORTED_FORMATS = ["flac"]
    cost_per_15_seconds = [0.004, 0.006, 0.009]
    no_config_error_message = (
        "Please sign up for the Google Speech-to-Text API "
        "and put the path to your credentials in an "
        'environment variable "GOOGLE_APPLICATION_CREDENTIALS"')
    transcript_type = TRANSCRIPT_TYPE
    # https://cloud.google.com/speech-to-text/docs/languages
    # Array.from(document.querySelector('.devsite-table-wrapper').querySelectorAll('table tr')).slice(1).map(row => row.children[1].innerText)
    _language_list = [
        "af-ZA",
        "am-ET",
        "hy-AM",
        "az-AZ",
        "id-ID",
        "ms-MY",
        "bn-BD",
        "bn-IN",
        "ca-ES",
        "cs-CZ",
        "da-DK",
        "de-DE",
        "en-AU",
        "en-CA",
        "en-GH",
        "en-GB",
        "en-IN",
        "en-IE",
        "en-KE",
        "en-NZ",
        "en-NG",
        "en-PH",
        "en-SG",
        "en-ZA",
        "en-TZ",
        "en-US",
        "es-AR",
        "es-BO",
        "es-CL",
        "es-CO",
        "es-CR",
        "es-EC",
        "es-SV",
        "es-ES",
        "es-US",
        "es-GT",
        "es-HN",
        "es-MX",
        "es-NI",
        "es-PA",
        "es-PY",
        "es-PE",
        "es-PR",
        "es-DO",
        "es-UY",
        "es-VE",
        "eu-ES",
        "fil-PH",
        "fr-CA",
        "fr-FR",
        "gl-ES",
        "ka-GE",
        "gu-IN",
        "hr-HR",
        "zu-ZA",
        "is-IS",
        "it-IT",
        "jv-ID",
        "kn-IN",
        "km-KH",
        "lo-LA",
        "lv-LV",
        "lt-LT",
        "hu-HU",
        "ml-IN",
        "mr-IN",
        "nl-NL",
        "ne-NP",
        "nb-NO",
        "pl-PL",
        "pt-BR",
        "pt-PT",
        "ro-RO",
        "si-LK",
        "sk-SK",
        "sl-SI",
        "su-ID",
        "sw-TZ",
        "sw-KE",
        "fi-FI",
        "sv-SE",
        "ta-IN",
        "ta-SG",
        "ta-LK",
        "ta-MY",
        "te-IN",
        "vi-VN",
        "tr-TR",
        "ur-PK",
        "ur-IN",
        "el-GR",
        "bg-BG",
        "ru-RU",
        "sr-RS",
        "uk-UA",
        "he-IL",
        "ar-IL",
        "ar-JO",
        "ar-AE",
        "ar-BH",
        "ar-DZ",
        "ar-SA",
        "ar-IQ",
        "ar-KW",
        "ar-MA",
        "ar-TN",
        "ar-OM",
        "ar-PS",
        "ar-QA",
        "ar-LB",
        "ar-EG",
        "fa-IR",
        "hi-IN",
        "th-TH",
        "ko-KR",
        "zh-TW",
        "yue-Hant-HK",
        "ja-JP",
        "zh-HK",
        "zh",
    ]

    if _check_for_config():
        speech_client = speech.SpeechClient()
        storage_client = storage.Client()
        transcript_bucket = storage_client.get_bucket(BUCKET_NAME_TRANSCRIPT)

    def __init__(self, filepath):
        super().__init__(filepath)

    @classmethod
    def _setup(cls):
        super()._setup()
        if not shutil.which("gsutil"):
            raise exceptions.DependencyRequired(
                "Please install gcloud using the steps here:"
                "https://cloud.google.com/storage/docs/gsutil_install")

        cls._make_bucket_if_doesnt_exist(BUCKET_NAME_TRANSCRIPT)

    @classmethod
    def _make_bucket_if_doesnt_exist(cls, bucket_name):
        try:
            cls.storage_client.create_bucket(bucket_name)
        except gc_exceptions.Conflict:
            # this might fail if a bucket by the name exists *anywhere* on GCS?
            return
        else:
            print("made Google Cloud Storage Bucket for transcripts")

    def convert_file_format_if_needed(self):
        if self.file_format not in self.SUPPORTED_FORMATS:
            if not shutil.which("ffmpeg"):
                raise exceptions.DependencyRequired("please install ffmpeg")
            self.filepath = helpers.convert_file(self.filepath, "flac")

    @property
    def file_format(self):
        return pathlib.Path(self.filepath).suffix[1:].lower()

    @staticmethod
    def check_for_config() -> bool:
        return _check_for_config()

    def upload_file_if_too_big(self):
        """10MB limit as of Mar 7, 2019"""
        pass

    def transcribe(self, **kwargs) -> str:
        self.convert_file_format_if_needed()
        self.upload_file_if_too_big()
        self._request_transcription(**kwargs)

    def _check_if_transcript_exists(self, transcript_name=None):
        return storage.Blob(bucket=self.transcript_bucket,
                            name=transcript_name
                            or self.basename).exists(self.storage_client)

    def _request_transcription(
        self,
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        num_speakers=2,
        model="phone_call",
        use_enhanced=True,
    ) -> str:
        """Returns the job_name"""
        if self._check_if_transcript_exists():
            raise exceptions.AlreadyExistsError(
                f"{self.basename} already exists on {NAME}")
        num_audio_channels = helpers.get_num_audio_channels(self.filepath)
        sample_rate = helpers.get_sample_rate(self.filepath)

        with io.open(self.filepath, "rb") as audio_file:
            content = audio_file.read()
            audio = speech.types.RecognitionAudio(content=content)

        if language_code != "en-US":
            model = None

        config = speech.types.RecognitionConfig(
            encoding=speech.enums.RecognitionConfig.AudioEncoding.FLAC,
            sample_rate_hertz=sample_rate,
            audio_channel_count=num_audio_channels,
            enable_separate_recognition_per_channel=True,
            enable_word_confidence=True,
            enable_word_time_offsets=True,
            language_code=language_code,
            enable_automatic_punctuation=enable_automatic_punctuation,
            enable_speaker_diarization=enable_speaker_diarization,
            diarization_speaker_count=num_speakers,
            model=model,
            use_enhanced=use_enhanced,
        )

        self.operation = self.speech_client.long_running_recognize(
            config, audio)

        print("transcribing...")
        while not self.operation.done():
            sleep(1)
            print(".")

        result_list = []

        for result in self.operation.result().results:
            result_list.append(str(result))

        print("saving transcript")
        transcript_path = "/tmp/transcript.txt"
        with open(transcript_path, "w") as fout:
            fout.write("\n".join(result_list))
        print("uploading transcript")
        self.upload_file(BUCKET_NAME_TRANSCRIPT, transcript_path)
        os.remove(transcript_path)

        return self.basename

    @classmethod
    def retrieve_transcript(cls,
                            transcription_job_name: str) -> TRANSCRIPT_TYPE:
        """Get transcript from BUCKET_NAME_TRANSCRIPT"""
        if not cls._check_if_transcript_exists(
                cls, transcript_name=transcription_job_name):
            raise exceptions.DoesntExistError("no such transcript!")
        blob = cls.transcript_bucket.blob(transcription_job_name)
        f = tempfile.NamedTemporaryFile(delete=False)
        f.close()

        blob.download_to_filename(f.name)
        with open(f.name) as fin:
            transcript_text = fin.read()

        os.remove(f.name)
        return transcript_text

    def upload_file(self, bucket_name, path):
        blob = self.transcript_bucket.blob(self.basename)
        blob.upload_from_filename(path)

    @classmethod
    def get_transcription_jobs(cls,
                               job_name_query=None,
                               status=None) -> List[dict]:

        if status and status.lower() != "completed":
            return []

        jobs = []

        for t in cls.transcript_bucket.list_blobs():
            if job_name_query is not None and t.name != job_name_query:
                continue
            jobs.append({"name": t.name, "status": "COMPLETED"})

        return jobs
예제 #14
0
    def build_dataset(self):
        print("running")
        output_wav_path = "{}/wavs/".format(self.project_name)

        if not os.path.exists(self.project_name):
            os.mkdir(self.project_name)

        if not os.path.exists(output_wav_path):
            os.mkdir(output_wav_path)

        if self.split_method == 0:
            #Google API mode
            if not get_value("input_project_name") or not get_value(
                    "label_wav_file_path"):
                print("Error, please choose text and/or audio files.")
                return

            set_value("label_build_status",
                      "Detecting silences. This may take several minutes...")
            audio_name = self.wav_file_path
            w = AudioSegment.from_wav(audio_name)

            s_len = 1000

            silence_cuts = silence.split_on_silence(w,
                                                    min_silence_len=s_len,
                                                    silence_thresh=-45,
                                                    keep_silence=True)

            cuts = []
            final_cuts = []

            def split_wav(wav, l):
                if (wav.duration_seconds * 1000) < (self.cut_length * 1000):
                    output = []
                    output.append(wav)
                    return output

                too_long = False
                while True:
                    l -= 50
                    if l == 0:
                        print(
                            "Error, could not find small enough silence period for split, giving up"
                        )
                        output = []
                        output.append(wav)
                        return output
                    splits = silence.split_on_silence(wav,
                                                      min_silence_len=l,
                                                      silence_thresh=-45,
                                                      keep_silence=True)
                    print("Trying resplit...")
                    for s in splits:
                        if (s.duration_seconds * 1000) > (self.cut_length *
                                                          1000):
                            too_long = True
                    if too_long == True:
                        too_long = False
                    else:
                        return splits

            # Keep splitting until all cuts are under max len

            for i, c in enumerate(silence_cuts):
                print(f"Checking phrase {i}...")
                c_splits = split_wav(c, 1000)
                for s in c_splits:
                    cuts.append(s)

            # c_split_len = 1
            # s_len_temp = s_len - 100

            # for c in silence_cuts:
            #     if (c.duration_seconds * 1000) > (self.cut_length * 1000):
            #         # cut again, too long
            #         #print("cutting again...")
            #         while c_split_len == 1:
            #             #print(s_len_temp)
            #             c_split = split_wav(c, s_len_temp)
            #             c_split_len = len(c_split)
            #             s_len_temp -= 100   #reduce split time for hopefully more cuts
            #         c_split_len = 1
            #         s_len_temp = s_len - 100
            #         for i in c_split:
            #             cuts.append(i)
            #     else:
            #         cuts.append(c)

            # rebuild small cuts into larger, but below split len
            temp_cuts = AudioSegment.empty()
            prev_cuts = AudioSegment.empty()

            for i, c in enumerate(cuts):
                prev_cuts = temp_cuts
                temp_cuts = temp_cuts + c

                if i == (len(cuts) - 1):
                    #on final entry
                    if (temp_cuts.duration_seconds * 1000) > (self.cut_length *
                                                              1000):
                        final_cuts.append(prev_cuts)
                        final_cuts.append(c)
                    else:
                        final_cuts.append(temp_cuts)
                else:
                    if ((temp_cuts.duration_seconds * 1000) +
                        (cuts[i + 1].duration_seconds * 1000)) > (
                            self.cut_length * 1000):
                        # combine failed, too long, add what has already been concatenated
                        final_cuts.append(temp_cuts)
                        temp_cuts = AudioSegment.empty()

            if not os.path.exists("{}/wavs".format(self.project_name)):
                os.mkdir("{}/wavs".format(self.project_name))

            for i, w in enumerate(final_cuts):
                w.export("{}/wavs/{}.wav".format(
                    self.project_name,
                    i + int(get_value("input_starting_index"))),
                         format="wav")

            # Process each cut into google API and add result to csv
            with open("{}/output.csv".format(self.project_name), 'w') as f:
                bucket_name = get_value("input_storage_bucket")
                newline = ''
                for i, c in enumerate(final_cuts):
                    x = i + int(get_value("input_starting_index"))
                    print(f"Transcribing entry {x}")
                    self.upload_blob(
                        bucket_name,
                        "{}/wavs/{}.wav".format(self.project_name,
                                                x), "temp_audio.wav")
                    gcs_uri = "gs://{}/temp_audio.wav".format(bucket_name)

                    client = speech.SpeechClient()

                    audio = speech.RecognitionAudio(uri=gcs_uri)

                    info = mediainfo("{}/wavs/{}.wav".format(
                        self.project_name, x))
                    sample_rate = info['sample_rate']

                    if get_value("input_use_videomodel") == 1:
                        print("Using enchanced google model...")
                        config = speech.RecognitionConfig(
                            encoding=speech.RecognitionConfig.AudioEncoding.
                            LINEAR16,
                            sample_rate_hertz=int(sample_rate),
                            language_code="en-US",
                            enable_automatic_punctuation=True,
                            enable_word_time_offsets=False,
                            enable_speaker_diarization=False,
                            # enhanced model for better performance?
                            use_enhanced=True,
                            model="video",  #"phone_call or video"
                        )
                    else:
                        config = speech.RecognitionConfig(
                            encoding=speech.RecognitionConfig.AudioEncoding.
                            LINEAR16,
                            sample_rate_hertz=int(sample_rate),
                            language_code="en-US",
                            enable_automatic_punctuation=True,
                            enable_word_time_offsets=False,
                            enable_speaker_diarization=False,
                        )

                    operation = client.long_running_recognize(config=config,
                                                              audio=audio)
                    response = operation.result(timeout=28800)

                    for result in response.results:
                        text = result.alternatives[0].transcript

                    # replace some symbols and google API word choice
                    text = text.replace("%", " percent")
                    text = text.replace("cuz", "cause")
                    text = text.replace("-", " ")
                    text = text.replace("&", "and")
                    print(text)
                    set_value("label_build_status", text)

                    f.write("{}wavs/{}.wav|{}".format(newline, x, text))
                    newline = '\n'
            print('\a')  #system beep
            set_value("label_build_status", "Done!")
            print("Done running builder!")

        else:
            # Aeneas mode
            if not get_value("input_project_name") or not get_value(
                    "label_speaker_text_path") or not get_value(
                        "label_wav_file_path"):
                print("Error, please choose text and/or audio files.")
                return

            if not os.path.exists("aeneas_out"):
                os.mkdir("aeneas_out")
            else:
                shutil.rmtree("aeneas_out")
                os.mkdir("aeneas_out")

            if not os.path.exists("aeneas_prepped"):
                os.mkdir("aeneas_prepped")
            else:
                shutil.rmtree("aeneas_prepped")
                os.mkdir("aeneas_prepped")

            audio_name = self.wav_file_path

            with open(self.speaker_text_path, 'r', encoding="utf8") as f:
                text = f.read()
                text = text.replace(';', '.')
                text = text.replace(':', '.')
                text = text.replace('-', ' ')
                text = text.replace('”', '')
                text = text.replace('“', '')
                text = text.replace('"', '.')
                text = text.replace('—', ' ')
                text = text.replace('’', '\'')
                text = text.replace(' –', '.')
                text = text.strip('\n')

                if self.contains_punc:
                    #remove any duplicate whitespace between words
                    text = " ".join(text.split())
                    phrase_splits = re.split(
                        r'(?<=[\.\!\?])\s*',
                        text)  #split on white space between sentences
                    phrase_splits = list(filter(
                        None, phrase_splits))  #remove empty splits
                else:
                    #no punctuation from speech to text, so we must divid text by word count
                    phrase_splits = []
                    temp_line = []
                    text_split = text.split()
                    word_count_limit = 16

                    while len(text_split) > 0:
                        while len(temp_line) < word_count_limit and len(
                                text_split) > 0:
                            temp_line.append(text_split.pop(0))
                        phrase_splits.append(" ".join(temp_line))
                        temp_line = []

                with open('aeneas_prepped/split_text', 'w') as f:
                    newline = ''
                    for s in phrase_splits:
                        if s:
                            stripped = s.strip()  #remove whitespace
                            f.write(newline + stripped)
                            newline = '\n'
                #os.system('python -m aeneas.tools.execute_task ' + audio_name  + ' aeneas_prepped/split_text "task_adjust_boundary_percent_value=50|task_adjust_boundary_algorithm=percent|task_language=en|is_text_type=plain|os_task_file_format=csv" ' + 'aeneas_out/' + audio_name_no_ext + '.csv')
                os.system(
                    'python -m aeneas.tools.execute_task ' + audio_name +
                    ' aeneas_prepped/split_text "task_adjust_boundary_percent_value=50|task_adjust_boundary_algorithm=percent|task_language=en|is_text_type=plain|os_task_file_format=csv" '
                    + 'aeneas_out/' + self.project_name + '.csv')

                output_exists = False
                if os.path.exists("{}/output.csv".format(self.project_name)):
                    #if file exists then prepare for append
                    output_exists = True

                new_csv_file = open("{}/output.csv".format(self.project_name),
                                    'a')
                if output_exists:
                    new_csv_file.write("\n")

                with open('aeneas_out/' + self.project_name + '.csv',
                          'r') as csv_file:

                    index_count = int(self.index_start)
                    csv_reader = csv.reader(csv_file, delimiter=',')
                    csv_reader = list(csv_reader)  #convert to list
                    row_count = len(csv_reader)

                    newline = ""

                    for row in csv_reader:
                        beginning_cut = float(row[1])
                        end_cut = float(row[2])
                        text_out = row[3]
                        text_out = text_out.strip()
                        print("{} {} {} ".format(beginning_cut, end_cut,
                                                 text_out))
                        c_length = end_cut - beginning_cut

                        #if cut is longer than cut length then split it even more
                        cut_length = float(self.cut_length)
                        if c_length > cut_length:

                            more_cuts = open("aeneas_prepped/temp.csv", 'w')

                            #save the current cut wav file to run on aeneas again
                            w = AudioSegment.from_wav(audio_name)
                            wav_cut = w[(beginning_cut * 1000):(end_cut *
                                                                1000)]
                            wav_cut.export("aeneas_prepped/tempcut.wav",
                                           format="wav")

                            split_list = []
                            num_cuts = math.ceil(c_length / cut_length)
                            text_list = text_out.split()
                            text_list_len = len(text_list)
                            split_len = math.ceil(text_list_len / num_cuts)
                            print(
                                "too long, making extra {} cuts. with length {}"
                                .format(num_cuts, split_len))
                            for i in range(1, num_cuts + 1):
                                words = []
                                for j in range(0, split_len):
                                    if not text_list:
                                        break
                                    words.append(text_list.pop(0))
                                split_list.append(" ".join(words))
                            print(split_list)
                            print()

                            newline_splits = ''
                            for phrase in split_list:
                                more_cuts.write(newline_splits + phrase)
                                newline_splits = '\n'
                            more_cuts.close()

                            os.system(
                                'python -m aeneas.tools.execute_task ' +
                                "aeneas_prepped/tempcut.wav" +
                                ' aeneas_prepped/temp.csv "task_adjust_boundary_percent_value=50|task_adjust_boundary_algorithm=percent|task_language=en|is_text_type=plain|os_task_file_format=csv" '
                                + 'aeneas_out/temp_out.csv')

                            csv_file_temp = open('aeneas_out/temp_out.csv',
                                                 'r')
                            csv_reader_temp = csv.reader(csv_file_temp,
                                                         delimiter=',')
                            csv_reader_temp = list(
                                csv_reader_temp)  #convert to list
                            row_count = len(csv_reader_temp)

                            w = AudioSegment.from_wav(
                                "aeneas_prepped/tempcut.wav")

                            for row in csv_reader_temp:
                                beginning_cut = float(row[1])
                                end_cut = float(row[2])
                                text_out = row[3]
                                text_out = text_out.strip()

                                wav_cut = w[(beginning_cut * 1000):(end_cut *
                                                                    1000)]
                                new_wav_filename = "wavs/" + str(
                                    index_count) + ".wav"
                                new_csv_file.write("{}{}|{}".format(
                                    newline, new_wav_filename, text_out))
                                wav_cut.export("{}/{}".format(
                                    self.project_name, new_wav_filename),
                                               format="wav")
                                index_count += 1
                                newline = '\n'

                            csv_file_temp.close()

                        else:
                            w = AudioSegment.from_wav(audio_name)
                            wav_cut = w[(beginning_cut * 1000):(end_cut *
                                                                1000)]
                            new_wav_filename = "wavs/" + str(
                                index_count) + ".wav"
                            new_csv_file.write("{}{}|{}".format(
                                newline, new_wav_filename, text_out))
                            wav_cut.export("{}/{}".format(
                                self.project_name, new_wav_filename),
                                           format="wav")
                            index_count += 1
                            newline = '\n'

                new_csv_file.close()
                set_value("label_build_status", "Building dataset done!")
                #Remove temporary directories
                shutil.rmtree("aeneas_prepped")
                shutil.rmtree("aeneas_out")
                print('\a')  #system beep
                print("Done with Aeneas!")
예제 #15
0
def gen_transcript(filename: str, script_path: str, to_lang: str):
    """generates a transcript"""
    client = speech.SpeechClient()
    #upload to gcp
    uri_ = upload_to_gcp(filename)
    audio = speech.types.RecognitionAudio(uri=uri_)

    characters, sentences = script_sanitzer.santize(script_path,
                                                    ['*,*', '[,]', '(,)'])
    phrases_ = [x[0] if len(x[0]) < 100 else x[0][:100] for x in sentences]
    config = speech.types.RecognitionConfig(encoding='FLAC',
                                            language_code='en-US',
                                            model='video',
                                            sample_rate_hertz=16000,
                                            enable_word_time_offsets=True)

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    result_data = operation.result(timeout=1000)
    merged_transcript = ""
    merged_words = []

    for result in result_data.results:
        alternative = result.alternatives[0]
        merged_transcript += alternative.transcript
        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            word_tup = (word, start_time.seconds + start_time.nanos * 1e-9,
                        end_time.seconds + end_time.nanos * 1e-9)
            #        print('Word: {}, start_time: {}, end_time: {}'.format(
            #           word,
            #          start_time.seconds + start_time.nanos * 1e-9,
            #         end_time.seconds + end_time.nanos * 1e-9))
            merged_words.append(word_tup)
    #santize the script
    print(merged_words)
    empty_queue = []
    transcript_ptr = 0
    start = -1.1
    end = -1.1
    prev_start = start
    prev_end = end
    for sentence in sentences:
        actualSize = findSize(sentence[0])
        print(transcript_ptr)
        prev_start = start
        prev_end = end
        start = -1.0
        end = -1.0
        found = False
        for word in sentence[0].split(" "):
            if word.isspace():
                continue
            if (found):
                break
            for word2 in merged_words[transcript_ptr:transcript_ptr +
                                      actualSize]:
                #find start
                if check_words_equal(word, word2[0]):
                    start = word2[1]
                    found = True
                    break

        found = False
        for word in sentence[0].split(" ")[::-1]:
            print("WORD: " + str(word))
            if word.isspace():
                continue
            if (found):

                break
            for word2 in range(transcript_ptr + actualSize, transcript_ptr - 1,
                               -1):
                if (word2 >= len(merged_words)):
                    continue
                print(actualSize)
                print(sentence[0].split(" "))
                #find start
                print(word2)
                print(len(merged_words))
                print("WORD 2:     " + str(merged_words[word2][0]))
                if check_words_equal(word, merged_words[word2][0]):
                    end = merged_words[word2][2]
                    transcript_ptr = word2 + 1
                    found = True
                    break
        #Could not find the correct start or end times for first and last words
        #Time to estimate!
        if start < 0 or end < 0:
            '''
               We know that, if all previous sentences were calculated correctly,
               The start and end time of this sentence must be after the previous
               end time of the last sentence (somewhere near the first word after the last sentence)
               or 0 if its the first sentence. Once we have the start we will calculate the average
               talking speed (wpm) of the characters. Using this speed we can define a low ball
               estimate for how long the sentence that couldnt be defined will take, allowing us
               to define the end time. If this is the first sentence we will attempt to use the
               average persons wpm (150 wpm).
            '''

            #No previous sentences
            if len(empty_queue) == 0:
                start = merged_words[0][1]
                end = actualSize * (14 / 6)
                transcript_ptr = actualSize - int(actualSize * 1 / 4)
            else:
                start = merged_words[transcript_ptr][1]
                avg_wpm = findAverageWPM(empty_queue)
                end = actualSize * avg_wpm
                transcript_ptr += actualSize - int(actualSize * 1 / 4)

        else:
            #create nodes
            node_to_add = Node(
                sentence[1], translate.translate_phrase(sentence[0], to_lang),
                start, end)
            empty_queue.append(node_to_add)
    print(empty_queue)
    return empty_queue
예제 #16
0
 def __init__(self):
     self._sound_to_text = speech.SpeechClient()
     self._text_to_sound = texttospeech.TextToSpeechClient()
     self._audio_config = texttospeech.types.AudioConfig(
         audio_encoding=texttospeech.enums.AudioEncoding.MP3)
     self._translator = translate.Client()
    def __init__(self):
        threading.Thread.__init__(self)
        # 语音自适应
        boost = 4  # 识别指定词语的概率 推荐 [0,20]
        speech_contexts = [{
            "phrases": "ホロライブ",
            "boost": boost
        }, {
            "phrases": "しらかみ",
            "boost": boost
        }, {
            "phrases": "ふぶき",
            "boost": boost
        }, {
            "phrases": "うさだ",
            "boost": boost
        }, {
            "phrases": "ぺこら",
            "boost": boost
        }, {
            "phrases": "ぺこ",
            "boost": boost
        }, {
            "phrases": "よし",
            "boost": boost
        }, {
            "phrases": "よしょ",
            "boost": boost
        }, {
            "phrases": "えしょう",
            "boost": boost
        }, {
            "phrases": "ARK",
            "boost": boost
        }, {
            "phrases": "やめろ",
            "boost": boost
        }, {
            "phrases": "マリン",
            "boost": boost
        }, {
            "phrases": "まつり",
            "boost": boost
        }, {
            "phrases": "せんちょう",
            "boost": boost
        }]

        speaker_diarization_config = speech.types.SpeakerDiarizationConfig(  # 区分讲话人配置
            enable_speaker_diarization=True,
            min_speaker_count=1,
            max_speaker_count=2)

        self.config = speech.types.RecognitionConfig(
            encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=44100,
            language_code='ja-JP',
            max_alternatives=1,
            enable_automatic_punctuation=True,  # 启用标点符号
            diarization_config=speaker_diarization_config,  # 区分讲话人
            speech_contexts=speech_contexts  # 语音自适应
        )
        self.streaming_config = speech.types.StreamingRecognitionConfig(
            config=self.config, interim_results=True)
        self.translate_client = translate.Client()
        self.speech_client = speech.SpeechClient()
예제 #18
0
 def __init__(self, sample_rate):
     self.client = speech_v1p1beta1.SpeechClient()
     self.sample_rate = sample_rate
    def get_hypothesis(self, uri, configuration):
        import time
        """Asynchronously transcribes the audio uri specified by the gcs_uri."""
        client = speech.SpeechClient()
        config = {
            "model":
            configuration.get_model(),
            "use_enhanced":
            configuration.get_use_enhanced(),
            "encoding":
            configuration.get_encoding(),
            "sample_rate_hertz":
            configuration.get_sample_rate_hertz(),
            "language_code":
            configuration.get_language_code(),
            "alternative_language_codes":
            configuration.get_alternative_language_codes(),
            "audio_channel_count":
            configuration.get_audio_channel_count(),
            "enable_separate_recognition_per_channel":
            configuration.get_enable_separate_recognition_per_channel(),
            "enable_speaker_diarization":
            configuration.get_enableSpeakerDiarization(),
            "diarization_speaker_count":
            configuration.get_diarizationSpeakerCount(),
            "enable_automatic_punctuation":
            configuration.get_enableAutomaticPunctuation(),
            "speech_contexts":
            configuration.get_speech_context()
        }

        audio = {"uri": uri}
        operation = object
        try:
            operation = client.long_running_recognize(config=config,
                                                      audio=audio)
        except google.api_core.exceptions.InvalidArgument as e:
            raise e
        count = 0
        sleep_time = 5
        while not operation.done() and count != 30000:
            print(
                f"{operation.metadata.progress_percent}% complete - updates every {sleep_time} seconds"
            )
            if count == 29999:
                raise TimeoutError("Time out processing audio")
            count += 1
            time.sleep(sleep_time)
        print(
            f"{operation.metadata.progress_percent}% complete - updates every {sleep_time} seconds"
        )

        response = operation.result(timeout=1200)

        transcript = str()
        for result in response.results:
            # First alternative is the most probable result
            transcript += " " + result.alternatives[0].transcript
        if not transcript:
            logger.debug('No transcript returned')
        utilities = Utilities()
        t = utilities.strip_puc(text=transcript)
        return t.lower()
예제 #20
0
def google_transcribe(audio_file_name):
    file_name = filepath + audio_file_name
    second_lang = "hi-IN"

    # The name of the audio file to transcribe

    frame_rate, channels = frame_rate_channel(file_name)

    if channels > 1:
        stereo_to_mono(file_name)

    bucket_name = bucketname
    source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name

    upload_blob(bucket_name, source_file_name, destination_blob_name)

    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''

    credential_path = s.get("credential_path")
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code='en-US',
        alternative_language_codes=[second_lang],
        enable_speaker_diarization=True,
        diarization_speaker_count=2)

    operation = client.long_running_recognize(request={
        "config": config,
        "audio": audio
    })
    response = operation.result(timeout=10000)
    result = response.results[-1]
    words_info = result.alternatives[0].words

    tag = 1
    speaker = ""

    for word_info in words_info:
        if word_info.speaker_tag == tag:
            speaker = speaker + " " + word_info.word
        else:
            transcript += "speaker {}: {}".format(tag, speaker) + '\n'
            tag = word_info.speaker_tag
            speaker = "" + word_info.word

    transcript += "speaker {}: {}".format(tag, speaker)
    #for result in response.results:
    #transcript += result.alternatives[0].transcript

    storage_client = storage.Client()
    bucket_name = storage_client.get_bucket(bucket_name)
    transcript_filename = audio_file_name.split(
        '.')[0] + '_transcript' + '.txt'
    blob_transcript_file = bucket_name.blob(transcript_filename)
    blob_transcript_file.upload_from_string(transcript)

    #delete_blob(bucket_name, destination_blob_name)
    return transcript
예제 #21
0
def transcriberDetail(blob_name, main):

    # check if already Inserted to ssrDictionary using audio name/blobname
    flagDntInst = 0
    # mycursor = mydb.cursor()
    mydb._open_connection()
    sql = "select  audioName from ssrDictionary where audioName='" + blob_name + "' LIMIT 2"
    mycursor.execute(sql)
    myresult = mycursor.fetchall()
    for x in myresult:
        flagDntInst = 1

    posts = []

    # urll = 'gs://bucketgcssr/SSR_8102019114925.wav'
    urll = 'gs://bucketgcssr/' + blob_name
    from google.cloud import speech_v1p1beta1 as speech  # GCP api
    client = speech.SpeechClient()

    audio = speech.types.RecognitionAudio(uri=urll)
    config = speech.types.RecognitionConfig(
        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code='ur-PK',  # language code
        # speaker diaraziation not working for urdu for now
        enable_speaker_diarization=True,
        diarization_speaker_count=2,  # Speak count not working for urdu now
        sample_rate_hertz=48000,  # audio sampel rage
        audio_channel_count=1)  # number of chanel used in aud

    operation = client.long_running_recognize(config, audio)
    response = operation.result(timeout=10000)

    transcrip = ""
    confidence = 0
    for result in response.results:
        alternative = result.alternatives[0]
        transcrip = format(alternative.transcript)
        confidence = alternative.confidence
        main.append({
            'transcrip': transcrip,
            'blob_name': blob_name,
            'confidence': confidence
        })
        for word_info in alternative.words:
            confidence = format(word_info.confidence)
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            posts.append({
                'word': word,
                'start_time': start_time.seconds + start_time.nanos * 1e-9,
                'end_time': end_time.seconds + end_time.nanos * 1e-9,
                'confidence': confidence
            })
            # insertion to Mysql For WordDictionary here
            if flagDntInst == 0:
                sql = "INSERT INTO ssrDictionary (words,audioName, confidance,endTime,startTime) VALUES (%s, %s, %s, %s, %s)"
                val = (word, blob_name, confidence,
                       end_time.seconds + end_time.nanos * 1e-9,
                       start_time.seconds + start_time.nanos * 1e-9)
                mycursor.execute(sql, val)
                mydb.commit()

    mydb.close()
    return posts
예제 #22
0
def transcribe_gcs(gcs_uri, num_speakers):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""

    # Imports the Google Cloud client library
    #from google.cloud import speech
    from google.cloud import speech_v1p1beta1 as speech


    # Instantiates a client
    client = speech.SpeechClient()
    
    # Construct a recognition metadata object
    metadata = speech.RecognitionMetadata()
    metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION
    metadata.recording_device_type = (
        speech.RecognitionMetadata.RecordingDeviceType.OTHER_INDOOR_DEVICE
    )
    metadata.audio_topic = "court trial hearing" 
    metadata.original_mime_type = "audio/mp3"

    audio = speech.RecognitionAudio(uri=gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=16000,
        language_code="en-US",
        enable_automatic_punctuation=True,
        enable_speaker_diarization=True,
        diarization_speaker_count=num_speakers,
        # Enhanced models cost more than standard models. 
        use_enhanced=True,
        model="video",
        enable_word_time_offsets=True,

    )

    # Detects speech in the audio file -- long audio file
    operation = client.long_running_recognize(config=config, audio=audio)

    print("Waiting for operation to complete...")
    response = operation.result(timeout=300)

    # Writing results to json

    result_counter = 0 
    word_counter = 0 
    output_json = {}

    for result in response.results:
        alternative = result.alternatives[0]
        output_json[f"{result_counter}_Transcript"] =  alternative.transcript
        output_json[f"{result_counter}_Confidence"] =  alternative.confidence
        result_counter += 1

        for word_info in alternative.words:
            word = word_info.word
            start_time = word_info.start_time
            end_time = word_info.end_time
            speaker_tag = word_info.speaker_tag

            output_json[f"{word_counter}_Word"] =  word
            output_json[f"{word_counter}_start_time"] =  start_time.total_seconds()
            output_json[f"{word_counter}_end_time"] =  end_time.total_seconds()
            output_json[f"{word_counter}_speaker_tag"] =  speaker_tag

            word_counter += 1

    with open("{}.json".format(gcs_uri.split('/')[-1][:-5]) , "w+") as file:
        json.dump(output_json, file)
    

    print("Dirized and transcribed {}".format(gcs_uri.split('/')[-1]))
예제 #23
0
def manage_stream(mic, finals, STREAMING_LIMIT):
    client = speech.SpeechClient()
    config = speech.types.RecognitionConfig(
        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='en-US',
        max_alternatives=1,
        enable_speaker_diarization=True,
        enable_automatic_punctuation=True)
    streaming_config = speech.types.StreamingRecognitionConfig(
        config=config, interim_results=True)

    with mic as stream:
        while not stream.to_close:
            audio_generator = stream.generator()

            requests = (speech.types.StreamingRecognizeRequest(
                audio_content=content) for content in audio_generator)

            responses = client.streaming_recognize(streaming_config, requests)

            logging.info("Started new stream")

            for response in responses:
                logging.info("new response")

                if stream.to_close:
                    break

                if get_current_time() - stream.start_time > STREAMING_LIMIT:
                    stream.start_time = get_current_time()
                    break

                if not response.results:
                    continue

                result = response.results[0]

                if not result.alternatives:
                    continue

                if result.is_final:
                    transcript = result.alternatives[0].transcript
                    logging.info(transcript)

                    result_seconds = 0
                    result_nanos = 0

                    if result.result_end_time.seconds:
                        result_seconds = result.result_end_time.seconds

                    if result.result_end_time.nanos:
                        result_nanos = result.result_end_time.nanos

                    stream.end_time = int((result_seconds * 1000) +
                                          (result_nanos / 1000000))

                    finals.append(
                        f"Speaker {get_main_speaker(result)}: {transcript}<br/><br/>"
                    )
                    stream.last_interim = ""
                else:
                    stream.last_interim = result.alternatives[0].transcript

            stream.next_stream()
def transcribe_file(speech_file):
    """Transcribe the given audio file asynchronously."""
    from google.cloud import speech_v1p1beta1
    from google.cloud.speech_v1p1beta1 import enums
    from google.cloud.speech_v1p1beta1 import types
    client = speech_v1p1beta1.SpeechClient()

    # [START migration_async_request]
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()


    #convert to mono

    from pydub import AudioSegment
    AudioSegment.converter = r'C:\FFMpeg'
    sound = AudioSegment.from_wav(args.path)
    sound = sound.set_frame_rate(44100)
    sound = sound.set_channels(1)
    sound.export(args.path, format="wav")

    #from pydub.utils import mediainfo
    #info = mediainfo(args.path)
    #print (info['sample_rate'])

    from scipy import io
    rate = scipy.io.wavfile.read(args.path)
    print(rate)

    print('Using ', args.path, ', with the below config:')
    print("")
    print("importing speech_v1p1beta1")
    print("language_code='en-US'")
    print("use_enhanced=True")
    print("enable_automatic_punctuation=False")
    print("enable_word_time_offsets=False")
    print("profanity_filter=True")
    print("sample_rate=48000hz")
    print("")
    print("Transcript is as follows")

    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,
        language_code='en-US',
        use_enhanced=True,
        # A model must be specified to use enhanced model.
        model='phone_call',
        enable_automatic_punctuation=False,
        enable_word_time_offsets=False,
        profanity_filter=True,
        #speech_contexts=[speech.types.SpeechContext(
        #    phrases=['Andy', 'Wisy', 'EEP', 'Project', 'Tom', 'Jeff'],
        #    )],
        )

    # [START migration_async_response]
    operation = client.long_running_recognize(config, audio)
    # [END migration_async_request]

    print('Waiting for operation to complete...')
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(('Transcript: {}'.format(result.alternatives[0].transcript)))
        print(('Confidence: {}'.format(result.alternatives[0].confidence)))
예제 #25
0
파일: main.py 프로젝트: rasalt/audio_demo
#Reference: https://cloud.google.com/speech-to-text/docs/async-recognize
#https://google-cloud-python.readthedocs.io/en/0.32.0/storage/blobs.html

from google.cloud import speech_v1p1beta1 as speech
from google.cloud.storage import Blob
from google.cloud import storage

client = speech.SpeechClient()


def diarize(data, context):

    speech_file = data['name']
    bucket = data['bucket']
    print('Bucket {}'.format(bucket))
    print('File {}'.format(speech_file))
    filename_uri = "gs://" + bucket + "/" + speech_file
    print('File name uri {}'.format(filename_uri))
    dest_file = speech_file + ".txt"

    audio = speech.types.RecognitionAudio(uri=filename_uri)
    config = speech.types.RecognitionConfig(
        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        language_code='en-US',
        enable_speaker_diarization=True,
        diarization_speaker_count=2)

    operation = client.long_running_recognize(config, audio)
    print('Waiting for operation to complete...')
예제 #26
0
def main(input_queue, q_conversation, realTime=True, speech_file=None):
    global stop_loop
    global queueQA
    global arr
    global queueTranscripts
    global queueSpeakerTags
    global q_convo
    global queueThread
    q_convo = q_conversation

    max_conv_length = 1000
    stop_loop = multiprocessing.Value('i')  # used to stop all processes/loops when 'stop recording' is said
    stop_loop = 0
    queueQA = multiprocessing.Queue()
    queueTranscripts = multiprocessing.Queue()
    queueSpeakerTags = multiprocessing.Queue()
    queueThread = multiprocessing.Queue()
    arr = multiprocessing.Array('i', max_conv_length)

    process = multiprocessing.Process(target=calculation,
                                      args=(arr, queueQA, queueTranscripts, queueSpeakerTags, input_queue, stop_loop,
                                            queueThread,))
    process.start()

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/RedLine/Desktop/Semester 8/FYP/FYP_final/FYP-key.json"

    if realTime == True:
        print("Starting real time process")

        client = speech.SpeechClient()
        config = speech.types.RecognitionConfig(
            encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=SAMPLE_RATE,
            language_code='en-US',
            enable_speaker_diarization=True,
            enable_automatic_punctuation=True,
            max_alternatives=1
            # enable_word_time_offsets=True
        )
        streaming_config = speech.types.StreamingRecognitionConfig(
            config=config,
            interim_results=True)

        mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)

        print('Say "Quit" or "Exit" to terminate the program.')

        with mic_manager as stream:

            while not stream.closed:
                sys.stdout.write('\n' + str(
                    STREAMING_LIMIT * stream.restart_counter) + ': NEW REQUEST\n')

                stream.audio_input = []
                audio_generator = stream.generator()

                requests = (speech.types.StreamingRecognizeRequest(
                    audio_content=content) for content in audio_generator)

                responses = client.streaming_recognize(streaming_config,
                                                       requests)

                # Now, put the transcription responses to use.
                listen_print_loop(responses, stream)

                if stream.result_end_time > 0:
                    stream.final_request_end_time = stream.is_final_end_time
                stream.result_end_time = 0
                stream.last_audio_input = []
                stream.last_audio_input = stream.audio_input
                stream.audio_input = []
                stream.restart_counter = stream.restart_counter + 1

                if not stream.last_transcript_was_final:
                    sys.stdout.write('\n')
                stream.new_stream = True

    else:
        print("Starting Non real time process")

        client = speech.SpeechClient()

        storage_uri = 'gs://fyp_1/BEP313-Scrum-Meetings1_1.wav'

        # Sample rate in Hertz of the audio data sent
        # sample_rate_hertz = 16000

        # The language of the supplied audio
        language_code = "en-US"

        # Encoding of audio data sent. This sample sets this explicitly.
        # This field is optional for FLAC and WAV audio formats.
        encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
        config = {
            # "sample_rate_hertz": sample_rate_hertz,
            "enable_speaker_diarization": True,
            "enable_automatic_punctuation": True,
            "language_code": language_code,
            "encoding": encoding,
            "audio_channel_count": 2,
        }
        audio = {"uri": storage_uri}

        operation = client.long_running_recognize(config, audio)

        print(u"Waiting for operation to complete...")
        response = operation.result()

        result = response.results[-1]

        x = result.alternatives[0]
        words_info = x.words

        tag = 1
        tag_prev = 1
        speaker = ""
        transcript = ""

        for word_info in words_info:
            if tag_prev == tag:
                tag_prev = tag
                tag = word_info.speaker_tag
                speaker = speaker + " " + word_info.word
            elif not (speaker[-1] == "." or speaker[-1] == "?"):
                speaker = speaker + " " + word_info.word
            else:
                transcript += "speaker {}: {}".format(tag_prev, speaker) + '\n'
                tag_prev = tag
                tag = word_info.speaker_tag
                speaker = "" + word_info.word

        transcript += "speaker {}: {}".format(tag_prev, speaker)
        print("transcript_1\n", transcript)
        f = open("transcript_1.txt", "w")
        f.write(transcript)
        f.close()

        f = open("transcript_1.txt")
        transcript = f.readlines()
        print("transcript_2\n", transcript)
        f.close()
        output = []
        for i in transcript:
            x = i.split(': ')
            sentence = x[-1]
            speaker_tag = x[0][-1]
            sentences = re.split(', |\. |\?  ', sentence)
            for j in sentences:
                output.append([j.rstrip(), speaker_tag])

        print('x: ', output)
        print(process_transcripts(output)[:])
예제 #27
0
def async_transcribe(audio_file_paths,
                     bucket_name,
                     output_tsv_path,
                     sample_rate,
                     language_code,
                     speaker_count=0,
                     begin_sec=0.0):
    """Transcribe a given audio file using the async GCloud Speech-to-Text API.

  The async API has the advantage of being able to handler longer audio without
  state reset. Empirically, we've observed that the async calls lead to slightly
  better accuracy than streaming calls.

  Args:
    audio_file_paths: Paths to the audio files as a list of strings in the
      correct order.
    bucket_name: Name of GCS bucket used for holding objects temporarily.
    output_tsv_path: Path to the output TSV file.
    sample_rate: Audio sample rate.
    language_code: Language code for recognition.
    speaker_count: Number of speakers. If 0, speaker diarization will be
      disabled.
    begin_sec: Transcript begin timestamp in seconds.
  """
    tmp_audio_file = tempfile.mktemp(suffix=".flac")
    print("Temporary audio file: %s" % tmp_audio_file)
    audio_duration_s = concatenate_audio_files(audio_file_paths,
                                               tmp_audio_file)

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    destination_blob_name = os.path.basename(tmp_audio_file)
    blob = bucket.blob(destination_blob_name)
    print("Uploading %s to GCS bucket %s" % (tmp_audio_file, bucket_name))
    blob.upload_from_filename(tmp_audio_file)
    gcs_uri = "gs://%s/%s" % (bucket_name, destination_blob_name)
    print("Uploaded to GCS URI: %s" % gcs_uri)

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)
    enable_speaker_diarization = speaker_count > 0
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=sample_rate,
        language_code=language_code,
        enable_speaker_diarization=enable_speaker_diarization,
        diarization_speaker_count=speaker_count)

    operation = client.long_running_recognize(config=config, audio=audio)
    timeout_s = int(audio_duration_s * 0.25)
    print("Waiting for async ASR operation to complete "
          "(audio duration: %.3f s; ASR timeout: %d s)..." %
          (audio_duration_s, timeout_s))
    response = operation.result(timeout=timeout_s)
    blob.delete()
    os.remove(tmp_audio_file)

    utterances = []
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        alt = result.alternatives[0]
        utterances.append(alt.transcript)
        print(u"Transcript: {}".format(alt.transcript))
        diarized_words = [(word.word, word.speaker_tag,
                           word.start_time.total_seconds(),
                           word.end_time.total_seconds())
                          for word in alt.words]
        # print("Confidence: {}".format(result.alternatives[0].confidence))

    regrouped_utterances = regroup_utterances(utterances, diarized_words)
    with open(output_tsv_path, "w" if not begin_sec else "a") as f:
        if not begin_sec:
            # Write the TSV header.
            f.write(tsv_data.HEADER + "\n")
        utterance_counter = 0
        for (regrouped_utterance, speaker_index, start_time_sec,
             end_time_sec) in regrouped_utterances:
            utterance_counter += 1
            line = "%.3f\t%.3f\t%s\t%s [U%d] [Speaker #%d]" % (
                start_time_sec + begin_sec, end_time_sec + begin_sec,
                tsv_data.SPEECH_TRANSCRIPT_TIER, regrouped_utterance,
                utterance_counter, speaker_index)
            print(line)
            f.write(line + "\n")
예제 #28
0
def sample_long_running_recognize(storage_uri):
    """
    Transcribe long audio file from Cloud Storage using asynchronous speech
    recognition

    Args:
      storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
    """
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/RedLine/Desktop/Semester 8/FYP/FYP_final/FYP-key.json"

    client = speech.SpeechClient()

    # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw'

    # Sample rate in Hertz of the audio data sent
    # sample_rate_hertz = 16000

    # The language of the supplied audio
    language_code = "en-US"

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        # "sample_rate_hertz": sample_rate_hertz,
        "enable_speaker_diarization": True,
        "enable_automatic_punctuation": True,
        "language_code": language_code,
        "encoding": encoding,
        "audio_channel_count": 2,
    }
    audio = {"uri": storage_uri}

    operation = client.long_running_recognize(config, audio)

    print(u"Waiting for operation to complete...")
    response = operation.result()

    result = response.results[-1]

    words_info = result.alternatives[0].words

    tag = 1
    tag_prev = 1
    speaker = ""
    transcript = ""

    for word_info in words_info:
        if tag_prev == tag:
            tag_prev = tag
            tag = word_info.speaker_tag
            speaker = speaker + " " + word_info.word
        elif not (speaker[-1] == "." or speaker[-1] == "?"):
            speaker = speaker + " " + word_info.word
        else:
            transcript += "speaker {}: {}".format(tag_prev, speaker) + '\n'
            tag_prev = tag
            tag = word_info.speaker_tag
            speaker = "" + word_info.word

    transcript += "speaker {}: {}".format(tag_prev, speaker)
    print("transcript : ", transcript)
    f = open("transcript.txt", "a")
    f.write(transcript)
    f.close()
예제 #29
0
import io
import os

# Imports the Google Cloud client library
from google.cloud import speech_v1p1beta1
from google.cloud.speech_v1p1beta1 import enums
from google.cloud.speech_v1p1beta1 import types

# Instantiates a client
client = speech_v1p1beta1.SpeechClient()

print "Using testwav1m, 8k, with the below config:"
print ""
print "importing speech_v1p1beta1"
print "language_code='en-GB'"
print "use_enhanced=True"
print "enable_automatic_punctuation=True"
print "enable_word_time_offsets=True"
print ""
print "Transcript is as follows"
# The name of the audio file to transcribe
file_name = os.path.join(os.path.dirname(__file__), 'resources',
                         'test_8k_mocktheweek.wav')

# Loads the audio into memory
with io.open(file_name, 'rb') as audio_file:
    content = audio_file.read()
    audio = types.RecognitionAudio(content=content)

config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
def sample_long_running_recognize(storage_uri):
    """
    Transcribe a long audio file using asynchronous speech recognition

    Args:
      local_file_path Path to local audio file, e.g. /path/audio.wav
    """
    # standard speech client
    #client = speech_v1.SpeechClient()

    # if utilizing speaker diarization
    client = speech_v1p1beta1.SpeechClient()

    # local_file_path = 'resources/brooklyn_bridge.raw'

    # The language of the supplied audio
    language_code = "en-US"

    # Enhanced model to use
    model = "phone_call"

    # Sample rate in Hertz of the audio data sent
    sample_rate_hertz = int(sys.argv[2])

    # Optional. Specifies the estimated number of speakers in the conversation.
    diarization_speaker_count = 2

    # Encoding of audio data sent. This sample sets this explicitly.
    # This field is optional for FLAC and WAV audio formats.
    encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
    config = {
        "model": model,
        "use_enhanced": True,
        "language_code": language_code,
        "sample_rate_hertz": sample_rate_hertz,
        "encoding": encoding,
        "enable_automatic_punctuation": True,
        "enable_speaker_diarization": True,
        "diarization_speaker_count": diarization_speaker_count,
    }
    audio = {"uri": storage_uri}

    operation = client.long_running_recognize(config, audio)

    print(u"Waiting for operation to complete...")
    response = operation.result()
    outtext = list()
    out_text_speaker = list()
    out_text_speaker_label = list()
    for result in response.results:
        # First alternative is the most probable result
        alternative = result.alternatives[0]
        #        print(u"Transcript: {}".format(alternative.transcript))
        outtext.append(alternative.transcript)

        for word in alternative.words:
            print(u"Speaker: {}, Word: {}".format(word.speaker_tag, word.word))
            out_text_speaker.append(word.word)
            out_text_speaker_label.append(word.speaker_tag)

    return outtext, out_text_speaker, out_text_speaker_label