示例#1
0
def get_transcripts_json(gcstorage_path,
                         lang,
                         phrase_hints=[],
                         speaker_count=1,
                         enhanced_model=None):
    # transcribes audio files
    def _jsonify(res):
        # helper func for simplifying gcp speech client response
        json = []
        for section in res.results:
            data = {
                'transcript': section.alternatives[0].transcript,
                'words': []
            }
            for word in section.alternative[0].words:
                data['words'].append({
                    'word':
                    word.word,
                    'start_time':
                    word.start_time.total_seconds(),
                    'end_time':
                    word.end_time.total_seconds(),
                    'speaker_tag':
                    word.speaker_tag
                })
            json.append(data)

        return json

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcstorage_path)
    diarize = speaker_count if speaker_count > 1 else False
    print(f"Diarizing: {diarize}")
    diarizationConfig = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=speaker_count
        if speaker_count > 1 else False, )

    # if eng only, can use the optimized video model
    if lang == 'en':
        enhanced_model = 'video'

    config = speech.RecognitionConfig(
        lang_code='en-US' if lang == 'en' else lang,
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        speech_contexts=[{
            'phrases': phrase_hints,
            'boost': 15
        }],
        diarization_config=diarizationConfig,
        profanity_filter=True,
        use_enhanced=True if enhanced_model else False,
        model='video' if enhanced_model else None)

    res = client.long_running_recognize(config=config, audio=audio).result()

    return _jsonify(res)
示例#2
0
def get_transcripts_json(gcsPath,
                         langCode,
                         phraseHints=[],
                         speakerCount=1,
                         enhancedModel=None):
    """Transcribes audio files.

    Args:
        gcsPath (String): path to file in cloud storage (i.e. "gs://audio/clip.mp4")
        langCode (String): language code (i.e. "en-US", see https://cloud.google.com/speech-to-text/docs/languages)
        phraseHints (String[]): list of words that are unusual but likely to appear in the audio file.
        speakerCount (int, optional): Number of speakers in the audio. Only works on English. Defaults to None.
        enhancedModel (String, optional): Option to use an enhanced speech model, i.e. "video"

    Returns:
        list | Operation.error
    """

    # Helper function for simplifying Google speech client response
    def _jsonify(result):
        json = []
        for section in result.results:
            data = {
                "transcript": section.alternatives[0].transcript,
                "words": []
            }
            for word in section.alternatives[0].words:
                data["words"].append({
                    "word":
                    word.word,
                    "start_time":
                    word.start_time.total_seconds(),
                    "end_time":
                    word.end_time.total_seconds(),
                    "speaker_tag":
                    word.speaker_tag
                })
            json.append(data)
        return json

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcsPath)

    diarize = speakerCount if speakerCount > 1 else False
    print(f"Diarizing: {diarize}")
    diarizationConfig = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization=speakerCount
        if speakerCount > 1 else False, )

    # In English only, we can use the optimized video model
    if langCode == "en":
        enhancedModel = "video"

    config = speech.RecognitionConfig(
        language_code="en-US" if langCode == "en" else langCode,
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        speech_contexts=[{
            "phrases": phraseHints,
            "boost": 15
        }],
        diarization_config=diarizationConfig,
        profanity_filter=True,
        use_enhanced=True if enhancedModel else False,
        model="video" if enhancedModel else None)
    res = client.long_running_recognize(config=config, audio=audio).result()

    return _jsonify(res)
from google.cloud import speech_v1p1beta1 as speech

client = speech.SpeechClient()

speech_file = "'/home/serkhane/Repositories/AI/DATA/youtube_data_taflowtron/en/jocko_podcast_shortlist/v4_concate_removesilence/1HhXDprzf5I/clips/1HhXDprzf5I_trim_0.0_6180.0.wav'"

with open(speech_file, "rb") as audio_file:
    content = audio_file.read()

audio = speech.RecognitionAudio(content=content)

diarization_config = speech.SpeakerDiarizationConfig(
    enable_speaker_diarization=True,
    min_speaker_count=2,
    max_speaker_count=10,
)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=8000,
    language_code="en-US",
    diarization_config=diarization_config,
)

print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)

# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result: