예제 #1
0
def convert_sound_file(filename,
                       language='en-US',
                       wait=1200,
                       keep_on_gs=False):
    """
    Convert a sound file to a transcript using Google API.
    Files must be converted to FLAC encoding if they are not already.
    Big files have to be stored on a gs bucket to avoid time out.

    Args:
        filename (string): name of the file to convert
        language (string): language of voices in file (default: en-US)
        wait (int): time to wait of long running operations (if 0, use recognize, should be less 1 min text)

    Returns:
        Sound file transcript (first alternative) if wait > 0
        Operation name if wait == 0
    """
    working_filename = filename

    # Convert file if needed
    _, file_extension = os.path.splitext(filename)
    if file_extension is not '.flac':
        working_filename = _convert_with_ffmpeg(filename)

    # Upload file if necessary
    # Optimal size to be determined, always upload for now
    size = os.path.getsize(working_filename)
    uploaded = (size > 0)
    if uploaded:
        upload_uri = _upload_to_gs(working_filename,
                                   delete=(filename != working_filename))

    # Instantiates a Speech client using credentials
    client = speech.SpeechClient(credentials=_get_credentials())

    # Loads the audio into memory
    if uploaded:
        audio = types.RecognitionAudio(uri=upload_uri)
    else:
        with io.open(working_filename, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        language_code=language)

    print("URI:", upload_uri)
    # Detects speech in the audio file
    operation = client.long_running_recognize(config, audio)
    operation_name = operation.operation_name()
    if wait > 0:
        print("Operation:", operation_name)
        retry_count = wait // 10 + 1
        while retry_count > 0 and not operation.done():
            retry_count -= 1
            time.sleep(10)
            progress = operation.metadata().progress_percent
            print("Progress:", progress)

        if not operation.done():
            raise TimeoutError(
                "Conversion not completed before end of retries")

        response = operation.result()
        transcript = ''
        for result in response.results:
            # Several alternatives could be proposed, but generally only one is available
            transcript += result.alternatives[0].transcript + '\n'

        if uploaded and not keep_on_gs:
            _delete_from_gs(upload_uri)

        return transcript

    else:
        return operation_name
def main(name):
    # [START speech_quickstart]
    import io
    import os
    from pydub import AudioSegment

    # Imports the Google Cloud client library
    # [START migration_import]
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types

    # AudioSegment.ffmpeg = "C:\\ffmpeg\\ffmpeg\\bin\\ffmpeg.exe"
    # AudioSegment.ffprobe = "C:\\ffmpeg\\ffmpeg\\bin\\ffprobe.exe"
    # [END migration_import]

    # Instantiates a client
    # [START migration_client]
    client = speech.SpeechClient()
    # [END migration_client]

    # The name of the audio file to transcribe
    # file_name = os.path.join(
    #     os.path.dirname(__file__),
    #     '.',
    #     'file.wav')
    # print("++++++++++======")
    # print(file_name)
    # print("++++++++++======")
    file_name = name
    print(file_name)
    # file_name = argv
    str_name = str(file_name).split('.')[-1]
    str_firstname = str(file_name).split('.')[-2]

    filename1 = "C:/Users/multicampus/Documents/s03p23d107/backend/AI/captioning/test/images/" + str(
        file_name)

    if str_name == "m4a":
        print("m4a")
        AudioSegment.converter = "C:/ffmpeg-4.3.1-2020-09-21-full_build/bin/ffmpeg.exe"
        sound1 = AudioSegment.from_file(filename1, "m4a")
        sound1.export(str_firstname + "_trans_mp4.wav", format="wav")
        os.remove(filename1)
        filename_t = "C:/Users/multicampus/Documents/s03p23d107/backend/AI/captioning/test/images/" + str_firstname + "trans_mp4.wav"
        sound = sound1.set_channels(1)
        sound.export(filename_t, format="wav")
        sound = AudioSegment.from_wav(filename_t)
        frames_per_second = sound.frame_rate

        print(frames_per_second)

        # Loads the audio into memory
        with io.open(filename_t, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)

        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=frames_per_second,
            language_code='en')

        # Detects speech in the audio file
        response = client.recognize(config, audio)

        os.remove(filename_t)
        for result in response.results:
            print('Transcript: {}'.format(result.alternatives[0].transcript))
            stt = result.alternatives[0].transcript
            return stt
        # [END speech_quickstart]

    if str_name == "mp4":
        print("mp4")
        AudioSegment.converter = "C:/ffmpeg-4.3.1-2020-09-21-full_build/bin/ffmpeg.exe"
        sound1 = AudioSegment.from_file(filename1, "mp4")
        sound1.export(str_firstname + "_trans_mp4.wav", format="wav")
        os.remove(filename1)
        filename_t = "C:/Users/multicampus/Documents/s03p23d107/backend/AI/captioning/test/images/" + str_firstname + "trans_mp4.wav"
        sound = sound1.set_channels(1)
        sound.export(filename_t, format="wav")
        sound = AudioSegment.from_wav(filename_t)
        frames_per_second = sound.frame_rate

        print(frames_per_second)

        # Loads the audio into memory
        with io.open(filename_t, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)

        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=frames_per_second,
            language_code='en')

        # Detects speech in the audio file
        response = client.recognize(config, audio)

        os.remove(filename_t)
        for result in response.results:
            print('Transcript: {}'.format(result.alternatives[0].transcript))
            stt = result.alternatives[0].transcript
            return stt
        # [END speech_quickstart]

    if str_name == "mp3":
        print("mp3")
        AudioSegment.converter = "C:/ffmpeg-4.3.1-2020-09-21-full_build/bin/ffmpeg.exe"
        sound1 = AudioSegment.from_mp3(filename1)
        sound1.export(str_firstname + "_trans_mp3.wav", format="wav")
        os.remove(filename1)
        filename_t = "C:/Users/multicampus/Documents/s03p23d107/backend/AI/captioning/test/images/" + str_firstname + "_trans_mp3.wav"
        sound = sound1.set_channels(1)
        sound.export(filename_t, format="wav")
        sound = AudioSegment.from_wav(filename_t)
        frames_per_second = sound.frame_rate

        print(frames_per_second)

        # Loads the audio into memory
        with io.open(filename_t, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)

        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=frames_per_second,
            language_code='en')

        # Detects speech in the audio file
        response = client.recognize(config, audio)

        os.remove(filename_t)
        for result in response.results:
            print('Transcript: {}'.format(result.alternatives[0].transcript))
            stt = result.alternatives[0].transcript
            return stt
        # [END speech_quickstart]

    if str_name == "wav":
        print("wav")
        sound = AudioSegment.from_wav(filename1)
        sound = sound.set_channels(1)
        sound.export(filename1, format="wav")
        sound = AudioSegment.from_wav(filename1)
        frames_per_second = sound.frame_rate

        print(frames_per_second)

        # Loads the audio into memory
        with io.open(filename1, 'rb') as audio_file:
            content = audio_file.read()
            audio = types.RecognitionAudio(content=content)

        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=frames_per_second,
            language_code='en')

        # Detects speech in the audio file
        response = client.recognize(config, audio)

        os.remove(filename1)
        for result in response.results:
            print('Transcript: {}'.format(result.alternatives[0].transcript))
            stt = result.alternatives[0].transcript
            return stt
예제 #3
0
def run_google_speech(filename):

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.join(
        CONFIG('project_directory'), 'testing_webpage',
        'google_cloud_speech_key.json')
    client = speech.SpeechClient()

    # The name of the audio file to transcribe
    filename = common.get_media_path(filename)

    # TODO: remove?
    """
    test_module.down_sample_wave(filename,
                                 filename,
                                 inrate=sample_rate,
                                 outrate=ideal_sample_rate,
                                 inchannels=num_channels,
                                 outchannels=1)


    """
    """
    import librosa
    audio_time_series, _ = librosa.load(filename, sr=None)
    audio_time_series = librosa.core.resample(audio_time_series,
                                              orig_sr=sample_rate,
                                              target_sr=ideal_sample_rate)

    filename = filename + '.wav'
    librosa.output.write_wav(filename, audio_time_series, ideal_sample_rate)
    """

    # With scipy
    """
    print('BEFORE:')
    in_rate, _ = describe_wav(filename)

    ds = test_module.DownSample(in_rate=in_rate, out_rate=ideal_sample_rate)
    opens = ds.open_file(filename)
    if opens:
        ds.resample(filename)
    """
    """
    freq, audio = wavfile.read(filename)

    print(audio)
    print(freq)

    wavfile.write(filename, 16000, audio)
    """

    print('AFTER:')
    describe_wav(filename)

    # Loads the audio into memory
    with io.open(filename, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code='es-CO')

    # Detects speech in the audio file
    response = client.recognize(config, audio)

    for result in response.results:
        print('Transcript: {}'.format(result.alternatives[0].transcript))

    return response.results[0].alternatives[0].transcript
예제 #4
0
def transcribe_file(speech_file, sample_rate, parser):
    # authenticate with google using credentials in JSON file
    credentials = GoogleCredentials.get_application_default()

    client = speech.SpeechClient()

    # open audio file
    with io.open(speech_file, 'rb') as audio_file:
        content = audio_file.read()

    # send audio file to recognizer
    audio = types.RecognitionAudio(content=content)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=sample_rate,
        language_code='en-US')

    utterances = []
    marker_coordinates = []

    # send audio to recognizer
    response = client.recognize(config, audio)

    # Each result is for a consecutive portion of the audio.
    # #Iterate through them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        application.logger.info('Transcript: {}'.format(
            result.alternatives[0].transcript))

        # send transcribed text to geo parse services_
        lat, long, address, geo_json = geo_parse(
            result.alternatives[0].transcript, parser)

        # if we get back any geo parse results store as markers for map and append to utterance text
        if bool(address) and lat is not None:
            my_point = Point((float(long), float(lat)))
            my_feature = Feature(geometry=my_point,
                                 properties={
                                     'title':
                                     'Geo Location: {}'.format(address),
                                     'description':
                                     'Transcript: {}'.format(
                                         result.alternatives[0].transcript),
                                     'marker-size':
                                     'large',
                                     'marker-color':
                                     '#FF0000',
                                     'marker-symbol':
                                     'police'
                                 })

            # Insert record into DB
            row = [
                str(uuid.uuid4()),
                float(long),
                float(lat), 'Geo Location: {}'.format(address),
                'Transcript: {}'.format(result.alternatives[0].transcript),
                str(datetime.datetime.now())
            ]
            db.InsertRow(tablename='security_events', row=row)
            #use this line to temp export the table for debugging
            #db.ExportCSV(tablename='security_events')

            # store lat long as marker coordinates for map
            marker_coordinates.append(my_feature)
            # store utterance with geo parsed address after for disaply
            utterances.append(
                'Transcript: {}'.format(result.alternatives[0].transcript) +
                ' ( ' + '<em style="color:LightGray;">' +
                'Geo Location: {}'.format(address) + '</em>' + ' )')

        elif bool(address) and lat is None:
            utterances.append(
                'Transcript: {}'.format(result.alternatives[0].transcript) +
                ' ( ' + '<em style="color:LightGray;">' +
                'Geo Location: {}'.format(address) + '</em>' + ' )')
        # if there are no geo parsed results just added text without address
        else:
            utterances.append('Transcript: {}'.format(
                result.alternatives[0].transcript))

    return utterances, marker_coordinates
예제 #5
0
def text_recognition(path, config):
    root, ext = os.path.splitext(path)
    txt_path = root + ".txt"

    if os.path.exists(txt_path):
        with open(txt_path) as f:
            out = json.loads(open(txt_path).read())
            return out

    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types

    out = {}
    error_count = 0

    tmp_path = os.path.splitext(path)[0] + ".tmp.wav"
    client = speech.SpeechClient()  # Fixed

    while True:
        try:
            # client= speech.SpeechClient() # Causes 10060 max retries exceeded -to OAuth -HK

            content = load_audio(
                path,
                pre_silence_length=config.pre_silence_length,
                post_silence_length=config.post_silence_length)

            max_duration = config.max_duration - \
                    config.pre_silence_length - config.post_silence_length
            audio_duration = get_duration(content)

            if audio_duration >= max_duration:
                print(" [!] Skip {} because of duration: {} > {}". \
                        format(path, audio_duration, max_duration))
                return {}

            content = resample_audio(content, config.sample_rate)
            save_audio(content, tmp_path, config.sample_rate)

            with io.open(tmp_path, 'rb') as f:
                audio = types.RecognitionAudio(content=f.read())

            config = types.RecognitionConfig(
                encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=config.sample_rate,
                language_code='ko-KR')

            response = client.recognize(config, audio)
            if len(response.results) > 0:
                alternatives = response.results[0].alternatives

                results = [
                    alternative.transcript for alternative in alternatives
                ]
                assert len(results) == 1, "More than 1 results: {}".format(
                    results)

                out = {path: "" if len(results) == 0 else results[0]}
                print(path, results[0])
                break
            break
        except Exception as err:
            raise Exception("OS error: {0}".format(err))

            error_count += 1
            print("Skip warning for {} for {} times". \
                    format(path, error_count))

            if error_count > 5:
                break
            else:
                continue

    remove_file(tmp_path)
    with open(txt_path, 'w') as f:
        json.dump(out, f, indent=2, ensure_ascii=False)

    return out
예제 #6
0
def transcribe(gcs_uri, apikey, language='en-US', confidences=False):
    """Function to asynchronously translate audio file uploaded
    to Google Cloud Platform.

    Parameters
    -----------
      gcs_uri: str
        URI file path consisting of bucket name and filename
        See: https://cloud.google.com/bigquery/docs/loading-data-cloud-storage
      apikey: str 
        Path to the .json file with Google Cloud API key.
        See: https://cloud.google.com/docs/authentication/api-keys
      language: str (default: 'en-US')
        Passes language code argument to client. Many languages available.
        See: https://cloud.google.com/speech-to-text/docs/languages
      confidences: bool (default: False)
        Inserts back-to-top links below headings if True.
      
    Returns
    -----------
    cont: file
      Text file with transcription and (optionally) json file with
      transcription and confidence levels.
    """

    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = apikey
    client = speech.SpeechClient()

    # For optimal results, file sample hertz rate
    # should be at least 16000Hz
    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        language_code=language)

    operation = client.long_running_recognize(config, audio)

    print('PROCESSING...')

    response = operation.result(timeout=None)

    transcript_list = []
    result_dict = {}

    for n, result in enumerate(response.results):
        # Note that results returns alternative translations
        # with varying degrees of confidence, with the zeroth
        # alternative as the most likely.
        transcript = result.alternatives[0].transcript
        confidence = result.alternatives[0].confidence

        result_value = {'transcript': transcript, 'confidence': confidence}

        result_key = 'result_{}'.format(n)
        result_dict[result_key] = result_value

        transcript_list.append(transcript)

    transcript_str = ''.join(transcript_list)

    # write files to os
    audio_name = gcs_uri.split('/')[3]
    writer(transcript_str, '{}-transcript.txt'.format(audio_name))
    if confidences:
        writer(result_dict,
               '{}-transcript_confidences.json'.format(audio_name))

    return None
def speech_to_text_in_a_min(
        doc_title='範例1_一分鐘內雲端運算',
        title_pattern='nlpno.wav',
        wd='/home/slave1/git/Speech2Text_workshop/record',
        json_os='/home/slave1/git/Speech2Text_workshop/speech2text-3de4444fd46a.json',
        sample_rate_hertz=48000):
    '''
    * json_os:憑證檔的路徑
    * title_pattern:錄音檔的名稱模式
    * sample_rate_hertz:錄音的取樣頻率
    * doc_title:docx文件名稱
    * wd:工作目錄
    
    '''

    # 計時
    start_time = time.time()
    # 從python client端對雲端speech2text服務進行驗證
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = json_os
    client = speech.SpeechClient()

    file_list = os.listdir(wd)

    # 選出title_pattern的錄音檔
    select_wav = []
    for i in file_list:
        if title_pattern in i:
            select_wav.append(wd + '/' + i)

    aa = pd.DataFrame()

    for music in select_wav:

        # 將 audio錄音檔 讀入進來
        with io.open(music, 'rb') as audio_file:
            content = audio_file.read()

        # 將錄音檔轉換成google 看得懂的格式
        audio = types.RecognitionAudio(content=content)

        # 設定格式錄音檔
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=sample_rate_hertz,
            language_code='cmn-Hant-TW',
            enable_word_time_offsets=True)

        # 機器學習文字辨識(speech2text)
        print('')
        response = client.recognize(config, audio)

        transcript_list = []
        transcript_confidence = []
        timerecored = []
        # Each result is for a consecutive portion of the audio. Iterate through
        # them to get the transcripts for the entire audio file.
        for result in response.results:
            alternative = result.alternatives[0]
            # The first alternative is the most likely one for this portion.
            transcript_list.append(alternative.transcript)
            transcript_confidence.append(alternative.confidence)
            print('Transcript: {}'.format(alternative.transcript))
            print('Confidence: {}'.format(alternative.confidence))

            # begining and end time of a sentence
            sentence_start_time = alternative.words[0].start_time
            sentence_end_time = alternative.words[len(alternative.words) -
                                                  1].end_time

            # make time
            sentence_start_time = round(sentence_start_time.seconds +
                                        sentence_start_time.nanos * 1e-9)
            sentence_end_time = round(sentence_end_time.seconds +
                                      sentence_end_time.nanos * 1e-9)

            # make min
            sentence_start_time = str(
                datetime.timedelta(seconds=sentence_start_time))
            sentence_end_time = str(
                datetime.timedelta(seconds=sentence_end_time))
            timerecored.append([sentence_start_time, sentence_end_time])

        # pandas 建立信心程度資料表
        # make df
        transcript_df = pd.DataFrame(transcript_list, columns=['文章段句'])
        confidence_df = pd.DataFrame(transcript_confidence,
                                     columns=['機器認字信心水準'])
        confidence_df['機器認字信心水準'] = round(confidence_df['機器認字信心水準'], 2)
        time_df = pd.DataFrame(timerecored, columns=['start', 'end'])
        correctness_summary_df = pd.concat(
            [transcript_df, confidence_df, time_df], axis=1)
        correctness_summary_df = correctness_summary_df.sort_values(
            ['機器認字信心水準'])
        correctness_summary_df['改善順序'] = range(1,
                                               len(correctness_summary_df) + 1)

        timer_translist = []
        for hah, timer in zip(transcript_list, timerecored):
            timer_translist.append(hah + '  ' + '【' + ' to '.join(timer) + '】')

        aa = pd.concat([aa, correctness_summary_df])

    # 製作文字雲
    from speech2text import make_worldcould_report, text_freq
    cut_text = make_worldcould_report(data=aa,
                                      pd_text_col='文章段句',
                                      mask_pic=False,
                                      filename='wordcloud',
                                      pic_name='test.png')
    words_counts = text_freq(cut_text)

    # 計算重要程度
    max = words_counts['counts'].describe()['max']
    mean = words_counts['counts'].describe()['mean']

    # 僅取出max與mean的字詞
    words_counts = words_counts[(words_counts['counts'] <= max)
                                & (words_counts['counts'] >= mean)]

    df_count_all = pd.DataFrame()
    for index, i in words_counts.iterrows():

        df_count = correctness_summary_df[
            correctness_summary_df['文章段句'].str.contains(i['word'])]

        if not df_count.empty:
            df_count['重要性'] = i['counts']
            df_count_all = pd.concat([df_count_all, df_count])

    # group by
    correctness_summary_df = df_count_all.groupby(
        ['文章段句', '機器認字信心水準', 'start', 'end', '改善順序'],
        as_index=False)['重要性'].mean().round(2)

    # save to docx
    document = Document()
    document.add_heading(doc_title, 0)
    document.add_paragraph(
        '機器認字信心水準' + str(round(correctness_summary_df['機器認字信心水準'].mean(), 2)) +
        '\n\n' + '\n\n'.join(timer_translist))
    document.add_picture('wordcloud.png', width=Cm(15), height=Cm(13))
    document.save(doc_title + '_文章逐字稿.docx')
    print('Done')
    print('請看工作目錄檔案中有沒有兩個檔案,一格個是完整的docx檔案,一個是xlsx檔案')
    print("--- %s seconds ---" % (round(time.time() - start_time, 2)))
    return correctness_summary_df.to_excel(doc_title + '_文章認字信心矩陣.xlsx')
예제 #8
0
    print "finished recording"

    # stop Recording
    stream.stop_stream()
    stream.close()
    audio1.terminate()

    file = open("newfile.raw", "w")
    file.write(b''.join(frames))
    file.close()

    client = speech.SpeechClient()

    with io.open('newfile.raw', 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code='en-US')

    response = client.recognize(config, audio)

    for result in response.results:
        print('Transcript: {}'.format(result.alternatives[0].transcript))
        if result.alternatives[0].transcript == "lights on":
            s = "1"
        elif result.alternatives[0].transcript == "orange":
            s = "2"
        elif result.alternatives[0].transcript == "kitchen":
예제 #9
0
def startSTT(end):
    form_1 = pyaudio.paInt16  # 16-bit resolution
    chans = 1  # 1 channel
    samp_rate = 44100  # 44.1kHz sampling rate
    chunk = 4096  # 2^12 samples for buffer
    record_secs = 3600  # seconds to record
    dev_index = 2  # device index found by p.get_device_info_by_index(ii)

    threshold = 15000
    sliding_window = deque(maxlen=15)

    client = speech.SpeechClient()
    #send_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    #send_socket.connect(('10.0.0.194', 8002))

    audio = pyaudio.PyAudio()  # create pyaudio instantiation

    # create pyaudio stream
    stream = audio.open(format = form_1,rate = samp_rate,channels = chans, \
                    input_device_index = dev_index, input = True, \
                    frames_per_buffer=chunk)
    print("recording")

    predata = deque(maxlen=10)

    while end.value == 0:
        frames = []
        started = False
        # loop through stream and append audio chunks to frame array
        for ii in range(0, int((samp_rate / chunk) * record_secs)):
            data = stream.read(chunk, exception_on_overflow=False)
            predata.append(data)
            rms = audioop.rms(data, 2)
            print(rms)
            if rms > threshold and started is False:
                started = True
                print('started')
            if started:
                frames.append(data)
                sliding_window.append(rms)
                if sum(ii < threshold for ii in sliding_window) >= 15:
                    print("ending")
                    break

        print("finished recording")

        for i in range(len(predata)):
            print('added a frame')
            frames.insert(0, predata.pop())

        audio = types.RecognitionAudio(content=b''.join(frames))
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=44100,
            language_code='en-US')

        response = client.recognize(config, audio)

        for result in response.results:
            print('Transcript: {}'.format(result.alternatives[0].transcript))

        # send result to server
        #send_socket.send((result.alternatives[0].transcript).encode())
        print("sent")

    # stop the stream, close it, and terminate the pyaudio instantiation
    stream.stop_stream()
    stream.close()
    audio.terminate()
         ]  # mac 不一樣是,應該用pattern去做 請記得,引數都是從 0 開始,所以第 2 個數值就是引數 1
music = [i for i in music
         if 'wav' in i]  # mac 不一樣是,應該用pattern去做 請記得,引數都是從 0 開始,所以第 2 個數值就是引數 1

##以上 會捉到  nlpno.wav
music = music[0]  # 成功抓出第二個數值

# 將 audio錄音檔 讀進來
with io.open(music, 'rb') as audio_file:
    content = audio_file.read()

# 看一下讀進python裡面是以何種方式呈現
content[0:100]

# 將錄音檔轉換成google 看得懂的格式
audio = types.RecognitionAudio(content=content)

# 如果您覺得自己電腦效能夠好的話,可以執行看看下面的audio
#audio

##################################################################
# google 語音分析
###################################################################

# 設定格式錄音檔
config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.
    LINEAR16,  # LINEAR16 for Audio
    sample_rate_hertz=sample_rate_hertz,
    language_code='cmn-Hant-TW',  # for Taiwan 語系
    enable_word_time_offsets=True)  # 是否要分段
예제 #11
0
def validate_dataset(yt_uri, matching, in_stage, out_stage):
    # Use vid as the diretory name for download and processing
    vids = parse_qs(urlparse(yt_uri).query, keep_blank_values=True).get('v')
    vid = None if vids == None else vids[0]

    v_dir = os.path.join(data_path, vid)
    in_dir = os.path.join(v_dir, in_stage)
    out_dir = os.path.join(v_dir, out_stage)
    ext_dir = os.path.join(v_dir, out_stage + 'ext')

    # Get information on the YouTube content
    try:
        yt = YouTube(yt_uri)
    except:
        e = sys.exc_info()[0]
        print("Exception: {}".format(e))
        sys.exit(1)

    # Creating array of wav files
    files = []
    for file in os.listdir(in_dir):
        if file.endswith('.wav'):
            files.append(file)
    files.sort()

    os.makedirs(out_dir, exist_ok=True)
    os.makedirs(ext_dir, exist_ok=True)

    # Speech client
    client = speech.SpeechClient()

    for file in files:
        event_no = os.path.splitext(os.path.basename(file))[0]
        subtitle = os.path.join(in_dir, event_no + '.txt')
        transcript = os.path.join(in_dir, event_no + 't.txt')

        if Path(subtitle).exists() == False:
            continue

        # Printing process and testing files
        try:
            file_path = os.path.join(in_dir, file)
            print(file_path)
            audio_file = io.open(file_path, 'rb')
            audio_content = audio_file.read()
            audio_file.close()

            audio = types.RecognitionAudio(content=audio_content)
            config = types.RecognitionConfig(
                encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                speech_contexts=[{
                    "phrases": build_phrase_hint(subtitle)
                }],
                language_code='ko-kr')

            response = client.recognize(config, audio)

            subtitle_file = io.open(subtitle, 'r')
            transcript_file = io.open(transcript, 'w')

            # Determining appropriateness of existing subtitle
            result_script = ""
            print(u"Subtitle: {}".format(subtitle_file.read()))
            for result in response.results:
                print(u"Response: {}".format(
                    result.alternatives[0].transcript))
                print("Confidence: {}".format(
                    result.alternatives[0].confidence))
                result_script += result.alternatives[0].transcript

            print(u"Transcript: {}".format(result_script))

            try:
                transcript_file.write(result_script)
            except:
                exc_type, exc_obj, exc_tb = sys.exc_info()
                exc_file = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                print(exc_type, exc_file, exc_tb.tb_lineno)
                sys.exit(1)

            subtitle_file.close()
            transcript_file.close()

            score = similarity_score(subtitle, transcript)

            # Moving appropriate files to output pipeline stage
            if matching == 'exact':
                result = exact_match(subtitle, transcript)
            elif matching == 'similarity':
                result = score >= 0.9
            else:  # matching == 'subs' or else
                result = substring_match(subtitle, transcript)

            if result == True:
                shutil.move(file_path, out_dir)
                shutil.move(subtitle, out_dir)
                shutil.move(transcript, out_dir)
                message = "Matched"
            elif score >= 0.95:
                shutil.move(file_path, ext_dir)
                shutil.move(subtitle, ext_dir)
                shutil.move(transcript, ext_dir)
                message = "Matched (Similar)"
            else:
                message = "Not Matched"

            print("Result: {}, Score: {}".format(message, score))
            print("")

        except:
            exc_type, exc_obj, exc_tb = sys.exc_info()
            exc_file = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
            print(exc_type, exc_file, exc_tb.tb_lineno)
            sys.exit(1)
예제 #12
0
def transcript(content):
    audio = types.RecognitionAudio(content=content)
    response = client.recognize(config, audio)
    return response.results
예제 #13
0
#! pip install webapp2
#! pip install cloudstorage
#! pip install GoogleAppEngineCloudStorageClient

from google.cloud import storage

client = storage.Client()

bucket = client.get_bucket('storagexxx')

blob = bucket.get_blob('Info/1-NotSolved_No_Silence.wav')

blob3=blob.upload_from_filename(filename='1-NotSolved_No_Silence.wav')  


audio = types.RecognitionAudio(content=blob3)


from google.cloud import speech_v1p1beta1 as speech

client = speech.SpeechClient()

config = speech.types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.MULAW,
    sample_rate_hertz=8000,enable_word_time_offsets= True,
    language_code='pt-BR',
    enable_automatic_punctuation= True,
    use_enhanced=True,
    speech_contexts=[speech.types.SpeechContext(phrases=['computador', 'wi-fi'])],
    enable_speaker_diarization=True,
    diarization_speaker_count=2,
예제 #14
0
def speech_to_text():

    print('button pressed')
    os.system('python D:/Hackathons/VirtualPatient/src/record.py')
    file_name = "D:/Hackathons/VirtualPatient/output.wav"

    # X, sample_rate = librosa.load(file_name, res_type='kaiser_fast', duration=2.5, sr=22050*2, offset=0.5)
    # sample_rate = np.array(sample_rate)
    # mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
    # featurelive = mfccs
    # livedf2 = featurelive

    # livedf2 = pd.DataFrame(data=livedf2)

    # livedf2 = livedf2.stack().to_frame().T

    # twodim = np.expand_dims(livedf2, axis=2)

    # livepreds = loaded_model.predict(twodim,
    #                          batch_size=32,
    #                          verbose=1)

    # livepreds = np.array([0,0,0,0,0,0,0,0,0,0])

    # Instantiates a client
    client = speech.SpeechClient()

    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code='en-US')

    try:
        # Detects speech in the audio file
        response = client.recognize(config, audio)
        print(response)

        for result in response.results:
            data = {'msg': result.alternatives[0].transcript, 'counter': 1}
            resp = Response(json.dumps(data),
                            status=200,
                            mimetype='application/json')
            resp.headers['Access-Control-Allow-Origin'] = '*'

        return resp

    except:
        data = {
            'msg': "Your message was not picked up, please try again.",
            'counter': 0
        }
        resp = Response(json.dumps(data),
                        status=200,
                        mimetype='application/json')
        resp.headers['Access-Control-Allow-Origin'] = '*'

        return resp
def speech_to_text(
        gcs_uri='gs://speechfashion/Acc.wav',
        doc_title='範例2_一分鐘以上雲端運算',
        timeout=None,
        json_os='/home/slave1/git/Speech2Text/damnhow-db8d83229dd4.json',
        sample_rate_hertz=96000):
    '''
    1.產出文章認字信心矩陣csv,提供修改者文句之修正順序
    2.產出docx文本,並提供文章機器認字信心水準,供修改者修改
    
    '''
    # 計時
    start_time = time.time()
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = json_os

    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=sample_rate_hertz,
        language_code='cmn-Hant-TW',
        enable_word_time_offsets=True)

    #    config=types.StreamingRecognitionConfig(config=config)
    #    stream = [audio]
    #    requests = (types.StreamingRecognizeRequest(audio_content=chunk)
    #                for chunk in stream)
    #    responses = client.streaming_recognize(config, requests)

    operation = client.long_running_recognize(config, audio)

    print('機器學習文字辨識中...')
    response = operation.result(timeout=timeout)  #

    aa = pd.DataFrame()
    transcript_list = []
    transcript_confidence = []
    timerecored = []
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        alternative = result.alternatives[0]
        # The first alternative is the most likely one for this portion.
        transcript_list.append(alternative.transcript)
        transcript_confidence.append(alternative.confidence)
        print('Transcript: {}'.format(alternative.transcript))
        print('Confidence: {}'.format(alternative.confidence))

        # begining and end time of a sentence
        sentence_start_time = alternative.words[0].start_time
        sentence_end_time = alternative.words[len(alternative.words) -
                                              1].end_time

        # make time
        sentence_start_time = round(sentence_start_time.seconds +
                                    sentence_start_time.nanos * 1e-9)
        sentence_end_time = round(sentence_end_time.seconds +
                                  sentence_end_time.nanos * 1e-9)

        # make min
        sentence_start_time = str(
            datetime.timedelta(seconds=sentence_start_time))
        sentence_end_time = str(datetime.timedelta(seconds=sentence_end_time))
        timerecored.append([sentence_start_time, sentence_end_time])

        # pandas 建立信心程度資料表
        # make df
        transcript_df = pd.DataFrame(transcript_list, columns=['文章段句'])
        confidence_df = pd.DataFrame(transcript_confidence,
                                     columns=['機器認字信心水準'])
        confidence_df['機器認字信心水準'] = round(confidence_df['機器認字信心水準'], 2)
        time_df = pd.DataFrame(timerecored, columns=['start', 'end'])
        correctness_summary_df = pd.concat(
            [transcript_df, confidence_df, time_df], axis=1)
        correctness_summary_df = correctness_summary_df.sort_values(
            ['機器認字信心水準'])
        correctness_summary_df['改善順序'] = range(1,
                                               len(correctness_summary_df) + 1)

        timer_translist = []
        for hah, timer in zip(transcript_list, timerecored):
            timer_translist.append(hah + '  ' + '【' + ' to '.join(timer) + '】')

        aa = pd.concat([aa, correctness_summary_df])

    # 製作文字雲
    from speech2text import make_worldcould_report, text_freq
    cut_text = make_worldcould_report(data=aa,
                                      pd_text_col='文章段句',
                                      mask_pic=False,
                                      filename='wordcloud',
                                      pic_name='test.png')
    words_counts = text_freq(cut_text)

    # 計算重要程度
    max = words_counts['counts'].describe()['max']
    mean = words_counts['counts'].describe()['mean']

    # 僅取出max與mean的字詞
    words_counts = words_counts[(words_counts['counts'] <= max)
                                & (words_counts['counts'] >= mean)]

    df_count_all = pd.DataFrame()
    for index, i in words_counts.iterrows():

        df_count = correctness_summary_df[
            correctness_summary_df['文章段句'].str.contains(i['word'])]

        if not df_count.empty:
            df_count['重要性'] = i['counts']
            df_count_all = pd.concat([df_count_all, df_count])

    # group by
    correctness_summary_df = df_count_all.groupby(
        ['文章段句', '機器認字信心水準', 'start', 'end', '改善順序'],
        as_index=False)['重要性'].mean().round(2)

    # save to docx
    document = Document()
    document.add_heading(doc_title, 0)
    document.add_paragraph(
        '機器認字信心水準' + str(round(correctness_summary_df['機器認字信心水準'].mean(), 2)) +
        '\n\n' + '\n\n'.join(timer_translist))
    document.add_picture('wordcloud.png', width=Cm(15), height=Cm(13))
    document.save(doc_title + '_文章逐字稿.docx')
    print('Done')
    print('請看工作目錄檔案中有沒有兩個檔案,一格個是完整的docx檔案,一個是xlsx檔案')
    print("--- %s seconds ---" % (round(time.time() - start_time, 2)))
    return correctness_summary_df.to_excel(doc_title + '_文章認字信心矩陣.xlsx')
예제 #16
0
storage_client = storage.Client()

#variables
source_bucket_name = '911-calls'
source_bucket = storage_client.bucket(source_bucket_name)
bucket_prefix = 'audio-recordings-wav'

#create a csv file
with open('data.csv', 'a') as csvfile:
    csvfile.write('audio_gcs_uri, transcript' + '\n')

# for each audio file ....
for file in (list(source_bucket.list_blobs(prefix=bucket_prefix))):
    audio_gcs_uri = "gs://" + source_bucket_name + "/" + file.name

    audio = types.RecognitionAudio(uri=audio_gcs_uri)
    print(audio)

    config = types.RecognitionConfig(
        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
        # use_enhanced=True, # for phone audio
        # model='phone_call', # model must be specified for enhanced model
        language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=3600)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
예제 #17
0
client = speech.SpeechClient()

# The name of the audio file to transcribe
audio_file = sys.argv[1]  # 'SchoolOfAI.wav'
file_to_mono = 'audio_mono.wav'
sound = AudioSegment.from_wav(audio_file)
sound = sound.set_channels(1)  # convert audio file to mono
sound.export(file_to_mono, format="wav")

# Loads the audio into memory - for when not storing file on Google Cloud Storage
# with io.open(file_to_mono, 'rb') as audio_file:
#     content = audio_file.read()
# audio = types.RecognitionAudio(content=content)

# Loads the audio from Google Cloud Storage
audio = types.RecognitionAudio(uri='gs://ctrlfboilermake/' + audio_file)

config = types.RecognitionConfig(
    # encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
    # sample_rate_hertz=16000,
    language_code='en-US',
    enable_word_time_offsets=True,
    enable_automatic_punctuation=True)

# Detects speech in the audio file
# response = client.recognize(config, audio)
operation = client.long_running_recognize(config, audio)
result = operation.result(timeout=90)

transcript = ""
list_of_times = []
예제 #18
0
def transcribe_file(speech_file):
    print(speech_file)
    """Transcribe the given audio file."""
    client = speech.SpeechClient()
    now = datetime.datetime.now()
    speech_file = os.getcwd() + '/' + speech_file
    # [START speech_python_migration_sync_request]
    # [START speech_python_migration_config]
    outstr = ''
    try:
        #print(os.path.isfile(speech_file) )
        if (debug):
            print('Python::opening file ' + speech_file)

        with io.open(speech_file, 'rb') as audio_file:
            content = audio_file.read()
        audio = types.RecognitionAudio(content=content)
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code='ko-KR')
        # [END speech_python_migration_config]

        # [START speech_python_migration_sync_response]
        if (debug):
            print('GCP::requesting : ' + speech_file)
        response = client.recognize(config, audio)
        # [END speech_python_migration_sync_request]
        # Each result is for a consecutive portion of the audio. Iterate through
        # them to get the transcripts for the entire audio file.
        # The first alternative is the most likely one for this portion.
        #output_transcription = response.results[0].alternatives[0].transcript
        if (debug):
            print('opening response file')
        outfile = open('response.txt', 'a')
        for result in response.results:

            if (os.name == 'posix'):
                outstr = (result.alternatives[0].transcript)
            else:
                #                outstr =(result.alternatives[0].transcript).encode("utf-8")
                outstr = (result.alternatives[0].transcript)

#            outstr = result.alternatives[0].transcript
            if (debug):
                print('GCP::response : ' + outstr)
            #outstr = result.alternatives[0].transcript
            #print(u'Transcript: {}'.format(result.alternatives[0].transcript))
            #print(outstr.format(result.alternatives[0].transcript))
            outfile.write(now.strftime("%Y-%m-%d %H:%M") + " : " + outstr)
            outfile.write('\n')
            #outfile.write(outstr)
        print("GCP-response::" + outstr)
        #outfile.write(output_transcription)
        #outfile.write(response.results[0].alternatives[0].transcript)
        outfile.close()
#        print('closing response file')
    except:
        #outfile = open('output.txt', 'w')
        outfile = open('response.txt', 'a')
        outfile.write(now.strftime("%Y-%m-%d %H:%M") + " : " + "ERROR\n")
        #outfile.write('recognition error occured')
        outfile.close()
        outstr = 'ERROR::exeption occured in GCP-Speech'
        print(outstr)
        #outfile.write("error")
        #outfile.close()
        # [END speech_python_migration_sync_response]
    # [END speech_transcribe_sync]
#    print('got ' + outstr)

    if (outstr == ""):
        return "<내용 없음>"
    return outstr
    parser = argparse.ArgumentParser(
        description='Prints the language from an Image')
    parser.add_argument(dest="audio", help='URL of an image')

    return parser.parse_args()


if __name__ == "__main__":
    args = get_args()
    # api_key = os.environ.get('GCP_API_KEY')
    # project_name = os.environ.get('PROJECT_NAME')
    project_name = os.environ[
        'GOOGLE_APPLICATION_CREDENTIALS'] == 'service_account.json'

    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=args.audio)
    config = types.RecognitionConfig(language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        print('Confidence: {}'.format(result.alternatives[0].confidence))