Exemplo n.º 1
0
class Translator:
    def __init__(self):
        self.client = SpeechClient()
        storage_client = storage.Client()
        self.bucket_name = 'cross-culture-audios'
        self.bucket = storage_client.get_bucket(self.bucket_name)

    def translate_long(self, gs_uri):
        audio = types.RecognitionAudio(uri=gs_uri, )
        config = types.RecognitionConfig(
            encoding='FLAC',
            language_code='en-US',
            sample_rate_hertz=44100,
        )
        operation = self.client.long_running_recognize(config=config,
                                                       audio=audio)
        op_result = operation.result()
        result = '\n'.join([
            str.strip(result.alternatives[0].transcript)
            for result in op_result.results if len(result.alternatives) > 0
        ])
        return result

    def translate_with_timestamps(self, gs_uri):
        audio = types.RecognitionAudio(uri=gs_uri, )
        config = types.RecognitionConfig(
            encoding='FLAC',
            language_code='en-US',
            # sample_rate_hertz=44100,
            enable_word_time_offsets=True)
        operation = self.client.long_running_recognize(config=config,
                                                       audio=audio)
        results = []
        for result in operation.result().results:
            alternatives = result.alternatives
            if len(alternatives) == 0:
                continue
            alternative = alternatives[0]
            for word_info in alternative.words:
                word = word_info.word
                start_time = word_info.start_time.seconds + round(
                    word_info.start_time.nanos * 1e-9, 1)
                end_time = word_info.end_time.seconds + round(
                    word_info.end_time.nanos * 1e-9, 1)
                results.append([word, start_time, end_time])
        return results

    def upload_to_gcs(self, filepath):
        filename = ntpath.basename(filepath)
        gs_filepath = 'audios/%s' % filename
        blob = self.bucket.blob(gs_filepath)
        blob.upload_from_filename(filepath)
        return self.generate_uri(gs_filepath)

    def delete_from_gcs(self, filename):
        gs_filepath = 'audios/%s' % filename
        self.bucket.delete_blob(gs_filepath)

    def generate_uri(self, filepath):
        return 'gs://%s/%s' % (self.bucket_name, filepath)
Exemplo n.º 2
0
    def transcribe_gcs(self, gcs_uri):
        """Asynchronously transcribes the audio file specified by the gcs_uri.
        args:
            gcs_uri - URI with format 'gs://<bucket>/<path_to_audio>'
        returns:
            trans - a list of transcribed sections
        """
        printmsg.begin('Initiating Google Cloud Speech operation')
        client = SpeechClient()

        audio = types.RecognitionAudio(uri=gcs_uri)
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
            sample_rate_hertz=44100,
            language_code='en-GB',
            enable_word_time_offsets=True)

        operation = client.long_running_recognize(config, audio)
        printmsg.end()

        printmsg.begin('Waiting for operation to complete [0%%]')
        while not operation.done():
            time.sleep(1)
            printmsg.begin('Waiting for operation to complete [%s%%]' %
                           operation.metadata.progress_percent)
        response = operation.result(timeout=10)
        printmsg.end()

        # Each result is for a consecutive portion of the audio. Iterate through
        # them to get the transcripts for the entire audio file.
        trans = []
        for result in response.results:
            # The first alternative is the most likely one for this portion.
            best = result.alternatives[0]
            get_ts = lambda x: dict(min=x.seconds // 60,
                                    sec=x.seconds % 60,
                                    msec=x.nanos // (10**6))
            seg = dict(text=best.transcript,
                       confidence=best.confidence,
                       words=[])
            # loop the words
            for word_info in best.words:
                word = word_info.word
                start_time = word_info.start_time
                end_time = word_info.end_time
                word_obj = dict(word=word, tstamp=get_ts(start_time))
                seg['words'].append(word_obj)
            trans.append(seg)

        return trans
Exemplo n.º 3
0
def recognize_audio_from_uri(
    uri: str,
    credential: Union[str, os.PathLike, None] = None,
    language_code: str = 'en-US',
    encoding: enums.RecognitionConfig.AudioEncoding = enums.RecognitionConfig.
    AudioEncoding.FLAC,
    sampling_rate_hertz: int = 44100,
) -> types.RecognizeResponse:
    """

    Args:
        uri (str) : Cloud
        credential (str, os.PathLike, None) :
        language_code:
        encoding (enums.RecognitionConfig.AudioEncoding) :
        sampling_rate_hertz (int) :

    Returns:
        types.RecognizeResponse
    """
    if credential is None:
        client = SpeechClient()
    else:
        credentials = Credentials.from_service_account_file(
            filename=credential)
        client = SpeechClient(credentials=credentials)

    config = types.RecognitionConfig(encoding=encoding,
                                     language_code=language_code,
                                     sample_rate_hertz=sampling_rate_hertz)
    audio = types.RecognitionAudio(uri=uri)

    try:
        result = client.recognize(config=config, audio=audio)
    except exceptions.InvalidArgument:
        print(
            'cannot synchronize recognition. switched asynchronized recognition'
        )
        operartion = client.long_running_recognize(config=config, audio=audio)
        result = operartion.result()
    return result
def get_raw(file_name: str, client: speech.SpeechClient) -> str:
    """
    Get the raw Speech to text result from Google Cloud API

    :param file_name: File name + path
    :param client:    Google Cloud API Speech client

    :return: str JSON encoded response
    """
    audio = types.RecognitionAudio(uri=file_name)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=44100,
        language_code="de-DE",
        enable_word_time_offsets=True)

    operation = client.long_running_recognize(config, audio)

    response = operation.result(timeout=900)

    return MessageToJson(response)