class Translator: def __init__(self): self.client = SpeechClient() storage_client = storage.Client() self.bucket_name = 'cross-culture-audios' self.bucket = storage_client.get_bucket(self.bucket_name) def translate_long(self, gs_uri): audio = types.RecognitionAudio(uri=gs_uri, ) config = types.RecognitionConfig( encoding='FLAC', language_code='en-US', sample_rate_hertz=44100, ) operation = self.client.long_running_recognize(config=config, audio=audio) op_result = operation.result() result = '\n'.join([ str.strip(result.alternatives[0].transcript) for result in op_result.results if len(result.alternatives) > 0 ]) return result def translate_with_timestamps(self, gs_uri): audio = types.RecognitionAudio(uri=gs_uri, ) config = types.RecognitionConfig( encoding='FLAC', language_code='en-US', # sample_rate_hertz=44100, enable_word_time_offsets=True) operation = self.client.long_running_recognize(config=config, audio=audio) results = [] for result in operation.result().results: alternatives = result.alternatives if len(alternatives) == 0: continue alternative = alternatives[0] for word_info in alternative.words: word = word_info.word start_time = word_info.start_time.seconds + round( word_info.start_time.nanos * 1e-9, 1) end_time = word_info.end_time.seconds + round( word_info.end_time.nanos * 1e-9, 1) results.append([word, start_time, end_time]) return results def upload_to_gcs(self, filepath): filename = ntpath.basename(filepath) gs_filepath = 'audios/%s' % filename blob = self.bucket.blob(gs_filepath) blob.upload_from_filename(filepath) return self.generate_uri(gs_filepath) def delete_from_gcs(self, filename): gs_filepath = 'audios/%s' % filename self.bucket.delete_blob(gs_filepath) def generate_uri(self, filepath): return 'gs://%s/%s' % (self.bucket_name, filepath)
def transcribe_gcs(self, gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri. args: gcs_uri - URI with format 'gs://<bucket>/<path_to_audio>' returns: trans - a list of transcribed sections """ printmsg.begin('Initiating Google Cloud Speech operation') client = SpeechClient() audio = types.RecognitionAudio(uri=gcs_uri) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=44100, language_code='en-GB', enable_word_time_offsets=True) operation = client.long_running_recognize(config, audio) printmsg.end() printmsg.begin('Waiting for operation to complete [0%%]') while not operation.done(): time.sleep(1) printmsg.begin('Waiting for operation to complete [%s%%]' % operation.metadata.progress_percent) response = operation.result(timeout=10) printmsg.end() # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. trans = [] for result in response.results: # The first alternative is the most likely one for this portion. best = result.alternatives[0] get_ts = lambda x: dict(min=x.seconds // 60, sec=x.seconds % 60, msec=x.nanos // (10**6)) seg = dict(text=best.transcript, confidence=best.confidence, words=[]) # loop the words for word_info in best.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time word_obj = dict(word=word, tstamp=get_ts(start_time)) seg['words'].append(word_obj) trans.append(seg) return trans
def recognize_audio_from_uri( uri: str, credential: Union[str, os.PathLike, None] = None, language_code: str = 'en-US', encoding: enums.RecognitionConfig.AudioEncoding = enums.RecognitionConfig. AudioEncoding.FLAC, sampling_rate_hertz: int = 44100, ) -> types.RecognizeResponse: """ Args: uri (str) : Cloud credential (str, os.PathLike, None) : language_code: encoding (enums.RecognitionConfig.AudioEncoding) : sampling_rate_hertz (int) : Returns: types.RecognizeResponse """ if credential is None: client = SpeechClient() else: credentials = Credentials.from_service_account_file( filename=credential) client = SpeechClient(credentials=credentials) config = types.RecognitionConfig(encoding=encoding, language_code=language_code, sample_rate_hertz=sampling_rate_hertz) audio = types.RecognitionAudio(uri=uri) try: result = client.recognize(config=config, audio=audio) except exceptions.InvalidArgument: print( 'cannot synchronize recognition. switched asynchronized recognition' ) operartion = client.long_running_recognize(config=config, audio=audio) result = operartion.result() return result
def get_raw(file_name: str, client: speech.SpeechClient) -> str: """ Get the raw Speech to text result from Google Cloud API :param file_name: File name + path :param client: Google Cloud API Speech client :return: str JSON encoded response """ audio = types.RecognitionAudio(uri=file_name) config = types.RecognitionConfig( encoding=enums.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=44100, language_code="de-DE", enable_word_time_offsets=True) operation = client.long_running_recognize(config, audio) response = operation.result(timeout=900) return MessageToJson(response)