def main(): def _abs_path(rel_path): return os.path.join(os.path.dirname(__file__), '../..', rel_path) parser = argparse.ArgumentParser() parser.add_argument('--library_path', help="absolute path to Cheetah's dynamic library", type=str, default=_abs_path('lib/linux/x86_64/libpv_cheetah.so')) parser.add_argument('--acoustic_model_path', help='absolute path to acoustic model parameter file', type=str, default=_abs_path('lib/common/acoustic_model.pv')) parser.add_argument('--language_model_path', help='absolute path to language model parameter file', type=str, default=_abs_path('lib/common/language_model.pv')) parser.add_argument( '--license_path', help='absolute path to license file', type=str, default=_abs_path('resources/license/cheetah_eval_linux_public.lic')) parser.add_argument( '--audio_paths', help='comma-separated absolute paths to audio files to be transcribed', type=str, required=True) args = parser.parse_args() cheetah = Cheetah(library_path=args.library_path, acoustic_model_path=args.acoustic_model_path, language_model_path=args.language_model_path, license_path=args.license_path) for audio_path in [ os.path.expanduser(x.strip()) for x in args.audio_paths.split(',') ]: audio, sample_rate = soundfile.read(audio_path, dtype='int16') if sample_rate != cheetah.sample_rate: raise ValueError( 'Cheetah can only process audio data with sample rate of %d' % cheetah.sample_rate) num_frames = len(audio) // cheetah.frame_length transcript = '' for i in range(num_frames): frame = audio[i * cheetah.frame_length:(i + 1) * cheetah.frame_length] partial_transcript, _ = cheetah.process(frame) transcript += partial_transcript transcript += cheetah.flush() print(transcript)
def test_transcribe(self): def abs_path(rel_path): return os.path.join(os.path.dirname(__file__), '../..', rel_path) cheetah = Cheetah( library_path=abs_path('/home/nithin/project1/libpv_cheetah.so'), acoustic_model_path=abs_path( '/home/nithin/project1/acoustic_model.pv'), language_model_path=abs_path( '/home/nithin/project1/language_model.pv'), license_path=abs_path( '/home/nithin/project1/cheetah_eval_linux_public.lic')) audio, sample_rate = soundfile.read( abs_path('/home/nithin/project1/audio_samples/test.wav'), dtype='int16') assert sample_rate == cheetah.sample_rate transcript = '' num_frames = len(audio) // cheetah.frame_length for i in range(num_frames): frame = audio[i * cheetah.frame_length:(i + 1) * cheetah.frame_length] partial_transcript, _ = cheetah.process(frame) transcript += partial_transcript final_transcript = cheetah.flush() transcript += final_transcript self.assertEqual( transcript, "MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL" )
def run(self): cheetah = None pa = None audio_stream = None try: cheetah = Cheetah( library_path=self._library_path, acoustic_model_path=self._acoustic_model_path, language_model_path=self._language_model_path, license_path=self._license_path, endpoint_duration_sec=1) pa = pyaudio.PyAudio() audio_stream = pa.open( rate=cheetah.sample_rate, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=cheetah.frame_length, input_device_index=self._input_device_index) while True: pcm = audio_stream.read(cheetah.frame_length) pcm = struct.unpack_from("h" * cheetah.frame_length, pcm) if self._output_path is not None: self._recorded_frames.append(pcm) partial_transcript, is_endpoint = cheetah.process(pcm) print(partial_transcript, end='', flush=True) if is_endpoint: print(cheetah.flush()) except Exception as e: print(e) finally: if cheetah is not None: print(cheetah.flush()) cheetah.delete() if audio_stream is not None: audio_stream.close() if pa is not None: pa.terminate() if self._output_path is not None and len(self._recorded_frames) > 0: recorded_audio = np.concatenate(self._recorded_frames, axis=0).astype(np.int16) soundfile.write(self._output_path, recorded_audio, samplerate=cheetah.sample_rate, subtype='PCM_16')
parser.add_argument( '--audio_paths', help='comma-separated absolute paths to audio files to be transcribed', type=str, required=True) args = parser.parse_args() cheetah = Cheetah(library_path=args.library_path, acoustic_model_file_path=args.acoustic_model_file_path, language_model_file_path=args.language_model_file_path, license_file_path=args.license_file_path) for audio_path in [ os.path.expanduser(x.strip()) for x in args.audio_paths.split(',') ]: audio, sample_rate = soundfile.read(audio_path, dtype='int16') if sample_rate != cheetah.sample_rate: raise ValueError( 'Cheetah can only process audio data with sample rate of %d' % cheetah.sample_rate) num_frames = len(audio) // cheetah.frame_length for i in range(num_frames): frame = audio[i * cheetah.frame_length:(i + 1) * cheetah.frame_length] cheetah.process(frame) print(cheetah.transcribe())
type=str, required=True) args = parser.parse_args() cheetah = Cheetah(library_path=args.library_path, acoustic_model_path=args.acoustic_model_path, language_model_path=args.language_model_path, license_path=args.license_path) for audio_path in [ os.path.expanduser(x.strip()) for x in args.audio_paths.split(',') ]: audio, sample_rate = soundfile.read(audio_path, dtype='int16') if sample_rate != cheetah.sample_rate: raise ValueError( 'Cheetah can only process audio data with sample rate of %d' % cheetah.sample_rate) num_frames = len(audio) // cheetah.frame_length transcript = '' for i in range(num_frames): frame = audio[i * cheetah.frame_length:(i + 1) * cheetah.frame_length] partial_transcript, _ = cheetah.process(frame) transcript += partial_transcript transcript += cheetah.flush() print(transcript)