Exemplo n.º 1
0
 def get_audio_data(self):
     frames = collections.deque()
     chunk = True
     while chunk:
         chunk = self.read()
         frames.append(chunk)
     return AudioData(frame_data=b''.join(frames), sample_rate=self.sample_rate, sample_width=self.sample_width)
Exemplo n.º 2
0
 def _get_stt_from_file(self, wav_file: str,
                        lang: str = None) -> (AudioData, dict, list):
     """
     Performs STT and audio processing on the specified wav_file
     :param wav_file: wav audio file to process
     :param lang: language of passed audio
     :return: (AudioData of object, extracted context, transcriptions)
     """
     from neon_utils.file_utils import get_audio_file_stream
     lang = lang or 'en-us'  # TODO: read default from config
     segment = AudioSegment.from_file(wav_file)
     audio_data = AudioData(segment.raw_data, segment.frame_rate,
                            segment.sample_width)
     audio_stream = get_audio_file_stream(wav_file)
     if self.lock.acquire(True, 30):
         LOG.info(f"Starting STT processing (lang={lang}): {wav_file}")
         self.api_stt.stream_start(lang)
         while True:
             try:
                 data = audio_stream.read(1024)
                 self.api_stt.stream_data(data)
             except EOFError:
                 break
         transcriptions = self.api_stt.stream_stop()
         self.lock.release()
     else:
         LOG.error(f"Timed out acquiring lock, not processing: {wav_file}")
         transcriptions = []
     if isinstance(transcriptions, str):
         LOG.warning("Transcriptions is a str, no alternatives provided")
         transcriptions = [transcriptions]
     audio, audio_context = self.loop.responsive_recognizer. \
         audio_consumers.transform(audio_data)
     LOG.info(f"Transcribed: {transcriptions}")
     return audio, audio_context, transcriptions
Exemplo n.º 3
0
 def __create_sample_from_test_file(self, sample_name):
     root_dir = dirname(dirname(dirname(__file__)))
     filename = join(root_dir, 'unittests', 'client', 'data',
                     sample_name + '.wav')
     wavfile = WavFile(filename)
     with wavfile as source:
         return AudioData(source.stream.read(), wavfile.SAMPLE_RATE,
                          wavfile.SAMPLE_WIDTH)
 def process(self, recording: AudioRecording,
             passThrough: PushPipe.PassThrough):
     print('Sample width:', recording.METADATA.getFormatByteSize())
     data = AudioData(recording.data, recording.METADATA.Rate,
                      recording.METADATA.getFormatByteSize())
     try:
         return self.sr.recognize_sphinx(data, 'en-IN')
     except UnknownValueError:
         self.setErrored("Couldn't identify speech.")
         return ''
Exemplo n.º 5
0
def convert_AudioData_to_Numpy_array_and_fs(AudioData: sr.AudioData):
    """
    this converts AudioData from speech_recognition to Numpy Array and fs int that is used by
    our own simple_audio, and external soundfile, sounddevice
    :param AudioData:
    :return:
    """
    flac = io.BytesIO(AudioData.get_flac_data())
    data, fs = soundfile.read(flac)
    return {"NumpyArray": data, "fs": fs}
Exemplo n.º 6
0
def sphinx(config, data, sample_rate):
    """Perform speech recognition using sphinx."""
    audio_data = AudioData(data, sample_rate, 2)
    try:
        text = Recognizer().recognize_sphinx(audio_data,
                                             language="en-US",
                                             keyword_entries=None,
                                             show_all=False)
    except UnknownValueError:
        text = ""
        _LOGGER.warning("No speech found in audio.")
    return text
Exemplo n.º 7
0
def _save_audio_data_to_disk(audiodata: AudioData):
    name       = str(uuid.uuid4())

    # GStreamer audioconverters may lost during the installation
    # So we use raw audioformat and it is cruicial to resample the audio at the proper rate (mfcc.conf)
    audio_data = audiodata.get_raw_data(convert_rate=8000) 
    path       = name + ".raw"

    with open(path, mode="wb") as f:
        f.write(audio_data)
        path = os.path.abspath(f.name)
    
    return path
Exemplo n.º 8
0
def google_cloud(config, data, sample_rate):
    """Perform speech recognition using Google Cloud."""
    audio_data = AudioData(data, sample_rate, 2)
    try:
        text = Recognizer().recognize_google_cloud(audio_data,
                                                   credentials_json=None,
                                                   language="en-GB",
                                                   preferred_phrases=None,
                                                   show_all=False)
    except UnknownValueError:
        text = ""
        _LOGGER.warning("No speech found in audio.")
    return text
Exemplo n.º 9
0
    def run(self, voice_bytes, conn):
        audio = AudioData(frame_data=voice_bytes,
                          sample_rate=44100,
                          sample_width=2)
        speech_rec = sr.Recognizer()
        try:
            # Passa o audio para o reconhecedor de padroes do speech_recognition
            frase = speech_rec.recognize_google(audio, language='pt-BR')
            # Após alguns segundos, retorna a frase falada
            conn.sendall(bytes(frase, encoding='utf-8'))

        except:
            # Caso nao tenha reconhecido o padrao de fala, exibe esta mensagem
            print("nao foi possivel traduzir a mensagem")
            pass
        return
    def record_phrase(self) -> AudioData:
        """Records until a period of silence"""
        log.info('Recording...')
        raw_audio = b'\0' * self.sample_width
        self.integral = 0
        self.noise_level = 0
        total_sec = 0
        while total_sec < self.recording_timeout:
            self._check_intercept()
            chunk = self.stream.read(self.chunk_size)
            raw_audio += chunk
            total_sec += self.chunk_sec
            energy = self._calc_energy(chunk)
            self.update_energy(energy)
            if self.integral > self.required_integral and self.noise_level == 0:
                break

        log.info('Done recording.')
        return AudioData(raw_audio, self.sample_rate, self.sample_width)
	def listen(self):
		while not self.talking:
			pass

		while self.talking:
			pass

		speech_data = []
		list_buffer = list(self.microphone_buffer)
		for i in list_buffer[list_buffer.index(self.first_speech_frame) - 10:]:
			speech_data += list(i)

		output = struct.pack('<' + ('h' * len(speech_data)), *speech_data)
		if self.play_back:
			self.output_stream.write(output)
		self.first_speech_frame = None

		audio_data_new = AudioData(output, 96000, 4)

		return audio_data_new
    def __init__(self, audio_data: AudioData, url='http://127.0.0.1:8085'):
        self._text = None
        wav_data = audio_data.get_wav_data(convert_rate=16000, convert_width=2)
        request = Request('{}/stt'.format(url),
                          data=wav_data,
                          headers={'Content-Type': 'audio/wav'})
        try:
            response = urlopen(request)
        except HTTPError as e:
            raise RuntimeError('Request failed: {}'.format(e.reason))
        except URLError as e:
            raise RuntimeError('Connection failed: {}'.format(e.reason))
        response_text = response.read().decode('utf-8')
        try:
            result = json.loads(response_text)
        except (json.JSONDecodeError, ValueError) as e:
            raise RuntimeError('Json decode error: {}'.format(e))

        if 'code' not in result or 'text' not in result or result['code']:
            raise RuntimeError('Server error: {}: {}'.format(
                result.get('code', 'None'), result.get('text', 'None')))
        self._text = result['text']
Exemplo n.º 13
0
 def get_audio_data_before(self):
     byte_data = self.audio.frame_data[0:self.begin] + self.silence_data
     return AudioData(byte_data, self.audio.sample_rate,
                      self.audio.sample_width)
Exemplo n.º 14
0
    def listen(self,
               source,
               timeout=None,
               phrase_time_limit=None,
               snowboy_configuration=None):
        """
        Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns.
        This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included.
        The ``timeout`` parameter is the maximum number of seconds that this will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, there will be no wait timeout.
        The ``phrase_time_limit`` parameter is the maximum number of seconds that this will allow a phrase to continue before stopping and returning the part of the phrase processed before the time limit was reached. The resulting audio will be the phrase cut off at the time limit. If ``phrase_timeout`` is ``None``, there will be no phrase time limit.
        The ``snowboy_configuration`` parameter allows integration with `Snowboy <https://snowboy.kitt.ai/>`__, an offline, high-accuracy, power-efficient hotword recognition engine. When used, this function will pause until Snowboy detects a hotword, after which it will unpause. This parameter should either be ``None`` to turn off Snowboy support, or a tuple of the form ``(SNOWBOY_LOCATION, LIST_OF_HOT_WORD_FILES)``, where ``SNOWBOY_LOCATION`` is the path to the Snowboy root directory, and ``LIST_OF_HOT_WORD_FILES`` is a list of paths to Snowboy hotword configuration files (`*.pmdl` or `*.umdl` format).
        This operation will always complete within ``timeout + phrase_timeout`` seconds if both are numbers, either by returning the audio data, or by raising a ``speech_recognition.WaitTimeoutError`` exception.
        """
        print('ReRecognizer listen!')
        assert isinstance(source,
                          AudioSource), "Source must be an audio source"
        assert source.stream is not None, "Audio source must be entered before listening, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
        assert self.pause_threshold >= self.non_speaking_duration >= 0
        if snowboy_configuration is not None:
            assert os.path.isfile(
                os.path.join(snowboy_configuration[0], "snowboydetect.py")
            ), "``snowboy_configuration[0]`` must be a Snowboy root directory containing ``snowboydetect.py``"
            for hot_word_file in snowboy_configuration[1]:
                assert os.path.isfile(
                    hot_word_file
                ), "``snowboy_configuration[1]`` must be a list of Snowboy hot word configuration files"

        seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
        print('seconds_per_buffer:', seconds_per_buffer)
        pause_buffer_count = int(
            math.ceil(self.pause_threshold / seconds_per_buffer)
        )  # number of buffers of non-speaking audio during a phrase, before the phrase should be considered complete
        phrase_buffer_count = int(
            math.ceil(self.phrase_threshold / seconds_per_buffer)
        )  # minimum number of buffers of speaking audio before we consider the speaking audio a phrase
        non_speaking_buffer_count = int(
            math.ceil(self.non_speaking_duration / seconds_per_buffer)
        )  # maximum number of buffers of non-speaking audio to retain before and after a phrase

        # read audio input for phrases until there is a phrase that is long enough
        elapsed_time = 0  # number of seconds of audio read
        buffer = b""  # an empty buffer means that the stream has ended and there is no data left to read
        while True:
            frames = collections.deque()

            if snowboy_configuration is None:
                # store audio input until the phrase starts
                while True:
                    if self.current_status:
                        self.current_status = False
                        if self.status_cb:
                            self.status_cb(self.current_status)
                    # handle waiting too long for phrase by raising an exception
                    elapsed_time += seconds_per_buffer
                    if timeout and elapsed_time > timeout:
                        raise WaitTimeoutError(
                            "listening timed out while waiting for phrase to start"
                        )

                    buffer = source.stream.read(source.CHUNK)
                    if len(buffer) == 0: break  # reached end of the stream
                    frames.append(buffer)
                    if len(
                            frames
                    ) > non_speaking_buffer_count:  # ensure we only keep the needed amount of non-speaking buffers
                        frames.popleft()

                    # detect whether speaking has started on audio input
                    energy = audioop.rms(
                        buffer,
                        source.SAMPLE_WIDTH)  # energy of the audio signal
                    #if energy > self.energy_threshold: break

                    # don't start listening when we are talking
                    if not (self.talking_get and self.talking_get()):
                        if is_speech(): break

                    # dynamically adjust the energy threshold using asymmetric weighted average
                    if self.dynamic_energy_threshold:
                        damping = self.dynamic_energy_adjustment_damping**seconds_per_buffer  # account for different chunk sizes and rates
                        target_energy = energy * self.dynamic_energy_ratio
                        self.energy_threshold = self.energy_threshold * damping + target_energy * (
                            1 - damping)
            else:
                # read audio input until the hotword is said
                snowboy_location, snowboy_hot_word_files = snowboy_configuration
                buffer, delta_time = self.snowboy_wait_for_hot_word(
                    snowboy_location, snowboy_hot_word_files, source, timeout)
                elapsed_time += delta_time
                if len(buffer) == 0: break  # reached end of the stream
                frames.append(buffer)

            # read audio input until the phrase ends
            pause_count, phrase_count = 0, 0
            phrase_start_time = elapsed_time
            angles = []
            while True:
                if not self.current_status:
                    self.current_status = True
                    if self.status_cb:
                        self.status_cb(self.current_status)

                # handle phrase being too long by cutting off the audio
                elapsed_time += seconds_per_buffer
                if phrase_time_limit and elapsed_time - phrase_start_time > phrase_time_limit:
                    break

                buffer = source.stream.read(source.CHUNK)
                if len(buffer) == 0: break  # reached end of the stream
                frames.append(buffer)
                phrase_count += 1

                # check if speaking has stopped for longer than the pause threshold on the audio input
                #energy = audioop.rms(buffer, source.SAMPLE_WIDTH)  # unit energy of the audio signal within the buffer
                #if energy > self.energy_threshold:
                #    pause_count = 0
                #else:
                #    pause_count += 1
                if is_speech():
                    pause_count = 0
                    angles.append(dev.direction)
                else:
                    pause_count += 1
                if pause_count > pause_buffer_count:  # end of the phrase
                    break

            # check how long the detected phrase is, and retry listening if the phrase is too short
            phrase_count -= pause_count  # exclude the buffers for the pause before the phrase
            if phrase_count >= phrase_buffer_count or len(buffer) == 0:
                break  # phrase is long enough or we've reached the end of the stream, so stop listening

        # obtain frame data
        for i in range(pause_count - non_speaking_buffer_count):
            frames.pop()  # remove extra non-speaking frames at the end
        frame_data = b"".join(frames)

        return AudioData(frame_data, source.SAMPLE_RATE,
                         source.SAMPLE_WIDTH), median(angles)
Exemplo n.º 15
0
 def get_audio_data_after(self):
     byte_data = self.silence_data + self.get_audio_segment(
         self.range.end, self.AUDIO_SIZE)
     return AudioData(byte_data, self.audio_data.sample_rate,
                      self.audio_data.sample_width)
Exemplo n.º 16
0
 def get_audio_data_before(self):
     byte_data = self.get_audio_segment(
         0, self.range.begin) + self.silence_data
     return AudioData(byte_data, self.audio_data.sample_rate,
                      self.audio_data.sample_width)
    def listen_hotword(self,
                       source,
                       timeout=None,
                       phrase_time_limit=None,
                       keywords=pvporcupine.KEYWORDS):
        """
        Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns.
        This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included.
        The ``timeout`` parameter is the maximum number of seconds that this will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, there will be no wait timeout.
        The ``phrase_time_limit`` parameter is the maximum number of seconds that this will allow a phrase to continue before stopping and returning the part of the phrase processed before the time limit was reached. The resulting audio will be the phrase cut off at the time limit. If ``phrase_timeout`` is ``None``, there will be no phrase time limit.
        This operation will always complete within ``timeout + phrase_timeout`` seconds if both are numbers, either by returning the audio data, or by raising a ``speech_recognition.WaitTimeoutError`` exception.
        """
        assert isinstance(source,
                          AudioSource), "Source must be an audio source"
        assert source.stream is not None, "Audio source must be entered before listening, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
        assert self.pause_threshold >= self.non_speaking_duration >= 0

        seconds_per_buffer = float(source.CHUNK) / source.SAMPLE_RATE
        pause_buffer_count = int(
            math.ceil(self.pause_threshold / seconds_per_buffer)
        )  # number of buffers of non-speaking audio during a phrase, before the phrase should be considered complete
        phrase_buffer_count = int(
            math.ceil(self.phrase_threshold / seconds_per_buffer)
        )  # minimum number of buffers of speaking audio before we consider the speaking audio a phrase
        non_speaking_buffer_count = int(
            math.ceil(self.non_speaking_duration / seconds_per_buffer)
        )  # maximum number of buffers of non-speaking audio to retain before and after a phrase

        # read audio input for phrases until there is a phrase that is long enough
        elapsed_time = 0  # number of seconds of audio read
        buffer = b""  # an empty buffer means that the stream has ended and there is no data left to read
        while True:
            frames = collections.deque()

            # read audio input until the hotword is said
            buffer, delta_time = self.porcupine_wait_for_hotword(
                source, keywords, timeout)
            elapsed_time += delta_time
            if len(buffer) == 0: break  # reached end of the stream
            frames.append(buffer)
            self._found_hotword_callback()

            # read audio input until the phrase ends
            pause_count, phrase_count = 0, 0
            phrase_start_time = elapsed_time
            while True:
                # handle phrase being too long by cutting off the audio
                elapsed_time += seconds_per_buffer
                if phrase_time_limit and elapsed_time - phrase_start_time > phrase_time_limit:
                    break

                buffer = source.stream.read(source.CHUNK)
                if len(buffer) == 0: break  # reached end of the stream
                frames.append(buffer)
                phrase_count += 1

                # check if speaking has stopped for longer than the pause threshold on the audio input
                energy = audioop.rms(
                    buffer, source.SAMPLE_WIDTH
                )  # unit energy of the audio signal within the buffer
                if energy > self.energy_threshold:
                    pause_count = 0
                else:
                    pause_count += 1
                if pause_count > pause_buffer_count:  # end of the phrase
                    break

            # check how long the detected phrase is, and retry listening if the phrase is too short
            phrase_count -= pause_count  # exclude the buffers for the pause before the phrase
            if phrase_count >= phrase_buffer_count or len(buffer) == 0:
                break  # phrase is long enough or we've reached the end of the stream, so stop listening

        # obtain frame data
        for i in range(pause_count - non_speaking_buffer_count):
            frames.pop()  # remove extra non-speaking frames at the end
        frame_data = b"".join(frames)

        return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
Exemplo n.º 18
0
 def getAudioData(self):
     return AudioData(
         np.array(self.data, np.int16).tostring(), SAMPLERATE, 2)
Exemplo n.º 19
0
 def _get_audio(self, audio_data: AudioData):
     return audio_data.get_wav_data(self._convert_rate, self._convert_width)
Exemplo n.º 20
0
 def get_audio_data_after(self):
     byte_data = self.silence_data + self.audio.frame_data[self.end:self.
                                                           audio_size]
     return AudioData(byte_data, self.audio.sample_rate,
                      self.audio.sample_width)
Exemplo n.º 21
0
    def listen(self, source, timeout=None):
        """
        Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns.

        This is done by waiting until the audio has an energy above ``recognizer_instance.energy_threshold`` (the user has started speaking), and then recording until it encounters ``recognizer_instance.pause_threshold`` seconds of non-speaking or there is no more audio input. The ending silence is not included.

        The ``timeout`` parameter is the maximum number of seconds that it will wait for a phrase to start before giving up and throwing an ``speech_recognition.WaitTimeoutError`` exception. If ``timeout`` is ``None``, it will wait indefinitely.
        """
        assert isinstance(source,
                          AudioSource), "Source must be an audio source"
        assert self.pause_threshold >= self.non_speaking_duration >= 0

        seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
        pause_buffer_count = int(
            math.ceil(self.pause_threshold / seconds_per_buffer)
        )  # number of buffers of non-speaking audio before the phrase is complete
        phrase_buffer_count = int(
            math.ceil(self.phrase_threshold / seconds_per_buffer)
        )  # minimum number of buffers of speaking audio before we consider the speaking audio a phrase
        non_speaking_buffer_count = int(
            math.ceil(self.non_speaking_duration / seconds_per_buffer)
        )  # maximum number of buffers of non-speaking audio to retain before and after

        # read audio input for phrases until there is a phrase that is long enough
        elapsed_time = 0  # number of seconds of audio read
        while True:
            frames = collections.deque()

            # store audio input until the phrase starts
            while True:
                elapsed_time += seconds_per_buffer
                if timeout and elapsed_time > timeout:  # handle timeout if specified
                    raise WaitTimeoutError("listening timed out")

                buffer = source.stream.read(source.CHUNK)
                if len(buffer) == 0: break  # reached end of the stream
                frames.append(buffer)
                if len(
                        frames
                ) > non_speaking_buffer_count:  # ensure we only keep the needed amount of non-speaking buffers
                    frames.popleft()

                # detect whether speaking has started on audio input
                energy = audioop.rms(
                    buffer, source.SAMPLE_WIDTH)  # energy of the audio signal
                if energy > self.energy_threshold: break

                # dynamically adjust the energy threshold using assymmetric weighted average
                # do not adjust dynamic energy level for this sample if it is muted audio (energy == 0)
                self.adjust_energy_threshold(energy, seconds_per_buffer)
            # read audio input until the phrase ends
            pause_count, phrase_count = 0, 0
            while True:
                elapsed_time += seconds_per_buffer

                buffer = source.stream.read(source.CHUNK)
                if len(buffer) == 0: break  # reached end of the stream
                frames.append(buffer)
                phrase_count += 1

                # check if speaking has stopped for longer than the pause threshold on the audio input
                energy = audioop.rms(
                    buffer, source.SAMPLE_WIDTH)  # energy of the audio signal
                if energy > self.energy_threshold:
                    pause_count = 0
                else:
                    pause_count += 1
                if pause_count > pause_buffer_count:  # end of the phrase
                    break

                if len(frames
                       ) * seconds_per_buffer >= self.max_audio_length_sec:
                    # if we hit the end of the audio length, readjust energy_threshold
                    for frame in frames:
                        energy = audioop.rms(frame, source.SAMPLE_WIDTH)
                        self.adjust_energy_threshold(energy,
                                                     seconds_per_buffer)
                    break

            # check how long the detected phrase is, and retry listening if the phrase is too short
            phrase_count -= pause_count
            if phrase_count >= phrase_buffer_count:
                break  # phrase is long enough, stop listening

        # obtain frame data
        for i in range(pause_count - non_speaking_buffer_count):
            frames.pop()  # remove extra non-speaking frames at the end
        frame_data = b"".join(list(frames))

        return AudioData(frame_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
Exemplo n.º 22
0
 def _create_audio_data(raw_data, source):
     """
     Constructs an AudioData instance with the same parameters
     as the source and the specified frame_data
     """
     return AudioData(raw_data, source.SAMPLE_RATE, source.SAMPLE_WIDTH)
Exemplo n.º 23
0
def predict_word(audio_data: AudioData, model_map: ModelMap):
    try:
        if not os.path.exists(BG_WAV_PATH):
            print("bg audio is not ready.")
            return
        try:
            os.remove(INPUT_WAV_PATH)
        except:
            pass

        # execute noise reduction
        with open(INPUT_WAV_PATH + '.tmp', 'wb') as f:
            f.write(audio_data.get_wav_data())
        with noisered.SEMAPHORE:
            try:
                os.remove(INPUT_WAV_PATH)
            except:
                pass
            os.rename(INPUT_WAV_PATH + '.tmp', INPUT_WAV_PATH)
        if not noisered.create_noisered_wav(INPUT_WAV_PATH, NOISERED_WAV_PATH,
                                            BG_WAV_PATH):
            return

        # load or get model
        if threading.get_ident() not in model_map.models:
            print(f"load model. tid:{threading.get_ident()}")
            model_map.models[threading.get_ident()] = load_model()
        model = model_map.models[threading.get_ident()]

        # create input from wav data
        # io_obj = BytesIO(audio_data.get_wav_data())
        # x = create_mfcc_from_io(io_obj)
        x = create_features(NOISERED_WAV_PATH, FEATURE_TYPE)
        # x = create_mfcc_from_file(INPUT_WAV_PATH)

        # complement shortage space
        print(f"x:{x.shape},{x.dtype} framedata:{len(audio_data.frame_data)}")
        if x.shape[0] < Tx:
            # min_val = np.amin(x, axis=0)
            # print(f"min_val:{min_val.shape}")
            # calc remaining space size
            empty_space_size = Tx - x.shape[0]
            # create remaining space
            # empty_space = np.tile(min_val, (empty_space_size, 1))
            empty_space = np.zeros((empty_space_size, n_freq),
                                   dtype=np.float32)
            # complement data's empty space
            print(f"emptysp:{empty_space.shape}")
            x = np.concatenate((x, empty_space), axis=0)
        # frames = np.array(data)
        if x.shape[0] > Tx:
            eprint(f"trim input. from={x.shape[0]} to={Tx}")
            x = x[:Tx]
        x = np.float32(np.array([x]))
        print(f"x:{x.shape},{x.dtype}")

        # do predict
        start = timer()
        predicted = model.predict(x)
        end = timer()
        print(f"predicted:{predicted} time:{end - start}")
        summarize_prediction(predicted[0])
    except:
        traceback.print_exc()
        raise