def test_init_min_3_init_max_silence_0(self): tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20, max_continuous_silence=4, init_min = 3, init_max_silence = 0, mode=0) data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA") # ^ ^ ^ ^ # 18 30 33 37 tokens = tokenizer.tokenize(data_source) self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens))) tok1, tok2 = tokens[0], tokens[1] data = ''.join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual(data, "AAAAAAAAAaaaa", msg="wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data)) self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start)) self.assertEqual(end, 30, msg="wrong end frame for token 1, expected: 30, found: {0} ".format(end)) data = ''.join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual(data, "AAAAA", msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) self.assertEqual(start, 33, msg="wrong start frame for token 2, expected: 33, found: {0} ".format(start)) self.assertEqual(end, 37, msg="wrong end frame for token 2, expected: 37, found: {0} ".format(end))
def test_init_min_0_init_max_silence_0(self): tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20, max_continuous_silence=4, init_min = 0, init_max_silence = 0, mode=0) data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA") # ^ ^ ^ ^ # 2 16 20 27 tokens = tokenizer.tokenize(data_source) self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format(len(tokens))) tok1, tok2 = tokens[0], tokens[1] # tok1[0]: data # tok1[1]: start frame (included) # tok1[2]: end frame (included) data = ''.join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual(data, "AaaaAaAaaAaAaaaa", msg="wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', found: {0} ".format(data)) self.assertEqual(start, 1, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start)) self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end)) data = ''.join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual(data, "AAAAAAAA", msg="wrong data for token 1, expected: 'AAAAAAAA', found: {0} ".format(data)) self.assertEqual(start, 20, msg="wrong start frame for token 2, expected: 20, found: {0} ".format(start)) self.assertEqual(end, 27, msg="wrong end frame for token 2, expected: 27, found: {0} ".format(end))
def split(filename='g1238-20181214-081712-1544750232.37681.wav'): sr, samples = wavfile.read(filename=filename, mmap=True) #print(len(samples)) plt.plot(samples) asource = ADSFactory.ads(filename=filename, record=False) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=50) # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=400 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 10 == 300 ms tokenizer = StreamTokenizer(validator=validator, min_length=100, max_length=500, max_continuous_silence=50) asource.open() tokens = tokenizer.tokenize(asource) stack = [] sum = [] for i, t in enumerate(tokens): #print("Token [{0}] starts at {1} and ends at {2}".format(i+1, t[1], t[2])) stack.append([t[1] * 80, t[2] * 80]) sum.append((t[2] * 80 - t[1] * 80) / 8000) wavfile.write('token_' + str(i) + '.wav', sr, samples[t[1] * 80:t[2] * 80]) #write to file asource.close() print(sum) return stack
def test_callback(self): tokens = [] def callback(data, start, end): tokens.append((data, start, end)) tokenizer = StreamTokenizer( self.A_validator, min_length=5, max_length=8, max_continuous_silence=3, init_min=3, init_max_silence=3, mode=0, ) data_source = StringDataSource("aaAAAAAAAAAAAAa") # ^ ^^ ^ # 2 910 14 tokenizer.tokenize(data_source, callback=callback) self.assertEqual( len(tokens), 2, msg="wrong number of tokens, expected: 1, found: {0} ".format( len(tokens)), )
def transcribe_audio(self, stereo_path, channels_to_process): if not os.path.isfile(stereo_path): raise Exception("Audio file does not exist.") data = self.split_to_mono(stereo_path) a_leg = data['a_leg'] b_leg = data['b_leg'] data['a_leg'] = None data['b_leg'] = None validator = AudioEnergyValidator(sample_width=data['frame_width'], energy_threshold=45) trimmer = StreamTokenizer(validator, min_length=self.min_segment_length, max_length=self.max_segment_length, max_continuous_silence=self.max_continuous_silence, mode=StreamTokenizer.DROP_TAILING_SILENCE) segments = [] if channels_to_process in ['A', 'AB']: a_source = ADSFactory.ads(audio_source=a_leg, record=True, block_size=data['frame_rate'] / self.divisor) a_source.open() trimmer.tokenize(a_source, callback=lambda data, start, end: segments.append(("A", data, start, end))) if channels_to_process in ['B', 'AB']: b_source = ADSFactory.ads(audio_source=b_leg, record=True, block_size=data['frame_rate'] / self.divisor) b_source.open() trimmer.tokenize(b_source, callback=lambda data, start, end: segments.append(("B", data, start, end))) segments = sorted(segments, key=lambda x: x[3]) self.batch(segments, data['duration'], data['frame_rate'], data['frame_width'], data['nchannels'])
def getSplitAudioDurationListBetweenSilence(fileName,eachAudioLen,silencePeriod,energyThreshold=55): try: # We set the `record` argument to True so that we can rewind the source asource = ADSFactory.ads(filename=fileName, record=False) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=energyThreshold) # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=400 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 30 == 300 ms tokenizer = StreamTokenizer(validator=validator, min_length=400, max_length=eachAudioLen*100, max_continuous_silence=silencePeriod*100) asource.open() tokens = tokenizer.tokenize(asource) # Play detected regions back #player = player_for(asource) # Rewind and read the whole signal #asource.rewind() #original_signal = [] #while True: # w = asource.read() # if w is None: # break # original_signal.append(w) #original_signal = b''.join(original_signal) #player.play(original_signal) #print("\n ** playing detected regions...\n") #for i,t in enumerate(tokens): # print("Token [{0}] starts at {1} and ends at {2}".format(i+1, t[1], t[2])) #data = b''.join(t[0]) #player.play(data) #assert len(tokens) == 8 asource.close() #player.stop() except KeyboardInterrupt: #player.stop() asource.close() #sys.exit(0) except Exception as e: sys.stderr.write(str(e) + "\n") #sys.exit(1) return tokens
def test_min_length_1_init_max_length_1(self): tokenizer = StreamTokenizer(self.A_validator, min_length = 1, max_length=1, max_continuous_silence=0, init_min = 0, init_max_silence = 0, mode=0) data_source = StringDataSource("AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA") tokens = tokenizer.tokenize(data_source) self.assertEqual(len(tokens), 21, msg="wrong number of tokens, expected: 21, found: {0} ".format(len(tokens)))
def test_min_length_4_init_max_length_5(self): tokenizer = StreamTokenizer(self.A_validator, min_length = 4, max_length=5, max_continuous_silence=4, init_min = 3, init_max_silence = 3, mode=0) data_source = StringDataSource("aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa") # ^ ^^ ^ ^ ^ ^ ^ # 18 2223 27 32 36 42 46 tokens = tokenizer.tokenize(data_source) self.assertEqual(len(tokens), 4, msg="wrong number of tokens, expected: 4, found: {0} ".format(len(tokens))) tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3] data = ''.join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual(data, "AAAAA", msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) self.assertEqual(start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ".format(start)) self.assertEqual(end, 22, msg="wrong end frame for token 1, expected: 22, found: {0} ".format(end)) data = ''.join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual(data, "AAAaa", msg="wrong data for token 1, expected: 'AAAaa', found: '{0}' ".format(data)) self.assertEqual(start, 23, msg="wrong start frame for token 1, expected: 23, found: {0} ".format(start)) self.assertEqual(end, 27, msg="wrong end frame for token 1, expected: 27, found: {0} ".format(end)) data = ''.join(tok3[0]) start = tok3[1] end = tok3[2] self.assertEqual(data, "AAAAA", msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ".format(data)) self.assertEqual(start, 32, msg="wrong start frame for token 1, expected: 1, found: {0} ".format(start)) self.assertEqual(end, 36, msg="wrong end frame for token 1, expected: 7, found: {0} ".format(end)) data = ''.join(tok4[0]) start = tok4[1] end = tok4[2] self.assertEqual(data, "AAaaA", msg="wrong data for token 2, expected: 'AAaaA', found: '{0}' ".format(data)) self.assertEqual(start, 42, msg="wrong start frame for token 2, expected: 17, found: {0} ".format(start)) self.assertEqual(end, 46, msg="wrong end frame for token 2, expected: 22, found: {0} ".format(end))
def read_split_dir(file): f = sf.SoundFile(file) #duration of file in seconds duration = len(f) / f.samplerate if duration <= 4: print(file, 'untouched') else: #Get original filename name = os.path.splitext(file)[0] tempsound = AudioSegment.from_wav(file) tempsound = tempsound.set_channels(1) tempsound.export('0wavtmp_' + file, format="wav") tmpfile = '0wavtmp_' + file # We set the `record` argument to True so that we can rewind the source asource = ADSFactory.ads(filename=tmpfile, record=True) validator = AudioEnergyValidator( sample_width=asource.get_sample_width(), energy_threshold=50) # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=4000 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 30 == 300 ms tokenizer = StreamTokenizer(validator=validator, min_length=500, max_length=4000, max_continuous_silence=100) asource.open() tokens = tokenizer.tokenize(asource) for index, t in enumerate(tokens): #print("Token starts at {0} and ends at {1}".format(t[1], t[2])) newAudio = AudioSegment.from_wav(file) newAudio = newAudio[t[1]:t[2]] chunk_name = "{}_clip{}.wav".format(name, index) print("Generating", chunk_name) newAudio.export( chunk_name, format="wav") #Exports to a wav file in the current path. #Remove the temporary file we made earlier os.remove(tmpfile) #Remove the original file to avoid confusion os.remove(file)
def __init__(self): self.asource = ADSFactory.ads(record=True, max_time=4) self.validator = AudioEnergyValidator(sample_width=2, energy_threshold=50) self.tokenizer = StreamTokenizer(validator=self.validator, min_length=20, max_length=1000, max_continuous_silence=30) self.player = player_for(self.asource) self.model = self.load_cnn('../model/final_cnn_model.json', '../model/weights_final_cnn.h5') self.model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self): tokenizer = StreamTokenizer( self.A_validator, min_length=5, max_length=8, max_continuous_silence=3, init_min=3, init_max_silence=3, mode=StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE, ) data_source = StringDataSource("aaAAAAAAAAAAAAaa") # ^ ^ # 2 8 tokens = tokenizer.tokenize(data_source) self.assertEqual( len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format( len(tokens)), ) tok1 = tokens[0] data = "".join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual( data, "AAAAAAAA", msg=("wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' " ).format(data), ) self.assertEqual( start, 2, msg=("wrong start frame for token 1, expected: 2, found: {0} " ).format(start), ) self.assertEqual( end, 9, msg=("wrong end frame for token 1, expected: 9, found: {0} " ).format(end), )
def make_auditok_detector(sample_rate=100): bytes_per_frame = 2 frames_per_window = FRAME_RATE // sample_rate validator = AudioEnergyValidator(sample_width=bytes_per_frame, energy_threshold=50) tokenizer = StreamTokenizer(validator=validator, min_length=0.2 * sample_rate, max_length=int(5 * sample_rate), max_continuous_silence=0.25 * sample_rate) def _detect(asegment): asource = BufferAudioSource(data_buffer=asegment, sampling_rate=FRAME_RATE, sample_width=bytes_per_frame, channels=1) ads = ADSFactory.ads(audio_source=asource, block_dur=1. / sample_rate) ads.open() tokens = tokenizer.tokenize(ads) length = (len(asegment) // bytes_per_frame + frames_per_window - 1) // frames_per_window media_bstring = np.zeros(length + 1, dtype=int) for token in tokens: media_bstring[token[1]] += 1 media_bstring[token[2] + 1] -= 1 return np.cumsum(media_bstring)[:-1] > 0 return _detect
def test_callback(self): tokens = [] def callback(data, start, end): tokens.append((data, start, end)) tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8, max_continuous_silence=3, init_min = 3, init_max_silence = 3, mode=0) data_source = StringDataSource("aaAAAAAAAAAAAAa") # ^ ^^ ^ # 2 910 14 tokenizer.tokenize(data_source, callback=callback) self.assertEqual(len(tokens), 2, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens)))
def test_min_5_max_10_max_continuous_silence_1(self): tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=10, max_continuous_silence=1, init_min = 3, init_max_silence = 3, mode=0) data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa") # ^ ^^ ^ ^ ^ # 3 12131517 26 # (12 13 15 17) tokens = tokenizer.tokenize(data_source) self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens))) tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] data = ''.join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual(data, "AAAAAaAAAA", msg="wrong data for token 1, expected: 'AAAAAaAAAA', found: '{0}' ".format(data)) self.assertEqual(start, 3, msg="wrong start frame for token 1, expected: 3, found: {0} ".format(start)) self.assertEqual(end, 12, msg="wrong end frame for token 1, expected: 10, found: {0} ".format(end)) data = ''.join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual(data, "AAa", msg="wrong data for token 1, expected: 'AAa', found: '{0}' ".format(data)) self.assertEqual(start, 13, msg="wrong start frame for token 1, expected: 9, found: {0} ".format(start)) self.assertEqual(end, 15, msg="wrong end frame for token 1, expected: 14, found: {0} ".format(end)) data = ''.join(tok3[0]) start = tok3[1] end = tok3[2] self.assertEqual(data, "AAAAAAAAAa", msg="wrong data for token 1, expected: 'AAAAAAAAAa', found: '{0}' ".format(data)) self.assertEqual(start, 17, msg="wrong start frame for token 1, expected: 17, found: {0} ".format(start)) self.assertEqual(end, 26, msg="wrong end frame for token 1, expected: 26, found: {0} ".format(end))
def calibrate(self): ''' This method calibrates the MinMaxScaler, self.scaler, by capturing 10 seconds of audio and applying MinMaxScaler fit method. See sklearn.preprocessing.MinMaxScaler for details. This is redundant, scaling is not necessary. ''' a = raw_input( "Calibrate normalisation, press return then make noises from your mouth hole." ) if self.audioPath == None: asource = ADSFactory.ads(sampling_rate=self.sr, max_time=10) else: asource = ADSFactory.ads(filename=self.audioPath, sampling_rate=self.sr, max_time=10) validator = AudioEnergyValidator( sample_width=asource.get_sample_width(), energy_threshold=self.energy) tokenizer = StreamTokenizer(validator=validator, min_length=self.min_len, max_length=self.max_len, max_continuous_silence=self.max_con_si) def calib_callback(data, start, end): audio = np.fromstring(data[0], dtype=np.int8) self.scaler.fit_transform(np.swapaxes(np.asarray([audio]), 0, 1)) print "Audio sample found {0}--{1}".format(start, end) asource.open() tokenizer.tokenize(asource, callback=calib_callback) print "Scaler paramaters found: min: {0} max: {1}".format( self.scaler.data_min_, self.scaler.data_max_) print "calibration done" self.mini = self.scaler.data_min_ self.maxi = self.scaler.data_max_
def test_min_length_1_init_max_length_1(self): tokenizer = StreamTokenizer(self.A_validator, min_length=1, max_length=1, max_continuous_silence=0, init_min=0, init_max_silence=0, mode=0) data_source = StringDataSource( "AAaaaAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaAAAAA") tokens = tokenizer.tokenize(data_source) self.assertEqual( len(tokens), 21, msg="wrong number of tokens, expected: 21, found: {0} ".format( len(tokens)))
def test_init_min_3_init_max_silence_2(self): tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=20, max_continuous_silence=4, init_min = 3, init_max_silence = 2, mode=0) data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA") # ^ ^ ^ ^ ^ ^ # 5 16 19 31 35 39 tokens = tokenizer.tokenize(data_source) self.assertEqual(len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format(len(tokens))) tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] data = ''.join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual(data, "AaAaaAaAaaaa", msg="wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' ".format(data)) self.assertEqual(start, 5, msg="wrong start frame for token 1, expected: 5, found: {0} ".format(start)) self.assertEqual(end, 16, msg="wrong end frame for token 1, expected: 16, found: {0} ".format(end)) data = ''.join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual(data, "AAAAAAAAAaaaa", msg="wrong data for token 2, expected: 'AAAAAAAAAaaaa', found: '{0}' ".format(data)) self.assertEqual(start, 19, msg="wrong start frame for token 2, expected: 19, found: {0} ".format(start)) self.assertEqual(end, 31, msg="wrong end frame for token 2, expected: 31, found: {0} ".format(end)) data = ''.join(tok3[0]) start = tok3[1] end = tok3[2] self.assertEqual(data, "AAAAA", msg="wrong data for token 3, expected: 'AAAAA', found: '{0}' ".format(data)) self.assertEqual(start, 35, msg="wrong start frame for token 2, expected: 35, found: {0} ".format(start)) self.assertEqual(end, 39, msg="wrong end frame for token 2, expected: 39, found: {0} ".format(end))
def test_DROP_TAILING_SILENCE(self): tokenizer = StreamTokenizer(self.A_validator, min_length=5, max_length=10, max_continuous_silence=2, init_min=3, init_max_silence=3, mode=StreamTokenizer.DROP_TAILING_SILENCE) data_source = StringDataSource("aaAAAAAaaaaa") # ^ ^ # 2 6 tokens = tokenizer.tokenize(data_source) self.assertEqual( len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format( len(tokens))) tok1 = tokens[0] data = ''.join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual( data, "AAAAA", msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ". format(data)) self.assertEqual( start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ". format(start)) self.assertEqual( end, 6, msg="wrong end frame for token 1, expected: 6, found: {0} ".format( end))
def __init__(self, _useGui): # parametros de áudio max_length = 1000000 max_interval = 12000 max_continuous_silence = 500 min_length = 150 self.sample_rate = 48000 self.asource = ADSFactory.ads(record=True, max_time=max_length, sampling_rate=self.sample_rate) self.sample_width = self.asource.get_sample_width() self.channels = self.asource.get_channels() # START VALIDATOR self.validator = AudioEnergyValidator( sample_width=self.sample_width, energy_threshold=energy_threshold) self.tokenizer = StreamTokenizer( validator=self.validator, min_length=min_length, max_length=max_length, max_continuous_silence=max_continuous_silence) self.audio_folder = 'recordings/' + '{:%Y-%m-%d_%H-%M-%S}'.format( datetime.datetime.now()) + '/' if not os.path.exists(os.path.dirname(self.audio_folder)): try: os.makedirs(os.path.dirname(self.audio_folder)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raiseRec os.chmod('recordings', 0o777) os.chmod(self.audio_folder, 0o777) self.MODE = 'ECHO' self.useGui = _useGui if self.useGui: root = Tk() self.display = GUI(root, True) self.display.display_image()
def configure(self, rf): # Setting up rpc port self.portsList["rpc"] = yarp.Port() self.portsList["rpc"].open("/sentence_tokenizer/rpc:i") self.attach(self.portsList["rpc"]) self.portsList["audio_out"] = yarp.BufferedPortBottle() self.portsList["audio_out"].open("/sentence_tokenizer/audio:o") # Setting up hotword detection self.hotword_detector = snowboydecoder.HotwordDetector(self.hotword_model, sensitivity=self.hotword_sensitivity) # Setting up audio tokenizer to split sentences self.audio_source = ADSFactory.ads(record=True, max_time=self.tok_record_duration, block_dur=self.tok_window) self.tok_validator = AudioEnergyValidator(sample_width=self.audio_source.get_sample_width(), energy_threshold=self.tok_energy_threshold) self.tokenizer_mode = StreamTokenizer.DROP_TRAILING_SILENCE self.tokenizer = StreamTokenizer(validator=self.tok_validator, min_length=self.tok_min_len, max_length=self.tok_max_len, max_continuous_silence=self.tok_max_silence_duration, mode=self.tokenizer_mode) if self.echo_enabled: self.echo_thread = threading.Thread(target=self.replayAudio) self.echo_thread.start() if self.hotword_enabled: print("Waiting for hotword to start interaction") # self.hotword_detector.start(detected_callback=self.detected_callback, # interrupt_check=self.interrupt_callback, # sleep_time=self.hotword_loop_time) print("Hotword detected. Starting tokenizer thread") else: print "Starting tokenizer thread" self.asr = sr.Recognizer() with open('google_credentials.json', 'r') as credentials: self.google_credentials = credentials.read() return True
def _make_auditok_detector( sample_rate: int, frame_rate: int, non_speech_label: float ) -> Callable[[bytes], np.ndarray]: try: from auditok import ( BufferAudioSource, ADSFactory, AudioEnergyValidator, StreamTokenizer, ) except ImportError as e: logger.error( """Error: auditok not installed! Consider installing it with `pip install auditok`. Note that auditok is GPLv3 licensed, which means that successfully importing it at runtime creates a derivative work that is GPLv3 licensed. For personal use this is fine, but note that any commercial use that relies on auditok must be open source as per the GPLv3!* *Not legal advice. Consult with a lawyer. """ ) raise e bytes_per_frame = 2 frames_per_window = frame_rate // sample_rate validator = AudioEnergyValidator(sample_width=bytes_per_frame, energy_threshold=50) tokenizer = StreamTokenizer( validator=validator, min_length=0.2 * sample_rate, max_length=int(5 * sample_rate), max_continuous_silence=0.25 * sample_rate, ) def _detect(asegment: bytes) -> np.ndarray: asource = BufferAudioSource( data_buffer=asegment, sampling_rate=frame_rate, sample_width=bytes_per_frame, channels=1, ) ads = ADSFactory.ads(audio_source=asource, block_dur=1.0 / sample_rate) ads.open() tokens = tokenizer.tokenize(ads) length = ( len(asegment) // bytes_per_frame + frames_per_window - 1 ) // frames_per_window media_bstring = np.zeros(length + 1) for token in tokens: media_bstring[token[1]] = 1.0 media_bstring[token[2] + 1] = non_speech_label - 1.0 return np.clip(np.cumsum(media_bstring)[:-1], 0.0, 1.0) return _detect
def test_STRICT_MIN_LENGTH_and_DROP_TAILING_SILENCE(self): tokenizer = StreamTokenizer(self.A_validator, min_length = 5, max_length=8, max_continuous_silence=3, init_min = 3, init_max_silence = 3, mode=StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TAILING_SILENCE) data_source = StringDataSource("aaAAAAAAAAAAAAaa") # ^ ^ # 2 8 tokens = tokenizer.tokenize(data_source) self.assertEqual(len(tokens), 1, msg="wrong number of tokens, expected: 1, found: {0} ".format(len(tokens))) tok1 = tokens[0] data = ''.join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual(data, "AAAAAAAA", msg="wrong data for token 1, expected: 'AAAAAAAA', found: '{0}' ".format(data)) self.assertEqual(start, 2, msg="wrong start frame for token 1, expected: 2, found: {0} ".format(start)) self.assertEqual(end, 9, msg="wrong end frame for token 1, expected: 9, found: {0} ".format(end))
def _get_asr_result_whole(folder,prefix): asource = ADSFactory.ads(filename='./temp/{}.wav'.format(prefix), block_size=160) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=65) tokenizer = StreamTokenizer(validator=validator, min_length=300, max_length=1000, max_continuous_silence=50) asource.open() from pocketsphinx_decoder import decoder tokens = tokenizer.tokenize(asource) d = defaultdict(list) past = 0 for content,start,end in tokens: save_audio_data(data=b''.join(content), filename='tmp.wav', filetype='wav', sr=asource.get_sampling_rate(),sw = asource.get_sample_width(),ch = asource.get_channels()) decoder.start_utt() decoder.process_raw(open('tmp.wav','rb').read(),False,False) decoder.end_utt() seg = list(decoder.seg()) print(' '.join([s.word for s in seg])) def add_feature(name,add=None): if add is None: d[name].extend(list(map(attrgetter(name),seg))) else: d[name].extend([attrgetter(name)(x)+add for x in seg]) add_feature('start_frame',past) add_feature('end_frame',past) add_feature('word') add_feature('ascore') add_feature('lscore') add_feature('lback') add_feature('prob') past += len(content) df = pd.DataFrame(d) df = df[['start_frame','end_frame','ascore','lscore','lback','prob','word']] df.to_csv(path.join(folder ,'{}.csv'.format(prefix)), index=None)
class AudioDetection: def __init__(self, _useGui): # parametros de áudio max_length = 1000000 max_interval = 12000 max_continuous_silence = 500 min_length = 150 self.sample_rate = 48000 self.asource = ADSFactory.ads(record=True, max_time=max_length, sampling_rate=self.sample_rate) self.sample_width = self.asource.get_sample_width() self.channels = self.asource.get_channels() # START VALIDATOR self.validator = AudioEnergyValidator( sample_width=self.sample_width, energy_threshold=energy_threshold) self.tokenizer = StreamTokenizer( validator=self.validator, min_length=min_length, max_length=max_length, max_continuous_silence=max_continuous_silence) self.audio_folder = 'recordings/' + '{:%Y-%m-%d_%H-%M-%S}'.format( datetime.datetime.now()) + '/' if not os.path.exists(os.path.dirname(self.audio_folder)): try: os.makedirs(os.path.dirname(self.audio_folder)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raiseRec os.chmod('recordings', 0o777) os.chmod(self.audio_folder, 0o777) self.MODE = 'ECHO' self.useGui = _useGui if self.useGui: root = Tk() self.display = GUI(root, True) self.display.display_image() def start(self): self.listener = keyboard.Listener(on_press=self.on_press, on_release=self.on_release) self.listener.start() self.listen() def listen(self): ## abrir microfone self.asource.open() print("\n ** Listening!!!") ## começar tokenizer self.tokenizer.tokenize(self.asource, callback=self.onDetection) self.asource.close() def on_press(self, key): return '' def on_release(self, key): if hasattr(key, 'char'): if key.char == 'e': self.MODE = 'ECHO' print('set mode to', self.MODE) if key.char == 'r': self.MODE = 'RANDOM' print('set mode to', self.MODE) if key.char == 'q': self.asource.close() sys.exit(0) def onDetection(self, data, start, end): name = "{0}-{1}".format(start, end) + '.wav' print(name) filename = self.savefile(data, start, end) print('current mode', self.MODE) if self.MODE == 'RANDOM': randomfile = player.getRandomFile(play_folder) player.play(randomfile) if self.MODE == 'ECHO': player.play(filename) self.display.display_image() print("finished playing") def savefile(self, data, start, end): name = "{0}-{1}".format(start, end) + '.wav' filename = self.audio_folder + name # save wav file waveFile = wave.open(filename, 'wb') waveFile.setnchannels(self.channels) waveFile.setsampwidth(self.sample_width) waveFile.setframerate(self.sample_rate) waveFile.writeframes(b''.join(data)) waveFile.close() # normalize volume sound = AudioSegment.from_file(filename, "wav") normalized_sound = self.match_target_amplitude(sound, -15.0) with_fade = normalized_sound.fade_in(200).fade_out(200) with_fade.export(filename, format="wav") print('audio saved at', filename) return filename def match_target_amplitude(self, sound, target_dBFS): change_in_dBFS = target_dBFS - sound.dBFS return sound.apply_gain(change_in_dBFS)
duration = float(sys.argv[2]) # record = True so that we'll be able to rewind the source. # max_time = 10: read 10 seconds from the microphone asource = ADSFactory.ads(record=True, max_time = duration, sampling_rate = sample_rate) # params # sample_rate = asource.get_sampling_rate() sample_width = asource.get_sample_width() channels = asource.get_channels() chunk = 1024 print(sample_width, sample_rate) validator = AudioEnergyValidator(sample_width=sample_width, energy_threshold = energy_threshold) tokenizer = StreamTokenizer(validator=validator, min_length=70, max_length=20000, max_continuous_silence=100) p = pyaudio.PyAudio() for i in range(p.get_device_count()): dev = p.get_device_info_by_index(i) print((i,dev['name'],dev['maxInputChannels'])) def savefile(data, start, end): print('-----------------------') print("Acoustic activity at: {0}--{1}".format(start, end)) filename = "teste_{0}_{1}.wav".format(start, end) waveFile = wave.open(filename, 'wb') waveFile.setnchannels(channels) waveFile.setsampwidth(sample_width) waveFile.setframerate(channels)
def test_min_5_max_10_max_continuous_silence_1(self): tokenizer = StreamTokenizer( self.A_validator, min_length=5, max_length=10, max_continuous_silence=1, init_min=3, init_max_silence=3, mode=0, ) data_source = StringDataSource("aaaAAAAAaAAAAAAaaAAAAAAAAAa") # ^ ^^ ^ ^ ^ # 3 12131517 26 # (12 13 15 17) tokens = tokenizer.tokenize(data_source) self.assertEqual( len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format( len(tokens)), ) tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] data = "".join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual( data, "AAAAAaAAAA", msg=("wrong data for token 1, expected: 'AAAAAaAAAA', " "found: '{0}' ").format(data), ) self.assertEqual( start, 3, msg=("wrong start frame for token 1, expected: 3, found: {0} " ).format(start), ) self.assertEqual( end, 12, msg=("wrong end frame for token 1, expected: 10, found: {0} " ).format(end), ) data = "".join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual( data, "AAa", msg=("wrong data for token 1, expected: 'AAa', found: '{0}' " ).format(data), ) self.assertEqual( start, 13, msg=("wrong start frame for token 1, expected: 9, found: {0} " ).format(start), ) self.assertEqual( end, 15, msg=("wrong end frame for token 1, expected: 14, found: {0} " ).format(end), ) data = "".join(tok3[0]) start = tok3[1] end = tok3[2] self.assertEqual( data, "AAAAAAAAAa", msg=("wrong data for token 1, expected: 'AAAAAAAAAa', " "found: '{0}' ").format(data), ) self.assertEqual( start, 17, msg=("wrong start frame for token 1, expected: 17, found: {0} " ).format(start), ) self.assertEqual( end, 26, msg=("wrong end frame for token 1, expected: 26, found: {0} " ).format(end), )
energy_threshold = 45 duration = 10 # seconds if len(sys.argv) > 1: energy_threshold = float(sys.argv[1]) if len(sys.argv) > 2: duration = float(sys.argv[2]) # record = True so that we'll be able to rewind the source. # max_time = 10: read 10 seconds from the microphone asource = ADSFactory.ads(record=True, max_time=duration) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=energy_threshold) tokenizer = StreamTokenizer(validator=validator, min_length=20, max_length=250, max_continuous_silence=30) player = player_for(asource) def echo(data, start, end): print("Acoustic activity at: {0}--{1}".format(start, end)) player.play(b"".join(data)) asource.open() print("\n ** Make some noise (dur:{}, energy:{})...".format(duration, energy_threshold)) tokenizer.tokenize(asource, callback=echo) asource.close() player.stop()
if w is None: break original_signal.append(w) original_signal = b''.join(original_signal) # rewind source asource.rewind() # Create a validator with an energy threshold of 50 validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=50) # Create a tokenizer with an unlimited token length and continuous silence within a token # Note the DROP_TRAILING_SILENCE mode that will ensure removing trailing silence trimmer = StreamTokenizer(validator, min_length = 20, max_length=99999999, max_continuous_silence=9999999, mode=StreamTokenizer.DROP_TRAILING_SILENCE, init_min=3, init_max_silence=1) tokens = trimmer.tokenize(asource) # Make sure we only have one token assert len(tokens) == 1, "Should have detected one single token" trimmed_signal = b''.join(tokens[0][0]) player = player_for(asource) print("\n ** Playing original signal (with leading and trailing silence)...") player.play(original_signal) print("\n ** Playing trimmed signal...") player.play(trimmed_signal)
class StreamReader(): categories = [ 'None', 'akatambi', 'app', 'bluetooth', 'damu', 'fayilo', 'folder', 'galawo', 'gulawo', 'koma awo', 'lekera awo', 'menu', 'task_manager', 'vaako', 'webake', 'wezeeko', 'wifi', 'wumula', 'zanya', 'zikira', ] commands = [ '', '', '', '', '', '', '', 'nautilus', 'nautilus', 'pkill,-KILL -u $USER', '', '', 'gnome-system-monitor,', 'shutdown,-rf,now', 'shutdown,-f,now', '', '', '', '', 'shutdown,-f,now' ] ''' categories = ['None', 'damu', 'galawo', 'menu', 'task_manager', 'wuumula'] commands = ['', 'Unset', 'pkill,-KILL -u $USER', 'nautilus,', 'gnome-system-monitor,', 'shutdown,-f']''' recognized_keyword = None command = None def __init__(self): self.asource = ADSFactory.ads(record=True, max_time=4) self.validator = AudioEnergyValidator(sample_width=2, energy_threshold=50) self.tokenizer = StreamTokenizer(validator=self.validator, min_length=20, max_length=1000, max_continuous_silence=30) self.player = player_for(self.asource) self.model = self.load_cnn('../model/final_cnn_model.json', '../model/weights_final_cnn.h5') self.model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) #CALLBACK FOR NEXT PROCESSING def echo(self, data, start, end): if (self.audio_file is not None): recording = self.asource.read() else: recording = b''.join(data) print("Acoustic Activity at: {0}--{1}".format(start, end)) print(recording) #data = np.array(data) #serialized = np.frombuffer(data) #print(len(hex_data)) #print(len(recording)) normalize(recording) pad_tokens('tmp.wav2') sample_rate, normalized_signal = wavfile.read('tmp.wav2') print(sample_rate) print(len(normalized_signal)) banks = convert_to_mel(normalized_signal) banks = np.array(banks) Banks = banks.reshape(1, 98, 40, 1) #np.save('test.npy', Banks) z = self.model.predict(Banks) p = z[0].tolist().index(max(z[0])) self.recognized_keyword = self.categories[p] print(self.recognized_keyword) ''' frame_length, step_size = 16000, 64000 no_of_shifts = int(64000 / step_size) - int(frame_length / step_size) print(no_of_shifts) #keyword = None prob = 0 for i in range(0, no_of_shifts): l = int(i*320) banks = convert_to_mel(normalized_signal[l:l+frame_length]) banks = np.array(banks) np.save('test.npy', banks) #banks = np.load('sd.npy') Banks = banks.reshape(1, 98, 40, 1) z = self.model.predict(Banks) print(z) p = z[0].tolist().index(max(z[0])) prob += p self.recognized_keyword = self.categories[p] #self.recognized_keyword = self.categories[np.argmax(z[0], -1)] #com = self.commands[p] print(self.recognized_keyword) final = int(np.ceil(prob/no_of_shifts)) print(prob/no_of_shifts) print(final) self.recognized_keyword = self.categories[final]''' K.clear_session() os.remove('tmp.wav2') os.remove('tmp.wav') #self.command = re.split(',', com) #print(max(z[0])) #print(self.recognized_keyword) #subprocess.call([self.command[0], self.command[1]]) #print(normalized_signal.get_array_of_samples()) #print(normalized_signal.get_array_of_samples()) #frame = np.array() #with open(datetime.today().isoformat() + 'XX.txt', 'w') as f: # f.write(recording) #player.play(recording) def load_cnn(self, name, weights): with open(name, 'r') as json_file: loaded_model_json = json_file.read() loaded_model = model_from_json(loaded_model_json) # Load weights into new model loaded_model.load_weights(weights) print("loaded model from disk") return loaded_model def main(self, audio_file=None): self.audio_file = audio_file if (self.audio_file is None): print("RECORDING STARTED") self.asource.open() self.tokenizer.tokenize(self.asource, callback=self.echo) else: print("AUDIO PROCESSING STARTED") #DELETE TOKENS FOLDER shutil.rmtree('../../tokens', ignore_errors=True) shutil.rmtree('../../data', ignore_errors=True) #CREATE tokens/tmp FOLDER os.makedirs('../../tokens/temp') #CREATE tmp/test AND tmp/train os.makedirs('../../tokens/temp/test') os.makedirs('../../tokens/temp/train') #COPY AUDIO FILE TO tokens/tmp/test as test.wave shutil.copyfile(self.audio_file, '../../tokens/temp/test/tf.wav') shutil.copyfile(self.audio_file, '../../tokens/temp/train/tf2.wav') #CALL SUBPROCESS TO RUN ./dump.sh with cd('../../'): subprocess.call(['./dump.sh', '']) #OPEN FILE IN /data/test/*.npy files = [file for file in os.listdir('./data/temp/test/')] #PREDICT DATA data = np.load('./data/temp/test/' + files[0]) banks = data.reshape(1, 98, 40, 1) z = self.model.predict(banks) p = z[0].tolist().index(max(z[0])) self.recognized_keyword = self.categories[p] print(self.recognized_keyword) K.clear_session()
def test_init_min_3_init_max_silence_2(self): tokenizer = StreamTokenizer( self.A_validator, min_length=5, max_length=20, max_continuous_silence=4, init_min=3, init_max_silence=2, mode=0, ) data_source = StringDataSource( "aAaaaAaAaaAaAaaaaaaAAAAAAAAAaaaaaaaAAAAA") # ^ ^ ^ ^ ^ ^ # 5 16 19 31 35 39 tokens = tokenizer.tokenize(data_source) self.assertEqual( len(tokens), 3, msg="wrong number of tokens, expected: 3, found: {0} ".format( len(tokens)), ) tok1, tok2, tok3 = tokens[0], tokens[1], tokens[2] data = "".join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual( data, "AaAaaAaAaaaa", msg=("wrong data for token 1, expected: 'AaAaaAaA', found: '{0}' " ).format(data), ) self.assertEqual( start, 5, msg=("wrong start frame for token 1, expected: 5, found: {0} " ).format(start), ) self.assertEqual( end, 16, msg=("wrong end frame for token 1, expected: 16, found: {0} " ).format(end), ) data = "".join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual( data, "AAAAAAAAAaaaa", msg=("wrong data for token 2, expected: 'AAAAAAAAAaaaa', " "found: '{0}' ").format(data), ) self.assertEqual( start, 19, msg=("wrong start frame for token 2, expected: 19, found: {0} " ).format(start), ) self.assertEqual( end, 31, msg=("wrong end frame for token 2, expected: 31, found: {0} " ).format(end), ) data = "".join(tok3[0]) start = tok3[1] end = tok3[2] self.assertEqual( data, "AAAAA", msg=("wrong data for token 3, expected: 'AAAAA', found: '{0}' " ).format(data), ) self.assertEqual( start, 35, msg=("wrong start frame for token 2, expected: 35, found: {0} " ).format(start), ) self.assertEqual( end, 39, msg=("wrong end frame for token 2, expected: 39, found: {0} " ).format(end), )
def test_init_min_0_init_max_silence_0(self): tokenizer = StreamTokenizer( self.A_validator, min_length=5, max_length=20, max_continuous_silence=4, init_min=0, init_max_silence=0, mode=0, ) data_source = StringDataSource("aAaaaAaAaaAaAaaaaaaaAAAAAAAA") # ^ ^ ^ ^ # 2 16 20 27 tokens = tokenizer.tokenize(data_source) self.assertEqual( len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format( len(tokens)), ) tok1, tok2 = tokens[0], tokens[1] # tok1[0]: data # tok1[1]: start frame (included) # tok1[2]: end frame (included) data = "".join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual( data, "AaaaAaAaaAaAaaaa", msg=("wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', " "found: {0} ").format(data), ) self.assertEqual( start, 1, msg=("wrong start frame for token 1, expected: 1, found: {0} " ).format(start), ) self.assertEqual( end, 16, msg=("wrong end frame for token 1, expected: 16, found: {0} " ).format(end), ) data = "".join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual( data, "AAAAAAAA", msg=("wrong data for token 1, expected: 'AAAAAAAA', found: {0} " ).format(data), ) self.assertEqual( start, 20, msg=("wrong start frame for token 2, expected: 20, found: {0} " ).format(start), ) self.assertEqual( end, 27, msg=("wrong end frame for token 2, expected: 27, found: {0} " ).format(end), )
def extractEvents(path, patientID): yname = os.path.basename(path) yname = yname[:len(yname) - 4] dest_path = '/home/pi/recordings/' + patientID + '/' + yname + '/' if not os.path.exists(dest_path): os.makedirs(dest_path) fsoriginal, y = wavfile.read(path) # read audio file try: r, c = np.shape(y) if c > 1: y = np.delete(y, 1, axis=1) # print("audio file shape: ", numpy.shape(y)) except: print(' ') wavfile.write('/home/pi/coughanalysis_ann/sample.wav', data=y, rate=44100) asource = ADSFactory.ads( filename='/home/pi/coughanalysis_ann/sample.wav', record=True) validator = AudioEnergyValidator( sample_width=asource.get_sample_width(), energy_threshold=65) # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=4000 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 30 == 300 ms # For a sampling rate of 16KHz (16000 samples per second), we have 160 samples for 10 ms. tokenizer = StreamTokenizer(validator=validator, min_length=10, max_length=1000, max_continuous_silence=40) asource.open() tokens = tokenizer.tokenize(asource) # Play detected regions back # player = player_for(asource) # Rewind and read the whole signal asource.rewind() original_signal = [] while True: w = asource.read() if w is None: break original_signal.append(w) original_signal = ''.join(original_signal) # print("Playing the original file...") # player.play(original_signal) # print("playing detected regions...") count = 0 for t in tokens: # print("Token starts at {0} and ends at {1}".format(t[1], t[2])) data = ''.join(t[0]) # player.play(data) fp = wave.open(dest_path + yname + str(count) + '.wav', "w") fp.setnchannels(asource.get_channels()) fp.setsampwidth(asource.get_sample_width()) fp.setframerate(asource.get_sampling_rate()) fp.writeframes(data) fp.close() count += 1 return dest_path
#check os system and set sample rate 48000 for Linux (Raspberry Pi) _os = platform.system() if (_os == 'Darwin') or (_os == 'Windows'): # macOs sample_rate = asource.get_sampling_rate() # get sample width and channels from ads factory sample_width = asource.get_sample_width() channels = asource.get_channels() # START VALIDATOR validator = AudioEnergyValidator(sample_width=sample_width, energy_threshold=energy_threshold) tokenizer = StreamTokenizer( validator=validator, min_length=min_length, max_length=max_length, max_continuous_silence=max_continuous_silence) # # LOAD PYAUDIO p = pyaudio.PyAudio() # start classe memoria _memoria = memoria.Memoria() # gui vars if GUI: root = Tk() display = GUI(root) if TRANSCRIPTION:
class sentence_tokenizer(yarp.RFModule): def __init__(self): yarp.RFModule.__init__(self) self.interrupted = False self.portsList = dict() self.hotword_detector = None self.hotword_model = None self.token_out_port = None self.audio_source = None self.tok_validator = None self.tokenizer = None self.player = None self.pause_tokenizer = False self.echo_enabled = False self.trigger_echo = False self.echo_thread = None self.hotword_enabled = True # Hotword settings self.hotword_sensitivity = 0.5 self.hotword_loop_time = 0.03 self.hotword_model = os.environ['HOTWORD_MODEL'] # Tokenizer Settings # self.tok_record_duration = None means indefinite self.tok_record_duration = None self.tok_energy_threshold = 40 #60 self.tok_window = 0.01 # 0.01 self.tok_window_rate = 1. / self.tok_window self.tok_min_len = 0.5 * self.tok_window_rate self.tok_max_len = int(5 * self.tok_window_rate) self.tok_max_silence_duration = 0.7 * self.tok_window_rate self.tokenizer_mode = None self.bdata = None self.google_credentials = None # Google ASR self.use_google = True self.asr = None self.time_total = 0 self.num_recs = 0 self.phrases = ["Hello i cub", "Goodbye i cub", "i cub", "Tony", "Daniel"] def configure(self, rf): # Setting up rpc port self.portsList["rpc"] = yarp.Port() self.portsList["rpc"].open("/sentence_tokenizer/rpc:i") self.attach(self.portsList["rpc"]) self.portsList["audio_out"] = yarp.BufferedPortBottle() self.portsList["audio_out"].open("/sentence_tokenizer/audio:o") # Setting up hotword detection self.hotword_detector = snowboydecoder.HotwordDetector(self.hotword_model, sensitivity=self.hotword_sensitivity) # Setting up audio tokenizer to split sentences self.audio_source = ADSFactory.ads(record=True, max_time=self.tok_record_duration, block_dur=self.tok_window) self.tok_validator = AudioEnergyValidator(sample_width=self.audio_source.get_sample_width(), energy_threshold=self.tok_energy_threshold) self.tokenizer_mode = StreamTokenizer.DROP_TRAILING_SILENCE self.tokenizer = StreamTokenizer(validator=self.tok_validator, min_length=self.tok_min_len, max_length=self.tok_max_len, max_continuous_silence=self.tok_max_silence_duration, mode=self.tokenizer_mode) if self.echo_enabled: self.echo_thread = threading.Thread(target=self.replayAudio) self.echo_thread.start() if self.hotword_enabled: print("Waiting for hotword to start interaction") # self.hotword_detector.start(detected_callback=self.detected_callback, # interrupt_check=self.interrupt_callback, # sleep_time=self.hotword_loop_time) print("Hotword detected. Starting tokenizer thread") else: print "Starting tokenizer thread" self.asr = sr.Recognizer() with open('google_credentials.json', 'r') as credentials: self.google_credentials = credentials.read() return True def detected_callback(self): print("Hotword 'Hello iCub' detected") self.interrupted = True def tok_callback(self, data, start, end, starting=False): if data is None: audio_bottle = self.portsList["audio_out"].prepare() audio_bottle.clear() audio_bottle.addString("speaking") if starting: print "Speaking start" audio_bottle.addString("start") else: print "Speaking stop" audio_bottle.addString("stop") self.portsList["audio_out"].write() else: print("Acoustic activity at: {0}--{1}".format(start, end)) # print "Chunk segmented", time.time() # print "Pause value is: ", self.pause_tokenizer if not self.pause_tokenizer: self.bdata = b''.join(data) if self.use_google: audio = sr.AudioData(self.bdata, self.audio_source.get_sampling_rate(), self.audio_source.get_sample_width()) t3 = time.time() try: sentence = self.asr.recognize_google_cloud(audio_data=audio, credentials_json=self.google_credentials, language="en-UK", preferred_phrases=self.phrases) t4 = time.time() dur = t4 - t3 self.time_total += dur self.num_recs += 1 print sentence, " | Time taken=", dur, " | Mean Time=", self.time_total/self.num_recs audio_bottle = self.portsList["audio_out"].prepare() audio_bottle.clear() audio_bottle.addString("spoken") audio_bottle.addString(str(sentence)) self.portsList["audio_out"].write() except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e)) else: audio_bottle = self.portsList["audio_out"].prepare() audio_bottle.clear() audio_bottle.addString("classify") audio_bottle.addString(self.bdata) audio_bottle.addInt(self.audio_source.get_sampling_rate()) self.portsList["audio_out"].write() if self.echo_enabled: self.trigger_echo = True def tokenizerThread(self): self.audio_source.open() self.tokenizer.tokenize(self.audio_source, callback=self.tok_callback) def replayAudio(self): self.player = player_for(self.audio_source) while True: if self.trigger_echo: self.player.play(self.bdata) self.trigger_echo = False time.sleep(2) def close(self): print('Exiting ...') time.sleep(2) self.hotword_detector.terminate() self.audio_source.close() if self.echo_enabled: self.player.stop() for j in self.portsList.keys(): self.close_port(self.portsList[j]) return True @staticmethod def close_port(j): j.interrupt() time.sleep(1) j.close() def respond(self, command, reply): reply.clear() action = command.get(0).asString() if action == "heartbeat": reply.addString('ack') elif action == "pause": self.pause_tokenizer = True print "pausing tokenizer sending" reply.addString('ack') elif action == "resume": self.pause_tokenizer = False print "resuming tokenizer sending" reply.addString('ack') # ------------------------------------------------- elif action == "EXIT": reply.addString('ack') self.close() # ------------------------------------------------- else: reply.addString("nack") reply.addString("Command not recognized") return True def interruptModule(self): print "Interrupting" self.close() return True def getPeriod(self): return 0.1 def updateModule(self): self.tokenizerThread() print "starting again" time.sleep(0.05) return True
def find_voice_segments(audio_file, music_time_list): segments = [] formats = {1: numpy.int8, 2: numpy.int16, 4: numpy.int32} #[Fs_cr, x_cr] = aIO.readAudioFile(input_audio_audio_file) #[Fs_ce, x_ce] = aIO.readAudioFile(callee_audio_file) #segments = aS.silenceRemoval(x_cr, Fs_cr, 0.010, 0.010, smoothWindow=3,Weight=0.3,plot=False) #print(segments) #callee_segments = aS.silenceRemoval(x_ce, Fs_ce, 0.010, 0.010, smoothWindow=5,Weight=0.3,plot=False) #print(callee_segments) test_source = ADSFactory.ads(filename=audio_file, record=False) test_source.open() i = 0 max_value = 0.0 a = numpy.empty([], dtype=numpy.float64) b = numpy.empty([], dtype=numpy.float64) while True: frame = test_source.read() if frame is None: break signal = numpy.array(numpy.frombuffer( frame, dtype=formats[test_source.get_sample_width()]), dtype=numpy.float64) energy = float(numpy.dot(signal, signal)) / len(signal) max_value = max(max_value, energy) i += 1 b = numpy.append(b, [energy]) #diff = max_value - numpy.mean(b) #print(10. * numpy.log10(0.3*diff)) log_max = 10. * numpy.log10(max_value) log_mean = 10. * numpy.log10(numpy.mean(b)) tmp = log_max - log_mean threshold = log_mean + 0.4 * tmp #print(threshold) test_source.close() asource = ADSFactory.ads(filename=audio_file, record=False) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=threshold) tokenizer = StreamTokenizer(validator=validator, min_length=300, max_length=99999999, max_continuous_silence=300) player = player_for(asource) asource.open() tokens = tokenizer.tokenize(asource) for i, t in enumerate(tokens): segment_begin = t[1] * 10 segment_end = t[2] * 10 if len(music_time_list) > 0: for item in music_time_list: # if segment end includes music begin if segment_end > item[0]: #include segment before music segments.append([segment_begin, item[0]]) #save stamps for incluing segment after music segment_begin = item[1] # remove music segment from list # to not use it in further music_time_list.remove(item) segments.append([segment_begin, segment_end]) asource.close() return segments
from auditok import ADSFactory, AudioEnergyValidator, StreamTokenizer, player_for, dataset import sys try: # We set the `record` argument to True so that we can rewind the source asource = ADSFactory.ads(filename=dataset.one_to_six_arabic_16000_mono_bc_noise, record=True) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=65) # Defalut analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=400 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 30 == 300 ms tokenizer = StreamTokenizer(validator=validator, min_length=20, max_length=400, max_continuous_silence=30) asource.open() tokens = tokenizer.tokenize(asource) # Play detected regions back player = player_for(asource) # Rewind and read the whole signal asource.rewind() original_signal = [] while True: w = asource.read() if w is None: break
def test_init_min_3_init_max_silence_0(self): tokenizer = StreamTokenizer(self.A_validator, min_length=5, max_length=20, max_continuous_silence=4, init_min=3, init_max_silence=0, mode=0) data_source = StringDataSource( "aAaaaAaAaaAaAaaaaaAAAAAAAAAaaaaaaAAAAA") # ^ ^ ^ ^ # 18 30 33 37 tokens = tokenizer.tokenize(data_source) self.assertEqual( len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format( len(tokens))) tok1, tok2 = tokens[0], tokens[1] data = ''.join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual( data, "AAAAAAAAAaaaa", msg= "wrong data for token 1, expected: 'AAAAAAAAAaaaa', found: '{0}' ". format(data)) self.assertEqual( start, 18, msg="wrong start frame for token 1, expected: 18, found: {0} ". format(start)) self.assertEqual( end, 30, msg="wrong end frame for token 1, expected: 30, found: {0} ". format(end)) data = ''.join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual( data, "AAAAA", msg="wrong data for token 1, expected: 'AAAAA', found: '{0}' ". format(data)) self.assertEqual( start, 33, msg="wrong start frame for token 2, expected: 33, found: {0} ". format(start)) self.assertEqual( end, 37, msg="wrong end frame for token 2, expected: 37, found: {0} ". format(end))
def test_min_length_10_init_max_length_20(self): tokenizer = StreamTokenizer( self.A_validator, min_length=10, max_length=20, max_continuous_silence=4, init_min=3, init_max_silence=3, mode=0, ) data_source = StringDataSource( "aAaaaAaAaaAaAaaaaaaAAAAAaaaaaaAAAAAaaAAaaAAA") # ^ ^ ^ ^ # 1 16 30 45 tokens = tokenizer.tokenize(data_source) self.assertEqual( len(tokens), 2, msg="wrong number of tokens, expected: 2, found: {0} ".format( len(tokens)), ) tok1, tok2 = tokens[0], tokens[1] data = "".join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual( data, "AaaaAaAaaAaAaaaa", msg=("wrong data for token 1, expected: 'AaaaAaAaaAaAaaaa', " "found: '{0}' ").format(data), ) self.assertEqual( start, 1, msg=("wrong start frame for token 1, expected: 1, found: {0} " ).format(start), ) self.assertEqual( end, 16, msg=("wrong end frame for token 1, expected: 16, found: {0} " ).format(end), ) data = "".join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual( data, "AAAAAaaAAaaAAA", msg=("wrong data for token 2, expected: 'AAAAAaaAAaaAAA', " "found: '{0}' ").format(data), ) self.assertEqual( start, 30, msg=("wrong start frame for token 2, expected: 30, found: {0} " ).format(start), ) self.assertEqual( end, 43, msg=("wrong end frame for token 2, expected: 43, found: {0} " ).format(end), )
try: # We set the `record` argument to True so that we can rewind the source asource = ADSFactory.ads( filename=dataset.one_to_six_arabic_16000_mono_bc_noise, record=True) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=65) # Default analysis window is 10 ms (float(asource.get_block_size()) / asource.get_sampling_rate()) # min_length=20 : minimum length of a valid audio activity is 20 * 10 == 200 ms # max_length=400 : maximum length of a valid audio activity is 400 * 10 == 4000 ms == 4 seconds # max_continuous_silence=30 : maximum length of a tolerated silence within a valid audio activity is 30 * 30 == 300 ms tokenizer = StreamTokenizer(validator=validator, min_length=20, max_length=400, max_continuous_silence=30) asource.open() tokens = tokenizer.tokenize(asource) # Play detected regions back player = player_for(asource) # Rewind and read the whole signal asource.rewind() original_signal = [] while True: w = asource.read() if w is None:
def test_min_length_4_init_max_length_5(self): tokenizer = StreamTokenizer( self.A_validator, min_length=4, max_length=5, max_continuous_silence=4, init_min=3, init_max_silence=3, mode=0, ) data_source = StringDataSource( "aAaaaAaAaaAaAaaaaaAAAAAAAAaaaaaaAAAAAaaaaaAAaaAaa") # ^ ^^ ^ ^ ^ ^ ^ # 18 2223 27 32 36 42 46 tokens = tokenizer.tokenize(data_source) self.assertEqual( len(tokens), 4, msg="wrong number of tokens, expected: 4, found: {0} ".format( len(tokens)), ) tok1, tok2, tok3, tok4 = tokens[0], tokens[1], tokens[2], tokens[3] data = "".join(tok1[0]) start = tok1[1] end = tok1[2] self.assertEqual( data, "AAAAA", msg=("wrong data for token 1, expected: 'AAAAA', found: '{0}' " ).format(data), ) self.assertEqual( start, 18, msg=("wrong start frame for token 1, expected: 18, found: {0} " ).format(start), ) self.assertEqual( end, 22, msg=("wrong end frame for token 1, expected: 22, found: {0} " ).format(end), ) data = "".join(tok2[0]) start = tok2[1] end = tok2[2] self.assertEqual( data, "AAAaa", msg=("wrong data for token 1, expected: 'AAAaa', found: '{0}' " ).format(data), ) self.assertEqual( start, 23, msg=("wrong start frame for token 1, expected: 23, found: {0} " ).format(start), ) self.assertEqual( end, 27, msg=("wrong end frame for token 1, expected: 27, found: {0} " ).format(end), ) data = "".join(tok3[0]) start = tok3[1] end = tok3[2] self.assertEqual( data, "AAAAA", msg=("wrong data for token 1, expected: 'AAAAA', found: '{0}' " ).format(data), ) self.assertEqual( start, 32, msg=("wrong start frame for token 1, expected: 1, found: {0} " ).format(start), ) self.assertEqual( end, 36, msg=("wrong end frame for token 1, expected: 7, found: {0} " ).format(end), ) data = "".join(tok4[0]) start = tok4[1] end = tok4[2] self.assertEqual( data, "AAaaA", msg=("wrong data for token 2, expected: 'AAaaA', found: '{0}' " ).format(data), ) self.assertEqual( start, 42, msg=("wrong start frame for token 2, expected: 17, found: {0} " ).format(start), ) self.assertEqual( end, 46, msg=("wrong end frame for token 2, expected: 22, found: {0} " ).format(end), )
''' # record = True so that we'll be able to rewind the source. # max_time = 10: read 10 seconds from the microphone asource = ADSFactory.ads(record=True) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=50) tokenizer = StreamTokenizer(validator=validator, min_length=20, max_length=250, max_continuous_silence=30) player = player_for(asource) asource.open() tokenizer.tokenize(asource, callback=echo) ''' asource = ADSFactory.ads(sampling_rate=16000, sample_width=2, channels=1, frames_per_buffer=128, record=False, block_dur=0.01) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=50) tokenizer = StreamTokenizer(validator=validator, min_length=100, max_continuous_silence=500) asource.open() tokenizer.tokenize(asource, callback=echo)
if __name__ == "__main__": # Delete all .wav files for file in glob.glob("*.wav"): remove(file) # Command utilities: handler = CommandHandler() # Auditok utilities: asource = ADSFactory.ads(sampling_rate=16000, sample_width=2, channels=1, frames_per_buffer=512, record=False, block_dur=0.01) validator = AudioEnergyValidator(sample_width=asource.get_sample_width(), energy_threshold=50) tokenizer = StreamTokenizer(validator=validator, min_length=100, max_length=500, max_continuous_silence=30) asource.open() # Main program loop log("Waiting for a command...") tokenizer.tokenize(asource, callback=listen_for_commands)