def featurize_wav(wav_file, label, featurizer_path, sample_rate, window_size, shift, dump): transform = featurizer.AudioTransform(featurizer_path, 0) input_size = transform.input_size output_shape = transform.model.output_shape output_size = output_shape.Size() feature_size = output_shape.Size() feature_shape = (output_shape.rows, output_shape.columns, output_shape.channels) wav_log = None if dump: wav_log = open(wav_file + ".txt", "w") wav_log.write("{}:\n".format(wav_file)) transform.set_log(wav_log) features = list(make_dataset.get_wav_features(wav_file, transform, sample_rate, window_size, shift)) # reshape features to record the fact that the window_size is a kind of batch size for this type of model features = [np.reshape(f, (window_size, 1, 1, int(len(f)/window_size))) for f in features] labels = [label] * len(features) output_file = os.path.basename(wav_file) output_file = os.path.splitext(output_file)[0] + ".npz" parameters = (sample_rate, input_size, output_size, window_size, shift) # remember these settings in the dataset np.savez(output_file, features=features, labels=labels, parameters=parameters) if dump: with open(wav_file + ".features.txt", "w") as f: f.write("{}:\n".format(wav_file)) for a in features: f.write("{}\n".format(", ".join([str(x) for x in a.ravel()]))) wav_log.close()
def load_featurizer_model(self, featurizer_path): """ load the given compiled ELL featurizer for use in processing subsequent audio input """ if featurizer_path: self.featurizer = featurizer.AudioTransform(featurizer_path, 40) self.setup_spectrogram_image() self.show_output("Feature input size: {}, output size: {}".format( self.featurizer.input_size, self.featurizer.output_size)) self.init_data()
def load_featurizer_model(self, featurizer_model): """ load the given compiled ELL featurizer for use in processing subsequent audio input """ if featurizer_model: self.featurizer = featurizer.AudioTransform(featurizer_model, 40) self.setup_spectrogram_image() self.show_output("Feature input size: {}, output size: {}".format( self.featurizer.input_size, self.featurizer.output_size)) if self.features_entry.get() != featurizer_model: self.features_entry.delete(0, END) self.features_entry.insert(0, featurizer_model) self.init_data()
def test_keyword_spotter(featurizer_model, classifier_model, categories, wav_files, threshold, sample_rate, output_speaker=False, auto_scale=False, reset=False): predictor = classifier.AudioClassifier(classifier_model, categories, threshold, SMOOTHING) transform = featurizer.AudioTransform(featurizer_model, predictor.input_size) if transform.using_map != predictor.using_map: raise Exception("cannot mix .ell and compiled models") the_speaker = None if output_speaker: the_speaker = speaker.Speaker() results = [] if wav_files: if not os.path.isdir(wav_files): raise Exception("--wav_files {} dir not found".format(wav_files)) file_list = os.listdir(wav_files) file_list.sort() for filename in file_list: ext = os.path.splitext(filename)[1] if ext != ".wav": print("Skipping non-wav file: ", filename) else: reader = wav_reader.WavReader(sample_rate, CHANNELS, auto_scale) path = os.path.join(wav_files, filename) print("opening ", path) reader.open(path, transform.input_size, the_speaker) result = get_prediction(reader, transform, predictor, categories) results += [result] if reset: predictor.reset() else: reader = microphone.Microphone(True, True) reader.open(transform.input_size, sample_rate, CHANNELS) print("Please type 'x' and enter to terminate this app...") result = get_prediction(reader, transform, predictor, categories) results += [result] return results
def make_dataset(list_file, outdir, categories_path, featurizer_path, sample_rate, window_size, shift, auto_scale=True, noise_path=None, max_noise_ratio=0.1, noise_selection=0.1, use_cache=False): """ Create a dataset given the input list file, a featurizer, the desired .wav sample rate, classifier window_size and window shift amount. The dataset is saved to the same file name with .npz extension. This will do nothing if dataset is already created, unless use_cache=False. """ dataset_name = os.path.basename(list_file) dataset_path = os.path.splitext(dataset_name)[0] + ".npz" dataset_path = os.path.join(outdir, dataset_path) if use_cache and os.path.isfile(dataset_path): return transform = featurizer.AudioTransform(featurizer_path, 0) entry_map = parse_list_file(list_file) if not categories_path: categories_path = "categories.txt" if not os.path.isfile(categories_path): raise Exception("{} file not found".format(categories_path)) categories = [x.strip() for x in open(categories_path, 'r').readlines()] mixer = None if noise_path: noise_files = [ os.path.join(noise_path, f) for f in os.listdir(noise_path) if os.path.splitext(f)[1] == ".wav" ] mixer = noise_mixer.AudioNoiseMixer(noise_files, max_noise_ratio, noise_selection) dataset = _get_dataset(entry_map, categories, transform, sample_rate, window_size, shift, auto_scale, mixer) if len(dataset.features) == 0: print("No features found in list file") print("Saving: {}".format(dataset_path)) dataset.save(dataset_path)
def test_keyword_spotter(featurizer_model, classifier_model, categories, wav_file, threshold, sample_rate, output_speaker=False): predictor = classifier.AudioClassifier(classifier_model, categories, threshold, SMOOTHING) transform = featurizer.AudioTransform(featurizer_model, predictor.input_size) if transform.using_map != predictor.using_map: raise Exception("cannot mix .ell and compiled models") # set up inputs and outputs if wav_file: the_speaker = None if output_speaker: the_speaker = speaker.Speaker() reader = wav_reader.WavReader(sample_rate, CHANNELS) reader.open(wav_file, transform.input_size, the_speaker) else: reader = microphone.Microphone(True) reader.open(transform.input_size, sample_rate, CHANNELS) print("Please type 'x' and enter to terminate this app...") transform.open(reader) results = None try: while True: feature_data = transform.read() if feature_data is None: break else: prediction, probability, label = predictor.predict(feature_data) if probability is not None: if not results or results[1] < probability: results = (prediction, probability, label) percent = int(100 * probability) print("<<< DETECTED ({}) {}% '{}' >>>".format(prediction, percent, label)) except KeyboardInterrupt: pass transform.close() average_time = predictor.avg_time() + transform.avg_time() print("Average processing time: {}".format(average_time)) if results is None: raise Exception("test_keyword_spotter failed to find any predictions!") return tuple(list(results) + [average_time])
def make_dataset(list_file, featurizer_path, sample_rate, window_size, shift): """ Create a dataset given the input list file, a featurizer, the desired .wav sample rate, classifier window_size and window shift amount. The dataset is saved to the same file name with .npz extension. """ transform = featurizer.AudioTransform(featurizer_path, 0) entry_map = parse_list_file(list_file) dataset = _get_dataset(entry_map, transform, sample_rate, window_size, shift) if len(dataset.features) == 0: print("No features found in list file") dataset_name = os.path.basename(list_file) dataset_path = os.path.splitext(dataset_name)[0] + ".npz" print("Saving: {}".format(dataset_path)) dataset.save(dataset_path)
def multiprocess_data(file, e, featurizer_path, sample_rate, window_size, shift, auto_scale, noise_path, max_noise_ratio, noise_selection, label): data_rows = [] label_rows = [] transform = featurizer.AudioTransform(featurizer_path, 0) mixer = None if noise_path: noise_files = [ os.path.join(noise_path, f) for f in os.listdir(noise_path) if os.path.splitext(f)[1] == ".wav" ] mixer = noise_mixer.AudioNoiseMixer(noise_files, max_noise_ratio, noise_selection) full_path = os.path.join(e, file) file_features = list( get_wav_features(full_path, transform, sample_rate, window_size, shift, auto_scale, mixer)) if len(file_features) > 0: labels = [label for r in file_features] data_rows.append(file_features) label_rows.append(labels) return data_rows, label_rows
def RunTest(self, featurizer_model, classifier_model, list_file, dataset, categories, sample_rate, ignore_label): predictor = classifier.AudioClassifier(classifier_model, categories, [ignore_label], THRESHOLD, SMOOTHING) transform = featurizer.AudioTransform(featurizer_model, predictor.input_size) print("Evaluation with transform input size {}, output size {}".format( transform.input_size, transform.output_size)) print( "Evaluation with classifier input size {}, output size {}".format( predictor.input_size, predictor.output_size)) if transform.using_map != predictor.using_map: raise Exception("cannot mix .ell and compiled models") if list_file: with open(list_file, "r") as fp: testlist = [e.strip() for e in fp.readlines()] wav_dir = os.path.dirname(list_file) start = time.time() for name in testlist: # bed/28497c5b_nohash_0.wav expected = name.split('/')[0] wav_file = os.path.join(wav_dir, "audio", name) # open the wav file. reader = wav_reader.WavReader(sample_rate) reader.open(wav_file, transform.input_size, None) transform.open(reader) prediction = self.get_prediction(transform, predictor) self.process_prediction(prediction, expected) elif dataset: if type(dataset) is str: ds = np.load(dataset) features = ds['features'] labels = ds['labels'] else: features = dataset.features labels = dataset.label_names index = 0 start = time.time() for f in features: expected = labels[index] reader = FeatureReader(f, predictor.input_size) prediction = self.get_prediction(reader, predictor) self.process_prediction(prediction, expected) index += 1 end = time.time() seconds = end - start print("Test completed in {:.2f} seconds".format(seconds)) print("{} passed, {} failed, pass rate of {:.2f} %".format( self.passed, self.failed, self.rate * 100)) return self.rate
parser = argparse.ArgumentParser("test the classifier and featurizer against mic or wav file input") parser.add_argument("--wav_file", help="optional path to wav file to test", default=None) parser.add_argument("--featurizer", "-f", required=True, help="specify path to featurizer model (*.ell or compiled_folder/model_name)") parser.add_argument("--classifier", "-c", required=True, help="specify path to classifier model (*.ell or compiled_folder/model_name)") parser.add_argument("--categories", "-cat", help="specify path to categories file", required=True) parser.add_argument("--sample_rate", "-s", default=SAMPLE_RATE, type=int, help="Audio sample rate expected by classifier") parser.add_argument("--threshold", "-t", help="Classifier threshold (default 0.6)", default=THRESHOLD, type=float) parser.add_argument("--speaker", help="Output audio to the speaker.", action='store_true') args = parser.parse_args() predictor = classifier.AudioClassifier(args.classifier, args.categories, args.threshold, SMOOTHING) transform = featurizer.AudioTransform(args.featurizer, predictor.input_size) if transform.using_map != predictor.using_map: raise Exception("cannot mix .ell and compiled models") # set up inputs and outputs if args.wav_file: output_speaker = None if args.speaker: output_speaker = speaker.Speaker() reader = wav_reader.WavReader(args.sample_rate, CHANNELS) reader.open(args.wav_file, transform.input_size, output_speaker) else: reader = microphone.Microphone(True) reader.open(transform.input_size, args.sample_rate, CHANNELS) print("Please type 'x' and enter to terminate this app...")
def run_test(self, featurizer_model, classifier_model, list_file, max_tests, dataset, categories, sample_rate, auto_scale, output_file, algorithm="max", window_size=0): """ Run the test using the given input models (featurizer and classifier) which may or may not be compiled. The test set is defined by a list_file or a dataset. The list file lists .wav files which we will featurize using the given featurizer. The dataset contains pre-featurized data as created by make_dataset.py. The categories define the names of the keywords detected by the classifier and the sample_rate defines the audio sample rate in Hertz -- all input audio is resampled at this rate before featurization. """ predictor = classifier.AudioClassifier(classifier_model, categories, THRESHOLD, SMOOTHING) if window_size == 0: window_size = predictor.input_size transform = featurizer.AudioTransform(featurizer_model, window_size) if not self.silent: self.logger.info( "Evaluation with transform input size {}, output size {}". format(transform.input_size, transform.output_size)) self.logger.info( "Evaluation with classifier input size {}, output size {}". format(predictor.input_size, predictor.output_size)) if transform.using_map != predictor.using_map: raise Exception("cannot mix .ell and compiled models") results = [] if list_file: with open(list_file, "r") as fp: testlist = [e.strip() for e in fp.readlines()] wav_dir = os.path.dirname(list_file) if max_tests: testlist = np.random.choice(testlist, max_tests, replace=False) start = time.time() for name in testlist: # e.g. bed/28497c5b_nohash_0.wav expected = name.split('/')[0] wav_file = os.path.join(wav_dir, name) # open the wav file. reader = wav_reader.WavReader(sample_rate, 1, auto_scale) reader.open(wav_file, transform.input_size, None) transform.open(reader) prediction, confidence, _, elapsed = self.get_prediction( name, transform, predictor, algorithm) self.process_prediction(name, prediction, expected, confidence) results += [prediction] if self.best_time is None or elapsed < self.best_time: self.best_time = elapsed elif dataset: if type(dataset) is str: ds = np.load(dataset) features = ds['features'] labels = ds['labels'] else: features = dataset.features labels = dataset.label_names index = 0 start = time.time() for f in features: expected = labels[index] reader = FeatureReader(f, predictor.input_size) name = "row " + str(index) prediction, confidence, _, elapsed = self.get_prediction( name, reader, predictor) self.process_prediction(name, prediction, expected, confidence) if self.best_time is None or elapsed < self.best_time: self.best_time = elapsed index += 1 else: raise Exception("Missing list_file and dataset arguments") end = time.time() seconds = end - start self.logger.info("Saving '{}'".format(output_file)) with open(output_file, "w") as f: json.dump(results, f) self.logger.info("Test completed in {:.2f} seconds".format(seconds)) self.logger.info("{} passed, {} failed, pass rate of {:.2f} %".format( self.passed, self.failed, self.rate * 100)) self.logger.info("Best prediction time was {} seconds".format( self.best_time)) return self.rate, self.best_time
def _get_dataset(entry_map, categories, featurizer_path, window_size, shift, noise_path, multicore=False, max_noise_ratio=0.1, noise_selection=0.1): data_rows = [] label_rows = [] transform = featurizer.AudioTransform(featurizer_path, 0) sample_rate = transform.get_metadata("sample_rate") if not sample_rate: raise Exception("Featurizer is missing 'sample_rate' metadata") sample_rate = int(sample_rate) auto_scale = transform.get_metadata("auto_scale") if auto_scale: auto_scale = bool(auto_scale) if auto_scale: print( "Featurizer requires auto-scaling of audio input to float range [-1, 1]" ) else: auto_scale = False mixer = None if noise_path: noise_files = [ os.path.join(noise_path, f) for f in os.listdir(noise_path) if os.path.splitext(f)[1] == ".wav" ] mixer = noise_mixer.AudioNoiseMixer(noise_files, max_noise_ratio, noise_selection) for e in entry_map: label = os.path.basename(e) if label not in categories: raise Exception( "label {} not found in categories file".format(label)) print("Transforming {} files from {} ... ".format( len(entry_map[e]), label), end='', flush=True) total = 0 if multicore: p = multiprocessing.Pool(multiprocessing.cpu_count()) temp_partial = partial(multiprocess_data, e=e, featurizer_path=featurizer_path, sample_rate=sample_rate, window_size=window_size, shift=shift, auto_scale=auto_scale, noise_path=noise_path, max_noise_ratio=max_noise_ratio, noise_selection=noise_selection, label=label) temp_file = p.map(temp_partial, entry_map[e]) for temp_list in temp_file: temp1, temp2 = temp_list for x in temp1: data_rows.append(x) for y in temp2: label_rows.append(y) total += len(label_rows) print(" found {} rows".format(total)) p.close() p.join() else: for file in entry_map[e]: full_path = os.path.join(e, file) file_features = list( get_wav_features(full_path, transform, sample_rate, window_size, shift, auto_scale, mixer)) if len(file_features) > 0: labels = [label for r in file_features] data_rows.append(file_features) label_rows.append(labels) total += len(file_features) print(" found {} rows".format(total)) features = np.concatenate(data_rows, axis=0) label_names = np.concatenate(label_rows, axis=0) # remember these settings in the dataset parameters = (sample_rate, transform.input_size, transform.output_size, window_size, shift) return Dataset(features, label_names, categories, parameters)