def get_model_output_10_classes(filename): with tf.gfile.GFile(filename, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) with tf.Graph().as_default() as graph: tensors_and_ops = tf.import_graph_def(graph_def, name='') with tf.Session(graph=graph) as sess: output_op = graph.get_operation_by_name('Minimum_3').outputs[0] xs = [] ys = [] filename_list = [] for idx, filename in enumerate(glob.glob(os.path.join(Constants.AUDIO_DATA_FOLDER, '*wav*', '*.wav'))): name = filename.split('/')[-2:] if int(name[-1].strip('.wav')) < 1000: continue y = name[-1].strip('.wav') y = str(int(y) - 1000) name = '/'.join(name) name = name.replace('.wav', '') filename_list.append(name) if idx % 50 == 0: print(name) fs, audio = wav.read(filename) x = audioToInputVector(audio, fs, N_FEATURES, N_CONTEXT) out = sess.run(output_op, {'input_node:0': [ x], 'input_lengths:0': [len(x)]}) xs.append(out) ys.append(y) xs = fix_seq_length(xs, length=20) xs = apply_pca(xs, n_components=25) xs = np.array([np.ravel(x) for x in xs]) to_csv(xs, ys, os.path.join(Constants.DATA_FOLDER, 'audio10classes.csv'), filename_list=filename_list)
def audiofile_to_input_vector(audio_filename, numcep, numcontext): r""" Given a WAV audio file at ``audio_filename``, calculates ``numcep`` MFCC features at every 0.01s time step with a window length of 0.025s. Appends ``numcontext`` context frames to the left and right of each time step, and returns this data in a numpy array. """ # Load wav files fs, audio = wav.read(audio_filename) return audioToInputVector(audio, fs, numcep, numcontext)
def infer_audio(input_file_path): initialize_globals() n_input = 26 n_context = 9 graph = load_graph(model_path) # We access the input and output nodes inp = graph.get_tensor_by_name('prefix/input_node:0') inp_len = graph.get_tensor_by_name('prefix/input_lengths:0') logits = graph.get_tensor_by_name('prefix/logits:0') op = graph.get_tensor_by_name('prefix/output_node:0') with tf.Session(graph=graph) as session: fs, audio = wav.read(input_file_path) mfcc = audioToInputVector(audio, fs, n_input, n_context) output = session.run(logits, feed_dict={ inp: [mfcc], inp_len: [len(mfcc)], }) return output