def process_single_file(row): # row = index, Series _, file = row features = audiofile_to_input_vector(file.wav_filename, N_FEATURES, N_CONTEXT) transcript = text_to_char_array(file.transcript, alphabet) return features, len(features), transcript, len(transcript)
def find_transcripts(self, wav_file_path, visual_feature_json_path=None): ''' Args: wav_file_path: the filepath for your wav file. visual_features: Visual features for video based speech recognition. These will be required when the exported model is of AVSR type. ''' if self.use_visual_features: assert visual_feature_json_path is not None source = np.array([(get_audio_visual_feature_vector( wav_file_path, visual_feature_json_path, NUM_MFCC_COEFF + NUM_VISUAL, N_CONTEXT))]) else: source = np.array([ (audiofile_to_input_vector(wav_file_path, NUM_MFCC_COEFF, N_CONTEXT)) ]) source_len = np.array([(len(source[-1]))]) feed_dict = {self.input: source, self.input_len: source_len} decoded = self.session.run(self.output, feed_dict)[0][0] # session.run() will return shape = (1,1,X). X = Number of characters in the transcript transcript = ndarray_to_text(decoded) if self.use_spell_check: transcript = correction(transcript) return transcript
def _populate_batch_queue(self, session, coord): ''' Queue thread routine. ''' file_count = len(self._data_set.files) index = -1 while not coord.should_stop(): index = self._data_set.next_index(index) % file_count wav_file, transcript = self._data_set.files[index] source = audiofile_to_input_vector(wav_file, self._model_feeder.numcep, self._model_feeder.numcontext) source_len = len(source) target = text_to_char_array( transcript, self._alphabet) ## 이 부분을 diphone 형태로 받아오도록 수정 target_len = len(target) if source_len < target_len: raise ValueError( 'Error: Audio file {} is too short for transcription.'. format(wav_file)) try: session.run(self._enqueue_op, feed_dict={ self._model_feeder.ph_x: source, self._model_feeder.ph_x_length: source_len, self._model_feeder.ph_y: target, self._model_feeder.ph_y_length: target_len }) except tf.errors.CancelledError: return
def _populate_batch_queue(self, session, coord): ''' Queue thread routine. ''' file_count = len(self._data_set.files) index = -1 while not coord.should_stop(): index = self._data_set.next_index(index) % file_count wav_file, transcript = self._data_set.files[index] source = audiofile_to_input_vector(wav_file, self._model_feeder.numcep, self._model_feeder.numcontext) source_len = len(source) target = text_to_char_array(transcript) target_len = len(target) try: session.run(self._enqueue_op, feed_dict={ self._model_feeder.ph_x: source, self._model_feeder.ph_x_length: source_len, self._model_feeder.ph_y: target, self._model_feeder.ph_y_length: target_len }) except tf.errors.CancelledError: return
def _populate_batch_queue(self): with self._graph.as_default(): while True: n_steps = 0 sources = [] targets = [] for index, (txt_file, wav_file) in enumerate(self._files_circular_list): if index >= self._batch_size: break next_source = audiofile_to_input_vector( wav_file, self._numcep, self._numcontext) if n_steps < next_source.shape[0]: n_steps = next_source.shape[0] sources.append(next_source) with open(txt_file) as open_txt_file: targets.append(open_txt_file.read()) target = texts_to_sparse_tensor(targets) for index, next_source in enumerate(sources): npad = ((0, (n_steps - next_source.shape[0])), (0, 0)) sources[index] = np.pad(next_source, pad_width=npad, mode='constant') source = np.array(sources) self._batch_queue.put((source, target))
def do_single_file_inference(input_file_path): with tf.Session(config=Config.session_config) as session: inputs, outputs, layers = create_inference_graph(batch_size=1, n_steps=-1) # REVIEW josephz: Hack: print all layers here. for i, l in enumerate(layers): print("layer '{}': '{}'".format(i, l)) # Create a saver using variables from the above newly created graph mapping = { v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_') } saver = tf.train.Saver(mapping) # Restore variables from training checkpoint # TODO: This restores the most recent checkpoint, but if we use validation to counteract # over-fitting, we may want to restore an earlier checkpoint. checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not checkpoint: log_error( 'Checkpoint directory ({}) does not contain a valid checkpoint state.' .format(FLAGS.checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) session.run(outputs['initialize_state']) features = audiofile_to_input_vector(input_file_path, Config.n_input, Config.n_context) num_strides = len(features) - (Config.n_context * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2 * Config.n_context + 1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, Config.n_input), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) logits = session.run(outputs['outputs'], feed_dict={ inputs['input']: [features], inputs['input_lengths']: [num_strides], }) logits = np.squeeze(logits) scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet) decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width, scorer=scorer) # Print highest probability result print(decoded[0][1])
def process_single_file(row, numcep, numcontext, alphabet): # row = index, Series _, file = row features = audiofile_to_input_vector(file.wav_filename, numcep, numcontext) features_len = len(features) - 2*numcontext transcript = text_to_char_array(file.transcript, alphabet) if features_len < len(transcript): raise ValueError('Error: Audio file {} is too short for transcription.'.format(file.wav_filename)) return features, features_len, transcript, len(transcript)
def main(_): if not FLAGS.server: print 'please specify server host:port' return if not FLAGS.file: print 'pleace specify an audio file' return audio_waves = audiofile_to_input_vector(FLAGS.file, FLAGS.n_input, FLAGS.n_context) audio = np.array([audio_waves]) do_inference(FLAGS.server, audio)
def _compute_source_target(self): txt_file = self._txt_files[0] wav_file = path.splitext(txt_file)[0] + ".wav" audio_waves = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext) with open(txt_file) as open_txt_file: original = ' '.join(open_txt_file.read().strip().lower().split(' ')[2:]).replace('.', '') target = text_to_char_array(original) return audio_waves, len(audio_waves), target, len(target)
def _populate_batch_queue(self, session): for txt_file, wav_file in self._files_circular_list: source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext) source_len = len(source) with codecs.open(txt_file, encoding="utf-8") as open_txt_file: target = unicodedata.normalize("NFKD", open_txt_file.read()).encode("ascii", "ignore") target = text_to_char_array(target) target_len = len(target) session.run(self._enqueue_op, feed_dict={ self._x: source, self._x_length: source_len, self._y: target, self._y_length: target_len})
def _populate_batch_queue(self, session): for wav_file, transcript in self._indices(): source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext) source_len = len(source) target = text_to_char_array(transcript) target_len = len(target) try: session.run(self._enqueue_op, feed_dict={ self._x: source, self._x_length: source_len, self._y: target, self._y_length: target_len}) except tf.errors.CancelledError: return
def pipeline(data): data = data.head(2) #overfitting it for 2 file #print train['transcript'] inputs_encoder = [] inputs_decoder = [] outputs_decoder = [] decoder_length = [] sequence_length = [] for ind, row in data.iterrows(): inputs_encoder.append( audiofile_to_input_vector(row['wav_filename'], 26, 0)) inputs_decoder.append( np.append([1], text_to_char_array(row['transcript']))) outputs_decoder.append( np.append(text_to_char_array(row['transcript']), [1])) sequence_length.append( audiofile_to_input_vector(row['wav_filename'], 26, 0).shape[0]) decoder_length.append(len(row['transcript']) + 1) xt_decoder_input, xlen_decoder_input = helpers2.batch(inputs_decoder) xt_encoder, xlen_encoder = helpers.batch(inputs_encoder) xt_decoder_output, xlen_decoder_output = helpers2.batch(outputs_decoder) sequence_length = np.asarray(sequence_length, dtype=np.int32) decoder_length = np.asarray(decoder_length, dtype=np.int32) #print inputs_encoder[1].shape #print inputs_decoder[1].shape #print xt_encoder.shape #print xt_encoder.shape #print xt_decoder_input.dtype #print xt_decoder_output.shape #fd={encoder_inputs_embedded:xt_encoder,seq_len_tensor:sequence_length,decoder_lengths:decoder_length,decoder_inputs:xt_decoder_input,decoder_targets:xt_decoder_output} return ({ "A": xt_encoder, "B": xt_decoder_input, "C": sequence_length, "D": decoder_length }, xt_decoder_output)
def do_single_file_inference(input_file_path): with tf.Session(config=Config.session_config) as session: inputs, outputs, _ = create_inference_graph(batch_size=1, n_steps=-1) # Create a saver using variables from the above newly created graph mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')} saver = tf.train.Saver(mapping) # Restore variables from training checkpoint # TODO: This restores the most recent checkpoint, but if we use validation to counteract # over-fitting, we may want to restore an earlier checkpoint. checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not checkpoint: log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) session.run(outputs['initialize_state']) features = audiofile_to_input_vector(input_file_path, Config.n_input, Config.n_context) num_strides = len(features) - (Config.n_context * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2*Config.n_context+1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, Config.n_input), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) logits = session.run(outputs['outputs'], feed_dict = { inputs['input']: [features], inputs['input_lengths']: [num_strides], }) logits = np.squeeze(logits) scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet) decoded = ctc_beam_search_decoder(logits, Config.alphabet, FLAGS.beam_width, scorer=scorer) # Print highest probability result print(decoded[0][1])
def _get_files_mfcc(wav_filenames): # print('Processing MFCC...') mfccs = [] lens = [] for audio_fname in wav_filenames: this_mfcc = audiofile_to_input_vector(audio_fname, n_input, n_context) if len(this_mfcc) != feature_len: needlen = feature_len - len(this_mfcc) a = ([[0 for x in range(feature_dim)] for y in range(needlen)]) this_mfcc = np.concatenate((this_mfcc, np.array(a))) # print(this_mfcc.shape) this_mfcc = np.reshape(this_mfcc, (feature_len, n_input, 1)) mfccs.append(this_mfcc) lens.append(len(this_mfcc)) a_mfccs = np.array(mfccs) # shape, (batch, time_step_len, feature_len) a_lens = np.array(lens) # shape, (batch, 1), value == time_step_len # print('MFCCs shape', a_mfccs.shape, a_lens.shape) return a_mfccs, a_lens
def do_inference(hostport, audio_file, server): audio_waves = audiofile_to_input_vector(audio_file, FLAGS.n_input, FLAGS.n_context) audio = np.array([audio_waves]) host, port = hostport.split(':') channel = implementations.insecure_channel(host, int(port)) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) request = predict_pb2.PredictRequest() request.model_spec.name = 'deepspeech' request.inputs['input'].CopyFrom(tf.contrib.util.make_tensor_proto(audio)) event = threading.Event() result_future = stub.Predict.future(request, 5.0) # 5 seconds result_future.add_done_callback(_create_rpc_callback(event, server)) if event.is_set() != True: event.wait()
def make_checkpoint(model_path, audio_path, save_path): graph_def = GraphDef() loaded = graph_def.ParseFromString(open(model_path, 'rb').read()) with tf.Graph().as_default() as graph: new_input = tf.placeholder(tf.float32, [None, None, None], name='new_input') # Load the saved .pb into the current graph to let us grab # access to the weights. logits, = tf.import_graph_def( graph_def, input_map={'input_node:0': new_input}, return_elements=['logits:0'], name='newname', op_dict=None, producer_op_list=None ) # Now let's dump these weights into a new copy of the network. with tf.Session(graph=graph) as sess: # Sample sentence, to make sure we've done it right mfcc = audiofile_to_input_vector(audio_path, 26, 9) # Okay, so this is ugly again. # We just want it to not crash. tf.app.flags.FLAGS.alphabet_config_path = \ os.path.join(os.path.dirname(__file__), 'DeepSpeech/data/alphabet.txt') DeepSpeech.initialize_globals() logits2 = DeepSpeech.BiRNN(new_input, [len(mfcc)], [0]*10) # Here's where all the work happens. Copy the variables # over from the .pb to the session object. for var in tf.global_variables(): sess.run(var.assign(sess.run('newname/'+var.name))) # Test to make sure we did it right. res = (sess.run(logits, {new_input: [mfcc], 'newname/input_lengths:0': [len(mfcc)]}).flatten()) res2 = (sess.run(logits2, {new_input: [mfcc]})).flatten() print('This value should be small', np.sum(np.abs(res - res2))) # And finally save the constructed session. saver = tf.train.Saver() saver.save(sess, save_path)
def _populate_batch_queue(self, session): for txt_file, wav_file in self._indices(): source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext) source_len = len(source) with codecs.open(txt_file, encoding="utf-8") as open_txt_file: # We need to do the encode-decode dance here because encode # returns a bytes() object on Python 3, and text_to_char_array # expects a string. target = unicodedata.normalize("NFKD", open_txt_file.read()) \ .encode("ascii", "ignore") \ .decode("ascii", "ignore") target = text_to_char_array(target) target_len = len(target) try: session.run(self._enqueue_op, feed_dict={ self._x: source, self._x_length: source_len, self._y: target, self._y_length: target_len}) except tf.errors.CancelledError: return
def _populate_batch_queue(self, session): for txt_file, wav_file in self._files_circular_list: if self._coord.should_stop(): return source = audiofile_to_input_vector(wav_file, self._numcep, self._numcontext) source_len = len(source) with codecs.open(txt_file, encoding="utf-8") as open_txt_file: target = unicodedata.normalize("NFKD", open_txt_file.read()) target = text_to_char_array(target) target_len = len(target) try: session.run(self._enqueue_op, feed_dict={ self._x: source, self._x_length: source_len, self._y: target, self._y_length: target_len }) except tf.errors.CancelledError: return
def _populate_batch_queue(self, session, coord): ''' Queue thread routine. ''' file_count = len(self._data_set.files) index = -1 while not coord.should_stop(): index = self._data_set.next_index(index) % file_count wav_file, transcript = self._data_set.files[index] source = audiofile_to_input_vector(wav_file, self._model_feeder.numcep, self._model_feeder.numcontext) source_len = len(source) target = text_to_char_array(transcript, self._alphabet) target_len = len(target) if source_len < target_len: raise ValueError('Error: Audio file {} is too short for transcription.'.format(wav_file)) try: session.run(self._enqueue_op, feed_dict={ self._model_feeder.ph_x: source, self._model_feeder.ph_x_length: source_len, self._model_feeder.ph_y: target, self._model_feeder.ph_y_length: target_len }) except tf.errors.CancelledError: return
with tf.Graph().as_default() as graph: new_input = tf.placeholder(tf.float32, [None, None, None], name="new_input") # Load the saved .pb into the current graph to let us grab # access to the weights. logits, = tf.import_graph_def(graph_def, input_map={"input_node:0": new_input}, return_elements=['logits:0'], name="newname", op_dict=None, producer_op_list=None) # Now let's dump these weights into a new copy of the network. with tf.Session(graph=graph) as sess: # Sample sentetnce, to make sure we've done it right mfcc = audiofile_to_input_vector(wav_file, 26, 9) # Okay, so this is ugly again. # We just want it to not crash. tf.app.flags.FLAGS.alphabet_config_path = alphabet_file DeepSpeech.initialize_globals() logits2 = DeepSpeech.BiRNN(new_input, [len(mfcc)], [0] * 10) # Here's where all the work happens. Copy the variables # over from the .pb to the session object. for var in tf.global_variables(): sess.run(var.assign(sess.run('newname/' + var.name))) # Test to make sure we did it right. res = (sess.run(logits, { new_input: [mfcc],
# Open tf.Session. with tf.Session(graph=graph) as sess: # Extract graph node names. tf.import_graph_def(graph_def, name='') graph_nodes = [n for n in graph_def.node] names = [] for i, t in enumerate(graph_nodes): names.append(t.name) print("graph_node: '{:03d}' -- '{}'".format(i, t.name)) # Prepare audio input data. input_file_path = '/home/josephz/GoogleDrive/University/UW/2018-19/CSE481I/singing-style-transfer' \ '/src/data/aligned/one_last_time/one_last_time_original_30s.wav' features = audiofile_to_input_vector(input_file_path, Config.n_input, Config.n_context) num_strides = len(features) - (Config.n_context * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2 * Config.n_context + 1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, Config.n_input), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) # Prepare graph nodes for inference. # Prepare input nodes. # initialize_state = graph.get_tensor_by_name('initialize_state:0') input_node = graph.get_tensor_by_name('input_node:0')
name="new_input") # Load the saved .pb into the current graph to let us grab # access to the weights. logits, = tf.import_graph_def( graph_def, input_map={"input_node:0": new_input}, return_elements=['logits:0'], name="newname", op_dict=None, producer_op_list=None ) # Now let's dump these weights into a new copy of the network. with tf.Session(graph=graph) as sess: # Sample sentetnce, to make sure we've done it right mfcc = audiofile_to_input_vector("sample.wav", 26, 9) # Okay, so this is ugly again. # We just want it to not crash. tf.app.flags.FLAGS.alphabet_config_path = "DeepSpeech/data/alphabet.txt" # Make it stop complaining tf.app.flags.FLAGS.decoder_library_path = "." DeepSpeech.initialize_globals() logits2 = DeepSpeech.BiRNN(new_input, [len(mfcc)], [0]*10) # Here's where all the work happens. Copy the variables # over from the .pb to the session object. for var in tf.global_variables(): sess.run(var.assign(sess.run('newname/'+var.name))) # Test to make sure we did it right.
def _maybe_convert_set(source_dir, target_dir, mode, datasets): rows = [] remove_alphabets = set('۱١٢۳٣٤٥٦٧۷۸٨٩۹٠۰0123456789٪éàçèáâïóöúﺠپچﭽ') for dataset in datasets: for subdir, dirs, files in os.walk(source_dir + '/' + dataset + '/' + mode): # for audio_filename in sorted(glob.iglob(corpus_dir + "/" + '/**/*.' + ext, recursive=True)): for file in files: if file.endswith('.txt'): filepath = path.abspath(subdir + '/' + file).split('.')[:-1][0] if path.exists(filepath + '.txt') and path.exists(filepath + '.wav'): with open(filepath + '.txt', 'r') as readfile: for transcript in readfile.readlines(): features_len = audiofile_to_input_vector(filepath + '.wav', numcep=26, numcontext=9, compute_len=True, model='deepspeech_2') if(features_len > 100 and len(transcript) > 2): if ('english' in language): if ('tedlium' in dataset): if (len(transcript) >= 7 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)): rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript)) # if (features_len <= (len(transcript) * transcript_features_ratio)): # print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript))) elif ('tidigits' in dataset): if (len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)): rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript)) # if (features_len <= len(transcript)): # print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript))) elif ('voxforge' in dataset): if (len(transcript) >= 2 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)): rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript)) # if (features_len <= len(transcript)): # print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript))) elif ('vctk' in dataset): if (len(transcript) >= 2 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)): rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript)) # if (features_len <= (len(transcript) * transcript_features_ratio)): # print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript))) elif ('common_voice' in dataset): if (len(transcript) >= 8 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)): rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript)) # if (features_len <= (len(transcript) * transcript_features_ratio)): # print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript))) elif (len(transcript) >= 5 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)): rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript)) # if (features_len <= len(transcript)): # print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript))) else: transcript = transcript.replace('آ', 'آ') transcript = transcript.replace('ﻻ', 'لا') transcript = transcript.replace('ﻵ', 'لآ') transcript = transcript.replace('ﻷ', 'لأ') transcript = transcript.replace('ﻹ', 'لإ') transcript = transcript.replace('ﺇ', 'إ') transcript = transcript.replace('ک', 'ك') transcript = transcript.replace('ی', 'ى') transcript = transcript.replace('', ' ') transcript = transcript.replace('', ' ') # remove diacritics transcript = transcript.replace('ً', '') transcript = transcript.replace('ٍ', '') transcript = transcript.replace('ٌ', '') transcript = transcript.replace('ْ', '') # normalization transcript = transcript.replace('َ', '') transcript = transcript.replace('ِ', '') transcript = transcript.replace('ُ', '') transcript = transcript.replace('ّ', '') transcript = transcript.replace('ؤ', 'ؤ') transcript = transcript.replace('ئ', 'ىٔ') transcript = transcript.replace('أ', 'أ') if not any((c in remove_alphabets) for c in transcript): if ('ksu' in dataset): if (len(transcript) >= 2 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)): rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript)) # if (features_len <= (len(transcript) * transcript_features_ratio)): # print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript))) else: if (len(transcript) >= 2 and len(transcript) <= transcript_len and features_len > (len(transcript) * transcript_features_ratio)): rows.append((filepath + '.wav', path.getsize(filepath + '.wav'), transcript)) # if (features_len <= (len(transcript) * transcript_features_ratio)): # print('Error: Audio file {} is too short for transcription.'.format(filepath + '.wav') + " -- " + str(features_len) + " < " + str(len(transcript))) # # if path.exists(target_dir + '/dataset.csv'): # samples = [] # with open(source_csv) as source_csv_file: # reader = csv.DictReader(source_csv_file) # for row in reader: # samples.append((row['filename'], row['text'])) # # # Mutable counters for the concurrent embedded routine # counter = { 'all': 0, 'too_short': 0, 'too_long': 0 } # lock = RLock() # num_samples = len(samples) # rows = [] # # def one_sample(sample): # mp3_filename = path.join(*(sample[0].split('/'))) # mp3_filename = path.join(extracted_dir, mp3_filename) # # Storing wav files next to the mp3 ones - just with a different suffix # wav_filename = path.splitext(mp3_filename)[0] + ".wav" # _maybe_convert_wav(mp3_filename, wav_filename) # frames = int(subprocess.check_output(['soxi', '-s', wav_filename], stderr=subprocess.STDOUT)) # file_size = path.getsize(wav_filename) # with lock: # if int(frames/SAMPLE_RATE*1000/10/2) < len(str(sample[1])): # # Excluding samples that are too short to fit the transcript # counter['too_short'] += 1 # elif frames/SAMPLE_RATE > MAX_SECS: # # Excluding very long samples to keep a reasonable batch-size # counter['too_long'] += 1 # else: # # This one is good - keep it for the target CSV # rows.append((wav_filename, file_size, sample[1])) # counter['all'] += 1 # # print('Importing mp3 files...') # pool = Pool(cpu_count()) # bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR) # for i, _ in enumerate(pool.imap_unordered(one_sample, samples), start=1): # bar.update(i) # bar.update(num_samples) # pool.close() # pool.join() # # print('Writing "%s"...' % target_csv) # if ('english' in language): # dict_ = {} rows.sort(key=lambda item: int(item[1])) with open(target_dir + '/' + mode + '.csv', 'w') as target_csv_file: writer = csv.DictWriter(target_csv_file, fieldnames=FIELDNAMES) writer.writeheader() # bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR) for filename, file_size, transcript in rows: include = True transcript = transcript.lower() transcript = apos_re.sub('', transcript) # remove qoutes transcript = transcript.replace('-', ' ') # transcript = ''.join(['-'.join(c for c in s if c not in punctuationList) for s in transcript]) transcript = transcript.replace(' ', ' ').replace('\r', ' ').replace('\n', ' ').replace('\t', ' ').replace( '_', ' ').lower().strip() for c in transcript: if (c in punctuationList): include = False break if (include): writer.writerow({'wav_filename': filename, 'wav_filesize': file_size, 'transcript': transcript})
def do_single_file_inference(checkpoint_dir, input_file_path, layer_wanted, softmax_wanted, save_filename, save_folder, stride_size_s, win_size_s, fea_format, csv_format): with tf.Session(config=Config.session_config) as session: inputs, outputs, _ = create_inference_graph( batch_size=1, n_steps=-1, layer_wanted=layer_wanted, softmax_applied=softmax_wanted) # Create a saver using variables from the above newly created graph mapping = { v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_') } saver = tf.train.Saver(mapping) # Restore variables from training checkpoint checkpoint = tf.train.get_checkpoint_state(checkpoint_dir) if not checkpoint: log_error( 'Checkpoint directory ({}) does not contain a valid checkpoint state.' .format(checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) session.run(outputs['initialize_state']) # transformation of the audio file features = audiofile_to_input_vector(input_file_path, Config.n_input, Config.n_context) #print(features.shape) num_strides = len(features) - (Config.n_context * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2 * Config.n_context + 1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, Config.n_input), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) # This is not the logits but the ouput of the layer wanted logits = session.run(outputs['outputs'], feed_dict={ inputs['input']: [features], inputs['input_lengths']: [num_strides], }) logits = np.squeeze(logits) if fea_format: write_fea_file(logits, save_folder, save_filename, stride_size_s=stride_size_s, win_len_s=win_size_s) if csv_format: np.savetxt(save_folder + '/' + save_filename + '.csv', logits, delimiter=',')
def main(_): start = stopwatch() initialize_globals() if len(FLAGS.one_shot_infer): #load the frozen graph as in train(...) or as in https://blog.metaflow.fr/tensorflow-how-to-freeze-a-model-and-serve-it-with-a-python-api-d4f3596b3adc with tf.gfile.FastGFile("../../models/output_graph.pb", 'rb') as fin: graph_def = tf.GraphDef() graph_def.ParseFromString(fin.read()) with tf.Graph().as_default() as pretrained_model: tf.import_graph_def(graph_def, name="pretrained_") """ for op in pretrained_model.get_operations(): print(op.name) """ # print("------------***-------------") # https://stackoverflow.com/questions/36883949/in-tensorflow-get-the-names-of-all-the-tensors-in-a-graph?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa lstTensors = [op.values() for op in pretrained_model.get_operations()] input_node = lstTensors[0] input_lengths = lstTensors[1] output_node = lstTensors[-1] """ print("input node name: ") print(input_node[0].name) print("input node shape: ") print(input_node[0].shape)#V IMP: shape of input node is [x,y,z] where x = batch_size. For one shot infer, batch_size = 1 print("input lengths name: ") print(input_lengths[0].name) print("input lengths shape: ") print(input_lengths[0].shape) #V IMP: shape of input_lengths node is [x,y] where x = batch_size. For one shot infer, batch_size = 1 print("output node name: ") print(output_node[0].name) print("output node shape: ") print(output_node[0].shape) """ # do_single_file_inference(FLAGS.one_shot_infer) # print("n_input = "+repr(n_input)) # print("n_context = "+repr(n_context)) mfcc = audiofile_to_input_vector(FLAGS.one_shot_infer, n_input, n_context) # print(mfcc.shape) # output_node = pretrained_model.get_tensor_by_name(pretrained_model.get_operations()[-1].name) batch_size = 1 with tf.Session(graph=pretrained_model) as sess: output = sess.run( output_node, feed_dict={ input_node: [mfcc.reshape((batch_size, mfcc.shape[0], mfcc.shape[1]))], input_lengths: [np.array(len(mfcc)).reshape((batch_size, ))] }) # print(output) text = ndarray_to_text(output[0][0][0], alphabet) print("\n\nResult:") print(text) else: print( "Correct usage: python3 _this.py --one_shot_infer <<path-of-input-wav-file>>" ) delta = stopwatch(start) print("Net execution time including loading of the graph = " + format_duration(delta))
result = np.asarray([ SPACE_INDEX if xt == SPACE_TOKEN else ord(xt) - FIRST_INDEX for xt in result ]) return result train = pd.read_csv('./real_batch/clean-test_dev-combined.csv') train = train.head(3) #overfitting it for 2 file print train.shape inputs_encoder = [] inputs_decoder = [] outputs_decoder = [] for ind, row in train.iterrows(): inputs_encoder.append(audiofile_to_input_vector(row['wav_filename'], 26, 0)) for ind, row in train.iterrows(): inputs_decoder.append(np.append([0], text_to_char_array(row['transcript']))) for ind, row in train.iterrows(): outputs_decoder.append( np.append(text_to_char_array(row['transcript']), [0])) xt_decoder_input, xlen_decoder_input = helpers2.batch(inputs_decoder) xt_encoder, xlen_encoder = helpers.batch(inputs_encoder) xt_decoder_output, xlen_decoder_output = helpers2.batch(outputs_decoder)
new_input = tf.placeholder(tf.float32, [None, None, None], name="new_input") # Load the saved .pb into the current graph to let us grab # access to the weights. logits, = tf.import_graph_def(graph_def, input_map={"input_node:0": new_input}, return_elements=['logits:0'], name="newname", op_dict=None, producer_op_list=None) # Now let's dump these weights into a new copy of the network. with tf.Session(graph=graph) as sess: # Sample sentetnce, to make sure we've done it right # TODO i've substitutes this file mfcc = audiofile_to_input_vector("LDC93S1.wav", 26, 9) # Okay, so this is ugly again. # We just want it to not crash. tf.app.flags.FLAGS.alphabet_config_path = "DeepSpeech/data/alphabet.txt" DeepSpeech.initialize_globals() logits2 = DeepSpeech.BiRNN(new_input, [len(mfcc)], [0] * 10) # Here's where all the work happens. Copy the variables # over from the .pb to the session object. for var in tf.global_variables(): sess.run(var.assign(sess.run('newname/' + var.name))) # Test to make sure we did it right. res = (sess.run(logits, { new_input: [mfcc],