def initialize_globals(): global alphabet alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) # Geometric Constants # =================== # For an explanation of the meaning of the geometric constants, please refer to # doc/Geometry.md # Number of MFCC features global n_input n_input = 26 # TODO: Determine this programatically from the sample rate # The number of frames in the context global n_context n_context = 9 # TODO: Determine the optimal value using a validation data set if len(FLAGS.one_shot_infer) > 0: FLAGS.train = False FLAGS.test = False FLAGS.export_dir = '' if not os.path.exists(FLAGS.one_shot_infer): log_error( 'Path specified in --one_shot_infer is not a valid file.') exit(1) if not os.path.exists(os.path.abspath(FLAGS.decoder_library_path)): print('ERROR: The decoder library file does not exist. Make sure you have ' \ 'downloaded or built the native client binaries and pass the ' \ 'appropriate path to the binaries in the --decoder_library_path parameter.') global custom_op_module custom_op_module = tf.load_op_library(FLAGS.decoder_library_path)
def main(_): initialize_globals() if not FLAGS.test_files: log_error('You need to specify what files to use for evaluation via ' 'the --test_files flag.') exit(1) global alphabet alphabet = Alphabet(FLAGS.alphabet_config_path) # sort examples by length, improves packing of batches and timesteps test_data = preprocess( FLAGS.test_files.split(','), FLAGS.test_batch_size, alphabet=alphabet, numcep=Config.n_input, numcontext=Config.n_context, hdf5_cache_path=FLAGS.hdf5_test_set).sort_values( by="features_len", ascending=False) from DeepSpeech import create_inference_graph graph = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1) samples = evaluate(test_data, graph, alphabet) if FLAGS.test_output_file: # Save decoded tuples as JSON, converting NumPy floats to Python floats json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
def get_alphabet(language): if language in ALPHABETS: return ALPHABETS[language] alphabet_path = getattr(CLI_ARGS, language + '_alphabet') alphabet = Alphabet(alphabet_path) if alphabet_path else None ALPHABETS[language] = alphabet return alphabet
def create_bundle( alphabet_path, lm_path, vocab_path, package_path, force_utf8, default_alpha, default_beta, ): words = set() vocab_looks_char_based = True with open(vocab_path) as fin: for line in fin: for word in line.split(): words.add(word.encode("utf-8")) if len(word) > 1: vocab_looks_char_based = False print("{} unique words read from vocabulary file.".format(len(words))) print("{} like a character based model.".format( "Looks" if vocab_looks_char_based else "Doesn't look")) if force_utf8 != None: # pylint: disable=singleton-comparison use_utf8 = force_utf8.value print("Forcing UTF-8 mode = {}".format(use_utf8)) else: use_utf8 = vocab_looks_char_based if use_utf8: serialized_alphabet = UTF8Alphabet().serialize() else: serialized_alphabet = Alphabet(alphabet_path).serialize() alphabet = NativeAlphabet() err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet)) if err != 0: print("Error loading alphabet: {}".format(err)) sys.exit(1) scorer = Scorer() scorer.set_alphabet(alphabet) scorer.set_utf8_mode(use_utf8) scorer.reset_params(default_alpha, default_beta) scorer.load_lm(lm_path) scorer.fill_dictionary(list(words)) shutil.copy(lm_path, package_path) scorer.save_dictionary(package_path, True) # append, not overwrite print("Package created in {}".format(package_path))
def initialize_globals(): c = AttrDict() # ps and worker hosts required for p2p cluster setup FLAGS.ps_hosts = list(filter(len, FLAGS.ps_hosts.split(','))) FLAGS.worker_hosts = list(filter(len, FLAGS.worker_hosts.split(','))) # Create a cluster from the parameter server and worker hosts. c.cluster = tf.train.ClusterSpec({ 'ps': FLAGS.ps_hosts, 'worker': FLAGS.worker_hosts }) # The absolute number of computing nodes - regardless of cluster or single mode num_workers = max(1, len(FLAGS.worker_hosts)) # If replica numbers are negative, we multiply their absolute values with the number of workers if FLAGS.replicas < 0: FLAGS.replicas = num_workers * -FLAGS.replicas if FLAGS.replicas_to_agg < 0: FLAGS.replicas_to_agg = num_workers * -FLAGS.replicas_to_agg # The device path base for this node c.worker_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task_index) # This node's CPU device c.cpu_device = c.worker_device + '/cpu:0' # This node's available GPU devices c.available_devices = [ c.worker_device + gpu for gpu in get_available_gpus() ] # If there is no GPU available, we fall back to CPU based operation if 0 == len(c.available_devices): c.available_devices = [c.cpu_device] # Set default dropout rates if FLAGS.dropout_rate2 < 0: FLAGS.dropout_rate2 = FLAGS.dropout_rate if FLAGS.dropout_rate3 < 0: FLAGS.dropout_rate3 = FLAGS.dropout_rate if FLAGS.dropout_rate6 < 0: FLAGS.dropout_rate6 = FLAGS.dropout_rate # Set default checkpoint dir if len(FLAGS.checkpoint_dir) == 0: FLAGS.checkpoint_dir = xdg.save_data_path( os.path.join('deepspeech', 'checkpoints')) if FLAGS.benchmark_steps > 0: FLAGS.checkpoint_dir = None # Set default summary dir if len(FLAGS.summary_dir) == 0: FLAGS.summary_dir = xdg.save_data_path( os.path.join('deepspeech', 'summaries')) # Standard session configuration that'll be used for all new sessions. c.session_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_placement, inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads, intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads) c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) # Geometric Constants # =================== # For an explanation of the meaning of the geometric constants, please refer to # doc/Geometry.md # Number of MFCC features c.n_input = 26 # TODO: Determine this programmatically from the sample rate # The number of frames in the context c.n_context = 9 # TODO: Determine the optimal value using a validation data set # Number of units in hidden layers c.n_hidden = FLAGS.n_hidden c.n_hidden_1 = c.n_hidden c.n_hidden_2 = c.n_hidden c.n_hidden_5 = c.n_hidden # LSTM cell state dimension c.n_cell_dim = c.n_hidden # The number of units in the third layer, which feeds in to the LSTM c.n_hidden_3 = c.n_cell_dim # Units in the sixth layer = number of characters in the target language plus one c.n_hidden_6 = c.alphabet.size() + 1 # +1 for CTC blank label # Queues that are used to gracefully stop parameter servers. # Each queue stands for one ps. A finishing worker sends a token to each queue before joining/quitting. # Each ps will dequeue as many tokens as there are workers before joining/quitting. # This ensures parameter servers won't quit, if still required by at least one worker and # also won't wait forever (like with a standard `server.join()`). done_queues = [] for i, ps in enumerate(FLAGS.ps_hosts): # Queues are hosted by their respective owners with tf.device('/job:ps/task:%d' % i): done_queues.append( tf.FIFOQueue(1, tf.int32, shared_name=('queue%i' % i))) # Placeholder to pass in the worker's index as token c.token_placeholder = tf.placeholder(tf.int32) # Enqueue operations for each parameter server c.done_enqueues = [ queue.enqueue(c.token_placeholder) for queue in done_queues ] # Dequeue operations for each parameter server c.done_dequeues = [queue.dequeue() for queue in done_queues] if len(FLAGS.one_shot_infer) > 0: FLAGS.train = False FLAGS.test = False FLAGS.export_dir = '' if not os.path.exists(FLAGS.one_shot_infer): log_error( 'Path specified in --one_shot_infer is not a valid file.') exit(1) # Determine, if we are the chief worker c.is_chief = len( FLAGS.worker_hosts) == 0 or (FLAGS.task_index == 0 and FLAGS.job_name == 'worker') ConfigSingleton._config = c
# Number of MFCC features global n_input n_input = 40 # MFCC, maybe need add Delta # The number of frames in the context global n_context n_context = 0 global feature_len feature_len = 100 # all input is 1s wavfiel 1000ms/10ms = 100. global feature_dim feature_dim = n_input * (n_context * 2 + 1) global alphabet alphabet = Alphabet('./alphabet.txt') print('alphabet.size() ', alphabet.size()) print(alphabet._label_to_str) # The number of characters in the target language plus one global n_character n_character = alphabet.size() + 1 # +1 for CTC blank label global max_labellen max_labellen = 6 global n_hidden n_hidden = 128 trfile = 'data/speechcmd_train.csv' cvfile = 'data/speechcmd_dev.csv' testfile = 'data/speechcmd_test.csv'
if counter['too_short'] > 0: print('Skipped %d samples that were too short to match the transcript.' % counter['too_short']) if counter['too_long'] > 0: print('Skipped %d samples that were longer than %d seconds.' % (counter['too_long'], MAX_SECS)) print('Final amount of imported audio: %s.' % secs_to_hours(counter['total_time'] / SAMPLE_RATE)) def handle_args(): parser = argparse.ArgumentParser(description='Importer for African Accented French dataset. More information on http://www.openslr.org/57/.') parser.add_argument(dest='target_dir') parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet') parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones') return parser.parse_args() if __name__ == "__main__": CLI_ARGS = handle_args() ALPHABET = Alphabet(CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None def label_filter(label): if CLI_ARGS.normalize: label = unicodedata.normalize("NFKD", label.strip()) \ .encode("ascii", "ignore") \ .decode("ascii", "ignore") label = validate_label(label) if ALPHABET and label: try: ALPHABET.encode(label) except KeyError: label = None return label _download_and_preprocess_data(target_dir=CLI_ARGS.target_dir)
def run_inference(): """Load frozen graph, run inference and display most likely predicted characters""" parser = argparse.ArgumentParser( description='Run Deepspeech inference to obtain char probabilities') parser.add_argument('--input-file', type=str, help='Path to the wav file', action="store", dest="input_file_path") parser.add_argument('--alphabet-file', type=str, help='Path to the alphabet.txt file', action="store", dest="alphabet_file_path") parser.add_argument('--model-file', type=str, help='Path to the tf model file', action="store", dest="model_file_path") parser.add_argument( '--predicted-character-count', type=int, help='Number of most likely characters to be displayed', action="store", dest="predicted_character_count", default=5) args = parser.parse_args() alphabet = Alphabet(os.path.abspath(args.alphabet_file_path)) if args.predicted_character_count >= alphabet.size(): args.predicted_character_count = alphabet.size() - 1 # Load frozen graph from file and parse it with tf.io.gfile.GFile(args.model_file_path, "rb") as f: graph_def = tf.compat.v1.GraphDef() graph_def.ParseFromString(f.read()) # print(graph_def.node) with tf.Graph().as_default() as graph: tf.import_graph_def(graph_def, name="prefix") # currently hardcoded values used during inference with tf.compat.v1.Session(graph=graph) as session: features, features_len = audiofile_to_features( args.input_file_path) previous_state_c = np.zeros([1, n_cell_dim]) previous_state_h = np.zeros([1, n_cell_dim]) # Add batch dimension features = tf.expand_dims(features, 0) features_len = tf.expand_dims(features_len, 0) # Evaluate features = create_overlapping_windows(features).eval( session=session) features_len = features_len.eval(session=session) # we are interested only into logits, not CTC decoding inputs = { 'input': graph.get_tensor_by_name('prefix/input_node:0'), 'previous_state_c': graph.get_tensor_by_name('prefix/previous_state_c:0'), 'previous_state_h': graph.get_tensor_by_name('prefix/previous_state_h: 0'), 'input_lengths': graph.get_tensor_by_name('prefix/input_lengths:0') } outputs = { 'outputs': graph.get_tensor_by_name('prefix/raw_logits:0'), 'new_state_c': graph.get_tensor_by_name('prefix/new_state_c:0'), 'new_state_h': graph.get_tensor_by_name('prefix/new_state_h: 0'), } logits = np.empty([0, 1, alphabet.size() + 1]) # the frozen model only accepts input split to 16 step chunks, # if the inference was run from checkpoint instead (as in single inference in deepspeech script), this loop wouldn't be needed for i in range(0, features_len[0], n_steps): chunk = features[:, i:i + n_steps, :, :] chunk_length = chunk.shape[1] # pad with zeros if not enough steps (len(features) % FLAGS.n_steps != 0) if chunk_length < n_steps: chunk = np.pad(chunk, ((0, 0), (0, n_steps - chunk_length), (0, 0), (0, 0)), mode='constant', constant_values=0) # need to update the states with each loop iteration logits_step, previous_state_c, previous_state_h = session.run( [ outputs['outputs'], outputs['new_state_c'], outputs['new_state_h'] ], feed_dict={ inputs['input']: chunk, inputs['input_lengths']: [chunk_length], inputs['previous_state_c']: previous_state_c, inputs['previous_state_h']: previous_state_h, }) logits = np.concatenate((logits, logits_step)) logits = np.squeeze(logits) row_output = [] for j in range(args.predicted_character_count): row_output.append([]) # now sort logits and turn them into characters + probabilities for i in range(0, len(logits)): softmax_output = softmax(logits[i]) indexes_sorted = softmax_output.argsort( )[args.predicted_character_count * -1:][::-1] most_likely_chars = '' chars_probability = '' for j in range(args.predicted_character_count): char_index = indexes_sorted[j] if char_index < alphabet.size(): text = alphabet._string_from_label(char_index) most_likely_chars += text + ' ' row_output[j].append(text) chars_probability += ' (' + str( softmax_output[char_index]) + ')' else: most_likely_chars += '- ' row_output[j].append('-') chars_probability += ' (' + str( softmax_output[char_index]) + ')' print(most_likely_chars, " ", chars_probability) with open(args.input_file_path + "_acoustic.txt", "w") as out: for j in range(len(row_output)): out.write(', '.join(row_output[j]) + "\n") print(row_output[j])
def initialize_globals(): c = AttrDict() # CPU device c.cpu_device = '/cpu:0' # Available GPU devices c.available_devices = get_available_gpus() # If there is no GPU available, we fall back to CPU based operation if not c.available_devices: c.available_devices = [c.cpu_device] # Set default dropout rates if FLAGS.dropout_rate2 < 0: FLAGS.dropout_rate2 = FLAGS.dropout_rate if FLAGS.dropout_rate3 < 0: FLAGS.dropout_rate3 = FLAGS.dropout_rate if FLAGS.dropout_rate6 < 0: FLAGS.dropout_rate6 = FLAGS.dropout_rate # Set default checkpoint dir if not FLAGS.checkpoint_dir: FLAGS.checkpoint_dir = xdg.save_data_path( os.path.join('deepspeech', 'checkpoints')) if FLAGS.load not in ['last', 'best', 'init', 'auto']: FLAGS.load = 'auto' # Set default summary dir if not FLAGS.summary_dir: FLAGS.summary_dir = xdg.save_data_path( os.path.join('deepspeech', 'summaries')) c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) # Geometric Constants # =================== # For an explanation of the meaning of the geometric constants, please refer to # doc/Geometry.md # Number of MFCC features c.n_input = 26 # TODO: Determine this programmatically from the sample rate # The number of frames in the context c.n_context = 9 # TODO: Determine the optimal value using a validation data set # Number of units in hidden layers c.n_hidden = FLAGS.n_hidden c.n_hidden_1 = c.n_hidden c.n_hidden_2 = c.n_hidden c.n_hidden_5 = c.n_hidden # LSTM cell state dimension c.n_cell_dim = c.n_hidden # The number of units in the third layer, which feeds in to the LSTM c.n_hidden_3 = c.n_cell_dim # Units in the sixth layer = number of characters in the target language plus one c.n_hidden_6 = c.alphabet.size() + 1 # +1 for CTC blank label # Size of audio window in samples c.audio_window_samples = FLAGS.audio_sample_rate * (FLAGS.feature_win_len / 1000) # Stride for feature computations in samples c.audio_step_samples = FLAGS.audio_sample_rate * (FLAGS.feature_win_step / 1000) if FLAGS.one_shot_infer: if not os.path.exists(FLAGS.one_shot_infer): log_error( 'Path specified in --one_shot_infer is not a valid file.') exit(1) ConfigSingleton._config = c # pylint: disable=protected-access
def main(_): initialize_globals() if not FLAGS.test_files: log_error('You need to specify what files to use for evaluation via ' 'the --test_files flag.') exit(1) global alphabet alphabet = Alphabet(FLAGS.alphabet_config_path) scorer = Scorer(FLAGS.lm_weight, FLAGS.valid_word_count_weight, FLAGS.lm_binary_path, FLAGS.lm_trie_path, alphabet) # sort examples by length, improves packing of batches and timesteps test_data = preprocess( FLAGS.test_files.split(','), FLAGS.test_batch_size, alphabet=alphabet, numcep=N_FEATURES, numcontext=N_CONTEXT, hdf5_cache_path=FLAGS.hdf5_test_set).sort_values( by="features_len", ascending=False) def create_windows(features): num_strides = len(features) - (N_CONTEXT * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2*N_CONTEXT+1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, N_FEATURES), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) return features # Create overlapping windows over the features test_data['features'] = test_data['features'].apply(create_windows) with tf.Session() as session: inputs, outputs, layers = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1) # Transpose to batch major for decoder transposed = tf.transpose(outputs['outputs'], [1, 0, 2]) labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels") label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths") sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32) loss = tf.nn.ctc_loss(labels=sparse_labels, inputs=layers['raw_logits'], sequence_length=inputs['input_lengths']) # Create a saver using variables from the above newly created graph mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')} saver = tf.train.Saver(mapping) # Restore variables from training checkpoint checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not checkpoint: log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) logitses = [] losses = [] print('Computing acoustic model predictions...') batch_count = len(test_data) // FLAGS.test_batch_size bar = progressbar.ProgressBar(max_value=batch_count, widget=progressbar.AdaptiveETA) # First pass, compute losses and transposed logits for decoding for batch in bar(split_data(test_data, FLAGS.test_batch_size)): session.run(outputs['initialize_state']) features = pad_to_dense(batch['features'].values) features_len = batch['features_len'].values labels = pad_to_dense(batch['transcript'].values) label_lengths = batch['transcript_len'].values logits, loss = session.run([transposed, loss], feed_dict={ inputs['input']: features, inputs['input_lengths']: features_len, labels_ph: labels, label_lengths_ph: label_lengths }) logitses.append(logits) losses.extend(loss) ground_truths = [] predictions = [] distances = [] print('Decoding predictions...') bar = progressbar.ProgressBar(max_value=batch_count, widget=progressbar.AdaptiveETA) # Get number of accessible CPU cores for this process num_processes = len(os.sched_getaffinity(0)) # Second pass, decode logits and compute WER and edit distance metrics for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))): seq_lengths = batch['features_len'].values.astype(np.int32) decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer) ground_truths.extend(alphabet.decode(l) for l in batch['transcript']) predictions.extend(d[0][1] for d in decoded) distances.extend(levenshtein(a, b) for a, b in zip(labels, predictions)) wer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_edit_distance = np.mean(distances) mean_loss = np.mean(losses) # Take only the first report_count items report_samples = itertools.islice(samples, FLAGS.report_count) print('Test - WER: %f, loss: %f, mean edit distance: %f' % (wer, mean_loss, mean_edit_distance)) print('-' * 80) for sample in report_samples: print('WER: %f, loss: %f, edit distance: %f' % (sample.wer, sample.loss, sample.distance)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) if FLAGS.test_output_file: json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
import random from util.audio import audiofile_to_input_vector from util.text import text_to_char_array, Alphabet import numpy as np SAMPLE_RATE = 16000 training_percent = 0.9 validation_percent = 0.1 # test_percent = 0.05 numcontext = 9 numcep = 26 alphabet = Alphabet(os.path.abspath('/home/guest/Desktop/DeepSpeech/data/alphabet.txt')) excluded_train_wavs = ['/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon5/060799_a/adb_0467/speech/scr0467/05/04670504/r4670396/u0396196.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon5/220799/adb_0467/speech/scr0467/05/04670505/r4670441/u0441079.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon5/280799/adb_0467/speech/scr0467/05/04670505/r4670451/u0451201.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/160799/adb_0467/speech/scr0467/07/04670706/r4670598/u0598036.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/160799/adb_0467/speech/scr0467/07/04670706/r4670598/u0598037.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/160799/adb_0467/speech/scr0467/07/04670706/r4670598/u0598102.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672173.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672174.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672175.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672176.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672177.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672178.wav', '/home/guest/Desktop/Dataset16/TrainSet/sve.16khz.0467-2/0467_sv_train_2/Stasjon7/100899/adb_0467/speech/scr0467/07/04670707/r4670672/u0672179.wav',
def main(_): initialize_globals() if not FLAGS.test_files: log_error('You need to specify what files to use for evaluation via ' 'the --test_files flag.') exit(1) global alphabet alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) # sort examples by length, improves packing of batches and timesteps test_data = preprocess(FLAGS.test_files.split(','), FLAGS.test_batch_size, alphabet=alphabet, numcep=N_FEATURES, numcontext=N_CONTEXT, hdf5_cache_path=FLAGS.hdf5_test_set).sort_values( by="features_len", ascending=False) def create_windows(features): num_strides = len(features) - (N_CONTEXT * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2 * N_CONTEXT + 1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, N_FEATURES), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) return features test_data['features'] = test_data['features'].apply(create_windows) with tf.Session() as session: inputs, outputs = create_inference_graph( batch_size=FLAGS.test_batch_size, n_steps=N_STEPS) seq_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size]) decode_logits_ph = tf.placeholder( tf.float32, [None, FLAGS.test_batch_size, alphabet.size() + 1]) labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None]) label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size]) decoded, _ = decode_with_lm(decode_logits_ph, seq_lengths_ph, merge_repeated=False, beam_width=FLAGS.beam_width) sparse_labels = tf.cast( ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32) loss = tf.nn.ctc_loss(labels=sparse_labels, inputs=decode_logits_ph, sequence_length=seq_lengths_ph) distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), sparse_labels) # Create a saver using variables from the above newly created graph mapping = { v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_') } saver = tf.train.Saver(mapping) # Restore variables from training checkpoint checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not checkpoint: log_error( 'Checkpoint directory ({}) does not contain a valid checkpoint state.' .format(FLAGS.checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) logitses = [] batch_count = len(test_data) // FLAGS.test_batch_size bar = progressbar.ProgressBar(max_value=batch_count - 1, widget=progressbar.AdaptiveETA) for batch in bar(split_data(test_data, FLAGS.test_batch_size)): session.run(outputs['initialize_state']) batch_features = pad_to_dense(batch['features'].values) batch_features_len = batch['features_len'].values full_step_len = np.full_like(batch_features_len, N_STEPS) logits = np.empty([0, FLAGS.test_batch_size, alphabet.size() + 1]) for i in range(0, batch_features.shape[1], N_STEPS): chunk_features = batch_features[:, i:i + N_STEPS, :, :] chunk_features_len = np.minimum(batch_features_len, full_step_len) # pad with zeros if the chunk does not have enough steps steps_in_chunk = chunk_features.shape[1] if steps_in_chunk < FLAGS.n_steps: chunk_features = np.pad( chunk_features, ((0, 0), (0, FLAGS.n_steps - steps_in_chunk), (0, 0), (0, 0)), mode='constant', constant_values=0) output = session.run(outputs['outputs'], feed_dict={ inputs['input']: chunk_features, inputs['input_lengths']: chunk_features_len, }) logits = np.concatenate((logits, output)) # we have processed N_STEPS so subtract from remaining steps batch_features_len -= N_STEPS # clip to zero batch_features_len = np.maximum( batch_features_len, np.zeros_like(batch_features_len)) logitses.append(logits) ground_truths = [] predictions = [] distances = [] losses = [] bar = progressbar.ProgressBar(max_value=batch_count - 1, widget=progressbar.AdaptiveETA) for logits, batch in bar( zip(logitses, split_data(test_data, FLAGS.test_batch_size))): seq_lengths = batch['features_len'].values labels = pad_to_dense(batch['transcript'].values) label_lengths = batch['transcript_len'].values decoded_, loss_, distance_, sparse_labels_ = session.run( [decoded, loss, distance, sparse_labels], feed_dict={ decode_logits_ph: logits, seq_lengths_ph: seq_lengths, labels_ph: labels, label_lengths_ph: label_lengths }) ground_truths.extend( sparse_tensor_value_to_texts(sparse_labels_, alphabet)) predictions.extend( sparse_tensor_value_to_texts(decoded_[0], alphabet)) distances.extend(distance_) losses.extend(loss_) wer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_edit_distance = np.mean(distances) mean_loss = np.mean(losses) # Filter out all items with WER=0 and take only the first report_count items report_samples = itertools.islice((s for s in samples if s.wer > 0), FLAGS.report_count) print('Test - WER: %f, loss: %f, mean edit distance: %f' % (wer, mean_loss, mean_edit_distance)) print('-' * 80) for sample in report_samples: print('WER: %f, loss: %f, mean edit distance: %f' % (sample.wer, sample.loss, sample.distance)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) if FLAGS.test_output_file: json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
def initialize_globals(): c = AttrDict() # CPU device c.cpu_device = '/cpu:0' # Available GPU devices c.available_devices = get_available_gpus() # If there is no GPU available, we fall back to CPU based operation if 0 == len(c.available_devices): c.available_devices = [c.cpu_device] # Set default dropout rates if FLAGS.dropout_rate2 < 0: FLAGS.dropout_rate2 = FLAGS.dropout_rate if FLAGS.dropout_rate3 < 0: FLAGS.dropout_rate3 = FLAGS.dropout_rate if FLAGS.dropout_rate6 < 0: FLAGS.dropout_rate6 = FLAGS.dropout_rate # Set default checkpoint dir if len(FLAGS.checkpoint_dir) == 0: FLAGS.checkpoint_dir = xdg.save_data_path( os.path.join('deepspeech', 'checkpoints')) if FLAGS.load not in ['last', 'best', 'init', 'auto']: FLAGS.load = 'auto' # Set default summary dir if len(FLAGS.summary_dir) == 0: FLAGS.summary_dir = xdg.save_data_path( os.path.join('deepspeech', 'summaries')) # Standard session configuration that'll be used for all new sessions. c.session_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_placement, inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads, intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads) c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) # Geometric Constants # =================== # For an explanation of the meaning of the geometric constants, please refer to # doc/Geometry.md # Number of MFCC features c.n_input = 26 # TODO: Determine this programmatically from the sample rate # The number of frames in the context c.n_context = 9 # TODO: Determine the optimal value using a validation data set # Number of units in hidden layers c.n_hidden = FLAGS.n_hidden c.n_hidden_1 = c.n_hidden c.n_hidden_2 = c.n_hidden c.n_hidden_5 = c.n_hidden # LSTM cell state dimension c.n_cell_dim = c.n_hidden # The number of units in the third layer, which feeds in to the LSTM c.n_hidden_3 = c.n_cell_dim # Units in the sixth layer = number of characters in the target language plus one c.n_hidden_6 = c.alphabet.size() + 1 # +1 for CTC blank label if len(FLAGS.one_shot_infer) > 0: FLAGS.train = False FLAGS.test = False FLAGS.export_dir = '' if not os.path.exists(FLAGS.one_shot_infer): log_error( 'Path specified in --one_shot_infer is not a valid file.') exit(1) ConfigSingleton._config = c
def initialize_globals(): c = AttrDict() # The absolute number of computing nodes - regardless of cluster or single mode num_workers = 1 # The device path base for this node c.worker_device = '/job:%s/task:%d' % ('localhost', 0) # This node's CPU device c.cpu_device = c.worker_device + '/cpu:0' # This node's available GPU devices c.available_devices = [c.worker_device + gpu for gpu in get_available_gpus()] # If there is no GPU available, we fall back to CPU based operation if 0 == len(c.available_devices): c.available_devices = [c.cpu_device] # Set default dropout rates if FLAGS.dropout_rate2 < 0: FLAGS.dropout_rate2 = FLAGS.dropout_rate if FLAGS.dropout_rate3 < 0: FLAGS.dropout_rate3 = FLAGS.dropout_rate if FLAGS.dropout_rate6 < 0: FLAGS.dropout_rate6 = FLAGS.dropout_rate # Set default checkpoint dir if len(FLAGS.checkpoint_dir) == 0: FLAGS.checkpoint_dir = xdg.save_data_path(os.path.join('deepspeech','checkpoints')) # Set default summary dir if len(FLAGS.summary_dir) == 0: FLAGS.summary_dir = xdg.save_data_path(os.path.join('deepspeech','summaries')) # Standard session configuration that'll be used for all new sessions. c.session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_placement, inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads, intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads) c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) # Geometric Constants # =================== # For an explanation of the meaning of the geometric constants, please refer to # doc/Geometry.md # Number of MFCC features c.n_input = 26 # TODO: Determine this programmatically from the sample rate # The number of frames in the context c.n_context = 9 # TODO: Determine the optimal value using a validation data set # Number of units in hidden layers c.n_hidden = FLAGS.n_hidden c.n_hidden_1 = c.n_hidden c.n_hidden_2 = c.n_hidden c.n_hidden_5 = c.n_hidden # LSTM cell state dimension c.n_cell_dim = c.n_hidden # The number of units in the third layer, which feeds in to the LSTM c.n_hidden_3 = c.n_cell_dim # Units in the sixth layer = number of characters in the target language plus one c.n_hidden_6 = c.alphabet.size() + 1 # +1 for CTC blank label # Determine, if we are the chief worker c.is_chief = True ConfigSingleton._config = c
write_csvs(extracted) cleanup(archive) def handle_args(): parser = argparse.ArgumentParser( description='Import German Distant Speech (TUDA)') parser.add_argument('base_dir', help='Directory containing all data') parser.add_argument('--max_duration', type=int, default=10000, help='Maximum sample duration in milliseconds') parser.add_argument( '--normalize', action='store_true', help='Converts diacritic characters to their base ones') parser.add_argument( '--alphabet', help='Exclude samples with characters not in provided alphabet file') parser.add_argument('--keep_archive', type=bool, default=True, help='If downloaded archives should be kept') return parser.parse_args() if __name__ == "__main__": CLI_ARGS = handle_args() ALPHABET = Alphabet(CLI_ARGS.alphabet) if CLI_ARGS.alphabet else None download_and_prepare()
def initialize_globals(): c = AttrDict() # Set default dropout rates if FLAGS.dropout_rate2 < 0: FLAGS.dropout_rate2 = FLAGS.dropout_rate if FLAGS.dropout_rate3 < 0: FLAGS.dropout_rate3 = FLAGS.dropout_rate if FLAGS.dropout_rate6 < 0: FLAGS.dropout_rate6 = FLAGS.dropout_rate # Set default checkpoint dir if not FLAGS.checkpoint_dir: FLAGS.checkpoint_dir = xdg.save_data_path( os.path.join('deepspeech', 'checkpoints')) if FLAGS.load not in ['last', 'best', 'init', 'auto', 'transfer']: FLAGS.load = 'auto' # Set default summary dir if not FLAGS.summary_dir: FLAGS.summary_dir = xdg.save_data_path( os.path.join('deepspeech', 'summaries')) # Standard session configuration that'll be used for all new sessions. c.session_config = tfv1.ConfigProto( allow_soft_placement=True, log_device_placement=FLAGS.log_placement, inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads, intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads, gpu_options=tfv1.GPUOptions(allow_growth=FLAGS.use_allow_growth)) # CPU device c.cpu_device = '/cpu:0' # Available GPU devices c.available_devices = get_available_gpus(c.session_config) # If there is no GPU available, we fall back to CPU based operation if not c.available_devices: c.available_devices = [c.cpu_device] if FLAGS.utf8: c.alphabet = UTF8Alphabet() else: c.alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) # Geometric Constants # =================== # For an explanation of the meaning of the geometric constants, please refer to # doc/Geometry.md # Number of MFCC features c.n_input = 26 # TODO: Determine this programmatically from the sample rate # The number of frames in the context c.n_context = 9 # TODO: Determine the optimal value using a validation data set # Number of units in hidden layers c.n_hidden = FLAGS.n_hidden c.n_hidden_1 = c.n_hidden c.n_hidden_2 = c.n_hidden c.n_hidden_5 = c.n_hidden # LSTM cell state dimension c.n_cell_dim = c.n_hidden # The number of units in the third layer, which feeds in to the LSTM c.n_hidden_3 = c.n_cell_dim # Units in the sixth layer = number of characters in the target language plus one c.n_hidden_6 = c.alphabet.size() + 1 # +1 for CTC blank label # Size of audio window in samples if (FLAGS.feature_win_len * FLAGS.audio_sample_rate) % 1000 != 0: log_error( '--feature_win_len value ({}) in milliseconds ({}) multiplied ' 'by --audio_sample_rate value ({}) must be an integer value. Adjust ' 'your --feature_win_len value or resample your audio accordingly.' ''.format(FLAGS.feature_win_len, FLAGS.feature_win_len / 1000, FLAGS.audio_sample_rate)) sys.exit(1) c.audio_window_samples = FLAGS.audio_sample_rate * (FLAGS.feature_win_len / 1000) # Stride for feature computations in samples if (FLAGS.feature_win_step * FLAGS.audio_sample_rate) % 1000 != 0: log_error( '--feature_win_step value ({}) in milliseconds ({}) multiplied ' 'by --audio_sample_rate value ({}) must be an integer value. Adjust ' 'your --feature_win_step value or resample your audio accordingly.' ''.format(FLAGS.feature_win_step, FLAGS.feature_win_step / 1000, FLAGS.audio_sample_rate)) sys.exit(1) c.audio_step_samples = FLAGS.audio_sample_rate * (FLAGS.feature_win_step / 1000) if FLAGS.one_shot_infer: if not os.path.exists(FLAGS.one_shot_infer): log_error( 'Path specified in --one_shot_infer is not a valid file.') sys.exit(1) ConfigSingleton._config = c # pylint: disable=protected-access
parser.add_argument( '--audio_dir', help= 'Directory containing the audio clips - defaults to "<tsv_dir>/clips"') parser.add_argument( '--filter_alphabet', help='Exclude samples with characters not in provided alphabet') parser.add_argument( '--normalize', action='store_true', help='Converts diacritic characters to their base ones') params = parser.parse_args() audio_dir = params.audio_dir if params.audio_dir else os.path.join( params.tsv_dir, 'clips') alphabet = Alphabet( params.filter_alphabet) if params.filter_alphabet else None def label_filter(label): if params.normalize: label = unicodedata.normalize("NFKD", label.strip()) \ .encode("ascii", "ignore") \ .decode("ascii", "ignore") label = validate_label(label) if alphabet and label: try: [alphabet.label_from_string(c) for c in label] except KeyError: label = None return label _preprocess_data(params.tsv_dir, audio_dir, label_filter)
PARSER.add_argument( '--filter_alphabet', help='Exclude samples with characters not in provided alphabet') PARSER.add_argument( '--normalize', action='store_true', help='Converts diacritic characters to their base ones') PARSER.add_argument('--space_after_every_character', action='store_true', help='To help transcript join by white space') PARAMS = PARSER.parse_args() AUDIO_DIR = PARAMS.audio_dir if PARAMS.audio_dir else os.path.join( PARAMS.tsv_dir, 'clips') ALPHABET = Alphabet( PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None def label_filter_fun(label): if PARAMS.normalize: label = unicodedata.normalize("NFKD", label.strip()) \ .encode("ascii", "ignore") \ .decode("ascii", "ignore") label = validate_label(label) if ALPHABET and label: try: ALPHABET.encode(label) except KeyError: label = None return label _preprocess_data(PARAMS.tsv_dir, AUDIO_DIR, label_filter_fun,
def run_inference(): """Load frozen graph, run inference and display most likely predicted characters""" parser = argparse.ArgumentParser(description='Run Deepspeech inference to obtain char probabilities') parser.add_argument('--input-file', type=str, help='Path to the wav file', action="store", dest="input_file_path") parser.add_argument('--alphabet-file', type=str, help='Path to the alphabet.txt file', action="store", dest="alphabet_file_path") parser.add_argument('--model-file', type=str, help='Path to the tf model file', action="store", dest="model_file_path") parser.add_argument('--predicted-character-count', type=int, help='Number of most likely characters to be displayed', action="store", dest="predicted_character_count", default=5) args = parser.parse_args() alphabet = Alphabet(os.path.abspath(args.alphabet_file_path)) if args.predicted_character_count >= alphabet.size(): args.predicted_character_count = alphabet.size() - 1 # Load frozen graph from file and parse it with tf.gfile.GFile(args.model_file_path, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) with tf.Graph().as_default() as graph: tf.import_graph_def(graph_def, name="prefix") # currently hardcoded values used during inference n_input = 26 n_context = 9 n_steps = 16 with tf.Session(graph=graph) as session: session.run('prefix/initialize_state') features = util.audio.audiofile_to_input_vector(args.input_file_path, n_input, n_context) num_strides = len(features) - (n_context * 2) window_size = 2 * n_context + 1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, n_input), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) # we are interested only into logits, not CTC decoding inputs = {'input': graph.get_tensor_by_name('prefix/input_node:0'), 'input_lengths': graph.get_tensor_by_name('prefix/input_lengths:0')} outputs = {'outputs': graph.get_tensor_by_name('prefix/logits:0')} logits = np.empty([0, 1, alphabet.size() + 1]) for i in range(0, len(features), n_steps): chunk = features[i:i + n_steps] # pad with zeros if not enough steps (len(features) % FLAGS.n_steps != 0) if len(chunk) < n_steps: chunk = np.pad(chunk, ( (0, n_steps - len(chunk)), (0, 0), (0, 0) ), mode='constant', constant_values=0) output = session.run(outputs['outputs'], feed_dict={ inputs['input']: [chunk], inputs['input_lengths']: [len(chunk)], }) logits = np.concatenate((logits, output)) for i in range(0, len(logits)): softmax_output = softmax(logits[i][0]) indexes_sorted = softmax_output.argsort()[args.predicted_character_count * -1:][::-1] most_likely_chars = '' chars_probability = '' for j in range(args.predicted_character_count): char_index = indexes_sorted[j] if char_index < alphabet.size(): text = alphabet.string_from_label(char_index) most_likely_chars += text+' ' chars_probability += ' (' + str(softmax_output[char_index]) + ')' else: most_likely_chars += '- ' chars_probability += ' (' + str(softmax_output[char_index]) + ')' print(most_likely_chars, " ", chars_probability)