def transcribe_file(audio_path, tlog_path): from DeepSpeech import create_model, try_loading # pylint: disable=cyclic-import,import-outside-toplevel initialize_globals() scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.scorer_path, Config.alphabet) try: num_processes = cpu_count() except NotImplementedError: num_processes = 1 with AudioFile(audio_path, as_path=True) as wav_path: data_set = split_audio_file(wav_path, batch_size=FLAGS.batch_size, aggressiveness=FLAGS.vad_aggressiveness, outlier_duration_ms=FLAGS.outlier_duration_ms, outlier_batch_size=FLAGS.outlier_batch_size) iterator = tf.data.Iterator.from_structure(data_set.output_types, data_set.output_shapes, output_classes=data_set.output_classes) batch_time_start, batch_time_end, batch_x, batch_x_len = iterator.get_next() no_dropout = [None] * 6 logits, _ = create_model(batch_x=batch_x, seq_length=batch_x_len, dropout=no_dropout) transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2])) tf.train.get_or_create_global_step() saver = tf.train.Saver() with tf.Session(config=Config.session_config) as session: loaded = try_loading(session, saver, 'best_dev_checkpoint', 'best validation', log_success=False) if not loaded: loaded = try_loading(session, saver, 'checkpoint', 'most recent', log_success=False) if not loaded: fail('Checkpoint directory ({}) does not contain a valid checkpoint state.' .format(FLAGS.checkpoint_dir)) session.run(iterator.make_initializer(data_set)) transcripts = [] while True: try: starts, ends, batch_logits, batch_lengths = \ session.run([batch_time_start, batch_time_end, transposed, batch_x_len]) except tf.errors.OutOfRangeError: break decoded = ctc_beam_search_decoder_batch(batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer) decoded = list(d[0][1] for d in decoded) transcripts.extend(zip(starts, ends, decoded)) transcripts.sort(key=lambda t: t[0]) transcripts = [{'start': int(start), 'end': int(end), 'transcript': transcript} for start, end, transcript in transcripts] with open(tlog_path, 'w') as tlog_file: json.dump(transcripts, tlog_file, default=float)
def evaluate(test_csvs, create_model, try_loading): scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet) test_csvs = FLAGS.test_files.split(',') test_sets = [create_dataset([csv], batch_size=FLAGS.test_batch_size) for csv in test_csvs] iterator = tf.data.Iterator.from_structure(test_sets[0].output_types, test_sets[0].output_shapes, output_classes=test_sets[0].output_classes) test_init_ops = [iterator.make_initializer(test_set) for test_set in test_sets] (batch_x, batch_x_len), batch_y = iterator.get_next() # One rate per layer no_dropout = [None] * 6 logits, _ = create_model(batch_x=batch_x, seq_length=batch_x_len, dropout=no_dropout) # Transpose to batch major and apply softmax for decoder transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2])) loss = tf.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_x_len) tf.train.get_or_create_global_step() # Get number of accessible CPU cores for this process try: num_processes = cpu_count() except NotImplementedError: num_processes = 1 # Create a saver using variables from the above newly created graph saver = tf.train.Saver() with tf.Session(config=Config.session_config) as session: # Restore variables from training checkpoint loaded = try_loading(session, saver, 'best_dev_checkpoint', 'best validation') if not loaded: loaded = try_loading(session, saver, 'checkpoint', 'most recent') if not loaded: log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir)) exit(1) def run_test(init_op, dataset): logitses = [] losses = [] seq_lengths = [] ground_truths = [] bar = create_progressbar(prefix='Computing acoustic model predictions | ', widgets=['Steps: ', progressbar.Counter(), ' | ', progressbar.Timer()]).start() log_progress('Computing acoustic model predictions...') step_count = 0 # Initialize iterator to the appropriate dataset session.run(init_op) # First pass, compute losses and transposed logits for decoding while True: try: logits, loss_, lengths, transcripts = session.run([transposed, loss, batch_x_len, batch_y]) except tf.errors.OutOfRangeError: break step_count += 1 bar.update(step_count) logitses.append(logits) losses.extend(loss_) seq_lengths.append(lengths) ground_truths.extend(sparse_tensor_value_to_texts(transcripts, Config.alphabet)) bar.finish() predictions = [] bar = create_progressbar(max_value=step_count, prefix='Decoding predictions | ').start() log_progress('Decoding predictions...') # Second pass, decode logits and compute WER and edit distance metrics for logits, seq_length in bar(zip(logitses, seq_lengths)): decoded = ctc_beam_search_decoder_batch(logits, seq_length, Config.alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer) predictions.extend(d[0][1] for d in decoded) distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)] wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_loss = np.mean(losses) # Take only the first report_count items report_samples = itertools.islice(samples, FLAGS.report_count) print('Test on %s - WER: %f, CER: %f, loss: %f' % (dataset, wer, cer, mean_loss)) print('-' * 80) for sample in report_samples: print('WER: %f, CER: %f, loss: %f' % (sample.wer, sample.distance, sample.loss)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) return samples samples = [] for csv, init_op in zip(test_csvs, test_init_ops): print('Testing model on {}'.format(csv)) samples.extend(run_test(init_op, dataset=csv)) return samples
def evaluate(test_csvs, create_model, try_loading): if FLAGS.lm_binary_path: scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet) else: scorer = None test_csvs = FLAGS.test_files.split(',') test_sets = [ create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False) for csv in test_csvs ] iterator = tfv1.data.Iterator.from_structure( tfv1.data.get_output_types(test_sets[0]), tfv1.data.get_output_shapes(test_sets[0]), output_classes=tfv1.data.get_output_classes(test_sets[0])) test_init_ops = [ iterator.make_initializer(test_set) for test_set in test_sets ] batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next() # One rate per layer no_dropout = [None] * 6 logits, _ = create_model(batch_x=batch_x, batch_size=FLAGS.test_batch_size, seq_length=batch_x_len, dropout=no_dropout) # Transpose to batch major and apply softmax for decoder transposed = tf.nn.softmax(tf.transpose(a=logits, perm=[1, 0, 2])) loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_x_len) tfv1.train.get_or_create_global_step() # Get number of accessible CPU cores for this process try: num_processes = cpu_count() except NotImplementedError: num_processes = 1 # Create a saver using variables from the above newly created graph saver = tfv1.train.Saver() with tfv1.Session(config=Config.session_config) as session: # Restore variables from training checkpoint loaded = try_loading(session, saver, 'best_dev_checkpoint', 'best validation') if not loaded: loaded = try_loading(session, saver, 'checkpoint', 'most recent') if not loaded: log_error( 'Checkpoint directory ({}) does not contain a valid checkpoint state.' .format(FLAGS.checkpoint_dir)) exit(1) def run_test(init_op, dataset): wav_filenames = [] losses = [] predictions = [] ground_truths = [] bar = create_progressbar(prefix='Test epoch | ', widgets=[ 'Steps: ', progressbar.Counter(), ' | ', progressbar.Timer() ]).start() log_progress('Test epoch...') step_count = 0 # Initialize iterator to the appropriate dataset session.run(init_op) # First pass, compute losses and transposed logits for decoding while True: try: batch_wav_filenames, batch_logits, batch_loss, batch_lengths, batch_transcripts = \ session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y]) except tf.errors.OutOfRangeError: break decoded = ctc_beam_search_decoder_batch( batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer, cutoff_prob=FLAGS.cutoff_prob, cutoff_top_n=FLAGS.cutoff_top_n) predictions.extend(d[0][1] for d in decoded) ground_truths.extend( sparse_tensor_value_to_texts(batch_transcripts, Config.alphabet)) wav_filenames.extend( wav_filename.decode('UTF-8') for wav_filename in batch_wav_filenames) losses.extend(batch_loss) step_count += 1 bar.update(step_count) bar.finish() wer, cer, samples = calculate_report(wav_filenames, ground_truths, predictions, losses) mean_loss = np.mean(losses) # Take only the first report_count items report_samples = itertools.islice(samples, FLAGS.report_count) print('Test on %s - WER: %f, CER: %f, loss: %f' % (dataset, wer, cer, mean_loss)) print('-' * 80) for sample in report_samples: print('WER: %f, CER: %f, loss: %f' % (sample.wer, sample.cer, sample.loss)) print(' - wav: file://%s' % sample.wav_filename) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) return samples samples = [] for csv, init_op in zip(test_csvs, test_init_ops): print('Testing model on {}'.format(csv)) samples.extend(run_test(init_op, dataset=csv)) return samples
def activations_common_voice_pertubed_sets(input_dir, output_dir, test_only=False, prune_percentage=0, scores_file=None, random=False, verbose=True, randomly_initialized=False): '''Obtains activations for wavs in input_dir and saves them to output_dir''' inputs, outputs, layers = create_inference_graph(batch_size=1, n_steps=-1) intermediate_layer_names = [ 'layer_1', 'layer_2', 'layer_3', 'rnn_output', 'layer_4', 'layer_5' ] intermediate_layers = [ l for n, l in layers.items() if n in intermediate_layer_names ] pertubed_sets = json.load(open('data/pertubed_input_sets_balanced.json')) skip_sets = [] if test_only: skip_sets = json.load(open('./results/set_ids_used.json')) if not prune_percentage: base_path = '{}/activations'.format(output_dir) else: base_path = '{}/activations/pruned-{}'.format(output_dir, prune_percentage * 100) if random: base_path += '-random' with tfv1.Session(config=Config.session_config) as session: # Create a saver using variables from the above newly created graph if not randomly_initialized: saver = tfv1.train.Saver() # Restore variables from training checkpoint loaded = False if not loaded and FLAGS.load in ['auto', 'last']: loaded = try_loading(session, saver, 'checkpoint', 'most recent', load_step=False) if not loaded and FLAGS.load in ['auto', 'best']: loaded = try_loading(session, saver, 'best_dev_checkpoint', 'best validation', load_step=False) if not loaded: print('Could not load checkpoint from {}'.format( FLAGS.checkpoint_dir)) sys.exit(1) else: initializer = tfv1.global_variables_initializer() session.run(initializer) ###### PRUNING PART ###### if verbose: if not prune_percentage: print('No pruning done.') else: if verbose: print('-' * 80) if verbose: print('pruning with {}%...'.format(prune_percentage)) scores_per_layer = np.load(scores_file) layer_masks = prune_matrices(scores_per_layer, prune_percentage=prune_percentage, random=random, verbose=verbose, skip_lstm=False) n_layers_to_prune = len(layer_masks) i = 0 for index, v in enumerate(tf.trainable_variables()): lstm_layer_name = 'cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/kernel:0' if 'weights' not in v.name and v.name != lstm_layer_name: continue if (i >= n_layers_to_prune): break # if i < total_ops, it is not yet the last layer # make mask into the shape of the weights if v.name == lstm_layer_name: if skip_lstm: continue # Shape of LSTM weights: [(2*neurons), (4*neurons)] cell_template = np.ones((2, 4)) mask = np.repeat(layer_masks[i], v.shape[0] // 2, axis=0) mask = mask.reshape( [layer_masks[i].shape[0], v.shape[0] // 2]) mask = np.swapaxes(mask, 0, 1) mask = np.kron(mask, cell_template) else: idx = layer_masks[i] == 1 mask = np.repeat(layer_masks[i], v.shape[0], axis=0) mask = mask.reshape([layer_masks[i].shape[0], v.shape[0]]) mask = np.swapaxes(mask, 0, 1) # apply mask to weights session.run(v.assign(tf.multiply(v, mask))) i += 1 ###### END PRUNING PART ###### # Default states for LSTM cell previous_state_c = np.zeros([1, Config.n_cell_dim]) previous_state_h = np.zeros([1, Config.n_cell_dim]) sets_to_process = [ set for set in pertubed_sets if str(set['set_id']) not in skip_sets ] print('{} sets found'.format(len(sets_to_process))) for set in sets_to_process: print('Processing set {}, {} items...'.format( set['set_id'], set['set_length'])) # Only process files that are not yet available in results directory create_dir_if_not_exists('{}/{}'.format( base_path, set['set_id'])) # Check if directory exists files_done = [ f[:-4] for f in os.listdir('{}/{}'.format(base_path, set['set_id'])) if f.endswith('.npy') ] for item in set['set_items']: file_name = item['path'][:-4] print(file_name) if file_name in files_done: print('Skipped.') continue print('current file: {}'.format(file_name)) input_file_path = '{}/{}.wav'.format(input_dir, file_name) # Prepare features features, features_len = audiofile_to_features(input_file_path) features = tf.expand_dims(features, 0) features_len = tf.expand_dims(features_len, 0) features = create_overlapping_windows(features).eval( session=session) features_len = features_len.eval(session=session) feed_dict = { inputs['input']: features, inputs['input_lengths']: features_len, inputs['previous_state_c']: previous_state_c, inputs['previous_state_h']: previous_state_h, } intermediate_activations = session.run(intermediate_layers, feed_dict=feed_dict) # Save activations of actual input save_to_path_activations = '{}/{}/{}.npy'.format( base_path, set['set_id'], file_name) write_numpy_to_file(save_to_path_activations, np.array(intermediate_activations)) print('Activations for {} are saved to: {}'.format( file_name, save_to_path_activations)) return True
def evaluate_with_pruning(test_csvs, prune_percentage, random, scores_file, result_file, verbose=True, skip_lstm=False): '''Code originaly comes from the DeepSpeech repository (./DeepSpeech/evaluate.py). The code is adapted for evaluation on pruned versions of the DeepSpeech model. ''' tfv1.reset_default_graph() if FLAGS.lm_binary_path: scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet) else: scorer = None test_csvs = test_csvs.split(',') test_sets = [ create_dataset([csv], batch_size=FLAGS.test_batch_size, train_phase=False) for csv in test_csvs ] iterator = tfv1.data.Iterator.from_structure( tfv1.data.get_output_types(test_sets[0]), tfv1.data.get_output_shapes(test_sets[0]), output_classes=tfv1.data.get_output_classes(test_sets[0])) test_init_ops = [ iterator.make_initializer(test_set) for test_set in test_sets ] batch_wav_filename, (batch_x, batch_x_len), batch_y = iterator.get_next() # One rate per layer no_dropout = [None] * 6 logits, _ = create_model(batch_x=batch_x, batch_size=FLAGS.test_batch_size, seq_length=batch_x_len, dropout=no_dropout) # Transpose to batch major and apply softmax for decoder transposed = tf.nn.softmax(tf.transpose(a=logits, perm=[1, 0, 2])) loss = tfv1.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_x_len) tfv1.train.get_or_create_global_step() # Get number of accessible CPU cores for this process try: num_processes = cpu_count() except NotImplementedError: num_processes = 1 # Create a saver using variables from the above newly created graph saver = tfv1.train.Saver() with tfv1.Session(config=Config.session_config) as session: # Create a saver using variables from the above newly created graph saver = tfv1.train.Saver() # Restore variables from training checkpoint loaded = False if not loaded and FLAGS.load in ['auto', 'last']: loaded = try_loading(session, saver, 'checkpoint', 'most recent', load_step=False) if not loaded and FLAGS.load in ['auto', 'best']: loaded = try_loading(session, saver, 'best_dev_checkpoint', 'best validation', load_step=False) if not loaded: print('Could not load checkpoint from {}'.format( FLAGS.checkpoint_dir)) sys.exit(1) ###### PRUNING PART ###### if verbose: if not prune_percentage: print('No pruning done.') else: if verbose: print('-' * 80) if verbose: print('pruning with {}%...'.format(prune_percentage)) scores_per_layer = np.load(scores_file) layer_masks = prune_matrices(scores_per_layer, prune_percentage=prune_percentage, random=random, verbose=verbose, skip_lstm=skip_lstm) n_layers_to_prune = len(layer_masks) i = 0 for index, v in enumerate(tf.trainable_variables()): lstm_layer_name = 'cudnn_lstm/rnn/multi_rnn_cell/cell_0/cudnn_compatible_lstm_cell/kernel:0' if 'weights' not in v.name and v.name != lstm_layer_name: continue if (i >= n_layers_to_prune): break # if i < total_ops, it is not yet the last layer # make mask into the shape of the weights if v.name == lstm_layer_name: if skip_lstm: continue # Shape of LSTM weights: [(2*neurons), (4*neurons)] cell_template = np.ones((2, 4)) mask = np.repeat(layer_masks[i], v.shape[0] // 2, axis=0) mask = mask.reshape( [layer_masks[i].shape[0], v.shape[0] // 2]) mask = np.swapaxes(mask, 0, 1) mask = np.kron(mask, cell_template) else: idx = layer_masks[i] == 1 mask = np.repeat(layer_masks[i], v.shape[0], axis=0) mask = mask.reshape([layer_masks[i].shape[0], v.shape[0]]) mask = np.swapaxes(mask, 0, 1) # apply mask to weights session.run(v.assign(tf.multiply(v, mask))) i += 1 ###### END PRUNING PART ###### def run_test(init_op, dataset): wav_filenames = [] losses = [] predictions = [] ground_truths = [] bar = create_progressbar(prefix='Test epoch | ', widgets=[ 'Steps: ', progressbar.Counter(), ' | ', progressbar.Timer() ]).start() log_progress('Test epoch...') step_count = 0 # Initialize iterator to the appropriate dataset session.run(init_op) # First pass, compute losses and transposed logits for decoding while True: try: batch_wav_filenames, batch_logits, batch_loss, batch_lengths, batch_transcripts = \ session.run([batch_wav_filename, transposed, loss, batch_x_len, batch_y]) except tf.errors.OutOfRangeError: break decoded = ctc_beam_search_decoder_batch( batch_logits, batch_lengths, Config.alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer, cutoff_prob=FLAGS.cutoff_prob, cutoff_top_n=FLAGS.cutoff_top_n) predictions.extend(d[0][1] for d in decoded) ground_truths.extend( sparse_tensor_value_to_texts(batch_transcripts, Config.alphabet)) wav_filenames.extend( wav_filename.decode('UTF-8') for wav_filename in batch_wav_filenames) losses.extend(batch_loss) step_count += 1 bar.update(step_count) bar.finish() wer, cer, samples = calculate_report(wav_filenames, ground_truths, predictions, losses) mean_loss = np.mean(losses) # Take only the first report_count items report_samples = itertools.islice(samples, FLAGS.report_count) if verbose: print('Test on %s - WER: %f, CER: %f, loss: %f' % (dataset, wer, cer, mean_loss)) if verbose: print('-' * 80) if result_file: pruning_type = 'score-based' if not random else 'random' result_string = '''Results for evaluating model with pruning percentage of {}% and {} pruning: Test on {} - WER: {}, CER: {}, loss: {} '''.format(prune_percentage * 100, pruning_type, dataset, wer, cer, mean_loss) write_to_file(result_file, result_string, 'a+') return wer, cer, mean_loss results = [] for csv, init_op in zip(test_csvs, test_init_ops): if verbose: print('Testing model on {}'.format(csv)) results.extend(run_test(init_op, dataset=csv)) return results
def evaluate(test_csvs, create_model, try_loading): scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet) test_set = create_dataset(test_csvs, batch_size=FLAGS.test_batch_size, cache_path=FLAGS.test_cached_features_path) it = test_set.make_one_shot_iterator() (batch_x, batch_x_len), batch_y = it.get_next() # One rate per layer no_dropout = [None] * 6 logits, _ = create_model(batch_x=batch_x, seq_length=batch_x_len, dropout=no_dropout) # Transpose to batch major and apply softmax for decoder transposed = tf.nn.softmax(tf.transpose(logits, [1, 0, 2])) loss = tf.nn.ctc_loss(labels=batch_y, inputs=logits, sequence_length=batch_x_len) global_step = tf.train.get_or_create_global_step() with tf.Session(config=Config.session_config) as session: # Create a saver using variables from the above newly created graph saver = tf.train.Saver() # Restore variables from training checkpoint loaded = try_loading(session, saver, 'best_dev_checkpoint', 'best validation') if not loaded: loaded = try_loading(session, saver, 'checkpoint', 'most recent') if not loaded: log_error( 'Checkpoint directory ({}) does not contain a valid checkpoint state.' .format(FLAGS.checkpoint_dir)) exit(1) logitses = [] losses = [] seq_lengths = [] ground_truths = [] print('Computing acoustic model predictions...') bar = progressbar.ProgressBar(widgets=[ 'Steps: ', progressbar.Counter(), ' | ', progressbar.Timer() ]) step_count = 0 # First pass, compute losses and transposed logits for decoding while True: try: logits, loss_, lengths, transcripts = session.run( [transposed, loss, batch_x_len, batch_y]) except tf.errors.OutOfRangeError: break step_count += 1 bar.update(step_count) logitses.append(logits) losses.extend(loss_) seq_lengths.append(lengths) ground_truths.extend( sparse_tensor_value_to_texts(transcripts, Config.alphabet)) bar.finish() predictions = [] # Get number of accessible CPU cores for this process try: num_processes = cpu_count() except: num_processes = 1 print('Decoding predictions...') bar = progressbar.ProgressBar(max_value=step_count, widget=progressbar.AdaptiveETA) # Second pass, decode logits and compute WER and edit distance metrics for logits, seq_length in bar(zip(logitses, seq_lengths)): decoded = ctc_beam_search_decoder_batch(logits, seq_length, Config.alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer) predictions.extend(d[0][1] for d in decoded) distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)] wer, cer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_loss = np.mean(losses) # Take only the first report_count items report_samples = itertools.islice(samples, FLAGS.report_count) print('Test - WER: %f, CER: %f, loss: %f' % (wer, cer, mean_loss)) print('-' * 80) for sample in report_samples: print('WER: %f, CER: %f, loss: %f' % (sample.wer, sample.distance, sample.loss)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) return samples