def next_batch(self): ''' Draw the next batch from from the combined switchable queue. ''' source, source_lengths, target, target_lengths = self._queue.dequeue_many(self._model_feeder.ph_batch_size) sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._model_feeder.ph_batch_size) return source, source_lengths, sparse_labels
def setup_graph(self, input_audio_batch, target_phrase): batch_size = input_audio_batch.shape[0] weird = (input_audio_batch.shape[1] - 1) // 320 logits_arg2 = np.tile(weird, batch_size) dense_arg1 = np.array(np.tile(target_phrase, (batch_size, 1)), dtype=np.int32) dense_arg2 = np.array(np.tile(target_phrase.shape[0], batch_size), dtype=np.int32) pass_in = np.clip(input_audio_batch, -2**15, 2**15 - 1) seq_len = np.tile(weird, batch_size).astype(np.int32) with tf.variable_scope('', reuse=tf.AUTO_REUSE): inputs = tf.placeholder(tf.float32, shape=pass_in.shape, name='a') len_batch = tf.placeholder(tf.float32, name='b') arg2_logits = tf.placeholder(tf.int32, shape=logits_arg2.shape, name='c') arg1_dense = tf.placeholder(tf.float32, shape=dense_arg1.shape, name='d') arg2_dense = tf.placeholder(tf.int32, shape=dense_arg2.shape, name='e') len_seq = tf.placeholder(tf.int32, shape=seq_len.shape, name='f') logits = get_logits(inputs, arg2_logits) target = ctc_label_dense_to_sparse(arg1_dense, arg2_dense, len_batch) ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=len_seq) decoded, _ = tf.nn.ctc_greedy_decoder(logits, arg2_logits, merge_repeated=True) sess = tf.Session() saver = tf.train.Saver(tf.global_variables()) saver.restore(sess, "models/session_dump") func1 = lambda a, b, c, d, e, f: sess.run(ctcloss, feed_dict={ inputs: a, len_batch: b, arg2_logits: c, arg1_dense: d, arg2_dense: e, len_seq: f }) func2 = lambda a, b, c, d, e, f: sess.run( [ctcloss, decoded], feed_dict={ inputs: a, len_batch: b, arg2_logits: c, arg1_dense: d, arg2_dense: e, len_seq: f }) return (func1, func2)
def main(_): initialize_globals() if not FLAGS.test_files: log_error('You need to specify what files to use for evaluation via ' 'the --test_files flag.') exit(1) global alphabet alphabet = Alphabet(FLAGS.alphabet_config_path) scorer = Scorer(FLAGS.lm_weight, FLAGS.valid_word_count_weight, FLAGS.lm_binary_path, FLAGS.lm_trie_path, alphabet) # sort examples by length, improves packing of batches and timesteps test_data = preprocess( FLAGS.test_files.split(','), FLAGS.test_batch_size, alphabet=alphabet, numcep=N_FEATURES, numcontext=N_CONTEXT, hdf5_cache_path=FLAGS.hdf5_test_set).sort_values( by="features_len", ascending=False) def create_windows(features): num_strides = len(features) - (N_CONTEXT * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2*N_CONTEXT+1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, N_FEATURES), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) return features # Create overlapping windows over the features test_data['features'] = test_data['features'].apply(create_windows) with tf.Session() as session: inputs, outputs, layers = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1) # Transpose to batch major for decoder transposed = tf.transpose(outputs['outputs'], [1, 0, 2]) labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels") label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths") sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32) loss = tf.nn.ctc_loss(labels=sparse_labels, inputs=layers['raw_logits'], sequence_length=inputs['input_lengths']) # Create a saver using variables from the above newly created graph mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')} saver = tf.train.Saver(mapping) # Restore variables from training checkpoint checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not checkpoint: log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) logitses = [] losses = [] print('Computing acoustic model predictions...') batch_count = len(test_data) // FLAGS.test_batch_size bar = progressbar.ProgressBar(max_value=batch_count, widget=progressbar.AdaptiveETA) # First pass, compute losses and transposed logits for decoding for batch in bar(split_data(test_data, FLAGS.test_batch_size)): session.run(outputs['initialize_state']) features = pad_to_dense(batch['features'].values) features_len = batch['features_len'].values labels = pad_to_dense(batch['transcript'].values) label_lengths = batch['transcript_len'].values logits, loss = session.run([transposed, loss], feed_dict={ inputs['input']: features, inputs['input_lengths']: features_len, labels_ph: labels, label_lengths_ph: label_lengths }) logitses.append(logits) losses.extend(loss) ground_truths = [] predictions = [] distances = [] print('Decoding predictions...') bar = progressbar.ProgressBar(max_value=batch_count, widget=progressbar.AdaptiveETA) # Get number of accessible CPU cores for this process num_processes = len(os.sched_getaffinity(0)) # Second pass, decode logits and compute WER and edit distance metrics for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))): seq_lengths = batch['features_len'].values.astype(np.int32) decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer) ground_truths.extend(alphabet.decode(l) for l in batch['transcript']) predictions.extend(d[0][1] for d in decoded) distances.extend(levenshtein(a, b) for a, b in zip(labels, predictions)) wer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_edit_distance = np.mean(distances) mean_loss = np.mean(losses) # Take only the first report_count items report_samples = itertools.islice(samples, FLAGS.report_count) print('Test - WER: %f, loss: %f, mean edit distance: %f' % (wer, mean_loss, mean_edit_distance)) print('-' * 80) for sample in report_samples: print('WER: %f, loss: %f, edit distance: %f' % (sample.wer, sample.loss, sample.distance)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) if FLAGS.test_output_file: json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
def next_batch(self): source, source_lengths, target, target_lengths = self._example_queue.dequeue_many( self._batch_size) sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._batch_size) return source, source_lengths, sparse_labels
def evaluate(test_data, inference_graph, alphabet): scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta, FLAGS.lm_binary_path, FLAGS.lm_trie_path, Config.alphabet) def create_windows(features): num_strides = len(features) - (Config.n_context * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2*Config.n_context+1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, Config.n_input), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) return features # Create overlapping windows over the features test_data['features'] = test_data['features'].apply(create_windows) with tf.Session(config=Config.session_config) as session: inputs, outputs, layers = inference_graph # Transpose to batch major for decoder transposed = tf.transpose(outputs['outputs'], [1, 0, 2]) labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels") label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths") sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32) loss = tf.nn.ctc_loss(labels=sparse_labels, inputs=layers['raw_logits'], sequence_length=inputs['input_lengths']) # Create a saver using variables from the above newly created graph mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')} saver = tf.train.Saver(mapping) # Restore variables from training checkpoint if FLAGS.checkpoint_dir is not None: checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not checkpoint: log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) logitses = [] losses = [] print('Computing acoustic model predictions...') batch_count = len(test_data) // FLAGS.test_batch_size bar = progressbar.ProgressBar(max_value=batch_count, widget=progressbar.AdaptiveETA) # First pass, compute losses and transposed logits for decoding for batch in bar(split_data(test_data, FLAGS.test_batch_size)): session.run(outputs['initialize_state']) features = pad_to_dense(batch['features'].values) features_len = batch['features_len'].values labels = pad_to_dense(batch['transcript'].values) label_lengths = batch['transcript_len'].values logits, loss_ = session.run([transposed, loss], feed_dict={ inputs['input']: features, inputs['input_lengths']: features_len, labels_ph: labels, label_lengths_ph: label_lengths }) logitses.append(logits) losses.extend(loss_) ground_truths = [] predictions = [] print('Decoding predictions...') bar = progressbar.ProgressBar(max_value=batch_count, widget=progressbar.AdaptiveETA) # Get number of accessible CPU cores for this process try: num_processes = cpu_count() except: num_processes = 1 # Second pass, decode logits and compute WER and edit distance metrics for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))): seq_lengths = batch['features_len'].values.astype(np.int32) decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width, num_processes=num_processes, scorer=scorer) ground_truths.extend(alphabet.decode(l) for l in batch['transcript']) predictions.extend(d[0][1] for d in decoded) distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)] wer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_edit_distance = np.mean(distances) mean_loss = np.mean(losses) # Take only the first report_count items report_samples = itertools.islice(samples, FLAGS.report_count) print('Test - WER: %f, CER: %f, loss: %f' % (wer, mean_edit_distance, mean_loss)) print('-' * 80) for sample in report_samples: print('WER: %f, CER: %f, loss: %f' % (sample.wer, sample.distance, sample.loss)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) return samples
def __init__(self, sess, loss_fn, phrase_length, max_audio_len, learning_rate=10, num_iterations=1000, batch_size=1): """ Set up the attack procedure. Here we create the TF graph that we're going to use to actually generate the adversarial examples. """ self.sess = sess self.learning_rate = learning_rate self.num_iterations = num_iterations self.batch_size = batch_size self.phrase_length = phrase_length self.max_audio_len = max_audio_len # Create all the variables necessary # they are prefixed with qq_ just so that we know hich # ones are ours so when we restore the session we don't # clobber them. self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_delta') self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask') self.cwmask = cwmask = tf.Variable(np.zeros( (batch_size, phrase_length), dtype=np.float32), name='qq_cwmask') self.original = original = tf.Variable(np.zeros( (batch_size, max_audio_len), dtype=np.float32), name='qq_original') self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths') self.importance = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.float32), name='qq_importance') self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase') self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths') self.rescale = tf.Variable(np.zeros((batch_size, 1), dtype=np.float32), name='qq_phrase_lengths') # Initially we bound the l_infty norm by 2000, increase this # constant if it's not big enough of a distortion for your dataset. self.apply_delta = tf.clip_by_value(delta, -2000, 2000) * self.rescale # We set the new input to the model to be the abve delta # plus a mask, which allows us to enforce that certain # values remain constant 0 for length padding sequences. self.new_input = new_input = self.apply_delta * mask + original # We add a tiny bit of noise to help make sure that we can # clip our values to 16-bit integers and not break things. noise = tf.random_normal(new_input.shape, stddev=2) pass_in = tf.clip_by_value(new_input + noise, -2**15, 2**15 - 1) # Feed this final value to get the logits. self.logits = logits = get_logits(pass_in, lengths) # And finally restore the graph to make the classifier # actually do something interesting. saver = tf.train.Saver( [x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, "models/session_dump") # Choose the loss function we want -- either CTC or CW self.loss_fn = loss_fn if loss_fn == "CTC": target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size) ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) loss = tf.nn.relu(ctcloss) self.expanded_loss = tf.constant(0) elif loss_fn == "CW": raise NotImplemented( "The current version of this project does not include the CW loss function implementation." ) else: raise # Set up the Adam optimizer to perform gradient descent for us var_start = tf.global_variables() self.train = tf.train.AdamOptimizer(learning_rate).minimize( loss, var_list=[delta]) self.loss = loss self.ctcloss = ctcloss var_end = tf.global_variables() new_vars = [ x for x in var_end if x.name not in [y.name for y in var_start] ] sess.run(tf.variables_initializer(new_vars + [delta])) # Decoder from the logits, to see how we're doing self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=1000)
def main(_): initialize_globals() if not FLAGS.test_files: log_error('You need to specify what files to use for evaluation via ' 'the --test_files flag.') exit(1) global alphabet alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path)) # sort examples by length, improves packing of batches and timesteps test_data = preprocess(FLAGS.test_files.split(','), FLAGS.test_batch_size, alphabet=alphabet, numcep=N_FEATURES, numcontext=N_CONTEXT, hdf5_cache_path=FLAGS.hdf5_test_set).sort_values( by="features_len", ascending=False) def create_windows(features): num_strides = len(features) - (N_CONTEXT * 2) # Create a view into the array with overlapping strides of size # numcontext (past) + 1 (present) + numcontext (future) window_size = 2 * N_CONTEXT + 1 features = np.lib.stride_tricks.as_strided( features, (num_strides, window_size, N_FEATURES), (features.strides[0], features.strides[0], features.strides[1]), writeable=False) return features test_data['features'] = test_data['features'].apply(create_windows) with tf.Session() as session: inputs, outputs = create_inference_graph( batch_size=FLAGS.test_batch_size, n_steps=N_STEPS) seq_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size]) decode_logits_ph = tf.placeholder( tf.float32, [None, FLAGS.test_batch_size, alphabet.size() + 1]) labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None]) label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size]) decoded, _ = decode_with_lm(decode_logits_ph, seq_lengths_ph, merge_repeated=False, beam_width=FLAGS.beam_width) sparse_labels = tf.cast( ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32) loss = tf.nn.ctc_loss(labels=sparse_labels, inputs=decode_logits_ph, sequence_length=seq_lengths_ph) distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), sparse_labels) # Create a saver using variables from the above newly created graph mapping = { v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_') } saver = tf.train.Saver(mapping) # Restore variables from training checkpoint checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if not checkpoint: log_error( 'Checkpoint directory ({}) does not contain a valid checkpoint state.' .format(FLAGS.checkpoint_dir)) exit(1) checkpoint_path = checkpoint.model_checkpoint_path saver.restore(session, checkpoint_path) logitses = [] batch_count = len(test_data) // FLAGS.test_batch_size bar = progressbar.ProgressBar(max_value=batch_count - 1, widget=progressbar.AdaptiveETA) for batch in bar(split_data(test_data, FLAGS.test_batch_size)): session.run(outputs['initialize_state']) batch_features = pad_to_dense(batch['features'].values) batch_features_len = batch['features_len'].values full_step_len = np.full_like(batch_features_len, N_STEPS) logits = np.empty([0, FLAGS.test_batch_size, alphabet.size() + 1]) for i in range(0, batch_features.shape[1], N_STEPS): chunk_features = batch_features[:, i:i + N_STEPS, :, :] chunk_features_len = np.minimum(batch_features_len, full_step_len) # pad with zeros if the chunk does not have enough steps steps_in_chunk = chunk_features.shape[1] if steps_in_chunk < FLAGS.n_steps: chunk_features = np.pad( chunk_features, ((0, 0), (0, FLAGS.n_steps - steps_in_chunk), (0, 0), (0, 0)), mode='constant', constant_values=0) output = session.run(outputs['outputs'], feed_dict={ inputs['input']: chunk_features, inputs['input_lengths']: chunk_features_len, }) logits = np.concatenate((logits, output)) # we have processed N_STEPS so subtract from remaining steps batch_features_len -= N_STEPS # clip to zero batch_features_len = np.maximum( batch_features_len, np.zeros_like(batch_features_len)) logitses.append(logits) ground_truths = [] predictions = [] distances = [] losses = [] bar = progressbar.ProgressBar(max_value=batch_count - 1, widget=progressbar.AdaptiveETA) for logits, batch in bar( zip(logitses, split_data(test_data, FLAGS.test_batch_size))): seq_lengths = batch['features_len'].values labels = pad_to_dense(batch['transcript'].values) label_lengths = batch['transcript_len'].values decoded_, loss_, distance_, sparse_labels_ = session.run( [decoded, loss, distance, sparse_labels], feed_dict={ decode_logits_ph: logits, seq_lengths_ph: seq_lengths, labels_ph: labels, label_lengths_ph: label_lengths }) ground_truths.extend( sparse_tensor_value_to_texts(sparse_labels_, alphabet)) predictions.extend( sparse_tensor_value_to_texts(decoded_[0], alphabet)) distances.extend(distance_) losses.extend(loss_) wer, samples = calculate_report(ground_truths, predictions, distances, losses) mean_edit_distance = np.mean(distances) mean_loss = np.mean(losses) # Filter out all items with WER=0 and take only the first report_count items report_samples = itertools.islice((s for s in samples if s.wer > 0), FLAGS.report_count) print('Test - WER: %f, loss: %f, mean edit distance: %f' % (wer, mean_loss, mean_edit_distance)) print('-' * 80) for sample in report_samples: print('WER: %f, loss: %f, mean edit distance: %f' % (sample.wer, sample.loss, sample.distance)) print(' - src: "%s"' % sample.src) print(' - res: "%s"' % sample.res) print('-' * 80) if FLAGS.test_output_file: json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
class Attack: def __init__(self, sess, loss_fn, phrase_length, max_audio_len, psdMaxes, learning_rate=10, num_iterations=5000, window_size=2048, step_per_window=4, batch_size=1, mp3=False, onlyCTC=True, audio=None, psdShape=None): """ Set up the attack procedure. Here we create the TF graph that we're going to use to actually generate the adversarial examples. """ self.sess = sess self.learning_rate = learning_rate self.num_iterations = num_iterations self.batch_size = batch_size self.phrase_length = phrase_length self.max_audio_len = max_audio_len self.mp3 = mp3 self.psdMaxes = psdMaxes self.window_size = window_size self.step_per_window = step_per_window # Create all the variables necessary # they are prefixed with qq_ just so that we know which # ones are ours so when we restore the session we don't # clobber them. frame_length = int(window_size) frame_step = int(window_size//step_per_window) fft_length = int(2**np.ceil(np.log2(frame_length))) sample_rate = 16000 freq_res = sample_rate/window_size time_res = frame_step/(sample_rate/1000) sigma_time = 96. / time_res sigma_freq = 15.625 / freq_res self.regularizer = regularizer = tf.Variable(np.zeros((batch_size), dtype=np.float32), name='qq_regularizer') self.psyTh = psyTh = tf.Variable(np.zeros((batch_size, psdShape[0], psdShape[1]), dtype=np.float32), name='qq_psyTh') self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len)).astype(np.float32)/2, name='qq_delta') name='qq_delta') self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask') self.original = original = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_original') self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths') self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase') self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths') self.rescale = tf.Variable(np.zeros((batch_size,1), dtype=np.float32), name='qq_rescale') # Initially we bound the l_infty norm by 2000, increase this # constant if it's not big enough of a distortion for your dataset. if(loss_fn == 'CTC'): self.apply_delta = tf.clip_by_value(delta, -2000, 2000)*self.rescale elif(loss_fn == 'CTCPSYCLIP'): self.apply_delta = apply_delta = self.clipBatch(delta, psyTh, regularizer, psdMaxes, max_audio_len, window_size, step_per_window) self.new_input = new_input = self.apply_delta*mask + original #self.new_input = new_input = delta*mask + original # We set the new input to the model to be the above delta # plus a mask, which allows us to enforce that certain # values remain constant 0 for length padding sequences. if(loss_fn == 'CTC'): self.new_input = new_input = self.apply_delta*mask + original if(loss_fn == 'CTCPSYGRAD'): self.new_input = new_input = self.delta*mask + original # We add a tiny bit of noise to help make sure that we can # clip our values to 16-bit integers and not break things. if(loss_fn == 'CTC'): noise = tf.random_normal(new_input.shape, stddev=2) pass_in = tf.clip_by_value(new_input+noise, -2**15, 2**15-1) # Feed this final value to get the logits. self.logits = logits = get_logits(new_input, lengths) # And finally restore the graph to make the classifier # actually do something interesting. saver = tf.train.Saver([x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, "models/session_dump") self.loss_fn = loss_fn if loss_fn == "CTC": target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size) ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion # The code runs faster at a slight cost of distortion, and also leaves one less # paramaeter that requires tuning. if not onlyCTC: loss = tf.reduce_mean((self.new_input-self.original)**2,axis=1)/regularizer + ctcLoss else: loss = ctcLoss self.expanded_loss = tf.constant(0)
if loss_fn == "CTC": target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size) ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion # The code runs faster at a slight cost of distortion, and also leaves one less # paramaeter that requires tuning. if not onlyCTC: loss = tf.reduce_mean((self.new_input-self.original)**2,axis=1)/regularizer + ctcLoss else: loss = ctcLoss self.expanded_loss = tf.constant(0) elif loss_fn == "CTCPSYCLIP": target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size) ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) loss = ctcLoss self.expanded_loss = tf.constant(0) elif loss_fn == "CW": raise NotImplemented("The current version of this project does not include the CW loss function implementation.") else: raise self.deltaPSD = deltaPSD = tfPSD(self.new_input-self.original, window_size, step_per_window, self.psdMaxes) psyLoss = tf.reduce_max(deltaPSD - self.psyTh, axis=[1,2]) self.loss = loss self.psyLoss = tf.transpose(psyLoss) self.ctcLoss = ctcLoss
def __init__(self, sess, phrase_length, max_audio_len, psdMaxes, learning_rate=10, num_iterations=5000, window_size=256, step_per_window=2, batch_size=1, mp3=False, delta=None, audio=None, psdShape=None): """ Set up the attack procedure. Here we create the TF graph that we're going to use to actually generate the adversarial examples. """ self.sess = sess self.learning_rate = learning_rate self.num_iterations = num_iterations self.batch_size = batch_size self.phrase_length = phrase_length self.max_audio_len = max_audio_len self.mp3 = mp3 self.psdMaxes = psdMaxes self.window_size = window_size self.step_per_window = step_per_window # Create all the variables necessary # they are prefixed with qq_ just so that we know which # ones are ours so when we restore the session we don't # clobber them. frame_length = int(window_size) frame_step = int(window_size // step_per_window) fft_length = int(2**np.ceil(np.log2(frame_length))) sample_rate = 16000 # datapoints per second freq_res = sample_rate / window_size # sample_rate/2 is the maximal recorded frequency, # We have window_size/2+1 frequencies time_res = frame_step / (sample_rate / 1000) # (sample_rate/1000) = samples per millisecond # frame_step/(sample_rate/1000) => milliseconds for one step self.regularizer = regularizer = tf.Variable(np.zeros( (batch_size), dtype=np.float32), name='qq_regularizer') self.psyTh = psyTh = tf.Variable(np.zeros( (batch_size, psdShape[0], psdShape[1]), dtype=np.float32), name='qq_psyTh') if (delta is None): self.delta = delta = tf.Variable(np.zeros( (batch_size, max_audio_len)).astype(np.float32) / 2, name='qq_delta') else: self.delta = delta = tf.Variable( (delta - audio).astype(np.float32), name='qq_delta') self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask') self.original = original = tf.Variable(np.zeros( (batch_size, max_audio_len), dtype=np.float32), name='qq_original') self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths') self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase') self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths') self.apply_delta = apply_delta = self.clipBatch( delta, psyTh, regularizer, psdMaxes, max_audio_len, window_size, step_per_window) self.new_input = new_input = self.apply_delta * mask + original # We set the new input to the model to be the above delta # plus a mask, which allows us to enforce that certain # values remain constant 0 for length padding sequences. # Feed this final value to get the logits. self.logits = logits = get_logits(new_input, lengths) # And finally restore the graph to make the classifier # actually do something interesting. saver = tf.train.Saver( [x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, "models/session_dump") target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size) ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) loss = ctcLoss self.expanded_loss = tf.constant(0) self.deltaPSD = deltaPSD = tfPSD(self.new_input - self.original, window_size, step_per_window, self.psdMaxes) self.loss = loss self.psyLoss = tf.reduce_max(deltaPSD - self.psyTh, axis=[1, 2]) self.ctcLoss = ctcLoss # Set up the Adam optimizer to perform gradient descent for us start_vars = set(x.name for x in tf.global_variables()) optimizer = tf.train.AdamOptimizer(learning_rate) grad, var = optimizer.compute_gradients(self.loss, [delta])[0] self.train = optimizer.apply_gradients([(grad, var)]) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] sess.run(tf.variables_initializer(new_vars + [delta])) # Decoder from the logits, to see how we're doing self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=100)
def next_batch(self): uttids, source, source_lengths, target, target_lengths = self._queue.dequeue_many( self._data_set.batch_size) sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._data_set.batch_size) return uttids, source, source_lengths, sparse_labels
def __init__(self, sess, loss_fn, phrase_length, max_audio_len, learning_rate=10, num_iterations=5000, batch_size=1, mp3=False, l2penalty=float('inf'), beam_width=100): """ Set up the attack procedure. Here we create the TF graph that we're going to use to actually generate the adversarial examples. """ self.sess = sess self.learning_rate = learning_rate self.num_iterations = num_iterations self.batch_size = batch_size self.phrase_length = phrase_length self.max_audio_len = max_audio_len self.mp3 = mp3 self.beam_width = beam_width # Create all the variables necessary # they are prefixed with qq_ just so that we know which # ones are ours so when we restore the session we don't # clobber them. bs_mal_shape = [batch_size, max_audio_len] bs_pl_shape = [batch_size, phrase_length] self.delta = delta = tf.get_variable('qq_delta', bs_mal_shape, dtype=np.float32, initializer=tf.zeros_initializer) self.mask = mask = tf.get_variable('qq_mask', bs_mal_shape, dtype=np.float32, initializer=tf.zeros_initializer) self.cwmask = cwmask = tf.get_variable('qq_cwmask', bs_pl_shape dtype=np.float32, initializer=tf.zeros_initializer) self.original = original = tf.get_variable('qq_original', bs_mal_shape, dtype=np.float32, initializer=tf.zeros_initializer) self.lengths = tf.get_variable('qq_lengths', np.zeros(batch_size, dtype=np.int32) initializer=tf.zeros_initializer) self.importance = tf.get_variable('qq_importance', bs_pl_shape, dtype=np.float32, initializer=tf.zeros_initializer) self.target_phrase = tf.get_variable('qq_phrase', bs_pl_shape, dtype=np.int32, initializer=tf.zeros_initializer) self.target_phrase_lengths = tf.get_variable('qq_phrase_lengths', [batch_size], dtype=np.int32, initializer=tf.zeros_initializer) self.rescale = tf.get_variable('qq_phrase_lengths', [batch_size,1], dtype=np.float32, initializer=tf.zeros_initializer) # Initially we bound the l_infty norm by 2000, increase this # constant if it's not big enough of a distortion for your dataset. self.apply_delta = tf.clip_by_value(delta, -2000, 2000)*self.rescale # We set the new input to the model to be the abve delta # plus a mask, which allows us to enforce that certain # values remain constant 0 for length padding sequences. self.new_input = self.apply_delta*mask + original # We add a tiny bit of noise to help make sure that we can # clip our values to 16-bit integers and not break things. noise = tf.random_normal(self.new_input.shape, stddev=2) pass_in = tf.clip_by_value(self.new_input + noise, -2**15, 2**15-1) # Feed this final value to get the logits. self.logits = get_logits(pass_in, self.lengths) # And finally restore the graph to make the classifier # actually do something interesting. saver = tf.train.Saver([x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, "models/session_dump") # Choose the loss function we want -- either CTC or CW self.loss_fn = loss_fn if loss_fn == "CTC": target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size) self.ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=self.logits, sequence_length=self.lengths) # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion # The code runs faster at a slight cost of distortion, and also leaves one less # paramaeter that requires tuning. if not np.isinf(l2penalty): self.loss = tf.reduce_mean((self.new_input - self.original)**2,axis=1) + l2penalty * self.ctcloss else: self.loss = self.ctcloss self.expanded_loss = tf.constant(0) elif loss_fn == "CW": raise NotImplemented("The current version of this project does not include the CW loss function implementation.") else: raise # Set up the Adam optimizer to perform gradient descent for us start_vars = set(x.name for x in tf.global_variables()) optimizer = tf.train.AdamOptimizer(self.learning_rate) grad,var = optimizer.compute_gradients(self.loss, [delta])[0] self.train = optimizer.apply_gradients([(tf.sign(grad),var)]) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] sess.run(tf.variables_initializer(new_vars + [delta])) # Decoder from the logits, to see how we're doing self.decoded, _ = tf.nn.ctc_beam_search_decoder(self.logits, self.lengths, merge_repeated=False, beam_width=self.beam_width)