def __init__(self, sess, batch, bit_depth=2 ** 15, r_constant=0.95, update_method="geom", lowest_bound=None): assert type(r_constant) == float or type(r_constant) == np.float32 assert 0 < r_constant < 1.0 if lowest_bound is not None: assert lowest_bound > 0 assert type(lowest_bound) in [float, int, np.int16, np.int32, np.float32] self.lowest_bound = float(lowest_bound) else: self.lowest_bound = None assert update_method in ["lin", "geom", "log"] self.__bit_depth = bit_depth self.r_constant = r_constant self.tf_run = sess.run self.update_method = update_method self.bounds = tf.Variable( tf.zeros([batch.size, 1]), trainable=False, validate_shape=True, dtype=tf.float32, name='qq_masks' ) self.initial_taus = np_arr( lcomp(self._gen_tau(batch.audios["n_samples"])), np.float32 ) self.tf_run(self.bounds.assign(self.initial_taus))
def create_audio_batch_from_wav_files(batched_file_path_data, dtype="int16"): audio_fps = l_map(lambda x: x[1], batched_file_path_data) basenames = l_map(lambda x: x[2], batched_file_path_data) audios = lcomp([WavFile.load(f, dtype) for f in audio_fps]) maxlen = max(map(len, audios)) maximum_length = maxlen + utils.Audios.padding(maxlen) padded_audio = np_arr( lcomp(utils.Audios.gen_padded_audio(audios, maximum_length)), np.float32 ) actual_lengths = np_arr( l_map(lambda x: x.size, audios), np.int32 ) # N.B. Remember to use round instead of integer division here! maximum_feature_lengths = np_arr( l_map( lambda _: np.round((maximum_length - 320) / 320), audios ), np.int32 ) actual_feature_lengths = np_arr( l_map( lambda x: np.round((x.size - 320) / 320), audios ), np.int32 ) return { "file_paths": audio_fps, "max_samples": maximum_length, "max_feats": maximum_feature_lengths[0], "audio": audios, "padded_audio": padded_audio, "basenames": basenames, "n_samples": actual_lengths, "ds_feats": maximum_feature_lengths, "real_feats": actual_feature_lengths, }
def __init__(self, sess, batch, hard_constraint, synthesiser, placeholders=None): batch_size = batch.size max_len = batch.audios["max_samples"] act_lengths = batch.audios["n_samples"] if placeholders is not None: self.placeholders = placeholders else: self.placeholders = Placeholders(batch_size, max_len) self.masks = tf.Variable( tf.zeros([batch_size, max_len]), trainable=False, validate_shape=True, dtype=tf.float32, name='qq_masks' ) self.synthesiser = synthesiser self.opt_vars = synthesiser.opt_vars # Generate the delta synth parameter objects which we will optimise deltas = synthesiser.synthesise() # Mask deltas first so we zero value *any part of the signal* that is # zero value padded in the original audio deltas *= self.masks # Restrict delta to valid space before applying constraints lower = -2.0 ** 15 upper = 2.0 ** 15 - 1 valid_deltas = tf.clip_by_value( deltas, clip_value_min=lower, clip_value_max=upper ) self.final_deltas = hard_constraint.clip(valid_deltas) # clip example to valid range self.adversarial_examples = tf.clip_by_value( self.final_deltas + self.placeholders.audios, clip_value_min=lower, clip_value_max=upper ) # initialise static variables initial_masks = np_arr( lcomp(self._gen_mask(act_lengths, max_len)), np.float32 ) sess.run(self.masks.assign(initial_masks))
def create_optimiser(self): """ Manage the computation of gradients from the loss and the delta variable """ self.variables = {} grad_var = self.optimizer.compute_gradients( self.attack.loss_fn, self.attack.delta_graph.opt_vars, colocate_gradients_with_ops=True) assert None not in lcomp(grad_var, i=0) self.train = self.optimizer.apply_gradients(grad_var) self.variables = {0: self.optimizer.variables()} self.gradients = grad_var[0]
def __init__(self, sess, batch, bit_depth=2**15): batch_size = batch.size max_len = batch.audios["max_samples"] act_lengths = batch.audios["n_samples"] self.__bit_depth = bit_depth self.raw_deltas = None self.opt_vars = None masks = tf.Variable( tf.zeros([batch_size, max_len]), trainable=False, validate_shape=True, dtype=tf.float32, name='qq_masks' ) # Generate a batch of delta variables which will be optimised as a batch deltas = self.create_perturbations(batch_size, max_len) # Mask deltas first so we zero value *any part of the signal* that is # zero value padded in the original audio deltas *= masks # Restrict delta to valid space before applying constraints lower = -self.__bit_depth upper = self.__bit_depth - 1 self.final_deltas = tf.clip_by_value( deltas, clip_value_min=lower, clip_value_max=upper ) # initialise static variables initial_masks = np_arr( lcomp(self._gen_mask(act_lengths, max_len)), np.float32 ) sess.run(masks.assign(initial_masks))
def create_optimiser(self): """ Manage the computation of gradients from the loss and the delta variable """ train_ops = [] self.variables = {} gradients = [] for idx, opt in enumerate(self.optimizers): grad_var = opt.compute_gradients( self.attack.loss_fn, [self.attack.delta_graph.opt_vars[idx]], colocate_gradients_with_ops=True) assert None not in lcomp(grad_var, i=0) training_op = opt.apply_gradients(grad_var) train_ops.append(training_op) print(idx, training_op) gradients.append(grad_var[0][0]) self.variables[idx] = opt.variables() self.train = tf.group(train_ops) self.gradients = tf.stack(gradients, axis=0)
def create_tf_ctc_alignment_search_graph(batch, q, use_beam_search_decoder=False): with tf.Session() as sess: targets = tf.placeholder(tf.int32, [batch.size, None], name='qq_alignment_targets') target_lengths = tf.placeholder(tf.int32, [batch.size], name='qq_alignment_targets_lengths') shape = [ batch.size, batch.audios["max_feats"], len(batch.targets["tokens"]) ] initial_alignments = tf.Variable(tf.zeros(shape), dtype=tf.float32, trainable=True, name='qq_alignment') mask = tf.Variable(tf.ones(shape), dtype=tf.float32, trainable=False, name='qq_alignment_mask') logits_alignments = initial_alignments * mask raw_alignments = tf.transpose(logits_alignments, [1, 0, 2]) softmax_alignments = tf.nn.softmax(logits_alignments, axis=-1) target_alignments = tf.argmax(softmax_alignments, axis=2) per_logit_lengths = batch.audios["real_feats"] maxlen = shape[1] def gen_mask(per_logit_len, maxlen): # per actual frame for l in per_logit_len: # per possible frame masks = [] for f in range(maxlen): if l > f: # if should be optimised mask = np.ones([29]) else: # shouldn't be optimised mask = np.zeros([29]) #mask[28] = 30.0 masks.append(mask) yield np.asarray(masks) initial_masks = np.asarray( [m for m in gen_mask(per_logit_lengths, maxlen)], dtype=np.float32) sess.run(mask.assign(initial_masks)) seq_lens = batch.audios["real_feats"] ctc_target = tf.keras.backend.ctc_label_dense_to_sparse( targets, target_lengths) loss_fn = tf.nn.ctc_loss( labels=ctc_target, inputs=raw_alignments, sequence_length=seq_lens, ) optimizer = tf.train.AdamOptimizer(1) grad_var = optimizer.compute_gradients(loss_fn, initial_alignments) assert None not in lcomp(grad_var, i=0) train_alignment = optimizer.apply_gradients(grad_var) variables = optimizer.variables() def tf_beam_decode(sess, logits, features_lengths, tokens): tf_decode, log_probs = tf.nn.ctc_beam_search_decoder( logits, features_lengths, merge_repeated=False, beam_width=500) dense = tf.sparse.to_dense(tf_decode[0]) tf_dense = sess.run([dense]) tf_outputs = [ ''.join([tokens[int(x)] for x in tf_dense[0][i]]) for i in range(tf_dense[0].shape[0]) ] tf_outputs = [o.rstrip(" ") for o in tf_outputs] probs = sess.run(log_probs) probs = [prob[0] for prob in probs] return tf_outputs, probs def tf_greedy_decode(sess, logits, features_lengths, tokens, merge_repeated=True): tf_decode, log_probs = tf.nn.ctc_greedy_decoder( logits, features_lengths, merge_repeated=merge_repeated, ) dense = tf.sparse.to_dense(tf_decode[0]) tf_dense = sess.run([dense]) tf_outputs = [ ''.join([tokens[int(x)] for x in tf_dense[0][i]]) for i in range(tf_dense[0].shape[0]) ] tf_outputs = [o.rstrip(" ") for o in tf_outputs] neg_sum_logits = sess.run(log_probs) neg_sum_logits = [prob[0] for prob in neg_sum_logits] return tf_outputs, neg_sum_logits variables.append(initial_alignments) sess.run(tf.variables_initializer(variables)) still_have_work = True max_iters = 1000 c = 0 while still_have_work: train_ops = [ loss_fn, softmax_alignments, logits_alignments, mask, train_alignment ] feed = { targets: batch.targets["indices"], target_lengths: batch.targets["lengths"], } ctc_limit, softmax, raw, m, _ = sess.run(train_ops, feed_dict=feed) if use_beam_search_decoder is True: decodings, probs = tf_beam_decode(sess, raw_alignments, batch.audios["real_feats"], TOKENS) else: decodings, probs = tf_greedy_decode(sess, raw_alignments, batch.audios["real_feats"], TOKENS) target_phrases = batch.targets["phrases"] decoding_check = all( [d == t for d, t in zip(decodings, target_phrases)]) ctc_check = all(c < 0.1 for c in ctc_limit) if decoding_check and ctc_check: s = "Found an alignment for each example:" for d, p, t in zip(decodings, probs, target_phrases): s += "\nTarget: {t} | Decoding: {d} | Probs: {p:.3f}".format( t=t, d=d, p=p, ) log(s, wrap=True) still_have_work = False elif c >= max_iters: log("Could not find any CTC optimal alignments for you...") q.put("dead") sys.exit(5) else: c += 1 q.put(sess.run(target_alignments).tolist())