def classify(input, psearch):
    with tf.Session() as sess:
        _, audio = wav.read(input)
        N = len(audio)
        new_input = tf.placeholder(tf.float32, [1, N])
        lengths = tf.placeholder(tf.int32, [1])

        # get logits (probability matrix) from deepspeech
        with tf.variable_scope("", reuse=tf.AUTO_REUSE):
            logits = get_logits(new_input, lengths)

        saver = tf.train.Saver()
        saver.restore(sess, restore_path)

        # decode them using either greedy or beam search
        decoded, _ = tf.nn.ctc_beam_search_decoder(
            logits,
            lengths,
            merge_repeated=False,
            beam_width=(1 if psearch == "greedy" else 100))

        #print('logits shape', logits.shape)
        length = (len(audio) - 1) // 320
        r = sess.run(decoded, {new_input: [audio], lengths: [length]})

        return "".join([toks[x] for x in r[0].values])
Пример #2
0
    def setup_graph(self, input_audio_batch, target_phrase):
        batch_size = input_audio_batch.shape[0]
        weird = (input_audio_batch.shape[1] - 1) // 320
        logits_arg2 = np.tile(weird, batch_size)
        dense_arg1 = np.array(np.tile(target_phrase, (batch_size, 1)),
                              dtype=np.int32)
        dense_arg2 = np.array(np.tile(target_phrase.shape[0], batch_size),
                              dtype=np.int32)

        pass_in = np.clip(input_audio_batch, -2**15, 2**15 - 1)
        seq_len = np.tile(weird, batch_size).astype(np.int32)

        with tf.variable_scope('', reuse=tf.AUTO_REUSE):
            inputs = tf.placeholder(tf.float32, shape=pass_in.shape, name='a')
            len_batch = tf.placeholder(tf.float32, name='b')
            arg2_logits = tf.placeholder(tf.int32,
                                         shape=logits_arg2.shape,
                                         name='c')
            arg1_dense = tf.placeholder(tf.float32,
                                        shape=dense_arg1.shape,
                                        name='d')
            arg2_dense = tf.placeholder(tf.int32,
                                        shape=dense_arg2.shape,
                                        name='e')
            len_seq = tf.placeholder(tf.int32, shape=seq_len.shape, name='f')

            logits = get_logits(inputs, arg2_logits)
            target = ctc_label_dense_to_sparse(arg1_dense, arg2_dense,
                                               len_batch)
            ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits,
                                     sequence_length=len_seq)
            decoded, _ = tf.nn.ctc_greedy_decoder(logits,
                                                  arg2_logits,
                                                  merge_repeated=True)

            sess = tf.Session()
            saver = tf.train.Saver(tf.global_variables())
            saver.restore(sess, "models/session_dump")

        func1 = lambda a, b, c, d, e, f: sess.run(ctcloss,
                                                  feed_dict={
                                                      inputs: a,
                                                      len_batch: b,
                                                      arg2_logits: c,
                                                      arg1_dense: d,
                                                      arg2_dense: e,
                                                      len_seq: f
                                                  })
        func2 = lambda a, b, c, d, e, f: sess.run(
            [ctcloss, decoded],
            feed_dict={
                inputs: a,
                len_batch: b,
                arg2_logits: c,
                arg1_dense: d,
                arg2_dense: e,
                len_seq: f
            })
        return (func1, func2)
Пример #3
0
def main():
    with tf.Session() as sess:
        for i in range(1, len(sys.argv)):
            if sys.argv[i].split(".")[-1] == 'mp3':
                raw = pydub.AudioSegment.from_mp3(sys.argv[i])
                audio = np.array([
                    struct.unpack("<h", raw.raw_data[i:i + 2])[0]
                    for i in range(0, len(raw.raw_data), 2)
                ])
            elif sys.argv[i].split(".")[-1] == 'wav':
                _, audio = wav.read(sys.argv[i])
            else:
                raise Exception("Unknown file format")
            N = len(audio)
            new_input = tf.placeholder(tf.float32, [1, N])
            lengths = tf.placeholder(tf.int32, [1])

            with tf.variable_scope("", reuse=tf.AUTO_REUSE):
                logits = get_logits(new_input, lengths)

            if i == 1:
                saver = tf.train.Saver()
                saver.restore(sess, "models/session_dump")

            decoded, _ = tf.nn.ctc_beam_search_decoder(logits,
                                                       lengths,
                                                       merge_repeated=False,
                                                       beam_width=500)

            length = (len(audio) - 1) // 320
            l = len(audio)
            r = sess.run(decoded, {new_input: [audio], lengths: [length]})
            if len(sys.argv[i]) > 2:
                print(sys.argv[i])
            print("".join([toks[x] for x in r[0].values]))
def main():
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument(
        '--in',
        type=str,
        dest="input",
        required=True,
        help="Input audio .wav file(s), at 16KHz (separated by spaces)")
    parser.add_argument(
        '--restore_path',
        type=str,
        required=True,
        help="Path to the DeepSpeech checkpoint (ending in model0.4.1)")
    args = parser.parse_args()
    while len(sys.argv) > 1:
        sys.argv.pop()
    with tf.Session() as sess:
        if args.input.split(".")[-1] == 'mp3':
            raw = pydub.AudioSegment.from_mp3(args.input)
            audio = np.array([
                struct.unpack("<h", raw.raw_data[i:i + 2])[0]
                for i in range(0, len(raw.raw_data), 2)
            ])
        elif args.input.split(".")[-1] == 'wav' or args.input.split(
                ".")[-1] == 'WAV':
            _, audio = wav.read(args.input)
        else:
            raise Exception("Unknown file format")
        N = len(audio)
        new_input = tf.placeholder(tf.float32, [1, N])
        lengths = tf.placeholder(tf.int32, [1])

        with tf.variable_scope("", reuse=tf.AUTO_REUSE):
            logits = get_logits(new_input, lengths)

        saver = tf.train.Saver()
        saver.restore(sess, args.restore_path)

        decoded, _ = tf.nn.ctc_beam_search_decoder(logits,
                                                   lengths,
                                                   merge_repeated=False,
                                                   beam_width=500)

        print('logits shape', logits.shape)
        length = (len(audio) - 1) // 320
        l = len(audio)
        r = sess.run(decoded, {new_input: [audio], lengths: [length]})

        print("-" * 80)
        print("-" * 80)

        print("Classification:")
        print("".join([toks[x] for x in r[0].values]))
        print("-" * 80)
        print("-" * 80)

        output_text = "".join([toks[x] for x in r[0].values])
        return output_text
    def __init__(self, sess, phrase_length, max_audio_len, batch_size=1,
                  restore_path=None):
        """
        Set up the attack procedure.

        Here we create the TF graph that we're going to use to
        actually generate the adversarial examples.
        """

        self.sess = sess
        self.batch_size = batch_size
        self.phrase_length = phrase_length
        self.max_audio_len = max_audio_len

        # Create all the variables necessary
        # they are prefixed with qq_ just so that we know which
        # ones are ours so when we restore the session we don't
        # clobber them.
        # self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_delta')
        self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask')
        self.cwmask = cwmask = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.float32), name='qq_cwmask')
        self.original = original = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_original')
        self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths')
        self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase')
        self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths')
        

        # We set the new input to the model to be the abve delta
        # plus a mask, which allows us to enforce that certain
        # values remain constant 0 for length padding sequences.
        self.new_input = new_input = mask + original

        # We add a tiny bit of noise to help make sure that we can
        # clip our values to 16-bit integers and not break things.
        noise = tf.random_normal(new_input.shape, stddev=2)
        pass_in = tf.clip_by_value(new_input + noise, -2 ** 15, 2 ** 15 - 1)

        # Feed this final value to get the logits.
        self.logits = logits = get_logits(pass_in, lengths)

        # And finally restore the graph to make the classifier
        # actually do something interesting.
        saver = tf.train.Saver([x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, restore_path)

        target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths)

        ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                    inputs=logits, sequence_length=lengths)

        self.expanded_loss = tf.constant(0)
        self.ctcloss = ctcloss

        # Decoder from the logits, to see how we're doing
        self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=100)
def main():
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('input_files', type=str,
                        nargs='+',
                        help="Input audio .wav file(s), at 16KHz (separated by spaces)")
    args = parser.parse_args()
    restore_path='deepspeech-0.4.1-checkpoint/model.v0.4.1'
    for input_file in args.input_files:
        tf.reset_default_graph()
        with tf.Session() as sess:
            if input_file.split(".")[-1] == 'mp3':
                raw = pydub.AudioSegment.from_mp3(input_file)
                audio = np.array([struct.unpack("<h", raw.raw_data[i:i+2])[0] for i in range(0,len(raw.raw_data),2)])
            elif input_file.split(".")[-1] == 'wav':
                _, audio = wav.read(input_file)
            else:
                raise Exception("Unknown file format")
            prediction_output_path = input_file.split('.')[0] + '_041_prediction'
            N = len(audio)
            new_input = tf.placeholder(tf.float32, [1, N])
            lengths = tf.placeholder(tf.int32, [1])

            with tf.variable_scope("", reuse=tf.AUTO_REUSE):
                logits = get_logits(new_input, lengths)

            saver = tf.train.Saver()
            saver.restore(sess, restore_path)

            decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=500)

            print('logits shape', logits.shape)
            length = (len(audio)-1)//320
            l = len(audio)
            r = sess.run(decoded, {new_input: [audio],
                                   lengths: [length]})
            prediction =  "".join([toks[x] for x in r[0].values])

            print("-"*80)
            print("-"*80)

            print("Classification:")
            print(prediction)
            print("-"*80)
            print("-"*80)

            with open(prediction_output_path, 'w') as f:
                f.write(prediction)
Пример #7
0
def getAudioPrediction(sess, audio):
    global modelInitDone
    N = len(audio)
    new_input = tf.placeholder(tf.float32, [1, N])
    lengths = tf.placeholder(tf.int32, [1])
    with tf.variable_scope("", reuse=tf.AUTO_REUSE):
        logits = get_logits(new_input, lengths)

    if not modelInitDone:
        init(sess)
        modelInitDone = True

    decoded, logprobs = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=500)
    length = (len(audio)-1)//320
    l = len(audio)
    r = sess.run(decoded, {new_input: [audio], lengths: [length]})
    lp = sess.run(logprobs, {new_input: [audio], lengths: [length]})
    tts = "".join([toks[x] for x in r[0].values])    
    return tts
Пример #8
0
    def __init__(self,
                 sess,
                 loss_fn,
                 phrase_length,
                 max_audio_len,
                 learning_rate=10,
                 num_iterations=5000,
                 batch_size=1,
                 max_offset=320,
                 mp3=False,
                 l2penalty=float('inf'),
                 restore_path=None,
                 adversarial_signal_limit=2000.0):
        """
        Set up the attack procedure.

        Here we create the TF graph that we're going to use to
        actually generate the adversarial examples.
        """

        self.sess = sess
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.phrase_length = phrase_length
        self.max_audio_len = max_audio_len
        self.mp3 = mp3
        self.max_offset = max_offset
        self.adversarial_signal_limit = adversarial_signal_limit

        # Create all the variables necessary
        # they are prefixed with qq_ just so that we know which
        # ones are ours so when we restore the session we don't
        # clobber them.
        self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len),
                                                  dtype=np.float32),
                                         name='qq_delta')
        self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len),
                                                dtype=np.float32),
                                       name='qq_mask')
        self.cwmask = cwmask = tf.Variable(np.zeros(
            (batch_size, phrase_length), dtype=np.float32),
                                           name='qq_cwmask')
        self.original = original = tf.Variable(np.zeros(
            (batch_size, max_audio_len), dtype=np.float32),
                                               name='qq_original')
        self.lengths = lengths = tf.Variable(np.zeros(batch_size,
                                                      dtype=np.int32),
                                             name='qq_lengths')
        self.importance = tf.Variable(np.zeros((batch_size, phrase_length),
                                               dtype=np.float32),
                                      name='qq_importance')
        self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length),
                                                  dtype=np.int32),
                                         name='qq_phrase')
        self.target_phrase_lengths = tf.Variable(np.zeros((batch_size),
                                                          dtype=np.int32),
                                                 name='qq_phrase_lengths')
        self.rescale = tf.Variable(np.zeros((batch_size, 1), dtype=np.float32),
                                   name='qq_phrase_lengths')
        self.learning_rate_tensor = tf.Variable(np.ones((1), dtype=np.float32),
                                                name='qq_learning_rate_tensor')

        # Initially we bound the l_infty norm by 2000, increase this
        # constant if it's not big enough of a distortion for your dataset.
        self.apply_delta = tf.clip_by_value(
            delta, -adversarial_signal_limit,
            adversarial_signal_limit) * self.rescale

        # We set the new input to the model to be the above delta
        # plus a mask, which allows us to enforce that certain
        # values remain constant 0 for length padding sequences.
        self.new_input = new_input = self.apply_delta * mask + original

        # We add a tiny bit of noise to help make sure that we can
        # clip our values to 16-bit integers and not break things.
        noise = tf.random_normal(new_input.shape, stddev=2)
        pass_in = tf.clip_by_value(new_input + noise, -2**15, 2**15 - 1)

        # Feed this final value to get the logits.
        self.logits = logits = get_logits(pass_in, lengths)

        # And finally restore the graph to make the classifier
        # actually do something interesting.
        saver = tf.train.Saver(
            [x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, restore_path)

        # Choose the loss function we want -- either CTC or CW
        self.loss_fn = loss_fn
        if loss_fn == "CTC":
            target = ctc_label_dense_to_sparse(self.target_phrase,
                                               self.target_phrase_lengths)

            ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits,
                                     sequence_length=lengths)

            # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion
            # The code runs faster at a slight cost of distortion, and also leaves one less
            # paramaeter that requires tuning.
            if not np.isinf(l2penalty):
                loss = tf.reduce_mean((self.new_input - self.original)**2,
                                      axis=1) + l2penalty * ctcloss
            else:
                loss = ctcloss
            self.expanded_loss = tf.constant(0)

        elif loss_fn == "CW":
            raise NotImplemented(
                "The current version of this project does not include the CW loss function implementation."
            )
        else:
            raise

        self.loss = loss
        self.ctcloss = ctcloss

        # Set up the Adam optimizer to perform gradient descent for us
        start_vars = set(x.name for x in tf.global_variables())
        tf.summary.scalar('Learning Rate', self.learning_rate_tensor[0])
        optimizer = tf.train.AdamOptimizer(self.learning_rate_tensor[0])
        self.optimizer = optimizer

        grad, var = optimizer.compute_gradients(self.loss, [delta])[0]
        self.grad_sign = grad_sign = tf.sign(grad)
        self.train = optimizer.apply_gradients([(grad_sign, var)])

        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        sess.run(tf.variables_initializer(new_vars + [delta]))

        # Decoder from the logits, to see how we're doing
        self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits,
                                                        lengths,
                                                        merge_repeated=False,
                                                        beam_width=100)

        self.merged = tf.summary.merge_all()
Пример #9
0
    def __init__(self,
                 sess,
                 loss_fn,
                 phrase_length,
                 max_audio_len,
                 learning_rate=10,
                 num_iterations=1000,
                 batch_size=1):
        """
        Set up the attack procedure.

        Here we create the TF graph that we're going to use to
        actually generate the adversarial examples.
        """

        self.sess = sess
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.phrase_length = phrase_length
        self.max_audio_len = max_audio_len

        # Create all the variables necessary
        # they are prefixed with qq_ just so that we know hich
        # ones are ours so when we restore the session we don't
        # clobber them.
        self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len),
                                                  dtype=np.float32),
                                         name='qq_delta')
        self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len),
                                                dtype=np.float32),
                                       name='qq_mask')
        self.cwmask = cwmask = tf.Variable(np.zeros(
            (batch_size, phrase_length), dtype=np.float32),
                                           name='qq_cwmask')
        self.original = original = tf.Variable(np.zeros(
            (batch_size, max_audio_len), dtype=np.float32),
                                               name='qq_original')
        self.lengths = lengths = tf.Variable(np.zeros(batch_size,
                                                      dtype=np.int32),
                                             name='qq_lengths')
        self.importance = tf.Variable(np.zeros((batch_size, phrase_length),
                                               dtype=np.float32),
                                      name='qq_importance')
        self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length),
                                                  dtype=np.int32),
                                         name='qq_phrase')
        self.target_phrase_lengths = tf.Variable(np.zeros((batch_size),
                                                          dtype=np.int32),
                                                 name='qq_phrase_lengths')
        self.rescale = tf.Variable(np.zeros((batch_size, 1), dtype=np.float32),
                                   name='qq_phrase_lengths')

        # Initially we bound the l_infty norm by 2000, increase this
        # constant if it's not big enough of a distortion for your dataset.
        self.apply_delta = tf.clip_by_value(delta, -2000, 2000) * self.rescale

        # We set the new input to the model to be the abve delta
        # plus a mask, which allows us to enforce that certain
        # values remain constant 0 for length padding sequences.
        self.new_input = new_input = self.apply_delta * mask + original

        # We add a tiny bit of noise to help make sure that we can
        # clip our values to 16-bit integers and not break things.
        noise = tf.random_normal(new_input.shape, stddev=2)
        pass_in = tf.clip_by_value(new_input + noise, -2**15, 2**15 - 1)

        # Feed this final value to get the logits.
        self.logits = logits = get_logits(pass_in, lengths)

        # And finally restore the graph to make the classifier
        # actually do something interesting.
        saver = tf.train.Saver(
            [x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, "models/session_dump")

        # Choose the loss function we want -- either CTC or CW
        self.loss_fn = loss_fn
        if loss_fn == "CTC":
            target = ctc_label_dense_to_sparse(self.target_phrase,
                                               self.target_phrase_lengths,
                                               batch_size)

            ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits,
                                     sequence_length=lengths)

            loss = tf.nn.relu(ctcloss)
            self.expanded_loss = tf.constant(0)

        elif loss_fn == "CW":
            raise NotImplemented(
                "The current version of this project does not include the CW loss function implementation."
            )
        else:
            raise

        # Set up the Adam optimizer to perform gradient descent for us
        var_start = tf.global_variables()
        self.train = tf.train.AdamOptimizer(learning_rate).minimize(
            loss, var_list=[delta])
        self.loss = loss
        self.ctcloss = ctcloss

        var_end = tf.global_variables()
        new_vars = [
            x for x in var_end if x.name not in [y.name for y in var_start]
        ]
        sess.run(tf.variables_initializer(new_vars + [delta]))

        # Decoder from the logits, to see how we're doing
        self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits,
                                                        lengths,
                                                        merge_repeated=False,
                                                        beam_width=1000)
    def __init__(self,
                 sess,
                 loss_fn,
                 phrase_length,
                 max_audio_len,
                 learning_rate=10,
                 num_iterations=5000,
                 batch_size=1,
                 mp3=False,
                 l2penalty=float('inf'),
                 restore_path=None,
                 th=None,
                 psd_max_ori=None):
        """
        Set up the attack procedure.
        Here we create the TF graph that we're going to use to
        actually generate the adversarial examples.
        """

        self.sess = sess
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.phrase_length = phrase_length
        self.max_audio_len = max_audio_len
        self.mp3 = mp3

        # Create all the variables necessary
        # they are prefixed with qq_ just so that we know which
        # ones are ours so when we restore the session we don't
        # clobber them.
        self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len),
                                                  dtype=np.float32),
                                         name='qq_delta')
        self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len),
                                                dtype=np.float32),
                                       name='qq_mask')
        self.cwmask = cwmask = tf.Variable(np.zeros(
            (batch_size, phrase_length), dtype=np.float32),
                                           name='qq_cwmask')
        self.original = original = tf.Variable(np.zeros(
            (batch_size, max_audio_len), dtype=np.float32),
                                               name='qq_original')

        self.lengths = lengths = tf.Variable(np.zeros(batch_size,
                                                      dtype=np.int32),
                                             name='qq_lengths')
        self.importance = tf.Variable(np.zeros((batch_size, phrase_length),
                                               dtype=np.float32),
                                      name='qq_importance')
        self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length),
                                                  dtype=np.int32),
                                         name='qq_phrase')
        self.target_phrase_lengths = tf.Variable(np.zeros((batch_size),
                                                          dtype=np.int32),
                                                 name='qq_phrase_lengths')
        self.alpha = tf.Variable(np.ones(
            (batch_size), dtype=np.float32) * 0.05,
                                 name='qq_alpha')
        self.rescale = tf.Variable(np.zeros((batch_size, 1), dtype=np.float32),
                                   name='qq_phrase_lengths')
        self.th = tf.placeholder(tf.float32,
                                 shape=[batch_size, None, None],
                                 name='qq_th')
        self.psd_max_ori = tf.placeholder(tf.float32,
                                          shape=[batch_size],
                                          name='qq_psd')
        self.input_tf = tf.placeholder(tf.float32,
                                       shape=[batch_size, None],
                                       name='qq_input')
        self.tgt_tf = tf.placeholder(tf.string)
        self.sample_rate_tf = tf.placeholder(tf.int32, name='qq_sample_rate')
        self.mask_freq = tf.placeholder(dtype=np.float32,
                                        shape=[batch_size, None, 80])

        # Initially we bound the l_infty norm by 2000, increase this
        # constant if it's not big enough of a distortion for your dataset.
        self.apply_delta = tf.clip_by_value(delta, -2000, 2000) * self.rescale
        # compute the loss for masking threshold
        self.loss_th_list = []
        self.transform = Transform(2048)
        for i in range(self.batch_size):

            logits_delta = self.transform((self.apply_delta[i, :]),
                                          (self.psd_max_ori)[i])
            #But more recently people use a function that results in 0 if the input is negative, and the input itself if that input is 0 or positive. This specific add-on function (or better "activation function") is called a relu.
            #tf.reduce_mean will compute mean across a particular row/column
            loss_th = tf.reduce_mean(tf.nn.relu(logits_delta - (self.th)[i]))
            #Returns a tensor with an additional dimension inserted at index axis here dim=0 so nex dimension of array is (1,,,) refer tensorflow document for more information.
            loss_th = tf.expand_dims(loss_th, dim=0)
            self.loss_th_list.append(loss_th)

            #tf.concat:- the data along the input tensor is joined along the axis dimension
        self.loss_th = tf.concat(self.loss_th_list, axis=0)

        # We set the new input to the model to be the abve delta
        # plus a mask, which allows us to enforce that certain
        # values remain constant 0 for length padding sequences.

        self.new_input = new_input = self.apply_delta * mask + original

        # We add a tiny bit of noise to help make sure that we can
        # clip our values to 16-bit integers and not break things.
        noise = tf.random_normal(new_input.shape, stddev=2)
        pass_in = tf.clip_by_value(new_input + noise, -2**15, 2**15 - 1)

        # Feed this final value to get the logits.
        self.logits = logits = get_logits(pass_in, lengths)

        # And finally restore the graph to make the classifier
        # actually do something interesting.
        saver = tf.train.Saver(
            [x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, restore_path)

        # Choose the loss function we want -- either CTC or CW
        self.loss_fn = loss_fn
        if loss_fn == "CTC":
            target = ctc_label_dense_to_sparse(self.target_phrase,
                                               self.target_phrase_lengths)

            ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits,
                                     sequence_length=lengths)

            # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion
            # The code runs faster at a slight cost of distortion, and also leaves one less
            # paramaeter that requires tuning.
            if not np.isinf(l2penalty):
                loss = tf.reduce_mean((self.new_input - self.original)**2,
                                      axis=1) + l2penalty * ctcloss
            else:
                loss = ctcloss
            self.expanded_loss = tf.constant(0)

        elif loss_fn == "CW":
            raise NotImplemented(
                "The current version of this project does not include the CW loss function implementation."
            )
        else:
            raise

        self.loss = loss
        self.ctcloss = ctcloss

        # Set up the Adam optimizer to perform gradient descent for us
        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.AdamOptimizer(learning_rate)
        optimizer2 = tf.train.AdamOptimizer(1)

        grad, var = optimizer.compute_gradients(self.loss, [delta])[0]
        grad21, var21 = optimizer2.compute_gradients(self.loss, [delta])[0]
        grad22, var22 = optimizer2.compute_gradients(self.alpha * self.loss_th,
                                                     [delta])[0]
        self.train = optimizer.apply_gradients([(tf.sign(grad), var)])
        self.train21 = optimizer2.apply_gradients([(grad21, var21)])
        self.train22 = optimizer2.apply_gradients([(grad22, var22)])
        self.train2 = tf.group(self.train21, self.train22)

        end_vars = tf.global_variables()
        # new_vars contain variables which are not present in start_var
        new_vars = [x for x in end_vars if x.name not in start_vars]

        sess.run(tf.variables_initializer(new_vars + [delta]))

        # Decoder from the logits, to see how we're doing
        self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits,
                                                        lengths,
                                                        merge_repeated=False,
                                                        beam_width=100)
Пример #11
0
class Attack:
    def __init__(self, sess, loss_fn, phrase_length, max_audio_len, psdMaxes,
                     learning_rate=10, num_iterations=5000, window_size=2048,
                     step_per_window=4, batch_size=1, mp3=False, 
                     onlyCTC=True, audio=None, psdShape=None):
        """
        Set up the attack procedure.

        Here we create the TF graph that we're going to use to
        actually generate the adversarial examples.
        """
        
        self.sess = sess
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.phrase_length = phrase_length
        self.max_audio_len = max_audio_len
        self.mp3 = mp3
        self.psdMaxes = psdMaxes
        self.window_size = window_size
        self.step_per_window = step_per_window
        
        # Create all the variables necessary
        # they are prefixed with qq_ just so that we know which
        # ones are ours so when we restore the session we don't
        # clobber them.
        
        frame_length = int(window_size)
        frame_step = int(window_size//step_per_window)
        fft_length = int(2**np.ceil(np.log2(frame_length)))
        sample_rate = 16000
        freq_res = sample_rate/window_size
        time_res = frame_step/(sample_rate/1000)
        sigma_time = 96. / time_res
        sigma_freq = 15.625 / freq_res
        
        self.regularizer = regularizer = tf.Variable(np.zeros((batch_size), dtype=np.float32), name='qq_regularizer')
        self.psyTh = psyTh = tf.Variable(np.zeros((batch_size, psdShape[0], psdShape[1]), dtype=np.float32), name='qq_psyTh')
        
        self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len)).astype(np.float32)/2, name='qq_delta') name='qq_delta')
        self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask')
        self.original = original = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_original')
        self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths')
        self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase')
        self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths')
        self.rescale = tf.Variable(np.zeros((batch_size,1), dtype=np.float32), name='qq_rescale')

        # Initially we bound the l_infty norm by 2000, increase this
        # constant if it's not big enough of a distortion for your dataset.
         
        if(loss_fn == 'CTC'):
            self.apply_delta = tf.clip_by_value(delta, -2000, 2000)*self.rescale
        elif(loss_fn == 'CTCPSYCLIP'):
            
            self.apply_delta = apply_delta = self.clipBatch(delta, psyTh, regularizer, psdMaxes, max_audio_len, window_size, step_per_window)
            
            self.new_input = new_input = self.apply_delta*mask + original
            
            #self.new_input = new_input = delta*mask + original
            
        # We set the new input to the model to be the above delta
        # plus a mask, which allows us to enforce that certain
        # values remain constant 0 for length padding sequences.
        
        if(loss_fn == 'CTC'):
            self.new_input = new_input = self.apply_delta*mask + original
        if(loss_fn == 'CTCPSYGRAD'):
            self.new_input = new_input = self.delta*mask + original

        # We add a tiny bit of noise to help make sure that we can
        # clip our values to 16-bit integers and not break things.
        if(loss_fn == 'CTC'):
            noise = tf.random_normal(new_input.shape,
                                     stddev=2)
            pass_in = tf.clip_by_value(new_input+noise, -2**15, 2**15-1)
 
        # Feed this final value to get the logits.
        self.logits = logits = get_logits(new_input, lengths)

        # And finally restore the graph to make the classifier
        # actually do something interesting.
        saver = tf.train.Saver([x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, "models/session_dump")

        self.loss_fn = loss_fn
        
        
        if loss_fn == "CTC":
            target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size)
            
            ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits, sequence_length=lengths)

            # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion
            # The code runs faster at a slight cost of distortion, and also leaves one less
            # paramaeter that requires tuning.
            if not onlyCTC:
                loss = tf.reduce_mean((self.new_input-self.original)**2,axis=1)/regularizer + ctcLoss
            else:
                loss = ctcLoss
            self.expanded_loss = tf.constant(0)
Пример #12
0
    def __init__(self,
                 sess,
                 loss_fn,
                 phrase_length,
                 maxlen,
                 learn_rate=10,
                 iterations_num=5000,
                 mem_size=1,
                 mp3=False,
                 foreit=float('inf'),
                 restore_path=None):

        ## Настроим процедуру modify

        ## Здесь создаётся tf граф, который мы используем, чтобы генерировать аудиофайл.

        self.sess = sess
        self.learn_rate = learn_rate
        self.iterations_num = iterations_num
        self.mem_size = mem_size
        self.phrase_length = phrase_length
        self.maxlen = maxlen
        self.mp3 = mp3

        # Создаём необходимые переменные Они имеют префикс qq, чтобы отличаться
        # от стандартных. Таким образом мы отличаем их от остальных

        self.delta = delta = tf.Variable(np.zeros((mem_size, maxlen),
                                                  dtype=np.float32),
                                         name='qq_delta')
        self.mask = mask = tf.Variable(np.zeros((mem_size, maxlen),
                                                dtype=np.float32),
                                       name='qq_mask')
        self.maskcw = maskcw = tf.Variable(np.zeros((mem_size, phrase_length),
                                                    dtype=np.float32),
                                           name='qq_maskcw')
        self.oring = oring = tf.Variable(np.zeros((mem_size, maxlen),
                                                  dtype=np.float32),
                                         name='qq_oring')
        self.length = length = tf.Variable(np.zeros(mem_size, dtype=np.int32),
                                           name='qq_length')
        self.importance = tf.Variable(np.zeros((mem_size, phrase_length),
                                               dtype=np.float32),
                                      name='qq_importance')
        self.target_phrase = tf.Variable(np.zeros((mem_size, phrase_length),
                                                  dtype=np.int32),
                                         name='qq_phrase')
        self.target_phrase_length = tf.Variable(np.zeros((mem_size),
                                                         dtype=np.int32),
                                                name='qq_phrase_length')
        self.rescale = tf.Variable(np.zeros((mem_size, 1), dtype=np.float32),
                                   name='qq_phrase_length')

        # Изначально привяжем  l_infty к 2000, увеличиваем константу, если она
        # недостаточно велика для искажения нашего набора данных.

        self.apply_delta = tf.clip_by_value(delta, -2000, 2000) * self.rescale

        # Мы устанавливаем новый вход для модели, чтобы получить дельту и маску,
        # которая позволяет применять определённым значениям константу 0 для
        # последовательного заполнения длины.

        self.new_input = new_input = self.apply_delta * mask + oring

        # Добавляем шума, чтобы убедиться, что можно обрезать значения
        # в 16-битные целые числа.

        noise = tf.random_normal(new_input.shape, stddev=2)
        pass_in = tf.clip_by_value(new_input + noise, -2**15, 2**15 - 1)

        # Вводим конечное число, чтобы получить logits.

        self.logits = logits = get_logits(pass_in, length)

        # Здесь восстанавливаем график, чтобы сделать классификатор

        saver = tf.train.Saver(
            [x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, restore_path)

        # Выбираем функцию потерь - СТС или CW.
        # В нашем случае это CTC.

        self.loss_fn = loss_fn
        if loss_fn == "CTC":
            target = ctc_label_dense_to_sparse(self.target_phrase,
                                               self.target_phrase_length)

            ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits,
                                     sequence_length=length)

            # Небольшая оговорка: бесконечный штраф l2 означает, что мы не увеличиваем
            # искажение l2. Код работает быстрее при небольшой величине искажения, а также
            # оставляет на единицу меньше параметр, который требует настройки

            if not np.isinf(foreit):
                loss = tf.reduce_mean((self.new_input - self.oring)**2,
                                      axis=1) + foreit * ctcloss
            else:
                loss = ctcloss
            self.expanded_loss = tf.constant(0)

        elif loss_fn == "CW":  #  Введём предупреждение, что  modify() не поддерживает CW.
            raise NotImplemented(
                "Сurrent version does not support implementation CW.")
        else:
            raise

        self.loss = loss
        self.ctcloss = ctcloss

        # Настроим AdamOptimizer для выполнения градиентного спуска.

        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.AdamOptimizer(learn_rate)

        grad, var = optimizer.compute_gradients(self.loss, [delta])[0]
        self.train = optimizer.apply_gradients([(tf.sign(grad), var)])

        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        sess.run(tf.variables_initializer(new_vars + [delta]))

        #  Декодер logits нужен для того, чтобы просмотреть успешность выполнения программы

        self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits,
                                                        length,
                                                        merge_repeated=False,
                                                        beam_width=100)
Пример #13
0
    def __init__(self,
                 sess,
                 phrase_length,
                 max_audio_len,
                 psdMaxes,
                 learning_rate=10,
                 num_iterations=5000,
                 window_size=256,
                 step_per_window=2,
                 batch_size=1,
                 mp3=False,
                 delta=None,
                 audio=None,
                 psdShape=None):
        """
        Set up the attack procedure.

        Here we create the TF graph that we're going to use to
        actually generate the adversarial examples.
        """

        self.sess = sess
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.phrase_length = phrase_length
        self.max_audio_len = max_audio_len
        self.mp3 = mp3
        self.psdMaxes = psdMaxes
        self.window_size = window_size
        self.step_per_window = step_per_window

        # Create all the variables necessary
        # they are prefixed with qq_ just so that we know which
        # ones are ours so when we restore the session we don't
        # clobber them.

        frame_length = int(window_size)
        frame_step = int(window_size // step_per_window)
        fft_length = int(2**np.ceil(np.log2(frame_length)))
        sample_rate = 16000  # datapoints per second
        freq_res = sample_rate / window_size
        # sample_rate/2 is the maximal recorded frequency,
        # We have window_size/2+1 frequencies
        time_res = frame_step / (sample_rate / 1000)
        # (sample_rate/1000) = samples per millisecond
        # frame_step/(sample_rate/1000) => milliseconds for one step

        self.regularizer = regularizer = tf.Variable(np.zeros(
            (batch_size), dtype=np.float32),
                                                     name='qq_regularizer')
        self.psyTh = psyTh = tf.Variable(np.zeros(
            (batch_size, psdShape[0], psdShape[1]), dtype=np.float32),
                                         name='qq_psyTh')

        if (delta is None):
            self.delta = delta = tf.Variable(np.zeros(
                (batch_size, max_audio_len)).astype(np.float32) / 2,
                                             name='qq_delta')
        else:
            self.delta = delta = tf.Variable(
                (delta - audio).astype(np.float32), name='qq_delta')
        self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len),
                                                dtype=np.float32),
                                       name='qq_mask')
        self.original = original = tf.Variable(np.zeros(
            (batch_size, max_audio_len), dtype=np.float32),
                                               name='qq_original')
        self.lengths = lengths = tf.Variable(np.zeros(batch_size,
                                                      dtype=np.int32),
                                             name='qq_lengths')
        self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length),
                                                  dtype=np.int32),
                                         name='qq_phrase')
        self.target_phrase_lengths = tf.Variable(np.zeros((batch_size),
                                                          dtype=np.int32),
                                                 name='qq_phrase_lengths')

        self.apply_delta = apply_delta = self.clipBatch(
            delta, psyTh, regularizer, psdMaxes, max_audio_len, window_size,
            step_per_window)

        self.new_input = new_input = self.apply_delta * mask + original

        # We set the new input to the model to be the above delta
        # plus a mask, which allows us to enforce that certain
        # values remain constant 0 for length padding sequences.

        # Feed this final value to get the logits.
        self.logits = logits = get_logits(new_input, lengths)

        # And finally restore the graph to make the classifier
        # actually do something interesting.
        saver = tf.train.Saver(
            [x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, "models/session_dump")

        target = ctc_label_dense_to_sparse(self.target_phrase,
                                           self.target_phrase_lengths,
                                           batch_size)

        ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                 inputs=logits,
                                 sequence_length=lengths)
        loss = ctcLoss
        self.expanded_loss = tf.constant(0)

        self.deltaPSD = deltaPSD = tfPSD(self.new_input - self.original,
                                         window_size, step_per_window,
                                         self.psdMaxes)
        self.loss = loss
        self.psyLoss = tf.reduce_max(deltaPSD - self.psyTh, axis=[1, 2])
        self.ctcLoss = ctcLoss

        # Set up the Adam optimizer to perform gradient descent for us
        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.AdamOptimizer(learning_rate)

        grad, var = optimizer.compute_gradients(self.loss, [delta])[0]
        self.train = optimizer.apply_gradients([(grad, var)])

        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        sess.run(tf.variables_initializer(new_vars + [delta]))

        # Decoder from the logits, to see how we're doing
        self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits,
                                                        lengths,
                                                        merge_repeated=False,
                                                        beam_width=100)