Exemplo n.º 1
0
def test_MpiAdam():
    np.random.seed(0)
    tf.set_random_seed(0)

    a = tf.Variable(np.random.randn(3).astype('float32'))
    b = tf.Variable(np.random.randn(2, 5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))

    stepsize = 1e-2
    update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
    do_update = U.function([], loss, updates=[update_op])

    tf.get_default_session().run(tf.global_variables_initializer())
    for i in range(10):
        print(i, do_update())

    tf.set_random_seed(0)
    tf.get_default_session().run(tf.global_variables_initializer())

    var_list = [a, b]
    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)],
                             updates=[update_op])
    adam = MpiAdam(var_list)

    for i in range(10):
        l, g = lossandgrad()
        adam.update(g, stepsize)
        print(i, l)
def validate_probtype(probtype, pdparam):
    N = 100000
    # Check to see if mean negative log likelihood == differential entropy
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
    pd = probtype.pdclass()(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
    Xval = U.eval(pd.sample(), feed_dict={M: Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = -logliks.mean()  #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N)  #pylint: disable=E1101
    entval = calcent(Mval).mean()  #pylint: disable=E1101
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr  # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
    pd2 = probtype.pdclass()(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
    klval = calckl(Mval, Mval2).mean()  #pylint: disable=E1101
    logliks = calcloglik(Xval, Mval2)
    klval_ll = -entval - logliks.mean()  #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N)  #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr  # within 3 sigmas
Exemplo n.º 3
0
def build_adv(make_obs_tf, q_func, num_actions, epsilon, noisy):
    with tf.variable_scope('deepq', reuse=tf.AUTO_REUSE):
        obs_tf_in = U.ensure_tf_input(make_obs_tf("observation"))
        stochastic_ph_adv = tf.placeholder(tf.bool, (), name="stochastic_adv")
        update_eps_ph_adv = tf.placeholder(tf.float32, (),
                                           name="update_eps_adv")
        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))
        update_eps_expr_adv = eps.assign(
            tf.cond(update_eps_ph_adv >= 0, lambda: update_eps_ph_adv,
                    lambda: eps))
        print("==========================================")

        #def wrapper(x):
        #    return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy)
        adversary = FastGradientMethod(q_func(obs_tf_in.get(),
                                              num_actions,
                                              scope="q_func",
                                              reuse=True,
                                              concat_softmax=True,
                                              noisy=noisy),
                                       sess=U.get_session())
        adv_observations = adversary.generate(
            obs_tf_in.get(), eps=epsilon, clip_min=0, clip_max=1.0) * 255.0
        craft_adv_obs = U.function(
            inputs=[obs_tf_in, stochastic_ph_adv, update_eps_ph_adv],
            outputs=adv_observations,
            givens={
                update_eps_ph_adv: -1.0,
                stochastic_ph_adv: True
            },
            updates=[update_eps_expr_adv])
        return craft_adv_obs
def build_adv(make_obs_tf, q_func, num_actions, epsilon, noisy, attack=None):
    with tf.variable_scope('deepq', reuse=tf.AUTO_REUSE):
        obs_tf_in = U.ensure_tf_input(make_obs_tf("observation"))
        stochastic_ph_adv = tf.placeholder(tf.bool, (), name="stochastic_adv")
        update_eps_ph_adv = tf.placeholder(tf.float32, (),
                                           name="update_eps_adv")
        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))
        update_eps_expr_adv = eps.assign(
            tf.cond(update_eps_ph_adv >= 0, lambda: update_eps_ph_adv,
                    lambda: eps))
        print("==========================================")

        #def wrapper(x):
        #    return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy)
        if attack == 'fgsm':
            adversary = FastGradientMethod(q_func(obs_tf_in.get(),
                                                  num_actions,
                                                  scope="q_func",
                                                  reuse=True,
                                                  concat_softmax=True,
                                                  noisy=noisy),
                                           sess=U.get_session())
            adv_observations = adversary.generate(
                obs_tf_in.get(), eps=epsilon, clip_min=0, clip_max=1.0) * 255.0
            print("----")
            print(adv_observations.shape)

        else:
            adversary = CarliniWagnerL2(q_func(obs_tf_in.get(),
                                               num_actions,
                                               scope="q_func",
                                               reuse=True,
                                               concat_softmax=True,
                                               noisy=noisy),
                                        sess=U.get_session())
            cw_params = {
                'binary_search_steps': 1,
                'max_iterations': 100,
                'learning_rate': 0.1,
                'initial_const': 10,
                'clip_min': 0,
                'clip_max': 1.0
            }
            adv_observations = adversary.generate(obs_tf_in.get(), **
                                                  cw_params) * 255.0
        # saveScreenPNG(b'test_image.png')

        craft_adv_obs = U.function(
            inputs=[obs_tf_in, stochastic_ph_adv, update_eps_ph_adv],
            outputs=adv_observations,
            givens={
                update_eps_ph_adv: -1.0,
                stochastic_ph_adv: True
            },
            updates=[update_eps_expr_adv])
        return craft_adv_obs
Exemplo n.º 5
0
def build_act(make_obs_ph, q_func, num_actions, noisy=False, scope="deepq", reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

        eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func", noisy=noisy)
        q_values = q_values.get_logits(observations_ph.get())
        deterministic_actions = tf.argmax(q_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
        chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
        update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))

        act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True},
                         updates=[update_eps_expr])
                         
        return act
Exemplo n.º 6
0
def test_function():
    tf.reset_default_graph()
    x = tf.placeholder(tf.int32, (), name="x")
    y = tf.placeholder(tf.int32, (), name="y")
    z = 3 * x + 2 * y
    lin = function([x, y], z, givens={y: 0})

    with single_threaded_session():
        initialize()

        assert lin(2) == 6
        assert lin(x=3) == 9
        assert lin(2, 2) == 10
        assert lin(x=2, y=3) == 12
Exemplo n.º 7
0
def test_multikwargs():
    tf.reset_default_graph()
    x = tf.placeholder(tf.int32, (), name="x")
    with tf.variable_scope("other"):
        x2 = tf.placeholder(tf.int32, (), name="x")
    z = 3 * x + 2 * x2

    lin = function([x, x2], z, givens={x2: 0})
    with single_threaded_session():
        initialize()
        assert lin(2) == 6
        assert lin(2, 2) == 10
        expt_caught = False
        try:
            lin(x=2)
        except AssertionError:
            expt_caught = True
        assert expt_caught
Exemplo n.º 8
0
    def __init__(self, epsilon=1e-2, shape=()):

        self._sum = tf.get_variable(dtype=tf.float64,
                                    shape=shape,
                                    initializer=tf.constant_initializer(0.0),
                                    name="runningsum",
                                    trainable=False)
        self._sumsq = tf.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.constant_initializer(epsilon),
            name="runningsumsq",
            trainable=False)
        self._count = tf.get_variable(
            dtype=tf.float64,
            shape=(),
            initializer=tf.constant_initializer(epsilon),
            name="count",
            trainable=False)
        self.shape = shape

        self.mean = tf.to_float(self._sum / self._count)
        self.std = tf.sqrt(
            tf.maximum(
                tf.to_float(self._sumsq / self._count) - tf.square(self.mean),
                1e-2))

        newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
        newsumsq = tf.placeholder(shape=self.shape,
                                  dtype=tf.float64,
                                  name='var')
        newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
        self.incfiltparams = U.function(
            [newsum, newsumsq, newcount], [],
            updates=[
                tf.assign_add(self._sum, newsum),
                tf.assign_add(self._sumsq, newsumsq),
                tf.assign_add(self._count, newcount)
            ])
        iteration_time_est = RunningAvg(0.999)
        obs = env.reset()
        # Record the mean of the \sigma
        sigma_name_list = []
        sigma_list = []
        for param in tf.trainable_variables():
            # only record the \sigma in the action network
            if 'sigma' in param.name \
                    and 'deepq/q_func/action_value' in param.name:
                summary_name = \
                    param.name.replace(
                        'deepq/q_func/action_value/', '').replace(
                            '/', '.').split(':')[0]
                sigma_name_list.append(summary_name)
                sigma_list.append(tf.reduce_mean(tf.abs(param)))
        f_mean_sigma = U.function(inputs=[], outputs=sigma_list)
        # Statistics
        writer = tf.summary.FileWriter(savedir, sess.graph)
        im_stats = statistics(
            scalar_keys=['action', 'im_reward', 'td_errors', 'huber_loss'] +
            sigma_name_list)
        ep_stats = statistics(scalar_keys=['ep_reward', 'ep_length'])
        # Main trianing loop
        ep_length = 0
        while True:
            num_iters += 1
            ep_length += 1

            # V: Perturb observation if we are past the init stage
            # and at a designated attack step
            # if craft_adv != None and (num_iters >= args.attack_init)
Exemplo n.º 10
0
def build_act_enjoy (make_obs_ph, q_func, num_actions, noisy=False, scope="deepq", reuse=None, attack=None, model_path=''):
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

        eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func", noisy=noisy)
        deterministic_actions = tf.argmax(q_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
        chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
        update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))

        act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True},
                         updates=[update_eps_expr])
                         
        # Load model before attacks graph construction so that TF won't
        # complain can't load parameters for attack
        try:
            U.load_state(model_path)
        except:
            pass

        if attack != None:
            
            if attack == 'fgsm':
                def wrapper(x):
                    return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True, noisy=noisy)
                adversary = FastGradientMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session())
                adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0,
                                                      clip_min=0, clip_max=1.0) * 255.0
            elif attack == 'iterative':
                def wrapper(x):
                    return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True)
                adversary = BasicIterativeMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session())
                adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0,
                                                      clip_min=0, clip_max=1.0) * 255.0
            elif attack == 'cwl2':
                def wrapper(x):
                    return q_func(x, num_actions, scope="q_func", reuse=True)
                adversary = CarliniWagnerL2(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session())
                cw_params = {'binary_search_steps': 1,
                             'max_iterations': 100,
                             'learning_rate': 0.1,
                             'initial_const': 10,
                             'clip_min': 0,
                             'clip_max': 1.0}
                adv_observations = adversary.generate(observations_ph.get(), **cw_params) * 255.0

            craft_adv_obs = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                            outputs=adv_observations,
                            givens={update_eps_ph: -1.0, stochastic_ph: True},
                            updates=[update_eps_expr])

        if attack == None:
            craft_adv_obs = None
            return act
        else:
            return act, craft_adv_obs
Exemplo n.º 11
0
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, noisy=False, scope="deepq", reuse=None, attack=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, noisy=noisy, reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", noisy=noisy, reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func", noisy=noisy)
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", noisy=noisy, reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=[td_error, errors],
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)
        ################## Vahid's Work ###################
        #U.load_state(model_path)

        if attack != None:
            if attack == 'fgsm':
                def wrapper(x):
                    return q_func(x, num_actions, scope="target_q_func", reuse=True, concat_softmax=True, noisy=noisy)
                adversary = FastGradientMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session())
                adv_observations = adversary.generate(obs_tp1_input.get(), eps=1.0/255.0,
                                                      clip_min=0, clip_max=1.0) * 255.0
            elif attack == 'iterative':
                def wrapper(x):
                    return q_func(x, num_actions, scope="q_func", reuse=True, concat_softmax=True)
                adversary = BasicIterativeMethod(CallableModelWrapper(wrapper, 'probs'), sess=U.get_session())
                adv_observations = adversary.generate(observations_ph.get(), eps=1.0/255.0,
                                                      clip_min=0, clip_max=1.0) * 255.0
            elif attack == 'cwl2':
                def wrapper(x):
                    return q_func(x, num_actions, scope="q_func", reuse=True)
                adversary = CarliniWagnerL2(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session())
                cw_params = {'binary_search_steps': 1,
                             'max_iterations': 100,
                             'learning_rate': 0.1,
                             'initial_const': 10,
                             'clip_min': 0,
                             'clip_max': 1.0}
                adv_observations = adversary.generate(observations_ph.get(), **cw_params) * 255.0

            craft_adv_obs = U.function(inputs=[obs_tp1_input],
                            outputs=adv_observations,                       
                            updates=[update_target_expr])

        if attack == None:
            craft_adv_obs = None

        return act_f, train, update_target, {'q_values': q_values}, craft_adv_obs
Exemplo n.º 12
0
        iteration_time_est = RunningAvg(0.999)
        obs = env.reset()
        # Record the mean of the \sigma
        sigma_name_list = []
        sigma_list = []
        for param in tf.trainable_variables():
            # only record the \sigma in the action network
            if 'sigma' in param.name \
                    and 'deepq/q_func/action_value' in param.name:
                summary_name = \
                    param.name.replace(
                        'deepq/q_func/action_value/', '').replace(
                            '/', '.').split(':')[0]
                sigma_name_list.append(summary_name)
                sigma_list.append(tf.reduce_mean(tf.abs(param)))
        f_mean_sigma = U.function(inputs=[], outputs=sigma_list)
        # Statistics
        writer = tf.summary.FileWriter(savedir, sess.graph)
        im_stats = statistics(scalar_keys=['action', 'im_reward', 'td_errors',
                                           'huber_loss'] + sigma_name_list)
        ep_stats = statistics(scalar_keys=['ep_reward', 'ep_length'])
        # Main trianing loop
        ep_length = 0
        while True:
            num_iters += 1
            ep_length += 1

            # V: Perturb observation if we are past the init stage
            # and at a designated attack step
            # if craft_adv != None and (num_iters >= args.attack_init)
            # and ((num_iters - args.attack_init) % args.attack_freq == 0) :