Exemplos de function em Python, exemplos de baselines0.common.tf_util.function em Python

Exemplo n.º 1

0

Exibir arquivo

def test_MpiAdam():
    np.random.seed(0)
    tf.set_random_seed(0)

    a = tf.Variable(np.random.randn(3).astype('float32'))
    b = tf.Variable(np.random.randn(2, 5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))

    stepsize = 1e-2
    update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
    do_update = U.function([], loss, updates=[update_op])

    tf.get_default_session().run(tf.global_variables_initializer())
    for i in range(10):
        print(i, do_update())

    tf.set_random_seed(0)
    tf.get_default_session().run(tf.global_variables_initializer())

    var_list = [a, b]
    lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)],
                             updates=[update_op])
    adam = MpiAdam(var_list)

    for i in range(10):
        l, g = lossandgrad()
        adam.update(g, stepsize)
        print(i, l)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: distributions.py Projeto: christopher-hsu/ray

def validate_probtype(probtype, pdparam):
    N = 100000
    # Check to see if mean negative log likelihood == differential entropy
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
    pd = probtype.pdfromflat(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
    Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = - logliks.mean() #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    entval = calcent(Mval).mean() #pylint: disable=E1101
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
    pd2 = probtype.pdfromflat(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
    klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
    logliks = calcloglik(Xval, Mval2)
    klval_ll = - entval - logliks.mean() #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
    print('ok on', probtype, pdparam)

Exemplo n.º 3

0

Exibir arquivo

def build_act_greedy(make_obs_ph,
                     q_func,
                     num_actions,
                     scope="deepq",
                     reuse=True,
                     eps=0.0):
    """Creates the act function for a simple fixed epsilon greedy
       Added by HJ
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = make_obs_ph("observation")
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
        deterministic_actions = tf.argmax(q_values[:, :num_actions], axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int64)
        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                                 lambda: deterministic_actions)
        _act = U.function(inputs=[observations_ph, stochastic_ph],
                          outputs=output_actions)

        def act(ob, stochastic=True):
            return _act(ob, stochastic)

        return act

Exemplo n.º 4

0

Exibir arquivo

Arquivo: build_graph.py Projeto: christopher-hsu/ADFQ

def build_act(make_obs_ph, q_func, num_actions, scope="setdeepq", reuse=None):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    with tf.compat.v1.variable_scope(scope, reuse=reuse):
        observations_ph = make_obs_ph("observation")
        stochastic_ph = tf.compat.v1.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.compat.v1.placeholder(tf.float32, (), name="update_eps")

        eps = tf.compat.v1.get_variable("eps", (), initializer=tf.compat.v1.constant_initializer(0))
        # Clipped Double q
        q1_values = q_func(observations_ph.get(), num_actions, scope="q1_func", reuse=reuse)
        q2_values = q_func(observations_ph.get(), num_actions, scope="q2_func", reuse=reuse)
        # Sum over q1 and q2 and find the action with argmax
        deterministic_actions = tf.argmax(input=q1_values+q2_values, axis=1)

        batch_size = tf.shape(input=observations_ph.get())[0]
        random_actions = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
        chose_random = tf.random.uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.compat.v1.where(chose_random, random_actions, deterministic_actions)

        output_actions = tf.cond(pred=stochastic_ph, true_fn=lambda: stochastic_actions, false_fn=lambda: deterministic_actions)
        update_eps_expr = eps.assign(tf.cond(pred=update_eps_ph >= 0, true_fn=lambda: update_eps_ph, false_fn=lambda: eps))
        _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True},
                         updates=[update_eps_expr])
        def act(ob, stochastic=True, update_eps=-1):
            return _act(ob, stochastic, update_eps)
        return act

Exemplo n.º 5

0

Exibir arquivo

Arquivo: test_tf_util.py Projeto: coco66/ADFQ

def test_function():
    with tf.Graph().as_default():
        x = tf.compat.v1.placeholder(tf.int32, (), name="x")
        y = tf.compat.v1.placeholder(tf.int32, (), name="y")
        z = 3 * x + 2 * y
        lin = function([x, y], z, givens={y: 0})

        with single_threaded_session():
            initialize()

            assert lin(2) == 6
            assert lin(2, 2) == 10

Exemplo n.º 6

0

Exibir arquivo

Arquivo: test_tf_util.py Projeto: coco66/ADFQ

def test_multikwargs():
    with tf.Graph().as_default():
        x = tf.compat.v1.placeholder(tf.int32, (), name="x")
        with tf.compat.v1.variable_scope("other"):
            x2 = tf.compat.v1.placeholder(tf.int32, (), name="x")
        z = 3 * x + 2 * x2

        lin = function([x, x2], z, givens={x2: 0})
        with single_threaded_session():
            initialize()
            assert lin(2) == 6
            assert lin(2, 2) == 10

Exemplo n.º 7

0

Exibir arquivo

def build_act_bayesian(make_obs_ph, q_func, num_actions, scope="deepadfq", reuse=None):
    """Creates the act function for Bayesian sampling
    """
    with tf.compat.v1.variable_scope(scope, reuse=reuse):
        observations_ph = make_obs_ph("observation")
        q_values = q_func(observations_ph.get(), num_actions*2, scope="q_func") # mean and -log(sd)
        q_means = q_values[:,:num_actions]
        q_sds = tf.math.exp(-q_values[:,num_actions:])
        samples = tf.random.normal((),mean=q_means,stddev=q_sds)
        output_actions = tf.argmax(input=samples, axis=1)

        _act = U.function(inputs=[observations_ph],
                         outputs=output_actions
                         )
        def act(ob, stochastic=True, update_eps=-1):
            return _act(ob)
        return act

Exemplo n.º 8

0

Exibir arquivo

Arquivo: build_graph.py Projeto: christopher-hsu/ADFQ

def build_act_greedy(make_obs_ph,
                     q_func,
                     num_actions,
                     scope="setdeepq",
                     reuse=True,
                     eps=0.0):
    """Creates the act function for a simple fixed epsilon greedy
       Added by HJ
    """
    with tf.compat.v1.variable_scope(scope, reuse=reuse):
        observations_ph = make_obs_ph("observation")
        stochastic_ph = tf.compat.v1.placeholder(tf.bool, (),
                                                 name="stochastic")
        # Clipped Double q
        q1_values = q_func.forward(observations_ph.get(),
                                   num_actions,
                                   scope="q1_func",
                                   reuse=reuse)
        q2_values = q_func.forward(observations_ph.get(),
                                   num_actions,
                                   scope="q2_func",
                                   reuse=reuse)
        # Sum over q1 and q2 and find the action with argmax
        deterministic_actions = tf.argmax(input=q1_values + q2_values, axis=1)

        batch_size = tf.shape(input=observations_ph.get())[0]
        random_actions = tf.random.uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int64)
        chose_random = tf.random.uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.compat.v1.where(chose_random, random_actions,
                                                deterministic_actions)

        output_actions = tf.cond(pred=stochastic_ph,
                                 true_fn=lambda: stochastic_actions,
                                 false_fn=lambda: deterministic_actions)
        _act = U.function(inputs=[observations_ph, stochastic_ph],
                          outputs=output_actions)

        def act(ob, stochastic=True):
            return _act(ob, stochastic)

        return act

Exemplo n.º 9

0

Exibir arquivo

Arquivo: mpi_running_mean_std.py Projeto: coco66/ADFQ

    def __init__(self, epsilon=1e-2, shape=()):

        self._sum = tf.compat.v1.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.compat.v1.constant_initializer(0.0),
            name="runningsum",
            trainable=False)
        self._sumsq = tf.compat.v1.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.compat.v1.constant_initializer(epsilon),
            name="runningsumsq",
            trainable=False)
        self._count = tf.compat.v1.get_variable(
            dtype=tf.float64,
            shape=(),
            initializer=tf.compat.v1.constant_initializer(epsilon),
            name="count",
            trainable=False)
        self.shape = shape

        self.mean = tf.cast(self._sum / self._count, dtype=tf.float32)
        self.std = tf.sqrt(
            tf.maximum(
                tf.cast(self._sumsq / self._count, dtype=tf.float32) -
                tf.square(self.mean), 1e-2))

        newsum = tf.compat.v1.placeholder(shape=self.shape,
                                          dtype=tf.float64,
                                          name='sum')
        newsumsq = tf.compat.v1.placeholder(shape=self.shape,
                                            dtype=tf.float64,
                                            name='var')
        newcount = tf.compat.v1.placeholder(shape=[],
                                            dtype=tf.float64,
                                            name='count')
        self.incfiltparams = U.function(
            [newsum, newsumsq, newcount], [],
            updates=[
                tf.compat.v1.assign_add(self._sum, newsum),
                tf.compat.v1.assign_add(self._sumsq, newsumsq),
                tf.compat.v1.assign_add(self._count, newcount)
            ])

Exemplo n.º 10

0

Exibir arquivo

def build_train(make_obs_ph, model, num_actions, optimizer_f, grad_norm_clipping=None, gamma=1.0,
    double_q=False, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None, test_eps=0.05, 
    learning_rate = 0.001, learning_rate_decay_factor=0.99, learning_rate_growth_factor=1.001):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(make_obs_ph, model, num_actions, scope=scope, reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph, model, num_actions, scope=scope, reuse=reuse)

    act_greedy = build_act_greedy(make_obs_ph, model, num_actions, scope=scope, reuse=True, eps=test_eps)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # Learning rate adjustment
        lr = tf.Variable(float(learning_rate), trainable=False, dtype = tf.float32)
        learning_rate_decay_op = lr.assign(tf.clip_by_value(lr*learning_rate_decay_factor, 1e-5, 1e-3))
        learning_rate_growth_op = lr.assign(tf.clip_by_value(lr*learning_rate_growth_factor, 1e-5, 1e-3))
        optimizer = optimizer_f(learning_rate = lr)

        # q network evaluation
        atom_t = model(obs_t_input.get(), num_outputs, scope="atom_func", reuse=True)  # reuse parameters from act
        atom_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/atom_func")
        atom_p_t = tf.nn.softmax(atom_t)

        # target q network evalution
        atom_tp1 = model(obs_tp1_input.get(), num_outputs, scope="target_atom_func")
        target_atom_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_atom_func")
        atom_p_tp1 = tf.nn.softmax(atom_tp1)

        m_vec = tf.constant(0.0, dtype=tf.float32, shape=(num_atoms))
        for j in range(num_atoms):
            Tz_j = tf.clip(rew_t_ph + gamma * (V_min + j * del_z), V_min, V_max)
            b_j = (Tz_j - V_min)/del_z
            l = tf.astype(tf.math.floor(b_j), tf.int32)
            u = tf.astype(tf.math.ceil(b_j), tf.int32)
            m_vec[l] = m_vec[l] + 





        cem_loss = tf.reduce_sum(tf.math.multiply(m, tf.log(atom_p)))
        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=[td_error, lr],
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function(inputs=[obs_t_input], outputs=q_t)

        return act_f, act_greedy, q_values, train, update_target, learning_rate_decay_op, learning_rate_growth_op, {'q_values': q_values}

Exemplo n.º 11

0

Exibir arquivo

def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None):
    """Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905):

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    if param_noise_filter_func is None:
        param_noise_filter_func = default_param_noise_filter

    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = make_obs_ph("observation")
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
        update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold")
        update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale")
        reset_ph = tf.placeholder(tf.bool, (), name="reset")

        eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))
        param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False)
        param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False)

        # Unmodified Q.
        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")

        # Perturbable Q used for the actual rollout.
        q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func")
        # We have to wrap this code into a function due to the way tf.cond() works. See
        # https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
        # a more detailed discussion.
        def perturb_vars(original_scope, perturbed_scope):
            all_vars = scope_vars(absolute_scope_name(original_scope))
            all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope))
            assert len(all_vars) == len(all_perturbed_vars)
            perturb_ops = []
            for var, perturbed_var in zip(all_vars, all_perturbed_vars):
                if param_noise_filter_func(perturbed_var):
                    # Perturb this variable.
                    op = tf.assign(perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale))
                else:
                    # Do not perturb, just assign.
                    op = tf.assign(perturbed_var, var)
                perturb_ops.append(op)
            assert len(perturb_ops) == len(all_vars)
            return tf.group(*perturb_ops)

        # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy
        # of the network and measures the effect of that perturbation in action space. If the perturbation
        # is too big, reduce scale of perturbation, otherwise increase.
        q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func")
        perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func")
        kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1)
        mean_kl = tf.reduce_mean(kl)
        def update_scale():
            with tf.control_dependencies([perturb_for_adaption]):
                update_scale_expr = tf.cond(mean_kl < param_noise_threshold,
                    lambda: param_noise_scale.assign(param_noise_scale * 1.01),
                    lambda: param_noise_scale.assign(param_noise_scale / 1.01),
                )
            return update_scale_expr

        # Functionality to update the threshold for parameter space noise.
        update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(update_param_noise_threshold_ph >= 0,
            lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold))

        # Put everything together.
        deterministic_actions = tf.argmax(q_values_perturbed, axis=1)
        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
        chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
        update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        updates = [
            update_eps_expr,
            tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])),
            tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)),
            update_param_noise_threshold_expr,
        ]
        _act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False},
                         updates=updates)
        def act(ob, reset, update_param_noise_threshold, update_param_noise_scale, stochastic=True, update_eps=-1):
            return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale)
        return act

Exemplo n.º 12

0

Exibir arquivo

Arquivo: build_graph.py Projeto: christopher-hsu/ADFQ

def build_train(make_obs_ph, q_func, num_actions, optimizer_f,
    grad_norm_clipping=None, gamma=1.0, scope="setdeepq", reuse=None, 
    test_eps=0.05, lr_init = 0.001, lr_period_steps=50000, tau=0.05):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.
    lr_init : float
        initial learning rate
    lr_decay_factor : float
        learning rate decay factor. It should be equal to or smaller than 1.0.
    lr_growth_factor : float
        learning rate growth factor. It should be equal to or larger than 1.0.
    tau : float
        parameter for the soft target network update. tau <= 1.0 and 1.0 for
        the hard update.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    # Build action graphs
    act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    act_greedy = build_act_greedy(make_obs_ph, q_func, num_actions, scope=scope, reuse=True, eps=test_eps)

    with tf.compat.v1.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.compat.v1.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.compat.v1.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.compat.v1.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.compat.v1.placeholder(tf.float32, [None], name="weight")
        iteration = tf.compat.v1.placeholder(tf.float32, name="iteration")

        # Cosine learning rate adjustment
        lr = tf.Variable(float(lr_init), trainable=False, dtype = tf.float32, name='lr')
        lr = tf.clip_by_value(0.0005*tf.math.cos(math.pi*iteration/lr_period_steps)+0.000501, 1e-6, 1e-3)
        optimizer = optimizer_f(learning_rate = lr)

        # q network evaluation
        q1_t = q_func(obs_t_input.get(), num_actions, scope="q1_func", reuse=True)  # reuse q1 parameters from act
        q1_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/q1_func")
        q2_t = q_func(obs_t_input.get(), num_actions, scope="q2_func", reuse=True)  # reuse q2 parameters from act
        q2_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/q2_func")

        # target q network evalution
        q1_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q1_func", reuse=False)
        target_q1_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/target_q1_func")
        q2_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q2_func", reuse=False)
        target_q2_func_vars = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/target_q2_func")

        # q scores for actions which we know were selected in the given state.
        q1_t_selected = tf.reduce_sum(input_tensor=q1_t * tf.one_hot(act_t_ph, num_actions), axis=1)
        q2_t_selected = tf.reduce_sum(input_tensor=q2_t * tf.one_hot(act_t_ph, num_actions), axis=1)

        # Actions selected with current q funcs at state t+1.
        q1_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q1_func", reuse=True)
        q2_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q2_func", reuse=True)
        tp1_best_action_using_online_net = tf.argmax(input=q1_tp1_using_online_net+q2_tp1_using_online_net, axis=1)
        # Using action at t+1 find target value associated with the action
        q1_tp1_selected = tf.reduce_sum(input_tensor=q1_tp1 * tf.one_hot(tp1_best_action_using_online_net, num_actions), axis=1)
        q2_tp1_selected = tf.reduce_sum(input_tensor=q2_tp1 * tf.one_hot(tp1_best_action_using_online_net, num_actions), axis=1)
        # Min of target q values to be used bellman equation
        q_tp1_best = tf.minimum(q1_tp1_selected, q2_tp1_selected)

        # Done mask
        # q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_tp1_selected_target = rew_t_ph + gamma * q_tp1_best

        # compute the error (potentially clipped)
        td_error1 = q1_t_selected - tf.stop_gradient(q_tp1_selected_target)
        td_error2 = q2_t_selected - tf.stop_gradient(q_tp1_selected_target)
        errors1 = U.huber_loss(td_error1)
        errors2 = U.huber_loss(td_error2)
        errors = errors1 + errors2
        weighted_error = tf.reduce_mean(input_tensor=importance_weights_ph * errors)

        #Print total number of params
        total_parameters = 0
        for variable in tf.compat.v1.trainable_variables():
            # shape is an array of tf.Dimension
            shape = variable.get_shape()
            variable_parameters = 1
            for dim in shape:
                variable_parameters *= dim.value
            # print("var params", variable_parameters)
            total_parameters += variable_parameters
        print("===============================================================")
        print("Total number of trainable params:", total_parameters)
        print("===============================================================")

        # Log for tensorboard
        tf.summary.scalar('q1_values', tf.math.reduce_mean(q1_t))
        tf.summary.scalar('q2_values', tf.math.reduce_mean(q2_t))
        tf.summary.scalar('td_1', tf.math.reduce_mean(td_error1))
        tf.summary.scalar('td_2', tf.math.reduce_mean(td_error2))
        tf.summary.scalar('weighted_loss', weighted_error)
        tf.summary.scalar('lr_schedule', lr)
        tf.summary.scalar('td_MSE_1', tf.math.reduce_mean(tf.math.square(td_error1)))
        tf.summary.scalar('td_MSE_2', tf.math.reduce_mean(tf.math.square(td_error2)))

        # combine variable scopes
        q_func_vars = q1_func_vars+q2_func_vars
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called every step to copy Q network to target Q network
        # target network is updated with polyak averaging
        update_target_expr1 = []
        for var, var_target in zip(sorted(q1_func_vars, key=lambda v: v.name),
                                   sorted(target_q1_func_vars, key=lambda v: v.name)):
            update_target_expr1.append(var_target.assign(tau*var + (1-tau)*var_target))
        update_target_expr1 = tf.group(*update_target_expr1)

        update_target_expr2 = []
        for var, var_target in zip(sorted(q2_func_vars, key=lambda v: v.name),
                                   sorted(target_q2_func_vars, key=lambda v: v.name)):
            update_target_expr2.append(var_target.assign(tau*var + (1-tau)*var_target))
        update_target_expr2 = tf.group(*update_target_expr2)

        merged_summary = tf.compat.v1.summary.merge_all(scope=tf.compat.v1.get_variable_scope().name)
        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph,
                iteration
            ],
            outputs=[td_error1, td_error2, tf.reduce_mean(input_tensor=errors), merged_summary],
            updates=[optimize_expr, lr]
        )
        update_target = U.function([], [], updates=[update_target_expr1, update_target_expr2])

        q_values = U.function(inputs=[obs_t_input], outputs=[q1_t, q2_t])

        return act_f, act_greedy, q_values, train, update_target, {'q_values': q_values}

Exemplo n.º 13

0

Exibir arquivo

Arquivo: build_graph.py Projeto: christopher-hsu/ray

def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer_f,
                grad_norm_clipping=None,
                gamma=0.9,
                scope="deepadfq",
                reuse=None,
                varTH=1e-05,
                test_eps=0.05,
                act_policy='egreedy',
                learning_rate=0.001,
                learning_rate_decay_factor=0.99,
                learning_rate_growth_factor=1.001):
    """Creates the train function:
    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.
    varTH : float
        variance threshold
    test_eps : float
        epsilon value for epsilon-greedy method in evaluation
    act_policy : str
        either 'egreedy' or 'bayesian' for action policy
        
    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if act_policy == 'egreedy':
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)
    elif act_policy == 'bayesian':
        act_f = build_act_bayesian(make_obs_ph,
                                   q_func,
                                   num_actions,
                                   scope=scope,
                                   reuse=reuse)
    else:
        raise ValueError(
            "Please choose either egreedy or bayesian for action policy.")
    act_test = build_act_greedy(make_obs_ph,
                                q_func,
                                num_actions,
                                scope=scope,
                                reuse=True,
                                eps=test_eps)
    #act_test = build_act_bayesian(make_obs_ph, q_func, num_actions, scope=scope, reuse=True)

    sdTH = np.sqrt(varTH, dtype=np.float32)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # Learning rate adjustment
        lr = tf.Variable(float(learning_rate),
                         trainable=False,
                         dtype=tf.float32)
        learning_rate_decay_op = lr.assign(
            tf.clip_by_value(lr * learning_rate_decay_factor, 1e-5, 1e-3))
        learning_rate_growth_op = lr.assign(
            tf.clip_by_value(lr * learning_rate_growth_factor, 1e-5, 1e-3))
        optimizer = optimizer_f(learning_rate=lr)

        target_means = tf.placeholder(tf.float32, [None], name="target_means")
        target_sd = tf.placeholder(tf.float32, [None], name="target_sd")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions * 2,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(),
                       num_actions * 2,
                       scope="target_q_func")

        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        mean_values = q_t[:, :num_actions]
        rho_values = q_t[:, num_actions:]

        mean_selected = tf.reduce_sum(
            mean_values * tf.one_hot(act_t_ph, num_actions, dtype=tf.float32),
            1)
        rho_selected = tf.reduce_sum(
            rho_values * tf.one_hot(act_t_ph, num_actions, dtype=tf.float32),
            1)

        sd_selected = tf.exp(-rho_selected)

        mean_error = mean_selected - tf.stop_gradient(target_means)
        #sd_error = sd_selected - tf.stop_gradient(target_sd)
        sd_error = tf.log(sd_selected) - tf.log(tf.stop_gradient(target_sd))
        huber_loss = U.huber_loss(mean_error) + U.huber_loss(sd_error)
        weighted_loss = tf.reduce_mean(huber_loss * importance_weights_ph)

        #kl_loss = tf.contrib.distributions.kl_divergence(
        #    tf.distributions.Normal(loc=target_means, scale=target_sd),
        #    tf.distributions.Normal(loc=mean_selected, scale=sd_selected),
        #    name='kl_loss')
        #weighted_loss = tf.reduce_mean(kl_loss * importance_weights_ph)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_loss,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_loss,
                                               var_list=q_func_vars)

        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                target_means,
                target_sd,
                importance_weights_ph,
            ],
            outputs=[tf.reduce_mean(huber_loss), mean_error, sd_error, lr],
            updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_target_vals = U.function(inputs=[obs_tp1_input], outputs=[q_tp1])

        return act_f, act_test, q_target_vals, train, update_target, learning_rate_decay_op, learning_rate_growth_op

Exemplo n.º 14

0

Exibir arquivo

Arquivo: build_graph_oldDQN.py Projeto: christopher-hsu/ADFQ

def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer_f,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=False,
                scope="setdeepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None,
                test_eps=0.05,
                lr_init=0.001,
                lr_decay_factor=0.99,
                lr_growth_factor=1.001,
                tau=0.001):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.
    lr_init : float
        initial learning rate
    lr_decay_factor : float
        learning rate decay factor. It should be equal to or smaller than 1.0.
    lr_growth_factor : float
        learning rate growth factor. It should be equal to or larger than 1.0.
    tau : float
        parameter for the soft target network update. tau <= 1.0 and 1.0 for
        the hard update.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    # Build action graphs
    act_f = build_act(make_obs_ph,
                      q_func,
                      num_actions,
                      scope=scope,
                      reuse=reuse)

    act_greedy = build_act_greedy(make_obs_ph,
                                  q_func,
                                  num_actions,
                                  scope=scope,
                                  reuse=True,
                                  eps=test_eps)

    with tf.compat.v1.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.compat.v1.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.compat.v1.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.compat.v1.placeholder(tf.float32, [None],
                                                name="done")
        importance_weights_ph = tf.compat.v1.placeholder(tf.float32, [None],
                                                         name="weight")

        # Learning rate adjustment
        lr = tf.Variable(float(lr_init), trainable=False, dtype=tf.float32)
        lr_decay_op = lr.assign(
            tf.clip_by_value(lr * lr_decay_factor, 1e-5, 1e-2))
        lr_growth_op = lr.assign(
            tf.clip_by_value(lr * lr_growth_factor, 1e-5, 1e-2))
        optimizer = optimizer_f(learning_rate=lr)

        # q network evaluation
        q_t = q_func.forward(obs_t_input.get(),
                             num_actions,
                             scope="q_func",
                             reuse=True)  # reuse parameters from act
        q_func_vars = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.compat.v1.get_variable_scope().name + "/q_func")

        # target q network evalution
        q_tp1 = q_func.forward(obs_tp1_input.get(),
                               num_actions,
                               scope="target_q_func",
                               reuse=False)
        target_q_func_vars = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.compat.v1.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(input_tensor=q_t *
                                     tf.one_hot(act_t_ph, num_actions),
                                     axis=1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func.forward(obs_tp1_input.get(),
                                                    num_actions,
                                                    scope="q_func",
                                                    reuse=True)
            q_tp1_best_using_online_net = tf.argmax(
                input=q_tp1_using_online_net, axis=1)
            q_tp1_best = tf.reduce_sum(
                input_tensor=q_tp1 *
                tf.one_hot(q_tp1_best_using_online_net, num_actions),
                axis=1)
        else:
            q_tp1_best = tf.reduce_max(input_tensor=q_tp1, axis=1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(input_tensor=importance_weights_ph *
                                        errors)

        # Log for tensorboard
        tf.summary.scalar('q_values', tf.math.reduce_mean(q_t))
        tf.summary.scalar('td_MSE',
                          tf.math.reduce_mean(tf.math.square(td_error)))
        tf.summary.scalar('weighted_loss', weighted_error)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(
                var_target.assign(tau * var + (1 - tau) * var_target))
        update_target_expr = tf.group(*update_target_expr)

        merged_summary = tf.compat.v1.summary.merge_all(
            scope=tf.compat.v1.get_variable_scope().name)
        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=[
                               td_error,
                               tf.reduce_mean(input_tensor=errors),
                               merged_summary
                           ],
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function(inputs=[obs_t_input], outputs=q_t)

        return act_f, act_greedy, q_values, train, update_target, lr_decay_op, lr_growth_op, {
            'q_values': q_values
        }