예제 #1
0
def validate_probtype(probtype, pdparam):
    N = 100000
    # Check to see if mean negative log likelihood == differential entropy
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
    pd = probtype.pdfromflat(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
    Xval = tf.get_default_session().run(pd.sample(), feed_dict={M: Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = -logliks.mean()  #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N)  #pylint: disable=E1101
    entval = calcent(Mval).mean()  #pylint: disable=E1101
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr  # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
    pd2 = probtype.pdfromflat(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
    klval = calckl(Mval, Mval2).mean()  #pylint: disable=E1101
    logliks = calcloglik(Xval, Mval2)
    klval_ll = -entval - logliks.mean()  #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N)  #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr  # within 3 sigmas
    print('ok on', probtype, pdparam)
예제 #2
0
def test_mpi_adam():
    """
    tests the MpiAdam object's functionality
    """
    np.random.seed(0)
    tf.set_random_seed(0)

    a_var = tf.Variable(np.random.randn(3).astype('float32'))
    b_var = tf.Variable(np.random.randn(2, 5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a_var)) + tf.reduce_sum(tf.sin(b_var))

    learning_rate = 1e-2
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    do_update = tf_utils.function([], loss, updates=[update_op])

    tf.get_default_session().run(tf.global_variables_initializer())
    for step in range(10):
        print(step, do_update())

    tf.set_random_seed(0)
    tf.get_default_session().run(tf.global_variables_initializer())

    var_list = [a_var, b_var]
    lossandgrad = tf_utils.function(
        [], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op])
    adam = MpiAdam(var_list)

    for step in range(10):
        loss, grad = lossandgrad()
        adam.update(grad, learning_rate)
        print(step, loss)
예제 #3
0
    def __init__(self, env, hidden_size, entcoeff=0.001, scope="adversary"):
        """
        reward regression from observations and transitions

        :param env: (Gym Environment)
        :param hidden_size: ([int]) the hidden dimension for the MLP
        :param entcoeff: (float) the entropy loss weight
        :param scope: (str) tensorflow variable scope
        """
        self.scope = scope
        self.observation_shape = env.observation_space.shape
        self.actions_shape = env.action_space.shape
        self.input_shape = tuple([
            o + a for o, a in zip(self.observation_shape, self.actions_shape)
        ])
        self.num_actions = env.action_space.shape[0]
        self.hidden_size = hidden_size
        self.build_ph()
        # Build grpah
        generator_logits = self.build_graph(self.generator_obs_ph,
                                            self.generator_acs_ph,
                                            reuse=False)
        expert_logits = self.build_graph(self.expert_obs_ph,
                                         self.expert_acs_ph,
                                         reuse=True)
        # Build accuracy
        generator_acc = tf.reduce_mean(
            tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5))
        expert_acc = tf.reduce_mean(
            tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5))
        # Build regression loss
        # let x = logits, z = targets.
        # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=generator_logits, labels=tf.zeros_like(generator_logits))
        generator_loss = tf.reduce_mean(generator_loss)
        expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=expert_logits, labels=tf.ones_like(expert_logits))
        expert_loss = tf.reduce_mean(expert_loss)
        # Build entropy loss
        logits = tf.concat([generator_logits, expert_logits], 0)
        entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
        entropy_loss = -entcoeff * entropy
        # Loss + Accuracy terms
        self.losses = [
            generator_loss, expert_loss, entropy, entropy_loss, generator_acc,
            expert_acc
        ]
        self.loss_name = [
            "generator_loss", "expert_loss", "entropy", "entropy_loss",
            "generator_acc", "expert_acc"
        ]
        self.total_loss = generator_loss + expert_loss + entropy_loss
        # Build Reward for policy
        self.reward_op = -tf.log(1 - tf.nn.sigmoid(generator_logits) + 1e-8)
        var_list = self.get_trainable_variables()
        self.lossandgrad = tf_util.function([
            self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph,
            self.expert_acs_ph
        ], self.losses + [tf_util.flatgrad(self.total_loss, var_list)])
예제 #4
0
def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
    """Creates the act function:

    :param make_obs_ph: (function (str): TensorFlow Tensor) a function that take a name and creates a placeholder of
        input with that name
    :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor)
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    :param num_actions: (int) number of actions.
    :param scope: (str or VariableScope) optional scope for variable_scope.
    :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given.
    :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) act function to select and action given
        observation. See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = make_obs_ph("observation")
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

        eps = tf.get_variable("eps", (),
                              initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
        deterministic_actions = tf.argmax(q_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size]),
                                           minval=0,
                                           maxval=num_actions,
                                           dtype=tf.int64)
        chose_random = tf.random_uniform(
            tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions,
                                      deterministic_actions)

        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                                 lambda: deterministic_actions)
        update_eps_expr = eps.assign(
            tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
        _act = tf_utils.function(
            inputs=[observations_ph, stochastic_ph, update_eps_ph],
            outputs=output_actions,
            givens={
                update_eps_ph: -1.0,
                stochastic_ph: True
            },
            updates=[update_eps_expr])

        def act(obs, stochastic=True, update_eps=-1):
            return _act(obs, stochastic, update_eps)

        return act
예제 #5
0
def build_act(q_func,
              ob_space,
              ac_space,
              stochastic_ph,
              update_eps_ph,
              sess,
              obs_phs=None):
    """
    Creates the act function:

    :param q_func: (DQNPolicy) the policy
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder
    :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder
    :param sess: (TensorFlow session) The current TensorFlow session
    :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor)
        act function to select and action given observation (See the top of the file for details),
        A tuple containing the observation placeholder and the processed observation placeholder respectivly.
    """
    eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))
    policy = q_func(sess, ob_space, ac_space, 1, 1, None, obs_phs=obs_phs)
    obs_phs = (policy.obs_ph, policy.processed_obs)
    #mask = tf.one_hot(mask_idx, depth=541, dtype=tf.float32)
    mask = tf.placeholder(dtype=tf.float32)
    feasible_q_values = tf.math.multiply(policy.q_values, mask)
    deterministic_actions = tf.argmax(feasible_q_values, axis=1)

    batch_size = tf.shape(policy.obs_ph)[0]
    n_actions = ac_space.nvec if isinstance(ac_space,
                                            MultiDiscrete) else ac_space.n
    random_actions = tf.random_uniform(tf.stack([batch_size]),
                                       minval=0,
                                       maxval=n_actions,
                                       dtype=tf.int64)
    chose_random = tf.random_uniform(
        tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
    stochastic_actions = tf.where(chose_random, random_actions,
                                  deterministic_actions)

    output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                             lambda: deterministic_actions)
    update_eps_expr = eps.assign(
        tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
    _act = tf_util.function(
        inputs=[policy.obs_ph, stochastic_ph, update_eps_ph, mask],
        outputs=output_actions,
        givens={
            update_eps_ph: -1.0,
            stochastic_ph: True
        },
        updates=[update_eps_expr])

    def act(obs, stochastic=False, update_eps=-1, mask=None):
        return _act(obs, stochastic, update_eps, mask)

    return act, obs_phs
예제 #6
0
def validate_probtype(probtype, pdparam):
    """
    validate probability distribution types

    :param probtype: (ProbabilityDistributionType) the type to validate
    :param pdparam: ([float]) the flat probabilities to test
    """
    number_samples = 100000
    # Check to see if mean negative log likelihood == differential entropy
    mval = np.repeat(pdparam[None, :], number_samples, axis=0)
    mval_ph = probtype.param_placeholder([number_samples])
    action_mask_ph = probtype.param_placeholder([number_samples])
    action_mask_ph = tf.placeholder_with_default(
        tf.zeros_like(action_mask_ph), shape=np.shape(action_mask_ph))
    xval_ph = probtype.sample_placeholder([number_samples])
    proba_distribution = probtype.proba_distribution_from_flat(mval_ph)
    calcloglik = tf_util.function([xval_ph, mval_ph],
                                  proba_distribution.logp(xval_ph))
    calcent = tf_util.function([mval_ph], proba_distribution.entropy())
    xval = tf.get_default_session().run(proba_distribution.sample(),
                                        feed_dict={mval_ph: mval})
    logliks = calcloglik(xval, mval)
    entval_ll = -logliks.mean()
    entval_ll_stderr = logliks.std() / np.sqrt(number_samples)
    entval = calcent(mval).mean()
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr  # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    mval2_ph = probtype.param_placeholder([number_samples])
    action_mask_ph2 = probtype.param_placeholder([number_samples])
    action_mask_ph2 = tf.placeholder_with_default(
        tf.zeros_like(action_mask_ph2), shape=np.shape(action_mask_ph2))
    pd2 = probtype.proba_distribution_from_flat(mval2_ph)
    tmp = pdparam + np.random.randn(pdparam.size) * 0.1
    mval2 = np.repeat(tmp[None, :], number_samples, axis=0)
    calckl = tf_util.function([mval_ph, mval2_ph], proba_distribution.kl(pd2))
    klval = calckl(mval, mval2).mean()
    logliks = calcloglik(xval, mval2)
    klval_ll = -entval - logliks.mean()
    klval_ll_stderr = logliks.std() / np.sqrt(number_samples)
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr  # within 3 sigmas
    print('ok on', probtype, pdparam)
예제 #7
0
    def __init__(self, epsilon=1e-2, shape=()):
        """
        calulates the running mean and std of a data stream
        https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm

        :param epsilon: (float) helps with arithmetic issues
        :param shape: (tuple) the shape of the data stream's output
        """
        self._sum = tf.compat.v1.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.compat.v1.constant_initializer(0.0),
            name="runningsum",
            trainable=False)
        self._sumsq = tf.compat.v1.get_variable(
            dtype=tf.float64,
            shape=shape,
            initializer=tf.compat.v1.constant_initializer(epsilon),
            name="runningsumsq",
            trainable=False)
        self._count = tf.compat.v1.get_variable(
            dtype=tf.float64,
            shape=(),
            initializer=tf.compat.v1.constant_initializer(epsilon),
            name="count",
            trainable=False)
        self.shape = shape

        self.mean = tf.cast(self._sum / self._count, tf.float32)
        self.std = tf.sqrt(
            tf.maximum(
                tf.cast(self._sumsq / self._count, tf.float32) -
                tf.square(self.mean), 1e-2))

        newsum = tf.compat.v1.placeholder(shape=self.shape,
                                          dtype=tf.float64,
                                          name='sum')
        newsumsq = tf.compat.v1.placeholder(shape=self.shape,
                                            dtype=tf.float64,
                                            name='var')
        newcount = tf.compat.v1.placeholder(shape=[],
                                            dtype=tf.float64,
                                            name='count')
        self.incfiltparams = tf_util.function(
            [newsum, newsumsq, newcount], [],
            updates=[
                tf.compat.v1.assign_add(self._sum, newsum),
                tf.compat.v1.assign_add(self._sumsq, newsumsq),
                tf.compat.v1.assign_add(self._count, newcount)
            ])
예제 #8
0
def test_function():
    """
    test the function function in tf_util
    """
    with tf.Graph().as_default():
        x_ph = tf.placeholder(tf.int32, (), name="x")
        y_ph = tf.placeholder(tf.int32, (), name="y")
        z_ph = 3 * x_ph + 2 * y_ph
        linear_fn = function([x_ph, y_ph], z_ph, givens={y_ph: 0})

        with single_threaded_session():
            initialize()

            assert linear_fn(2) == 6
            assert linear_fn(2, 2) == 10
예제 #9
0
def test_multikwargs():
    """
    test the function function in tf_util
    """
    with tf.Graph().as_default():
        x_ph = tf.placeholder(tf.int32, (), name="x")
        with tf.variable_scope("other"):
            x2_ph = tf.placeholder(tf.int32, (), name="x")
        z_ph = 3 * x_ph + 2 * x2_ph

        linear_fn = function([x_ph, x2_ph], z_ph, givens={x2_ph: 0})
        with single_threaded_session():
            initialize()
            assert linear_fn(2) == 6
            assert linear_fn(2, 2) == 10
예제 #10
0
def build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess):
    """
    Creates the act function:

    :param q_func: (DQNPolicy) the policy
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder
    :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder
    :param sess: (TensorFlow session) The current TensorFlow session
    :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor)
        act function to select and action given observation (See the top of the file for details),
        A tuple containing the observation placeholder and the processed observation placeholder respectivly.
    """
    eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))

    policy = q_func(sess, ob_space, ac_space, 1, 1, None)

    obs_phs = (policy.obs_ph, policy.processed_obs)
    deterministic_actions = tf.argmax(tf.add(policy.q_values, policy.action_mask_ph), axis=1)
    batch_size = tf.shape(policy.obs_ph)[0]
    random_actions = tf.distributions.Categorical(probs=policy.action_mask_probs_ph, dtype=tf.int64).sample()
    chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
    stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

    output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
    update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
    _act = tf_util.function(inputs=[policy.obs_ph, stochastic_ph, update_eps_ph, policy.action_mask_ph,
                                    policy.action_mask_probs_ph],
                            outputs=output_actions,
                            givens={update_eps_ph: -1.0, stochastic_ph: True},
                            updates=[update_eps_expr])

    def act(obs, stochastic=True, update_eps=-1, action_mask=None):
        if action_mask is not None:
            return _act(obs, stochastic, update_eps, policy.prepare_action_mask(action_mask), action_mask)
        else:
            return _act(obs, stochastic, update_eps)

    return act, obs_phs
예제 #11
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(self.observation_space, self.action_space,
                                                             self.hidden_size_adversary,
                                                             entcoeff=self.adversary_entcoeff)

                # Penalty related variable
                with tf.variable_scope('penalty'):
                    cur_cost_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # episodic cost

                    param_init = np.log(max(np.exp(self.penalty_init) - 1, 1e-8))
                    penalty_param = tf.get_variable('penalty_param',
                                                    initializer=float(param_init),
                                                    trainable=True,
                                                    dtype=tf.float32)
                penalty = tf.nn.softplus(penalty_param)
                penalty_loss = tf.reduce_mean(-penalty_param * (cur_cost_ph - self.cost_lim))

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1,
                                             None, reuse=False, **self.policy_kwargs)
                
                # # Network for safety value function
                # with tf.variable_Scope("vc",reuse=False):
                #     self.cost_value = MLPValue(self.sess, self.observation_spacem, self.n_envs, 1, None)
                
                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
                    catarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target cost advantage function
                    cret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical cost

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret))
                    vcerr = tf.reduce_mean(tf.square(self.policy_pi.vcf_flat - cret))
                    
                    # advantage * pnew / pold
                    ratio = tf.exp(self.policy_pi.proba_distribution.logp(action) -
                                   old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)
                    # Surrogate for cost function
                    surrcost = tf.reduce_mean(ratio * catarg)

                    optimgain = surrgain + entbonus
                    # Include surr_cost in pi_objective
                    optimgain -= penalty * surrcost
                    optimgain /= (1 + penalty)
                    # # Loss function for pi is negative of pi_objective
                    # optimgain = -optimgain # Should we??
                    
                    losses = [optimgain, meankl, entbonus, surrgain, meanent, surrcost]
                    self.loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy", "surrcost"]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [v for v in all_var_list if "/vf" not in v.name and "/q/" not in v.name and "/vcf" not in v.name] # policy parameters
                    vf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vcf" not in v.name] # value parameters
                    vcf_var_list = [v for v in all_var_list if "/pi" not in v.name and "/logstd" not in v.name and "/vf" not in v.name] # cost value parameters

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list, sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(tf.reshape(flat_tangent[start: start + var_size], shape))
                        start += var_size
                    gvp = tf.add_n([tf.reduce_sum(grad * tangent)
                                    for (grad, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
                    # Fisher vector products
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('penalty_loss', penalty_loss)
                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('constraint_cost_function_loss', surrcost)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar('loss', optimgain + meankl + entbonus + surrgain + meanent + surrcost + penalty_loss)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg], losses)
                    self.compute_fvp = tf_util.function([flat_tangent, observation, old_policy.obs_ph, action, atarg, catarg],
                                                        fvp) # Why need all inputs? Might for implementation easiness
                    # self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret],
                    #                                               tf_util.flatgrad(vferr, vf_var_list)) # Why need old_policy.obs_ph? Doesn't seem to be used
                    # self.compute_vcflossandgrad = tf_util.function([observation, old_policy.obs_ph, cret],
                    #                                               tf_util.flatgrad(vcerr, vcf_var_list))
                    self.compute_vflossandgrad = tf_util.function([observation, old_policy.obs_ph, ret, cret],
                                                                  [tf_util.flatgrad(vferr, vf_var_list), tf_util.flatgrad(vcerr, vcf_var_list)])
                    self.compute_lagrangiangrad = tf_util.function([cur_cost_ph],
                                                                   tf_util.flatgrad(penalty_loss, [penalty_param]))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(colorize("done in {:.3f} seconds".format((time.time() - start_time)),
                                           color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(self.reward_giver.get_trainable_variables(), sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()
                    # optimizer for constraint costs value function
                    self.vcadam = MpiAdam(vcf_var_list, sess=self.sess)
                    self.vcadam.sync()
                    # optimizer for lagragian value of safe RL
                    self.penaltyadam = MpiAdam([penalty_param], sess=self.sess)
                    self.penaltyadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards', tf.reduce_mean(ret))
                    tf.summary.scalar('discounted_costs', tf.reduce_mean(cret))
                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('cost_advantage', tf.reduce_mean(catarg))
                    tf.summary.scalar('kl_clip_range', tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('discounted_rewards', cret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('penalty_learning_rate', self.penalty_lr)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('cost_advantage', catarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars("model") + tf_util.get_trainable_vars("oldpi")
                if self.using_gail:
                    self.params.extend(self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, catarg, ret, cret, cur_cost_ph],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
예제 #12
0
def learn(env,
          policy,
          value_fn,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002):
    """
    Trains an ACKTR model.

    :param env: (Gym environment) The environment to learn from
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...)
    :param gamma: (float) The discount value
    :param lam: (float) the tradeoff between exploration and exploitation
    :param timesteps_per_batch: (int) the number of timesteps for each batch
    :param num_timesteps: (int) the total number of timesteps to run
    :param animate: (bool) if render env
    :param callback: (function) called every step, used for logging and saving
    :param desired_kl: (float) the Kullback leibler weight for the loss
    """
    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize,
                               cold_lr=stepsize * (1 - 0.9),
                               momentum=0.9,
                               kfac_update=2,
                               epsilon=1e-2,
                               stats_decay=0.99,
                               async_eigen_decomp=1,
                               cold_iter=1,
                               weight_decay_dict=policy.wd_dict,
                               max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = tf_util.function(inputs, update_op)
    tf_util.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for queue_runner in [q_runner, value_fn.q_runner]:
        assert queue_runner is not None
        enqueue_threads.extend(
            queue_runner.create_threads(tf.get_default_session(),
                                        coord=coord,
                                        start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            timesteps_this_batch += path["reward"].shape[0]
            timesteps_so_far += path["reward"].shape[0]
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = value_fn.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        value_fn.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl_loss = policy.compute_kl(ob_no, oldac_dist)
        if kl_loss > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl_loss < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular(
            "EpLenMean", np.mean([path["reward"].shape[0] for path in paths]))
        logger.record_tabular("KL", kl_loss)
        if callback is not None:
            # Only stop training if return value is False, not when it is None. This is for backwards
            # compatibility with callbacks that have no return statement.
            if callback(locals(), globals()) == False:
                break
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
예제 #13
0
    def __init__(self, ob_dim, ac_dim):
        """
        Create a gaussian MLP policy

        :param ob_dim: (int) Observation dimention
        :param ac_dim: (int) action dimention
        """
        # Here we'll construct a bunch of expressions, which will be used in two places:
        # (1) When sampling actions
        # (2) When computing loss functions, for the policy update
        # Variables specific to (1) have the word "sampled" in them,
        # whereas variables specific to (2) have the word "old" in them
        ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2],
                               name="ob")  # batch of observations
        oldac_na = tf.placeholder(
            tf.float32, shape=[None, ac_dim],
            name="ac")  # batch of actions previous actions
        # batch of actions previous action distributions
        oldac_dist = tf.placeholder(tf.float32,
                                    shape=[None, ac_dim * 2],
                                    name="oldac_dist")
        adv_n = tf.placeholder(tf.float32, shape=[None],
                               name="adv")  # advantage function estimate
        wd_dict = {}
        layer_1 = tf.nn.tanh(
            dense(ob_no,
                  64,
                  "h1",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0.0,
                  weight_loss_dict=wd_dict))
        layer_2 = tf.nn.tanh(
            dense(layer_1,
                  64,
                  "h2",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0.0,
                  weight_loss_dict=wd_dict))
        mean_na = dense(layer_2,
                        ac_dim,
                        "mean",
                        weight_init=tf_util.normc_initializer(0.1),
                        bias_init=0.0,
                        weight_loss_dict=wd_dict)  # Mean control output
        self.wd_dict = wd_dict
        # Variance on outputs
        self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim],
                                                     tf.float32,
                                                     tf.zeros_initializer())
        logstd_1a = tf.expand_dims(logstd_1a, 0)
        std_1a = tf.exp(logstd_1a)
        std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
        ac_dist = tf.concat([
            tf.reshape(mean_na, [-1, ac_dim]),
            tf.reshape(std_na, [-1, ac_dim])
        ], 1)
        # This is the sampled action we'll perform.
        sampled_ac_na = tf.random_normal(tf.shape(
            ac_dist[:, ac_dim:])) * ac_dist[:, ac_dim:] + ac_dist[:, :ac_dim]
        logprobsampled_n = -tf.reduce_sum(tf.log(
            ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log(
                2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum(
                    tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) /
                    (tf.square(ac_dist[:, ac_dim:])),
                    axis=1)  # Logprob of sampled action
        logprob_n = -tf.reduce_sum(
            tf.log(ac_dist[:, ac_dim:]), axis=1
        ) - 0.5 * tf.log(2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum(
            tf.square(ac_dist[:, :ac_dim] - oldac_na) /
            (tf.square(ac_dist[:, ac_dim:])),
            axis=1
        )  # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
        kl_loss = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
        # kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n))
        # Approximation of KL divergence between old policy used to generate actions,
        # and new policy used to compute logprob_n
        surr = -tf.reduce_mean(
            adv_n * logprob_n
        )  # Loss function that we'll differentiate to get the policy gradient
        surr_sampled = -tf.reduce_mean(logprob_n)  # Sampled loss of the policy
        # Generate a new action and its logprob
        self._act = tf_util.function(
            [ob_no], [sampled_ac_na, ac_dist, logprobsampled_n])
        # self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl)
        #  Compute (approximate) KL divergence between old policy and new policy
        self.compute_kl = tf_util.function([ob_no, oldac_dist], kl_loss)
        # Input and output variables needed for computing loss
        self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled)
        tf_util.initialize()  # Initialize uninitialized TF variables
예제 #14
0
def build_train(q_func,
                ob_space,
                ac_space,
                sess,
                num_actions,
                num_action_streams,
                batch_size,
                learning_rate=5e-4,
                aggregator='reduceLocalMean',
                optimizer_name="Adam",
                grad_norm_clipping=None,
                gamma=0.99,
                double_q=True,
                scope="bdq",
                reuse=None,
                losses_version=2,
                independent=False,
                dueling=True,
                target_version="mean",
                loss_type="L2",
                full_tensorboard_log=False):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        total number of sub-actions to be represented at the output  
    num_action_streams: int
        specifies the number of action branches in action value (or advantage) function representation
    batch_size: int
        size of the sampled mini-batch from the replay buffer 
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for deep Q-learning 
    grad_norm_clipping: float or None
        clip graident norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q-Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled. BDQ uses it. 
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    losses_version: int
        specifies the version number for merging of losses across the branches
        version 2 is the best-performing loss used for BDQ.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select an action given an observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """

    assert independent and losses_version == 4 or not independent, 'independent needs to be used along with loss v4'
    assert independent and target_version == "indep" or not independent, 'independent needs to be used along with independent TD targets'

    with tf.variable_scope("input", reuse=reuse):
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

    with tf.variable_scope(scope, reuse=reuse):
        act_f, obs_phs = build_act(q_func, ob_space, ac_space, sess,
                                   num_actions, num_action_streams,
                                   stochastic_ph, update_eps_ph)

        # Q-network evaluation
        with tf.variable_scope(
                "step_model",
                reuse=True,
                custom_getter=tf_util.outer_scope_getter("step_model")):
            step_model = q_func(sess,
                                ob_space,
                                ac_space,
                                1,
                                1,
                                None,
                                num_actions,
                                reuse=True,
                                obs_phs=obs_phs)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/model")

        # Target Q-network evalution
        with tf.variable_scope("target_q_func", reuse=False):
            target_policy = q_func(sess,
                                   ob_space,
                                   ac_space,
                                   1,
                                   1,
                                   None,
                                   num_actions,
                                   reuse=False)
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # compute estimate of best possible value starting from state at t + 1
        double_obs_ph = target_policy.obs_ph
        if double_q:
            with tf.variable_scope(
                    "q_func",
                    reuse=True,
                    custom_getter=tf_util.outer_scope_getter("q_func")):
                selection_q_tp1 = q_func(sess,
                                         ob_space,
                                         ac_space,
                                         1,
                                         1,
                                         None,
                                         num_actions,
                                         reuse=True)
                double_obs_ph = selection_q_tp1.obs_ph
        else:
            selection_q_tp1 = target_policy

        num_actions_pad = num_actions // num_action_streams

    with tf.variable_scope("loss", reuse=reuse):
        act_t_ph = tf.placeholder(tf.int32, [None, num_action_streams],
                                  name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        q_values = []
        for dim in range(num_action_streams):
            selected_a = tf.squeeze(
                tf.slice(act_t_ph, [0, dim], [batch_size, 1]))  # TODO better?
            q_values.append(
                tf.reduce_sum(tf.one_hot(selected_a, num_actions_pad) *
                              step_model.q_values[dim],
                              axis=1))

        if target_version == "indep":
            target_q_values = []
            for dim in range(num_action_streams):
                selected_a = tf.argmax(selection_q_tp1.q_values[dim], axis=1)
                selected_q = tf.reduce_sum(
                    tf.one_hot(selected_a, num_actions_pad) *
                    target_policy.q_values[dim],
                    axis=1)
                masked_selected_q = (1.0 - done_mask_ph) * selected_q
                target_q = rew_t_ph + gamma * masked_selected_q
                target_q_values.append(target_q)
        elif target_version == "max":
            for dim in range(num_action_streams):
                selected_a = tf.argmax(selection_q_tp1.q_values[dim], axis=1)
                selected_q = tf.reduce_sum(
                    tf.one_hot(selected_a, num_actions_pad) *
                    target_policy.q_values[dim],
                    axis=1)
                masked_selected_q = (1.0 - done_mask_ph) * selected_q
                if dim == 0:
                    max_next_q_values = masked_selected_q
                else:
                    max_next_q_values = tf.maximum(max_next_q_values,
                                                   masked_selected_q)
            target_q_values = [rew_t_ph + gamma * max_next_q_values
                               ] * num_action_streams  # TODO better?
        elif target_version == "mean":
            for dim in range(num_action_streams):
                selected_a = tf.argmax(selection_q_tp1.q_values[dim], axis=1)
                selected_q = tf.reduce_sum(
                    tf.one_hot(selected_a, num_actions_pad) *
                    target_policy.q_values[dim],
                    axis=1)
                masked_selected_q = (1.0 - done_mask_ph) * selected_q
                if dim == 0:
                    mean_next_q_values = masked_selected_q
                else:
                    mean_next_q_values += masked_selected_q
            mean_next_q_values /= num_action_streams
            target_q_values = [rew_t_ph + gamma * mean_next_q_values
                               ] * num_action_streams  # TODO better?
        else:
            assert False, 'unsupported target version ' + str(target_version)

        if optimizer_name == "Adam":
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            assert False, 'unsupported optimizer ' + str(optimizer_name)

        if loss_type == "L2":
            loss_function = tf.square
        elif loss_type == "Huber":
            loss_function = tf_util.huber_loss
        else:
            assert False, 'unsupported loss type ' + str(loss_type)

        if losses_version == 1:
            mean_q_value = sum(q_values) / num_action_streams
            mean_target_q_value = sum(target_q_values) / num_action_streams
            td_error = mean_q_value - tf.stop_gradient(mean_target_q_value)
            loss = loss_function(td_error)
            weighted_mean_loss = tf.reduce_mean(importance_weights_ph * loss)
            optimize_expr = minimize_and_clip(
                optimizer,
                weighted_mean_loss,
                var_list=q_func_vars,
                total_n_streams=(num_action_streams + (1 if dueling else 0)),
                clip_val=grad_norm_clipping)
            optimize_expr = [optimize_expr]
            tf.summary.scalar("loss", weighted_mean_loss)

        else:
            stream_losses = []
            for dim in range(num_action_streams):
                dim_td_error = q_values[dim] - tf.stop_gradient(
                    target_q_values[dim])
                dim_loss = loss_function(dim_td_error)
                # Scaling of learning based on importance sampling weights is optional, either way works
                stream_losses.append(
                    tf.reduce_mean(dim_loss *
                                   importance_weights_ph))  # with scaling
                #stream_losses.append(tf.reduce_mean(dim_loss)) # without scaling
                if dim == 0:
                    td_error = tf.abs(dim_td_error)
                else:
                    td_error += tf.abs(dim_td_error)
            #td_error /= num_action_streams

            mean_loss = sum(stream_losses) / num_action_streams
            optimize_expr = minimize_and_clip(
                optimizer,
                mean_loss,
                var_list=q_func_vars,
                total_n_streams=(num_action_streams + (1 if dueling else 0)),
                clip_val=grad_norm_clipping)
            optimize_expr = [optimize_expr]
            tf.summary.scalar("loss", mean_loss)

        ## FIXME tf summary scalars are wrong. They are not matching the original code.
        tf.summary.scalar("td_error", tf.reduce_mean(td_error))

        if full_tensorboard_log:
            tf.summary.histogram("td_error", td_error)

        # Target Q-network parameters are periodically updated with the Q-network's
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

    with tf.variable_scope("input_info", reuse=False):
        tf.summary.scalar('rewards', tf.reduce_mean(rew_t_ph))
        tf.summary.scalar('importance_weights',
                          tf.reduce_mean(importance_weights_ph))

        if full_tensorboard_log:
            tf.summary.histogram('rewards', rew_t_ph)
            tf.summary.histogram('importance_weights', importance_weights_ph)
            if tf_util.is_image(obs_phs[0]):
                tf.summary.image('observation', obs_phs[0])
            elif len(obs_phs[0].shape) == 1:
                tf.summary.histogram('observation', obs_phs[0])

        # optimize_expr = optimizer.apply_gradients(gradients)

        summary = tf.summary.merge_all()

        train = tf_util.function(
            inputs=[
                obs_phs[0],
                act_t_ph,
                rew_t_ph,
                target_policy.obs_ph,  #obs_tp1_input,
                double_obs_ph,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=[summary, td_error],
            updates=[optimize_expr])
        update_target = tf_util.function([], [], updates=[update_target_expr])

        # q_values = tf_util.function([obs_phs[0]], step_model)

        return act_f, train, update_target, step_model  #{'q_values': q_values}
    def __init__(self,
                 observation_space,
                 action_space,
                 hidden_size,
                 entcoeff=0.001,
                 scope="adversary",
                 normalize=True):
        """
        Reward regression from observations and transitions

        :param observation_space: (gym.spaces)
        :param action_space: (gym.spaces)
        :param hidden_size: ([int]) the hidden dimension for the MLP
        :param entcoeff: (float) the entropy loss weight
        :param scope: (str) tensorflow variable scope
        :param normalize: (bool) Whether to normalize the reward or not
        """
        # TODO: support images properly (using a CNN)
        self.scope = scope
        self.observation_shape = observation_space.shape
        self.actions_shape = action_space.shape

        if isinstance(action_space, gym.spaces.Box):
            # Continuous action space
            self.discrete_actions = False
            self.n_actions = action_space.shape[0]
        elif isinstance(action_space, gym.spaces.Discrete):
            self.n_actions = action_space.n
            self.discrete_actions = True
        else:
            raise ValueError(
                'Action space not supported: {}'.format(action_space))

        self.hidden_size = hidden_size
        self.normalize = normalize
        self.obs_rms = None

        # Placeholders
        self.generator_obs_ph = tf.compat.v1.placeholder(
            observation_space.dtype, (None, ) + self.observation_shape,
            name="observations_ph")
        self.generator_acs_ph = tf.compat.v1.placeholder(
            action_space.dtype, (None, ) + self.actions_shape,
            name="actions_ph")
        self.expert_obs_ph = tf.compat.v1.placeholder(
            observation_space.dtype, (None, ) + self.observation_shape,
            name="expert_observations_ph")
        self.expert_acs_ph = tf.compat.v1.placeholder(
            action_space.dtype, (None, ) + self.actions_shape,
            name="expert_actions_ph")
        # Build graph
        generator_logits = self.build_graph(self.generator_obs_ph,
                                            self.generator_acs_ph,
                                            reuse=False)
        expert_logits = self.build_graph(self.expert_obs_ph,
                                         self.expert_acs_ph,
                                         reuse=True)
        # Build accuracy
        generator_acc = tf.reduce_mean(input_tensor=tf.cast(
            tf.nn.sigmoid(generator_logits) < 0.5, tf.float32))
        expert_acc = tf.reduce_mean(input_tensor=tf.cast(
            tf.nn.sigmoid(expert_logits) > 0.5, tf.float32))
        # Build regression loss
        # let x = logits, z = targets.
        # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=generator_logits, labels=tf.zeros_like(generator_logits))
        generator_loss = tf.reduce_mean(input_tensor=generator_loss)
        expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=expert_logits, labels=tf.ones_like(expert_logits))
        expert_loss = tf.reduce_mean(input_tensor=expert_loss)
        # Build entropy loss
        logits = tf.concat([generator_logits, expert_logits], 0)
        entropy = tf.reduce_mean(input_tensor=logit_bernoulli_entropy(logits))
        entropy_loss = -entcoeff * entropy
        # Loss + Accuracy terms
        self.losses = [
            generator_loss, expert_loss, entropy, entropy_loss, generator_acc,
            expert_acc
        ]
        self.loss_name = [
            "generator_loss", "expert_loss", "entropy", "entropy_loss",
            "generator_acc", "expert_acc"
        ]
        self.total_loss = generator_loss + expert_loss + entropy_loss
        # Build Reward for policy
        self.reward_op = -tf.math.log(1 - tf.nn.sigmoid(generator_logits) +
                                      1e-8)
        var_list = self.get_trainable_variables()
        self.lossandgrad = tf_util.function([
            self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph,
            self.expert_acs_ph
        ], self.losses + [tf_util.flatgrad(self.total_loss, var_list)])
    def setup_model(self):
        # prevent import loops

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            print("number of workers are", self.nworkers)
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)
                self._setup_learn(self.seed)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)
                # Network for phi
                with tf.variable_scope("phi", reuse=False):
                    self.policy_phi = self.policy(self.sess,
                                                  self.observation_space,
                                                  self.action_space,
                                                  self.n_envs,
                                                  1,
                                                  None,
                                                  reuse=False,
                                                  **self.policy_kwargs)
                # Network for phi old
                with tf.variable_scope("oldphi", reuse=False):
                    self.policy_phi_old = self.policy(self.sess,
                                                      self.observation_space,
                                                      self.action_space,
                                                      self.n_envs,
                                                      1,
                                                      None,
                                                      reuse=False,
                                                      **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[
                        None
                    ])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32,
                                         shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    #kloldnew = self.policy_pi.proba_distribution.kl(old_policy.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(
                        tf.square(self.policy_pi.value_flat - ret))
                    vf_phi_err = tf.reduce_mean(
                        tf.square(self.policy_phi.value_flat - ret))
                    vf_phi_old_err = tf.reduce_mean(
                        tf.square(self.policy_phi_old.value_flat))

                    # advantage * pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action) -
                        old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = [
                        "optimgain", "meankl", "entloss", "surrgain", "entropy"
                    ]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [
                        v for v in all_var_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]
                    vf_var_list = [
                        v for v in all_var_list
                        if "/pi" not in v.name and "/logstd" not in v.name
                    ]
                    all_var_oldpi_list = tf_util.get_trainable_vars("oldpi")
                    var_oldpi_list = [
                        v for v in all_var_oldpi_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]

                    all_var_phi_list = tf_util.get_trainable_vars("phi")
                    vf_phi_var_list = [
                        v for v in all_var_phi_list if "/pi" not in v.name
                        and "/logstd" not in v.name and "/q" not in v.name
                    ]
                    all_var_phi_old_list = tf_util.get_trainable_vars("oldphi")
                    vf_phi_old_var_list = [
                        v for v in all_var_phi_old_list if "/pi" not in v.name
                        and "/logstd" not in v.name and "/q" not in v.name
                    ]
                    #print("vars", vf_var_list)
                    self.policy_vars = all_var_list
                    self.oldpolicy_vars = all_var_oldpi_list
                    print("all var list", all_var_list)
                    print("phi vars", vf_phi_var_list)
                    print("phi old vars", vf_phi_old_var_list)

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list,
                                                             sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32,
                                                  shape=[None],
                                                  name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(
                            tf.reshape(flat_tangent[start:start + var_size],
                                       shape))
                        start += var_size
                    gvp = tf.add_n([
                        tf.reduce_sum(grad * tangent)
                        for (grad, tangent) in zipsame(klgrads, tangents)
                    ])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('approximate_kullback-leibler', meankl)
                    tf.summary.scalar(
                        'loss',
                        optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function(
                        [observation, old_policy.obs_ph, action, atarg],
                        losses)
                    self.compute_fvp = tf_util.function([
                        flat_tangent, observation, old_policy.obs_ph, action,
                        atarg
                    ], fvp)
                    self.compute_vflossandgrad = tf_util.function(
                        [observation, old_policy.obs_ph, ret],
                        tf_util.flatgrad(vferr, vf_var_list))
                    self.compute_vf_phi_lossandgrad = tf_util.function(
                        [observation, self.policy_phi.obs_ph, ret],
                        tf_util.flatgrad(vf_phi_err, vf_phi_var_list))
                    self.compute_vf_loss = tf_util.function(
                        [observation, old_policy.obs_ph, ret], vferr)
                    self.compute_vf_phi_loss = tf_util.function(
                        [observation, self.policy_phi.obs_ph, ret], vf_phi_err)
                    #self.compute_vf_phi_old_loss = tf_util.function([self.policy_phi_old.obs_ph], vf_phi_old_err)
                    #self.phi_old_obs = np.array([-0.012815  , -0.02076313,  0.07524705,  0.09407324,  0.0901745 , -0.09339058,  0.03544853, -0.03297224])
                    #self.phi_old_obs = self.phi_old_obs.reshape((1, 8))

                    update_phi_old_expr = []
                    for var, var_target in zip(
                            sorted(vf_phi_var_list, key=lambda v: v.name),
                            sorted(vf_phi_old_var_list, key=lambda v: v.name)):
                        update_phi_old_expr.append(var_target.assign(var))
                    update_phi_old_expr = tf.group(*update_phi_old_expr)

                    self.update_phi_old = tf_util.function(
                        [], [], updates=[update_phi_old_expr])

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(
                                colorize("done in {:.3f} seconds".format(
                                    (time.time() - start_time)),
                                         color='magenta'))
                        else:
                            yield

                    @contextmanager
                    def temp_seed(seed):
                        state = np.random.get_state()
                        np.random.seed(seed)
                        try:
                            yield
                        finally:
                            np.random.set_state(state)

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    self.vf_phi_adam = MpiAdam(vf_phi_var_list, sess=self.sess)
                    self.vfadam.sync()
                    self.vf_phi_adam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('kl_clip_range',
                                      tf.reduce_mean(self.max_kl))

                self.timed = timed
                self.allmean = allmean
                self.temp_seed = temp_seed

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = tf_util.get_trainable_vars(
                    "model") + tf_util.get_trainable_vars("oldpi")

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
예제 #17
0
def build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess, action_filter=None, OnExpo=False, threshold=None): # NKAM
    """
    Creates the act function:

    :param q_func: (DQNPolicy) the policy
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder
    :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder
    :param sess: (TensorFlow session) The current TensorFlow session
    :param action_filter: (function (TensorFlow Tensor): TensorFlow Tensor) filters actions according to a criterion
    :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor)
        act function to select and action given observation (See the top of the file for details),
        A tuple containing the observation placeholder and the processed observation placeholder respectivly.
    """
    eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))

    policy = q_func(sess, ob_space, ac_space, 1, 1, None)
    obs_phs = (policy.obs_ph, policy.processed_obs)

    batch_size = tf.shape(policy.obs_ph)[0]
    n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n

    if action_filter != None:
        actions_mask = tf.py_func(func=action_filter, inp=[tf.constant(n_actions)], Tout=[tf.bool for _ in range(0, n_actions)])
        total_possible_actions = tf.reduce_sum(tf.cast(actions_mask, dtype=tf.int64))
        actions_mask = tf.reshape(tf.tile(actions_mask, [batch_size]), [batch_size, tf.shape(actions_mask)[0]])
        if threshold != None:
            legal_q_values = tf.boolean_mask(policy.q_values, actions_mask)
            max_legal_q_value = tf.math.reduce_max(legal_q_values)
            max_legal_q_value = tf.math.scalar_mul(threshold, max_legal_q_value)
            threshold_tensor = tf.tile(tf.reshape(max_legal_q_value, (1,)), tf.constant(n_actions, dtype=tf.int64, shape=(1,)))
            threshold_tensor = tf.reshape(tf.tile(threshold_tensor, [batch_size]), [batch_size, tf.shape(threshold_tensor)[0]])
            threshold_mask = tf.math.greater(policy.q_values, threshold_tensor)
            actions_mask = tf.math.logical_or(actions_mask, threshold_mask)
        # TODO: how to avoid getting stuck???
        # if actions_mask == "empty":
        #     actions_mask = None
    else:
        actions_mask = tf.constant([True for _ in range(0, n_actions)], dtype=tf.bool)
        total_possible_actions = n_actions
        actions_mask = tf.reshape(tf.tile(actions_mask, [batch_size]), [batch_size, tf.shape(actions_mask)[0]])

    q_values_mask = tf.constant([-float('inf') for _ in range(0, n_actions)])
    q_values_mask = tf.reshape(tf.tile(q_values_mask, [batch_size]), [batch_size, tf.shape(q_values_mask)[0]])
    masked_q_values = tf.where(actions_mask, policy.q_values, q_values_mask)

    deterministic_actions = tf.argmax(masked_q_values, axis=1)

    possible_actions = tf.boolean_mask(tf.constant([action for action in range(0, n_actions)], dtype=tf.int64), actions_mask[0])
    masked_random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=total_possible_actions, dtype=tf.int64)
    if not OnExpo:
        random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=n_actions, dtype=tf.int64)
    else:
        random_actions = tf.gather(possible_actions, masked_random_actions)
        # random_actions = tf.gather(possible_actions, masked_random_actions, batch_dims=0)

    chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
    stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

    output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
    update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
    _act = tf_util.function(inputs=[policy.obs_ph, stochastic_ph, update_eps_ph],
                            outputs=output_actions,
                            givens={update_eps_ph: -1.0, stochastic_ph: True},
                            updates=[update_eps_expr])

    def act(obs, stochastic=True, update_eps=-1):
        return _act(obs, stochastic, update_eps)

    return act, obs_phs
예제 #18
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):

        # observation_space 와 action_space
        # ob_space = env.observation_space
        # ac_space = env.action_space
        obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)
        # return ::
        # obs :: placeholder
        # pdtype :: action_space의 distribution정보를 담은 객체

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # DeepMind의 observation normalization
        obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std,
                               -5.0, 5.0)

        # ===========================[Value function prediction Model]====================================================
        # dense()는 tf.nn.bias_add(tf.matmul(input_tensor, weight), bias) 를 리턴해준다.
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=tf_util.normc_initializer(1.0)))
        self.vpred = dense(last_out,
                           1,
                           "vffinal",
                           weight_init=tf_util.normc_initializer(1.0))[:, 0]
        # dense(last, 1) 의 경우 return shape 가 [None, 1]이므로, [None]을 얻기 위해서 [:, 0]를 해준다.

        # ===========================[Policy function MOdel]==============================================================
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "polfc%i" % (i + 1),
                      weight_init=tf_util.normc_initializer(1.0)))

        # 선택한 gym 환경의 action space가 Box 형태인 경우 다음과 같이
        # action distribution의 mean 과 std 를 concate 한 값을
        # action output으로 사용한다.
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         tf_util.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            tf_util.normc_initializer(0.01))

        # 이 모델에서의 action output은 다음 proba_distribution 에 담기도록 합니다.
        self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam)
        self.state_in = []
        self.state_out = []

        # ppo1/mlp_policy class를 상송했으므로
        # 해당 클래스에서 정의되어 있는 act 함수를 재정의
        # _act를 정의해주면
        # act()를 통해서 action / value 를 얻을수 있다.
        self.stochastic_ph = tf.placeholder(dtype=tf.bool,
                                            shape=(),
                                            name="stochastic")
        action = tf_util.switch(self.stochastic_ph,
                                self.proba_distribution.sample(),
                                self.proba_distribution.mode())
        self.action = action
        self._act = tf_util.function([self.stochastic_ph, obs],
                                     [action, self.vpred])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):

        assert isinstance(ob_space, gym.spaces.Box)

        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)

        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # normalization
        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                tf.layers.dense(last_out,
                                hid_size,
                                kernel_initializer=U.normc_initializer(1.0),
                                name="vffc%i" % (i + 1)))

        self.vpred = dense3D2(last_out,
                              1,
                              "vffinal",
                              option,
                              num_options=num_options,
                              weight_init=U.normc_initializer(1.0))[:, 0]

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.greater(
            self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),
                                          maxval=1.))

        # input to policy
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                tf.layers.dense(last_out,
                                hid_size,
                                name="polfc%i" % (i + 1),
                                kernel_initializer=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd[option[0]]], axis=1)
        else:
            pdparam = tf.layers.dense(
                last_out,
                pdtype.param_shape()[0],
                name="polfinal",
                kernel_initializer=U.normc_initializer(0.01))

        # select option
        self.op_pi = tf.nn.softmax(
            tf.layers.dense(tf.stop_gradient(last_out),
                            num_options,
                            name="OPfc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
예제 #20
0
def learn(env,
          policy_func,
          dataset,
          optim_batch_size=128,
          max_iters=1e4,
          adam_epsilon=1e-5,
          optim_stepsize=3e-4,
          ckpt_dir=None,
          task_name=None,
          verbose=False):
    """
    Learn a behavior clone policy, and return the save location

    :param env: (Gym Environment) the environment
    :param policy_func: (function (str, Gym Space, Gym Space): TensorFlow Tensor) creates the policy
    :param dataset: (Dset or MujocoDset) the dataset manager
    :param optim_batch_size: (int) the batch size
    :param max_iters: (int) the maximum number of iterations
    :param adam_epsilon: (float) the epsilon value for the adam optimizer
    :param optim_stepsize: (float) the optimizer stepsize
    :param ckpt_dir: (str) the save directory, can be None for temporary directory
    :param task_name: (str) the save name, can be None for saving directly to the directory name
    :param verbose: (bool)
    :return: (str) the save location for the TensorFlow model
    """

    val_per_iter = int(max_iters / 10)
    ob_space = env.observation_space
    ac_space = env.action_space
    policy = policy_func("pi", ob_space,
                         ac_space)  # Construct network for new policy
    # placeholder
    obs_ph = policy.obs_ph
    action_ph = policy.pdtype.sample_placeholder([None])
    stochastic_ph = policy.stochastic_ph
    loss = tf.reduce_mean(tf.square(action_ph - policy.ac))
    var_list = policy.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = tf_util.function([obs_ph, action_ph, stochastic_ph],
                                   [loss] + [tf_util.flatgrad(loss, var_list)])

    tf_util.initialize()
    adam.sync()
    logger.log("Pretraining with Behavior Cloning...")
    for iter_so_far in tqdm(range(int(max_iters))):
        ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size,
                                                      'train')
        train_loss, grad = lossandgrad(ob_expert, ac_expert, True)
        adam.update(grad, optim_stepsize)
        if verbose and iter_so_far % val_per_iter == 0:
            ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
            val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
            logger.log("Training loss: {}, Validation loss: {}".format(
                train_loss, val_loss))

    if ckpt_dir is None:
        savedir_fname = tempfile.TemporaryDirectory().name
    else:
        savedir_fname = os.path.join(ckpt_dir, task_name)
    tf_util.save_state(savedir_fname, var_list=policy.get_variables())
    return savedir_fname
예제 #21
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.compat.v1.variable_scope("oldpi", reuse=False):
                    old_pi = self.policy(self.sess,
                                         self.observation_space,
                                         self.action_space,
                                         self.n_envs,
                                         1,
                                         None,
                                         reuse=False,
                                         **self.policy_kwargs)

                with tf.compat.v1.variable_scope("loss", reuse=False):
                    # Target advantage function (if applicable)
                    atarg = tf.compat.v1.placeholder(dtype=tf.float32,
                                                     shape=[None])

                    # Empirical return
                    ret = tf.compat.v1.placeholder(dtype=tf.float32,
                                                   shape=[None])

                    # learning rate multiplier, updated with schedule
                    lrmult = tf.compat.v1.placeholder(name='lrmult',
                                                      dtype=tf.float32,
                                                      shape=[])

                    # Annealed cliping parameter epislon
                    clip_param = self.clip_param * lrmult

                    obs_ph = self.policy_pi.obs_ph
                    action_ph = self.policy_pi.pdtype.sample_placeholder(
                        [None])

                    kloldnew = old_pi.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(input_tensor=kloldnew)
                    meanent = tf.reduce_mean(input_tensor=ent)
                    pol_entpen = (-self.entcoeff) * meanent

                    # pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action_ph) -
                        old_pi.proba_distribution.logp(action_ph))

                    # surrogate from conservative policy iteration
                    surr1 = ratio * atarg
                    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                                             1.0 + clip_param) * atarg

                    # PPO's pessimistic surrogate (L^CLIP)
                    pol_surr = -tf.reduce_mean(
                        input_tensor=tf.minimum(surr1, surr2))
                    vf_loss = tf.reduce_mean(
                        input_tensor=tf.square(self.policy_pi.value_flat -
                                               ret))
                    total_loss = pol_surr + pol_entpen + vf_loss
                    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
                    self.loss_names = [
                        "pol_surr", "pol_entpen", "vf_loss", "kl", "ent"
                    ]

                    tf.compat.v1.summary.scalar('entropy_loss', pol_entpen)
                    tf.compat.v1.summary.scalar('policy_gradient_loss',
                                                pol_surr)
                    tf.compat.v1.summary.scalar('value_function_loss', vf_loss)
                    tf.compat.v1.summary.scalar('approximate_kullback-leibler',
                                                meankl)
                    tf.compat.v1.summary.scalar('clip_factor', clip_param)
                    tf.compat.v1.summary.scalar('loss', total_loss)

                    self.params = tf_util.get_trainable_vars("model")

                    self.assign_old_eq_new = tf_util.function(
                        [], [],
                        updates=[
                            tf.compat.v1.assign(oldv, newv)
                            for (oldv, newv) in zipsame(
                                tf_util.get_globals_vars("oldpi"),
                                tf_util.get_globals_vars("model"))
                        ])

                with tf.compat.v1.variable_scope("Adam_mpi", reuse=False):
                    self.adam = MpiAdam(self.params,
                                        epsilon=self.adam_epsilon,
                                        sess=self.sess)

                with tf.compat.v1.variable_scope("input_info", reuse=False):
                    tf.compat.v1.summary.scalar(
                        'discounted_rewards', tf.reduce_mean(input_tensor=ret))
                    tf.compat.v1.summary.scalar(
                        'learning_rate',
                        tf.reduce_mean(input_tensor=self.optim_stepsize))
                    tf.compat.v1.summary.scalar(
                        'advantage', tf.reduce_mean(input_tensor=atarg))
                    tf.compat.v1.summary.scalar(
                        'clip_range',
                        tf.reduce_mean(input_tensor=self.clip_param))

                    if self.full_tensorboard_log:
                        tf.compat.v1.summary.histogram('discounted_rewards',
                                                       ret)
                        tf.compat.v1.summary.histogram('learning_rate',
                                                       self.optim_stepsize)
                        tf.compat.v1.summary.histogram('advantage', atarg)
                        tf.compat.v1.summary.histogram('clip_range',
                                                       self.clip_param)
                        if tf_util.is_image(self.observation_space):
                            tf.compat.v1.summary.image('observation', obs_ph)
                        else:
                            tf.compat.v1.summary.histogram(
                                'observation', obs_ph)

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                tf_util.initialize(sess=self.sess)

                self.summary = tf.compat.v1.summary.merge_all()

                self.lossandgrad = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    [self.summary,
                     tf_util.flatgrad(total_loss, self.params)] + losses)
                self.compute_losses = tf_util.function(
                    [obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult],
                    losses)
예제 #22
0
def build_act_with_param_noise(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess,
                               param_noise_filter_func=None):
    """
    Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905):

    :param q_func: (DQNPolicy) the policy
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder
    :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder
    :param sess: (TensorFlow session) The current TensorFlow session
    :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a
        variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter
        is used by default.
    :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor)
        act function to select and action given observation (See the top of the file for details),
        A tuple containing the observation placeholder and the processed observation placeholder respectivly.
    """
    if param_noise_filter_func is None:
        param_noise_filter_func = default_param_noise_filter

    update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold")
    update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale")
    reset_ph = tf.placeholder(tf.bool, (), name="reset")

    eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))
    param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01),
                                        trainable=False)
    param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05),
                                            trainable=False)

    # Unmodified Q.
    policy = q_func(sess, ob_space, ac_space, 1, 1, None)
    obs_phs = (policy.obs_ph, policy.processed_obs)

    # Perturbable Q used for the actual rollout.
    with tf.variable_scope("perturbed_model", reuse=False):
        perturbable_policy = q_func(sess, ob_space, ac_space, 1, 1, None, obs_phs=obs_phs)

    def perturb_vars(original_scope, perturbed_scope):
        """
        We have to wrap this code into a function due to the way tf.cond() works.

        See https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for a more detailed
        discussion.

        :param original_scope: (str or VariableScope) the original scope.
        :param perturbed_scope: (str or VariableScope) the perturbed scope.
        :return: (TensorFlow Operation)
        """
        all_vars = scope_vars(absolute_scope_name(original_scope))
        all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope))
        assert len(all_vars) == len(all_perturbed_vars)
        perturb_ops = []
        for var, perturbed_var in zip(all_vars, all_perturbed_vars):
            if param_noise_filter_func(perturbed_var):
                # Perturb this variable.
                operation = tf.assign(perturbed_var,
                                      var + tf.random_normal(shape=tf.shape(var), mean=0.,
                                                             stddev=param_noise_scale))
            else:
                # Do not perturb, just assign.
                operation = tf.assign(perturbed_var, var)
            perturb_ops.append(operation)
        assert len(perturb_ops) == len(all_vars)
        return tf.group(*perturb_ops)

    # Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy
    # of the network and measures the effect of that perturbation in action space. If the perturbation
    # is too big, reduce scale of perturbation, otherwise increase.
    with tf.variable_scope("adaptive_model", reuse=False):
        adaptive_policy = q_func(sess, ob_space, ac_space, 1, 1, None, obs_phs=obs_phs)
    perturb_for_adaption = perturb_vars(original_scope="model", perturbed_scope="adaptive_model/model")
    kl_loss = tf.reduce_sum(
        tf.nn.softmax(policy.q_values) *
        (tf.log(tf.nn.softmax(policy.q_values)) - tf.log(tf.nn.softmax(adaptive_policy.q_values))),
        axis=-1)
    mean_kl = tf.reduce_mean(kl_loss)

    def update_scale():
        """
        update the scale expression

        :return: (TensorFlow Tensor) the updated scale expression
        """
        with tf.control_dependencies([perturb_for_adaption]):
            update_scale_expr = tf.cond(mean_kl < param_noise_threshold,
                                        lambda: param_noise_scale.assign(param_noise_scale * 1.01),
                                        lambda: param_noise_scale.assign(param_noise_scale / 1.01),
                                        )
        return update_scale_expr

    # Functionality to update the threshold for parameter space noise.
    update_param_noise_thres_expr = param_noise_threshold.assign(
        tf.cond(update_param_noise_threshold_ph >= 0, lambda: update_param_noise_threshold_ph,
                lambda: param_noise_threshold))

    # Put everything together.
    perturbed_deterministic_actions = tf.argmax(perturbable_policy.q_values, axis=1)
    deterministic_actions = tf.argmax(policy.q_values, axis=1)
    batch_size = tf.shape(policy.obs_ph)[0]
    n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n
    random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=n_actions, dtype=tf.int64)
    chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
    perturbed_stochastic_actions = tf.where(chose_random, random_actions, perturbed_deterministic_actions)
    stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

    perturbed_output_actions = tf.cond(stochastic_ph, lambda: perturbed_stochastic_actions,
                                       lambda: deterministic_actions)
    output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
    update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
    updates = [
        update_eps_expr,
        tf.cond(reset_ph, lambda: perturb_vars(original_scope="model", perturbed_scope="perturbed_model/model"),
                lambda: tf.group(*[])),
        tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)),
        update_param_noise_thres_expr,
    ]

    _act = tf_util.function(inputs=[policy.obs_ph, stochastic_ph, update_eps_ph],
                            outputs=output_actions,
                            givens={update_eps_ph: -1.0, stochastic_ph: True},
                            updates=[update_eps_expr])

    _perturbed_act = tf_util.function(
        inputs=[policy.obs_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph,
                update_param_noise_scale_ph],
        outputs=perturbed_output_actions,
        givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False,
                update_param_noise_scale_ph: False},
        updates=updates)

    def act(obs, reset=None, update_param_noise_threshold=None, update_param_noise_scale=None, stochastic=True,
            update_eps=-1):
        """
        get the action from the current observation

        :param obs: (Any) Observation that can be feed into the output of make_obs_ph
        :param reset: (bool) reset the perturbed policy by sampling a new perturbation
        :param update_param_noise_threshold: (float) the desired threshold for the difference between
            non-perturbed and perturbed policy
        :param update_param_noise_scale: (bool) whether or not to update the scale of the noise for the next time
            it is re-perturbed
        :param stochastic: (bool) if set to False all the actions are always deterministic (default False)
        :param update_eps: (float) update epsilon a new value, if negative not update happens
            (default: no update)
        :return: (TensorFlow Tensor) tensor of dtype tf.int64 and shape (BATCH_SIZE,) with an action to be
            performed for every element of the batch.
        """
        if reset is None or update_param_noise_threshold is None or update_param_noise_scale is None:
            return _act(obs, stochastic, update_eps)
        else:
            return _perturbed_act(obs, stochastic, update_eps, reset, update_param_noise_threshold,
                                  update_param_noise_scale)

    return act, obs_phs
예제 #23
0
def build_act_mod(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph,
                  sess):
    """
    Original from build_graph.py from stable_baselines.deepq. This
    modified version returns the full sorted action array instead of
    the single best action.

    Creates the act function:

    :param q_func: (DQNPolicy) the policy
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param stochastic_ph: (TensorFlow Tensor) the stochastic placeholder
    :param update_eps_ph: (TensorFlow Tensor) the update_eps placeholder
    :param sess: (TensorFlow session) The current TensorFlow session
    :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor)
        act function to select and action given observation (See the top of the file for details),
        A tuple containing the observation placeholder and the processed observation placeholder respectively.
    """
    eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))

    policy = q_func(sess, ob_space, ac_space, 1, 1, None)
    obs_phs = (policy.obs_ph, policy.processed_obs)
    ####################################################################
    # MODIFICATION:
    # Get all sorted q_values instead of just the best one. Have to
    # cast to int64, since it comes back as int32 which is incompatible
    # with the random_actions, which is int64.
    deterministic_actions = tf.cast(
        tf.argsort(policy.q_values, axis=1, direction='DESCENDING'), tf.int64)
    ####################################################################
    # ORIGINAL:
    # Note that the original comes out as int64 since that's the default
    # from argmax.
    # deterministic_actions = tf.argmax(policy.q_values, axis=1)

    batch_size = tf.shape(policy.obs_ph)[0]
    n_actions = ac_space.nvec if isinstance(ac_space,
                                            MultiDiscrete) else ac_space.n
    ####################################################################
    # MODIFICATION:
    # TODO: It feels wrong not to use "batch_size" like was done in the
    #   original, but this seems to be working.
    random_actions = tf.random_uniform((1, n_actions),
                                       minval=0,
                                       maxval=n_actions,
                                       dtype=tf.int64)
    ####################################################################
    # ORIGINAL:
    # random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=n_actions, dtype=tf.int64)
    chose_random = tf.random_uniform(
        tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
    stochastic_actions = tf.where(chose_random, random_actions,
                                  deterministic_actions)

    output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions,
                             lambda: deterministic_actions)
    update_eps_expr = eps.assign(
        tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
    _act = tf_util.function(
        inputs=[policy.obs_ph, stochastic_ph, update_eps_ph],
        outputs=output_actions,
        givens={
            update_eps_ph: -1.0,
            stochastic_ph: True
        },
        updates=[update_eps_expr])

    def act(obs, stochastic=True, update_eps=-1):
        return _act(obs, stochastic, update_eps)

    return act, obs_phs
예제 #24
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """
    Creates the train function:

    :param make_obs_ph: (function (str): TensorFlow Tensor) a function that takes a name and creates a placeholder of
        input with that name
    :param q_func: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor)
        the model that takes the following inputs:
            - observation_in: (Any) the output of observation placeholder
            - num_actions: int  number of actions
            - scope: (str)
            - reuse: (bool)

            should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions)
            with values of every action.
    :param num_actions: (int) number of actions
    :param reuse: (bool) whether or not to reuse the graph variables
    :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective.
    :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed.
    :param gamma: (float) discount rate.
    :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a
        good idea to keep it enabled.
    :param scope: (str or VariableScope) optional scope for variable_scope.
    :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given.
    :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a
        variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter
        is used by default.

    :return: (tuple)

        act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given
            observation. See the top of the file for details.
        train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float)
            optimize the error in Bellman's equation. See the top of the file for details.
        update_target: (function) copy the parameters from optimized Q function to the target Q function.
            See the top of the file for details.
        debug: ({str: function}) a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = tf_utils.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = tf_utils.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                                  outputs=td_error,
                                  updates=[optimize_expr])
        update_target = tf_utils.function([], [], updates=[update_target_expr])

        q_values = tf_utils.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
예제 #25
0
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        num_options=2,
        app='',
        saves=False,
        wsaves=False,
        epoch=-1,
        seed=1,
        dc=0):

    optim_batchsize_ideal = optim_batchsize
    np.random.seed(seed)
    tf.set_random_seed(seed)

    # env._seed(seed)

    gamename = env.spec.id[:-3].lower()
    gamename += 'seed' + str(seed)
    gamename += app

    dirname = '{}_{}opts_saves/'.format(gamename, num_options)

    if wsaves:
        first = True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False
        # while os.path.exists(dirname) and first:
        #     dirname += '0'

        files = ['pposgd_simple.py', 'mlp_policy.py', 'run_main.py']
        for i in range(len(files)):
            src = os.path.expanduser('~/baselines/baselines/ppo1/') + files[i]
            dest = os.path.expanduser('~/baselines/baselines/ppo1/') + dirname
            shutil.copy2(src, dest)

    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    # option = tf.placeholder(dtype=tf.int32, shape=[None])

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # pdb.set_trace()
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv',
                                 dtype=tf.float32,
                                 shape=[None])

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0))
    entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1)
    op_loss = -tf.reduce_sum(log_pi[0][option[0]] * atarg + entropy * 0.1)

    total_loss += op_loss

    var_list = pi.get_trainable_variables()
    term_list = var_list[6:8]

    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option, term_adv],
                             losses + [U.flatgrad(total_loss, var_list)])
    termloss = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)
                           ])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)
    U.initialize()
    adam.sync()
    saver = tf.train.Saver(max_to_keep=10000)

    results = []
    if saves:
        results = open(
            gamename + '_' + str(num_options) + 'opts_' + '_results.csv', 'w')

        out = 'epoch,avg_reward'

        for opt in range(num_options):
            out += ',option {} dur'.format(opt)
        for opt in range(num_options):
            out += ',option {} std'.format(opt)
        for opt in range(num_options):
            out += ',option {} term'.format(opt)
        for opt in range(num_options):
            out += ',option {} adv'.format(opt)

        out += '\n'

        results.write(out)

        # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n')
        results.flush()

    if epoch >= 0:
        dirname = '{}_{}opts_saves/'.format(gamename, num_options)
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch)
        saver.restore(U.get_session(), filename)

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_options=num_options,
                                     saves=saves,
                                     results=results,
                                     rewbuffer=rewbuffer,
                                     dc=dc)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(
                seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        std = []
        for i in range(num_options):
            logstd = np.mean(
                seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0.
            std.append(np.exp(logstd))

        print("mean opt dur:", opt_d)
        print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0))
        print("mean term p:", np.mean(np.array(seg['term_p']), axis=0))
        print("mean value val:", np.mean(np.array(seg['value_val']), axis=0))
        ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[
            "adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        if iters_so_far % 5 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(
                gamename, iters_so_far)
            save_path = saver.save(U.get_session(), filename)

        min_batch = 160
        t_advs = [[] for _ in range(num_options)]
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("batch size:", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue

            # This part is only necessasry when we use options.
            # We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif indices.size + datas[opt].n < min_batch:
                    # pdb.set_trace()
                    oldmap = datas[opt].data_map

                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob,
                                              ac=cat_ac,
                                              atarg=cat_atarg,
                                              vtarg=cat_vtarg),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif (indices.size + datas[opt].n > min_batch and datas[opt].n
                      < min_batch) or (indices.size > min_batch
                                       and datas[opt].n < min_batch):

                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob,
                                                  ac=cat_ac,
                                                  atarg=cat_atarg,
                                                  vtarg=cat_vtarg),
                                             shuffle=not pi.recurrent)

                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(ob=ob[indices],
                                                  ac=ac[indices],
                                                  atarg=atarg[indices],
                                                  vtarg=tdlamret[indices]),
                                             shuffle=not pi.recurrent)

            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)

            optim_batchsize = optim_batchsize or ob.shape[0]
            optim_epochs = np.clip(
                np.int(10 * (indices.size /
                             (timesteps_per_batch / num_options))), 10,
                10) if num_options > 1 else optim_epochs
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    tadv, nodc_adv = pi.get_term_adv(batch["ob"], [opt])
                    tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    t_advs[opt].append(nodc_adv)

                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["vtarg"], cur_lrmult,
                                                    [opt], tadv)
                    termg = termloss(batch["ob"], [opt], tadv)
                    adam.update(termg[0], 5e-7 * cur_lrmult)
                    adam.update(grads, optim_stepsize * cur_lrmult)
                    losses.append(newlosses)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        if saves:
            out = "{},{}"
            for _ in range(num_options):
                out += ",{},{},{},{}"
            out += "\n"

            info = [iters_so_far, np.mean(rewbuffer)]
            for i in range(num_options):
                info.append(opt_d[i])
            for i in range(num_options):
                info.append(std[i])
            for i in range(num_options):
                info.append(np.mean(np.array(seg['term_p']), axis=0)[i])
            for i in range(num_options):
                info.append(np.mean(t_advs[i]))

            results.write(out.format(*info))
            results.flush()
예제 #26
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):

        obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std,
                               -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=tf_util.normc_initializer(1.0)))
        self.vpred = dense(last_out,
                           1,
                           "vffinal",
                           weight_init=tf_util.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "polfc%i" % (i + 1),
                      weight_init=tf_util.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         tf_util.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            tf_util.normc_initializer(0.01))

        self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam)
        self.state_in = []
        self.state_out = []

        # change for BC
        self.stochastic_ph = tf.placeholder(dtype=tf.bool,
                                            shape=(),
                                            name="stochastic")
        action = tf_util.switch(self.stochastic_ph,
                                self.proba_distribution.sample(),
                                self.proba_distribution.mode())
        self.action = action
        self._act = tf_util.function([self.stochastic_ph, obs],
                                     [action, self.vpred])
예제 #27
0
    def setup_model(self):
        # prevent import loops
        from stable_baselines.gail.adversary import TransitionClassifier

        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the TRPO model must be " \
                                                               "an instance of common.policies.ActorCriticPolicy."

            self.nworkers = MPI.COMM_WORLD.Get_size()
            self.rank = MPI.COMM_WORLD.Get_rank()
            np.set_printoptions(precision=3)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.single_threaded_session(graph=self.graph)

                if self.using_gail:
                    self.reward_giver = TransitionClassifier(
                        self.observation_space,
                        self.action_space,
                        self.hidden_size_adversary,
                        entcoeff=self.adversary_entcoeff)

                # Construct network for new policy
                self.policy_pi = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                # Network for old policy
                with tf.variable_scope("oldpi", reuse=False):
                    old_policy = self.policy(self.sess,
                                             self.observation_space,
                                             self.action_space,
                                             self.n_envs,
                                             1,
                                             None,
                                             reuse=False,
                                             **self.policy_kwargs)

                with tf.variable_scope("loss", reuse=False):
                    atarg = tf.placeholder(dtype=tf.float32, shape=[
                        None
                    ])  # Target advantage function (if applicable)
                    ret = tf.placeholder(dtype=tf.float32,
                                         shape=[None])  # Empirical return

                    observation = self.policy_pi.obs_ph
                    action = self.policy_pi.pdtype.sample_placeholder([None])

                    kloldnew = old_policy.proba_distribution.kl(
                        self.policy_pi.proba_distribution)
                    ent = self.policy_pi.proba_distribution.entropy()
                    meankl = tf.reduce_mean(kloldnew)
                    meanent = tf.reduce_mean(ent)
                    entbonus = self.entcoeff * meanent

                    vferr = tf.reduce_mean(
                        tf.square(self.policy_pi.value_fn[:, 0] - ret))

                    # advantage * pnew / pold
                    ratio = tf.exp(
                        self.policy_pi.proba_distribution.logp(action) -
                        old_policy.proba_distribution.logp(action))
                    surrgain = tf.reduce_mean(ratio * atarg)

                    optimgain = surrgain + entbonus
                    losses = [optimgain, meankl, entbonus, surrgain, meanent]
                    self.loss_names = [
                        "optimgain", "meankl", "entloss", "surrgain", "entropy"
                    ]

                    dist = meankl

                    all_var_list = tf_util.get_trainable_vars("model")
                    var_list = [
                        v for v in all_var_list
                        if "/vf" not in v.name and "/q/" not in v.name
                    ]
                    vf_var_list = [
                        v for v in all_var_list
                        if "/pi" not in v.name and "/logstd" not in v.name
                    ]

                    self.get_flat = tf_util.GetFlat(var_list, sess=self.sess)
                    self.set_from_flat = tf_util.SetFromFlat(var_list,
                                                             sess=self.sess)

                    klgrads = tf.gradients(dist, var_list)
                    flat_tangent = tf.placeholder(dtype=tf.float32,
                                                  shape=[None],
                                                  name="flat_tan")
                    shapes = [var.get_shape().as_list() for var in var_list]
                    start = 0
                    tangents = []
                    for shape in shapes:
                        var_size = tf_util.intprod(shape)
                        tangents.append(
                            tf.reshape(flat_tangent[start:start + var_size],
                                       shape))
                        start += var_size
                    gvp = tf.add_n([
                        tf.reduce_sum(grad * tangent)
                        for (grad, tangent) in zipsame(klgrads, tangents)
                    ])  # pylint: disable=E1111
                    fvp = tf_util.flatgrad(gvp, var_list)

                    tf.summary.scalar('entropy_loss', meanent)
                    tf.summary.scalar('policy_gradient_loss', optimgain)
                    tf.summary.scalar('value_function_loss', surrgain)
                    tf.summary.scalar('approximate_kullback-leiber', meankl)
                    tf.summary.scalar(
                        'loss',
                        optimgain + meankl + entbonus + surrgain + meanent)

                    self.assign_old_eq_new = \
                        tf_util.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in
                                                          zipsame(tf_util.get_globals_vars("oldpi"),
                                                                  tf_util.get_globals_vars("model"))])
                    self.compute_losses = tf_util.function(
                        [observation, old_policy.obs_ph, action, atarg],
                        losses)
                    self.compute_fvp = tf_util.function([
                        flat_tangent, observation, old_policy.obs_ph, action,
                        atarg
                    ], fvp)
                    self.compute_vflossandgrad = tf_util.function(
                        [observation, old_policy.obs_ph, ret],
                        tf_util.flatgrad(vferr, vf_var_list))

                    @contextmanager
                    def timed(msg):
                        if self.rank == 0 and self.verbose >= 1:
                            print(colorize(msg, color='magenta'))
                            start_time = time.time()
                            yield
                            print(
                                colorize("done in {:.3f} seconds".format(
                                    (time.time() - start_time)),
                                         color='magenta'))
                        else:
                            yield

                    def allmean(arr):
                        assert isinstance(arr, np.ndarray)
                        out = np.empty_like(arr)
                        MPI.COMM_WORLD.Allreduce(arr, out, op=MPI.SUM)
                        out /= self.nworkers
                        return out

                    tf_util.initialize(sess=self.sess)

                    th_init = self.get_flat()
                    MPI.COMM_WORLD.Bcast(th_init, root=0)
                    self.set_from_flat(th_init)

                with tf.variable_scope("Adam_mpi", reuse=False):
                    self.vfadam = MpiAdam(vf_var_list, sess=self.sess)
                    if self.using_gail:
                        self.d_adam = MpiAdam(
                            self.reward_giver.get_trainable_variables(),
                            sess=self.sess)
                        self.d_adam.sync()
                    self.vfadam.sync()

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('discounted_rewards',
                                      tf.reduce_mean(ret))
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.vf_stepsize))
                    tf.summary.scalar('advantage', tf.reduce_mean(atarg))
                    tf.summary.scalar('kl_clip_range',
                                      tf.reduce_mean(self.max_kl))

                    if self.full_tensorboard_log:
                        tf.summary.histogram('discounted_rewards', ret)
                        tf.summary.histogram('learning_rate', self.vf_stepsize)
                        tf.summary.histogram('advantage', atarg)
                        tf.summary.histogram('kl_clip_range', self.max_kl)
                        if tf_util.is_image(self.observation_space):
                            tf.summary.image('observation', observation)
                        else:
                            tf.summary.histogram('observation', observation)

                self.timed = timed
                self.allmean = allmean

                self.step = self.policy_pi.step
                self.proba_step = self.policy_pi.proba_step
                self.initial_state = self.policy_pi.initial_state

                self.params = find_trainable_variables("model")
                if self.using_gail:
                    self.params.extend(
                        self.reward_giver.get_trainable_variables())

                self.summary = tf.summary.merge_all()

                self.compute_lossandgrad = \
                    tf_util.function([observation, old_policy.obs_ph, action, atarg, ret],
                                     [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses)
    def __init__(self, ob_dim, ac_dim, verbose=1):
        """
        Create an MLP policy for a value function

        :param ob_dim: (int) Observation dimention
        :param ac_dim: (int) action dimention
        :param verbose: (int) verbosity level
        """
        obs_ph = tf.placeholder(tf.float32,
                                shape=[None, ob_dim * 2 + ac_dim * 2 + 2
                                       ])  # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}
        layer_1 = tf.nn.elu(
            dense(obs_ph,
                  64,
                  "h1",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0,
                  weight_loss_dict=wd_dict))
        layer_2 = tf.nn.elu(
            dense(layer_1,
                  64,
                  "h2",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0,
                  weight_loss_dict=wd_dict))
        vpred_n = dense(layer_2,
                        1,
                        "hfinal",
                        weight_init=tf_util.normc_initializer(1.0),
                        bias_init=0,
                        weight_loss_dict=wd_dict)[:, 0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = tf.reduce_mean(
            tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))

        self._predict = tf_util.function([obs_ph], vpred_n)

        optim = kfac.KfacOptimizer(learning_rate=0.001,
                                   cold_lr=0.001 * (1 - 0.9),
                                   momentum=0.9,
                                   clip_kl=0.3,
                                   epsilon=0.1,
                                   stats_decay=0.95,
                                   async=1,
                                   kfac_update=2,
                                   cold_iter=50,
                                   weight_decay_dict=wd_dict,
                                   max_grad_norm=None,
                                   verbose=verbose)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss,
                                                  loss_sampled,
                                                  var_list=vf_var_list)
        self.do_update = tf_util.function([obs_ph, vtarg_n], update_op)  # pylint: disable=E1101
        tf_util.initialize()  # Initialize uninitialized TF variables
예제 #29
0
def build_train(q_func, ob_space, ac_space, optimizer, sess, grad_norm_clipping=None, gamma=1.0, double_q=True,
                scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None):
    """
    Creates the train function:

    :param q_func: (DQNPolicy) the policy
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param reuse: (bool) whether or not to reuse the graph variables
    :param optimizer: (tf.train.Optimizer) optimizer to use for the Q-learning objective.
    :param sess: (TensorFlow session) The current TensorFlow session
    :param grad_norm_clipping: (float) clip gradient norms to this value. If None no clipping is performed.
    :param gamma: (float) discount rate.
    :param double_q: (bool) if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a
        good idea to keep it enabled.
    :param scope: (str or VariableScope) optional scope for variable_scope.
    :param reuse: (bool) whether or not the variables should be reused. To be able to reuse the scope must be given.
    :param param_noise: (bool) whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    :param param_noise_filter_func: (function (TensorFlow Tensor): bool) function that decides whether or not a
        variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter
        is used by default.

    :return: (tuple)

        act: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor) function to select and action given
            observation. See the top of the file for details.
        train: (function (Any, numpy float, numpy float, Any, numpy bool, numpy float): numpy float)
            optimize the error in Bellman's equation. See the top of the file for details.
        update_target: (function) copy the parameters from optimized Q function to the target Q function.
            See the top of the file for details.
        step_model: (DQNPolicy) Policy for evaluation
    """
    n_actions = ac_space.nvec if isinstance(ac_space, MultiDiscrete) else ac_space.n
    with tf.variable_scope("input", reuse=reuse):
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

    with tf.variable_scope(scope, reuse=reuse):
        if param_noise:
            act_f, obs_phs = build_act_with_param_noise(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess,
                                                        param_noise_filter_func=param_noise_filter_func)
        else:
            act_f, obs_phs = build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess)

        # q network evaluation
        with tf.variable_scope("step_model", reuse=True, custom_getter=tf_util.outer_scope_getter("step_model")):
            step_model = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=True, obs_phs=obs_phs)
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/model")
        # target q network evaluation

        with tf.variable_scope("target_q_func", reuse=False):
            target_policy = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=False)
        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope=tf.get_variable_scope().name + "/target_q_func")

        # compute estimate of best possible value starting from state at t + 1
        double_q_values = None
        double_obs_ph = target_policy.obs_ph
        if double_q:
            with tf.variable_scope("double_q", reuse=True, custom_getter=tf_util.outer_scope_getter("double_q")):
                double_policy = q_func(sess, ob_space, ac_space, 1, 1, None, reuse=True)
                double_q_values = double_policy.q_values
                double_obs_ph = double_policy.obs_ph

    with tf.variable_scope("loss", reuse=reuse):
        # set up placeholders
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(step_model.q_values * tf.one_hot(act_t_ph, n_actions), axis=1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_best_using_online_net = tf.argmax(double_q_values, axis=1)
            q_tp1_best = tf.reduce_sum(target_policy.q_values * tf.one_hot(q_tp1_best_using_online_net, n_actions), axis=1)
        else:
            q_tp1_best = tf.reduce_max(target_policy.q_values, axis=1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = tf_util.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        tf.summary.scalar("td_error", tf.reduce_mean(td_error))
        tf.summary.histogram("td_error", td_error)
        tf.summary.scalar("loss", weighted_error)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # compute optimization op (potentially with gradient clipping)
        gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars)
        if grad_norm_clipping is not None:
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)

    with tf.variable_scope("input_info", reuse=False):
        # Edit: Add action
        tf.summary.scalar('actions', tf.reduce_mean(act_t_ph))
        tf.summary.histogram('actions', act_t_ph)
        tf.summary.scalar('rewards', tf.reduce_mean(rew_t_ph))
        tf.summary.histogram('rewards', rew_t_ph)
        tf.summary.scalar('importance_weights', tf.reduce_mean(importance_weights_ph))
        tf.summary.histogram('importance_weights', importance_weights_ph)
        # Valid image: RGB, RGBD, GrayScale
        is_image = len(obs_phs[0].shape) == 3 and obs_phs[0].shape[-1] in [1, 3, 4]
        if is_image:
            tf.summary.image('observation', obs_phs[0])
        elif len(obs_phs[0].shape) == 1:
            tf.summary.histogram('observation', obs_phs[0])

    optimize_expr = optimizer.apply_gradients(gradients)

    summary = tf.summary.merge_all()

    # Create callable functions
    train = tf_util.function(
        inputs=[
            obs_phs[0],
            act_t_ph,
            rew_t_ph,
            target_policy.obs_ph,
            double_obs_ph,
            done_mask_ph,
            importance_weights_ph
        ],
        outputs=[summary, td_error],
        updates=[optimize_expr]
    )
    update_target = tf_util.function([], [], updates=[update_target_expr])

    return act_f, train, update_target, step_model
예제 #30
0
def build_act(q_func, ob_space, ac_space, sess, num_actions,
              num_action_streams, stochastic_ph, update_eps_ph):
    """Creates the act function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        total number of sub-actions to be represented at the output 
    num_action_streams: int
        specifies the number of action branches in action value (or advantage) function representation
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select an action given observation.
`       See the top of the file for details.
    """
    eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))

    policy = q_func(sess, ob_space, ac_space, 1, 1, None, num_actions)
    obs_phs = (policy.obs_ph, policy.processed_obs)

    assert (num_action_streams >=
            1), "number of action branches is not acceptable, has to be >=1"

    # TODO better: enable non-uniform number of sub-actions per joint
    num_actions_pad = num_actions // num_action_streams  # number of sub-actions per action dimension

    output_actions = []
    for dim in range(num_action_streams):
        q_values_batch = policy.q_values[dim][
            0]  # TODO better: does not allow evaluating actions over a whole batch
        deterministic_action = tf.argmax(q_values_batch)
        random_action = tf.random_uniform([],
                                          minval=0,
                                          maxval=num_actions //
                                          num_action_streams,
                                          dtype=tf.int64)
        chose_random = tf.random_uniform(
            [], minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_action = tf.cond(chose_random, lambda: random_action,
                                    lambda: deterministic_action)
        output_action = tf.cond(stochastic_ph, lambda: stochastic_action,
                                lambda: deterministic_action)
        output_actions.append(output_action)

    update_eps_expr = eps.assign(
        tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))

    _act = tf_util.function(
        inputs=[policy.obs_ph, stochastic_ph, update_eps_ph],
        outputs=output_actions,
        givens={
            update_eps_ph: -1.0,
            stochastic_ph: True
        },
        updates=[update_eps_expr])

    def act(obs, stochastic=True, update_eps=-1):
        return _act(obs, stochastic, update_eps)

    return act, obs_phs