def __init__(self, opt, job):
        self.opt = opt
        with tf.Graph().as_default():
            tf.set_random_seed(opt.seed)
            np.random.seed(opt.seed)

            # Inputs to computation graph
            self.x_ph, self.a_ph, self.x2_ph, = core.placeholders(opt.obs_dim, None, opt.obs_dim)

            # Main outputs from computation graph
            with tf.variable_scope('main'):
                self.q, _ = core.q_function(self.x_ph, self.x2_ph, opt.hidden_size, opt.act_dim)

            # Set up summary Ops
            self.test_ops, self.test_vars = self.build_summaries()

            self.sess = tf.Session(
                config=tf.ConfigProto(
                    device_count={'GPU': 0},
                    intra_op_parallelism_threads=1,
                    inter_op_parallelism_threads=1))

            self.sess.run(tf.global_variables_initializer())

            if job == "test":
                self.writer = tf.summary.FileWriter(
                    opt.summary_dir + "/" + str(datetime.datetime.now()) + "-" + opt.env_name + "-" + opt.exp_name +
                    "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph)

            variables_all = tf.contrib.framework.get_variables_to_restore()
            variables_bn = [v for v in variables_all if 'moving_mean' in v.name or 'moving_variance' in v.name]

            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                self.q, self.sess, input_variables=variables_bn)
示例#2
0
    def __init__(self, opt, job):
        self.opt = opt
        with tf.Graph().as_default():
            tf.set_random_seed(opt.seed)
            np.random.seed(opt.seed)

            # Inputs to computation graph
            self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
                core.placeholders(opt.obs_dim, opt.act_dim, opt.obs_dim, None, None)

            # Main outputs from computation graph
            with tf.variable_scope('main'):
                self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = \
                    actor_critic(self.x_ph, self.x2_ph, self.a_ph, action_space=opt.ac_kwargs["action_space"])

            # Set up summary Ops
            self.test_ops, self.test_vars = self.build_summaries()

            self.sess = tf.Session(
                config=tf.ConfigProto(device_count={'GPU': 0},
                                      intra_op_parallelism_threads=1,
                                      inter_op_parallelism_threads=1))

            self.sess.run(tf.global_variables_initializer())

            if job == "main":
                self.writer = tf.summary.FileWriter(
                    opt.summary_dir + "/" + str(datetime.datetime.now()) +
                    "-" + opt.env_name + "-workers_num:" +
                    str(opt.num_workers) + "%" + str(opt.a_l_ratio),
                    self.sess.graph)

            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                self.pi, self.sess)
示例#3
0
    def __init__(self, observation_space,action_space):
        obs_dim = observation_space.shape
        act_dim = action_space.shape

        # Share information about action space with policy architecture
        ac_kwargs = dict()
        ac_kwargs['action_space'] = action_space
        #ac_kwargs['output_activation'] = tf.tanh

        # Inputs to computation graph
        self.x_ph,  self.a_ph = core.placeholders_from_spaces(observation_space, action_space)
        self.adv_ph, self.ret_ph, self.logp_old_ph = core.placeholders(None, None, None)

        # Main outputs from computation graph
        self.pi, self.logp, self.logp_pi, self.v = core.mlp_actor_critic(self.x_ph, self.a_ph, output_activation=tf.tanh,**ac_kwargs)

        # Need all placeholders in *this* order later (to zip with data from buffer)
        self.all_phs = [self.x_ph, self.a_ph, self.adv_ph, self.ret_ph, self.logp_old_ph]

        # Every step, get: action, value, and logprob
        self.get_action_ops = [self.pi, self.v, self.logp_pi]

        # Experience buffer
        steps_per_epoch = 1000
        self.local_steps_per_epoch = steps_per_epoch
        gamma = 0.99
        lam = 0.97
        self.buf = PPOBuffer(obs_dim, act_dim, self.local_steps_per_epoch, gamma, lam)

        # Count variables
        var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
        print var_counts

        # PPO objectives
        clip_ratio = 0.2
        ratio = tf.exp(self.logp - self.logp_old_ph)  # pi(a|s) / pi_old(a|s)
        min_adv = tf.where(self.adv_ph > 0, (1 + clip_ratio) * self.adv_ph, (1 - clip_ratio) * self.adv_ph)
        self.pi_loss = -tf.reduce_mean(tf.minimum(ratio * self.adv_ph, min_adv))
        self.v_loss = tf.reduce_mean((self.ret_ph - self.v) ** 2)

        # Info (useful to watch during learning)
        self.approx_kl = tf.reduce_mean(self.logp_old_ph - self.logp)  # a sample estimate for KL-divergence, easy to compute
        self.approx_ent = tf.reduce_mean(-self.logp)  # a sample estimate for entropy, also easy to compute
        self.clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
        self.clipfrac = tf.reduce_mean(tf.cast(self.clipped, tf.float32))
        pi_lr = 3e-4
        vf_lr = 1e-3
        pi_optimizer = tf.train.AdadeltaOptimizer(learning_rate=pi_lr)
        vf_optimizer = tf.train.AdadeltaOptimizer(learning_rate=vf_lr)
        self.train_pi = pi_optimizer.minimize(self.pi_loss)
        self.train_v = vf_optimizer.minimize(self.v_loss)
        self.train_pi_iters = 80
        self.train_v_iters = 80
        self.target_kl = 0.01

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
示例#4
0
    def __init__(self):
        self.sess = tf.Session()
        self.memory = replay_buffer(max_length=1e5)
        self.tau = 0.995
        self.gamma = 0.99
        self.state_size = 33
        self.output_size = 4
        self.action_limit = 1.0
        self.hidden = [400, 300]
        self.batch_size = 100
        self.pi_lr = 1e-4
        self.q_lr = 1e-4
        self.noise = OU_noise(self.output_size, 1)

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q, self.q_pi = cr.mlp_actor_critic(
                self.x_ph,
                self.a_ph,
                self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            self.pi_targ, _, self.q_pi_targ = cr.mlp_actor_critic(self.x2_ph,\
                self.a_ph, self.hidden, activation=tf.nn.relu, output_activation=tf.tanh,
                output_size=self.output_size, action_limit=self.action_limit)

        self.target = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.q_pi_targ)
        self.pi_loss = -tf.reduce_mean(self.q_pi)
        self.v_loss = tf.reduce_mean((self.q - self.target)**2) * 0.5
        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.v_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.pi_train = self.pi_optimizer.minimize(
            self.pi_loss, var_list=cr.get_vars('main/pi'))
        self.v_train = self.v_optimizer.minimize(
            self.v_loss, var_list=cr.get_vars('main/q'))

        self.target_update = tf.group([
            tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for
            v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
    def __init__(self, opt, job):
        self.opt = opt
        with tf.Graph().as_default():
            tf.set_random_seed(opt.seed)
            np.random.seed(opt.seed)

            # Inputs to computation graph
            self.x_ph, self.a_ph, self.x2_ph, = core.placeholders(
                opt.obs_shape, opt.act_shape, opt.obs_shape)

            # ------
            if opt.alpha == 'auto':
                log_alpha = tf.get_variable('log_alpha',
                                            dtype=tf.float32,
                                            initializer=0.0)
                alpha_v = tf.exp(log_alpha)
            else:
                alpha_v = opt.alpha
            # ------

            # Main outputs from computation graph
            with tf.variable_scope('main'):
                self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi, q1_mu, q2_mu \
                    = actor_critic(self.x_ph, self.x2_ph, self.a_ph, alpha_v,
                                   hidden_sizes=opt.hidden_size,
                                   action_space=opt.act_space,
                                   phase=False, use_bn=opt.use_bn, coefficent_regularizer=opt.c_regularizer,
                                   model=opt.model)

            # Set up summary Ops
            self.test_ops, self.test_vars = self.build_summaries()

            self.sess = tf.Session(config=tf.ConfigProto(
                # device_count={'GPU': 0},
                intra_op_parallelism_threads=1,
                inter_op_parallelism_threads=1))

            self.sess.run(tf.global_variables_initializer())

            if job == "main":
                self.writer = tf.summary.FileWriter(
                    opt.summary_dir + "/" + str(datetime.datetime.now()) +
                    "-" + opt.env_name + "-" + opt.exp_name + "-workers_num:" +
                    str(opt.num_workers) + "%" + str(opt.a_l_ratio),
                    self.sess.graph)

            variables_all = tf.contrib.framework.get_variables_to_restore()
            variables_bn = [
                v for v in variables_all
                if 'moving_mean' in v.name or 'moving_variance' in v.name
            ]

            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                self.pi, self.sess, input_variables=variables_bn)
示例#6
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.gamma = env_set['gamma']
        self.lamda = 0.97
        self.hidden = env_set['hidden']
        self.pi_lr = 0.00025
        self.v_lr = 0.00025
        self.ppo_eps = 0.2
        self.epoch = 10

        self.x_ph, self.a_ph, self.adv_ph, self.target_ph, self.logp_old_ph, self.old_value = \
            cr.placeholders(self.state_size, self.output_size, None, None, None, None)

        self.pi, self.logp, self.logp_pi, self.v = cr.ppo_mlp_actor_critic(
            x=self.x_ph,
            a=self.a_ph,
            hidden=self.hidden,
            activation=tf.nn.relu,
            output_activation=None,
            output_size=self.output_size)

        self.all_phs = [
            self.x_ph, self.a_ph, self.adv_ph, self.target_ph,
            self.logp_old_ph, self.old_value
        ]
        self.get_action_ops = [self.pi, self.v, self.logp_pi]

        self.ratio = tf.exp(self.logp - self.logp_old_ph)

        self.min_adv = tf.where(self.adv_ph > 0,
                                (1.0 + self.ppo_eps) * self.adv_ph,
                                (1.0 - self.ppo_eps) * self.adv_ph)
        self.pi_loss = -tf.reduce_mean(
            tf.minimum(self.ratio * self.adv_ph, self.min_adv))

        self.clipped_value_loss = self.old_value + tf.clip_by_value(
            self.v - self.old_value, -self.ppo_eps, self.ppo_eps)
        self.v_loss1 = (self.target_ph - self.clipped_value_loss)**2
        self.v_loss2 = (self.target_ph - self.v)**2
        self.v_loss = 0.5 * tf.reduce_mean(
            tf.maximum(self.v_loss1, self.v_loss2))

        self.train_pi = tf.train.AdamOptimizer(self.pi_lr).minimize(
            self.pi_loss)
        self.train_v = tf.train.AdamOptimizer(self.v_lr).minimize(self.v_loss)

        self.approx_kl = tf.reduce_mean(self.logp_old_ph - self.logp)
        self.approx_ent = tf.reduce_mean(-self.logp)

        self.sess.run(tf.global_variables_initializer())
示例#7
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.gamma = env_set['gamma']
        self.lamda = 0.95
        self.hidden = env_set['hidden']
        self.pi_lr = env_set['pi_lr']  #0.00025
        self.v_lr = env_set['q_lr']  #0.00025
        self.ppo_eps = 0.2
        self.train_pi_iter = 10
        self.train_v_iter = 10
        self.target_kl = 0.1
        self.step_per_epoch = 2048

        self.x_ph, self.a_ph, self.adv_ph, self.target_ph, self.logp_old_ph = \
            cr.placeholders(self.state_size, self.output_size, None, None, None)

        self.pi, self.logp, self.logp_pi, self.v, self.std = cr.ppo_mlp_actor_critic(
            x=self.x_ph,
            a=self.a_ph,
            hidden=self.hidden,
            activation=tf.nn.relu,
            output_activation=None,
            output_size=self.output_size)

        self.all_phs = [
            self.x_ph, self.a_ph, self.adv_ph, self.target_ph, self.logp_old_ph
        ]
        self.get_action_ops = [self.pi, self.v, self.logp_pi]

        self.ratio = tf.exp(self.logp - self.logp_old_ph)
        self.min_adv = tf.where(self.adv_ph > 0,
                                (1.0 + self.ppo_eps) * self.adv_ph,
                                (1.0 - self.ppo_eps) * self.adv_ph)
        self.pi_loss = -tf.reduce_mean(
            tf.minimum(self.ratio * self.adv_ph, self.min_adv))
        self.v_loss = tf.reduce_mean((self.target_ph - self.v)**2)

        self.train_pi = tf.train.AdamOptimizer(self.pi_lr).minimize(
            self.pi_loss)
        self.train_v = tf.train.AdamOptimizer(self.v_lr).minimize(self.v_loss)

        self.approx_kl = tf.reduce_mean(self.logp_old_ph - self.logp)
        self.approx_ent = tf.reduce_mean(-self.logp)

        self.sess.run(tf.global_variables_initializer())
示例#8
0
文件: ppo.py 项目: BatBate/Meta-RL
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gru_units=256,
        trials_per_epoch=100, episodes_per_trial=2, n = 100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000,
        target_kl=0.01, logger_kwargs=dict(), save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape
    
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph\
    raw_input_ph = tf.placeholder(dtype=tf.float32, shape=obs_dim, name='raw_input_ph')
    rescale_image_op = tf.image.resize_images(raw_input_ph, [30, 40])
    max_seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(), name='max_seq_len_ph')
    seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(None,))

    # Because we pad zeros at the end of every sequence of length less than max length, we need to mask these zeros out
    # when computing loss
    seq_len_mask_ph = tf.placeholder(dtype=tf.int32, shape=(trials_per_epoch, episodes_per_trial * max_ep_len))

    # rescaled_image_ph This is a ph  because we want to be able to pass in value to this node manually
    rescaled_image_in_ph = tf.placeholder(dtype=tf.float32, shape=[None, 30, 40, 3], name='rescaled_image_in_ph')
    a_ph = core.placeholders_from_spaces( env.action_space)[0]
    conv1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=rescaled_image_in_ph, num_outputs=16, kernel_size=[5,5],
                        stride=2)
    image_out = slim.flatten(slim.conv2d(activation_fn=tf.nn.relu, inputs=conv1, num_outputs=16, kernel_size=[5,5],
                        stride=2))

    rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None)
    rnn_state_ph = tf.placeholder(tf.float32, [None, gru_units], name='pi_rnn_state_ph')
    # Main outputs from computation graph

    action_encoder_matrix = np.load(r'encoder.npy')
    pi, logp, logp_pi, v, rnn_state, logits, seq_len_vec, tmp_vec = actor_critic(
            image_out, a_ph, rew_ph, rnn_state_ph, gru_units,
            max_seq_len_ph, action_encoder_matrix, seq_len=seq_len_ph, action_space=env.action_space)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [rescaled_image_in_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi, rnn_state, logits]

    # Experience buffer
    buffer_size = trials_per_epoch * episodes_per_trial * max_ep_len
    buf = PPOBuffer(rescaled_image_in_ph.get_shape().as_list()[1:], act_dim, buffer_size, trials_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)          # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph)

    # Need to mask out the padded zeros when computing loss
    sequence_mask = tf.sequence_mask(seq_len_ph, episodes_per_trial*max_ep_len)
    # Convert bool tensor to int tensor with 1 and 0
    sequence_mask = tf.where(sequence_mask,
                             np.ones(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len)),
                             np.zeros(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len)))

    # need to reshape because ratio is a 1-D vector (it is a concatnation of all sequence) for masking and then reshape
    # it back
    pi_loss_vec = tf.multiply(sequence_mask, tf.reshape(tf.minimum(ratio * adv_ph, min_adv), tf.shape(sequence_mask)))
    pi_loss = -tf.reduce_mean(tf.reshape(pi_loss_vec, tf.shape(ratio)))
    aaa = (ret_ph - v)**2

    v_loss_vec = tf.multiply(sequence_mask, tf.reshape((ret_ph - v)**2, tf.shape(sequence_mask)))
    ccc = tf.reshape(v_loss_vec, tf.shape(v))

    v_loss = tf.reduce_mean(tf.reshape(v_loss_vec, tf.shape(v)))


    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)      # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)                  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    train = MpiAdamOptimizer(learning_rate=1e-4).minimize(pi_loss + 0.01 * v_loss - 0.001 * approx_ent)


    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'rescaled_image_in': rescaled_image_in_ph}, outputs={'pi': pi, 'v': v})



    def update():
        print(f'Start updating at {datetime.now()}')
        inputs = {k:v for k,v in zip(all_phs, buf.get())}

        inputs[rnn_state_ph] = np.zeros((trials_per_epoch, gru_units), np.float32)
        inputs[max_seq_len_ph] = int(episodes_per_trial * max_ep_len)
        inputs[seq_len_ph] = buf.seq_len_buf
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        buf.reset()

        
        # Training
        print(f'sequence length = {sess.run(seq_len_vec, feed_dict=inputs)}')


        for i in range(train_pi_iters):
            _, kl, pi_loss_i, v_loss_i, ent = sess.run([train_pi, approx_kl, pi_loss, v_loss, approx_ent], feed_dict=inputs)
            print(f'i: {i}, pi_loss: {pi_loss_i}, v_loss: {v_loss_i}, entropy: {ent}')


        logger.store(StopIter=i)


        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
                [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old, 
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
        print(f'Updating finished at {datetime.now()}')


    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0

    def recenter_rgb(image, min=0.0, max=255.0):
        '''

        :param image:
        :param min:
        :param max:
        :return: an image with rgb value re-centered to [-1, 1]
        '''
        mid = (min + max) / 2.0
        return np.apply_along_axis(func1d=lambda x: (x - mid) / mid, axis=2, arr=image)

    o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o}))
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for trial in range(trials_per_epoch):
            # TODO: tweek settings to match the paper

            # TODO: find a way to generate mazes
            last_a = np.array(0)
            last_r = np.array(r)
            last_rnn_state = np.zeros((1, gru_units), np.float32)

            step_counter = 0
            for episode in range(episodes_per_trial):
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o}))

                action_dict = defaultdict(int)

                # dirty hard coding to make it print in order
                action_dict[0] = 0
                action_dict[1] = 0
                action_dict[2] = 0

                for step in range(max_ep_len):
                    a, v_t, logp_t, rnn_state_t, logits_t = sess.run(
                            get_action_ops, feed_dict={
                                    rescaled_image_in_ph: np.expand_dims(o_rescaled, 0),
                                    a_ph: last_a.reshape(-1,),
                                    rew_ph: last_r.reshape(-1,1),
                                    rnn_state_ph: last_rnn_state,
                                    # v_rnn_state_ph: last_v_rnn_state,
                                    max_seq_len_ph: 1,
                        seq_len_ph: [1]})
                    action_dict[a[0]] += 1
                    # save and log
                    buf.store(o_rescaled, a, r, v_t, logp_t)
                    logger.store(VVals=v_t)
                    o, r, d, _ = env.step(a[0])
                    step_counter += 1
                    o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o}))
                    ep_ret += r
                    ep_len += 1

                    last_a = a[0]
                    last_r = np.array(r)
                    last_rnn_state = rnn_state_t

                    terminal = d or (ep_len == max_ep_len)
                    if terminal or (step==n-1):
                        if not(terminal):
                            print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
                        # if trajectory didn't reach terminal state, bootstrap value target
                        last_val = r if d else sess.run(v, feed_dict={rescaled_image_in_ph: np.expand_dims(o_rescaled, 0),
                                    a_ph: last_a.reshape(-1,),
                                    rew_ph: last_r.reshape(-1,1),
                                    rnn_state_ph: last_rnn_state,
                                    max_seq_len_ph: 1,
                                    seq_len_ph: [1]})
                        buf.finish_path(last_val)
                        logger.store(EpRet=ep_ret, EpLen=ep_len)


                        print(f'episode terminated with {step} steps. epoch:{epoch} trial:{trial} episode:{episode}')
                        break
                print(action_dict)
            if step_counter < episodes_per_trial * max_ep_len:
                buf.pad_zeros(episodes_per_trial * max_ep_len - step_counter)
            buf.seq_len_buf[trial] = step_counter



            # pad zeros to sequence buffer after each trial
        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, None)
        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*trials_per_epoch*episodes_per_trial*max_ep_len)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()
示例#9
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = 33
        self.output_size = 4
        self.tau = 0.995
        self.gamma = 0.99
        self.hidden = [400, 300]
        self.batch_size = 64
        self.pi_lr = 1e-3
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(1e5)
        self.target_noise = 0.2
        self.noise = OU_noise(self.output_size, 1)
        self.noise_clip = 0.1

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q1, self.q2, self.q1_pi = cr.td3_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            self.pi_targ, _, _, _ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target', reuse=True):
            self.eps = tf.random_normal(tf.shape(self.pi_targ),
                                        stddev=self.target_noise)
            self.epsilon = tf.clip_by_value(self.eps, -self.noise_clip,
                                            self.noise_clip)
            self.a_prev = self.pi_targ + self.epsilon
            self.a2 = tf.clip_by_value(self.a_prev, -self.action_limit,
                                       self.action_limit)
            _, self.q1_targ, self.q2_targ, self.q1_pi_targ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a2,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')

        self.min_q_targ = tf.minimum(self.q1_targ, self.q2_targ)
        self.backup = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.min_q_targ)
        self.pi_loss = -tf.reduce_mean(self.q1_pi)
        self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2)
        self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2)
        self.v_loss = self.q1_loss + self.q2_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.q_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.pi_train = self.pi_optimizer.minimize(self.pi_loss,
                                                   var_list=self.pi_params)
        self.v_train = self.pi_optimizer.minimize(self.v_loss,
                                                  var_list=self.q_params)

        self.target_update = tf.group([
            tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main) for
            v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
示例#10
0
文件: ddpg.py 项目: zhc134/l2s
def ddpg(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak,
         batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len,
         test_max_ep_len, number_of_tests_per_epoch, act_noise, logger_kwargs,
         seed):
    logger = EpochLogger(**logger_kwargs)
    configs = locals().copy()
    configs.pop("logger")
    logger.save_config(configs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = make_env(env_config), make_env(env_config)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_high = env.action_space.high

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    actor_critic = core.get_ddpg_actor_critic(ac_type)
    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    RB = get_replay_buffer(rb_type)
    replay_buffer = RB(obs_dim, act_dim, **rb_kwargs)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # DDPG losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    def get_action(o, noise_scale):
        pi_a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        pi_a += noise_scale * np.random.randn(act_dim)
        pi_a = np.clip(pi_a, 0, 1)
        real_a = pi_a * act_high
        return pi_a, real_a

    def test_agent(n=10):
        test_actions = []
        for j in range(n):
            test_actions_ep = []
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == test_max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                _, real_a = get_action(o, 0)
                test_actions_ep.append(real_a)
                o, r, d, _ = test_env.step(real_a)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            test_actions.append(test_actions_ep)
        return test_actions

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    actions = []
    epoch_actions = []
    rewards = []
    rets = []
    test_rets = []
    max_ret = None
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            pi_a, real_a = get_action(o, act_noise)
        else:
            pi_a, real_a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(real_a)
        ep_ret += r
        ep_len += 1
        epoch_actions.append(pi_a)

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, pi_a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for _ in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)
                logger.store(LossQ=outs[0], QVals=outs[1])

                # Policy update
                outs = sess.run([pi_loss, train_pi_op, target_update],
                                feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            actions.append(np.mean(epoch_actions))
            epoch_actions = []
            rewards.append(ep_ret)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Test the performance of the deterministic version of the agent.
            test_actions = test_agent(number_of_tests_per_epoch)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            ret = logger.log_tabular('EpRet', average_only=True)
            test_ret = logger.log_tabular('TestEpRet', average_only=True)[0]
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('QVals', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            rets.append(ret)
            test_rets.append(test_ret)

            if max_ret is None or test_ret > max_ret:
                max_ret = test_ret
                best_test_actions = test_actions

            max_ep_len += inc_ep
            util.plot_actions(test_actions, act_high,
                              logger.output_dir + '/actions%s.png' % epoch)

    logger.save_state(
        {
            "actions": actions,
            "rewards": rewards,
            "best_test_actions": best_test_actions,
            "rets": rets,
            "test_rets": test_rets,
            "max_ret": max_ret
        }, None)

    util.plot_actions(best_test_actions, act_high,
                      logger.output_dir + '/best_test_actions.png')
    logger.log("max ret: %f" % max_ret)
示例#11
0
文件: iac.py 项目: zhc134/l2s
def iac(env_config, ac_type, ac_kwargs, rb_type, rb_kwargs, gamma, lr, polyak,
        batch_size, epochs, start_steps, steps_per_epoch, inc_ep, max_ep_len,
        test_max_ep_len, number_of_tests_per_epoch, q_pi_sample_size, z_dim,
        z_type, act_noise, test_without_state, logger_kwargs, seed):
    logger = EpochLogger(**logger_kwargs)
    configs = locals().copy()
    configs.pop("logger")
    logger.save_config(configs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = make_env(env_config), make_env(env_config)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_high = env.action_space.high

    # Inputs to computation graph
    x_ph, a_ph, z_ph, x2_ph, r_ph, d_ph = core.placeholders(
        obs_dim, act_dim, z_dim, obs_dim, None, None)

    actor_critic = core.get_iac_actor_critic(ac_type)
    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, z_ph,
                                                   **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, z_ph, **ac_kwargs)

    # Experience buffer
    RB = get_replay_buffer(rb_type)
    replay_buffer = RB(obs_dim, act_dim, **rb_kwargs)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q', 'main/v', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q: %d, \t v: %d, \t total: %d\n'
        % var_counts)

    # Bellman backup for Q and V function
    q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ)
    min_q_pi = tf.minimum(q1_pi, q2_pi)
    v_backup = tf.stop_gradient(min_q_pi)

    # TD3 losses
    pi_loss = -tf.reduce_mean(q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q1 - q_backup)**2)
    q2_loss = 0.5 * tf.reduce_mean((q2 - q_backup)**2)
    v_loss = 0.5 * tf.reduce_mean((v - v_backup)**2)
    value_loss = q1_loss + q2_loss + v_loss

    # Separate train ops for pi, q
    policy_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_policy_op = policy_optimizer.minimize(pi_loss,
                                                var_list=get_vars('main/pi'))
    if ac_kwargs["pi_separate"]:
        train_policy_emb_op = policy_optimizer.minimize(
            pi_loss, var_list=get_vars('main/pi/emb'))
        train_policy_d_op = policy_optimizer.minimize(
            pi_loss, var_list=get_vars('main/pi/d'))
    train_value_op = value_optimizer.minimize(value_loss,
                                              var_list=get_vars('main/q') +
                                              get_vars('main/v'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    def sample_z(size):
        if z_type == "uniform":
            return np.random.random_sample(size=size)
        elif z_type == "gaussian":
            return np.random.normal(size=size)
        else:
            raise Exception("z_type error")

    def get_action(o, noise_scale):
        pi_a = sess.run(pi,
                        feed_dict={
                            x_ph: o.reshape(1, -1),
                            z_ph: sample_z((1, z_dim))
                        })[0]
        pi_a += noise_scale * np.random.randn(act_dim)
        pi_a = np.clip(pi_a, 0, 1)
        real_a = pi_a * act_high
        return pi_a, real_a

    def test_agent(n=10):
        test_actions = []
        for j in range(n):
            test_actions_ep = []
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == test_max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                if test_without_state:
                    _, real_a = get_action(np.zeros(o.shape), 0)
                else:
                    _, real_a = get_action(o, 0)
                test_actions_ep.append(real_a)
                o, r, d, _ = test_env.step(real_a)
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            test_actions.append(test_actions_ep)
        return test_actions

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    rewards = []
    rets = []
    test_rets = []
    max_ret = None
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            pi_a, real_a = get_action(o, act_noise)
        else:
            pi_a, real_a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(real_a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, pi_a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):

            for _ in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                feed_dict[z_ph] = sample_z((batch_size, z_dim))

                # Policy Learning update
                for key in feed_dict:
                    feed_dict[key] = np.repeat(feed_dict[key],
                                               q_pi_sample_size,
                                               axis=0)
                feed_dict[z_ph] = sample_z(
                    (batch_size * q_pi_sample_size, z_dim))
                if ac_kwargs["pi_separate"]:
                    if len(rewards) % 2 == 0:
                        outs = sess.run([pi_loss, train_policy_emb_op],
                                        feed_dict)
                    else:
                        outs = sess.run([pi_loss, train_policy_d_op],
                                        feed_dict)
                else:
                    outs = sess.run([pi_loss, train_policy_op], feed_dict)
                logger.store(LossPi=outs[0])

                # Q-learning update
                outs = sess.run([q1_loss, v_loss, q1, v, train_value_op],
                                feed_dict)
                logger.store(LossQ=outs[0],
                             LossV=outs[1],
                             ValueQ=outs[2],
                             ValueV=outs[3])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            rewards.append(ep_ret)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Test the performance of the deterministic version of the agent.
            test_actions = test_agent(number_of_tests_per_epoch)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            ret = logger.log_tabular('EpRet', average_only=True)[0]
            test_ret = logger.log_tabular('TestEpRet', average_only=True)[0]
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('ValueQ', average_only=True)
            logger.log_tabular('ValueV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            rets.append(ret)
            test_rets.append(test_ret)

            if max_ret is None or test_ret > max_ret:
                max_ret = test_ret
                best_test_actions = test_actions

            max_ep_len += inc_ep
            sess.run(target_update, feed_dict)

    logger.save_state(
        {
            "rewards": rewards,
            "best_test_actions": best_test_actions,
            "rets": rets,
            "test_rets": test_rets,
            "max_ret": max_ret
        }, None)

    util.plot_actions(best_test_actions, act_high,
                      logger.output_dir + '/best_test_actions.png')
    logger.log("max ret: %f" % max_ret)
示例#12
0
def td3(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        batch_size=100,
        start_steps=10000,
        act_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q1(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target
            policy.

        noise_clip (float): Limit for absolute value of target policy
            smoothing noise.

        policy_delay (int): Policy will only be updated once every
            policy_delay times for each update of the Q-networks.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target policy network
    with tf.variable_scope('target'):
        pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Target Q networks
    with tf.variable_scope('target', reuse=True):

        # Target policy smoothing, by adding clipped noise to target actions
        epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
        epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
        a2 = pi_targ + epsilon
        a2 = tf.clip_by_value(a2, -act_limit, act_limit)

        # Target Q-values, using action from target policy
        _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    # Bellman backup for Q functions, using Clipped Double-Q targets
    min_q_targ = tf.minimum(q1_targ, q2_targ)
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ)

    # TD3 losses
    pi_loss = -tf.reduce_mean(q1_pi)
    q1_loss = tf.reduce_mean((q1 - backup)**2)
    q2_loss = tf.reduce_mean((q2 - backup)**2)
    q_loss = q1_loss + q2_loss

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2
                          })

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                q_step_ops = [q_loss, q1, q2, train_q_op]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                if j % policy_delay == 0:
                    # Delayed policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
示例#13
0
def asac_v2(actor_critic=core.mlp_actor_critic,
            seed=0,
            ac_kwargs=dict(),
            steps_per_epoch=5000,
            epochs=200,
            replay_size=int(1e6),
            gamma=0.99,
            polyak=0.995,
            lr=0.001,
            alpha_start=0.2,
            batch_size=100,
            start_steps=10000,
            max_ep_len=1000,
            logger_kwargs=dict(),
            save_freq=1,
            loss_threshold=0.0001,
            delta=0.02,
            sample_step=2000):

    alpha = Alpha(alpha_start=alpha_start, delta=delta)
    alpha_t = alpha()

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = baxter()
    obs_dim = env.obs_dim
    act_dim = env.act_dim

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = 0.1

    # Share information about action space with policy architecture

    # Inputs to computation graph
    #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None)
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)
    alpha_ph = core.scale_holder()
    # Main outputs from computation graph
    #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs)
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(
            x_ph, a_ph, **ac_kwargs)
    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, _, _, v_targ, _, _, R_targ = actor_critic(
            x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in [
            'main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R',
            'main'
        ])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts)
    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ)
    v_backup = tf.stop_gradient(min_q_pi - alpha_ph * logp_pi)
    Q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * R_targ)
    R_backup = tf.stop_gradient(Q_pi)
    adv = Q_pi - R
    dQ = Q_backup * (R - Q)

    pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    Q_loss = 0.5 * tf.reduce_mean((Q_backup - Q)**2)
    R_loss = 0.5 * tf.reduce_mean((R_backup - R)**2)
    value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss
    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v') + get_vars(
        'main/Q') + get_vars('main/R')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)
    """
    R_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R'))
    """
    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op,
        train_value_op, target_update, R_loss, Q_loss, v_targ
    ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    obs1_epi = np.zeros([2 * max_ep_len, obs_dim], dtype=np.float32)
    obs2_epi = np.zeros([2 * max_ep_len, obs_dim], dtype=np.float32)
    act_epi = np.zeros([2 * max_ep_len, act_dim], dtype=np.float32)
    rew_epi = np.zeros([2 * max_ep_len], dtype=np.float32)
    done_epi = np.zeros([2 * max_ep_len], dtype=np.float32)
    ptr_epi = 0
    alpha_update = False
    epi_num = 0
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o["feature"])
        else:
            a = 0.1 - np.random.sample(act_dim) * 0.2
        # Step the env
        o2, r = env.step(a)
        ep_ret += r
        ep_len += 1
        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o["feature"], a, r, o2["feature"], d)
        obs1_epi[ptr_epi] = o["feature"]
        obs2_epi[ptr_epi] = o2["feature"]
        act_epi[ptr_epi] = a
        rew_epi[ptr_epi] = r
        done_epi[ptr_epi] = d
        ptr_epi += 1

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        if d or (ep_len == max_ep_len):
            epi_num += 1
            print("epi : {}, alpha : {}, return : {}".format(
                epi_num, alpha_t, ep_ret))
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            """
            rew_epi[ptr_epi] = sess.run(R, feed_dict={x_ph: [o]})[0]
            rets_epi = scipy.signal.lfilter([1], [1, float(-gamma)], rew_epi[::-1], axis=0)[::-1]
            rets_epi = rets_epi[:-1]
            """
            """
            v_epi = sess.run(R, feed_dict={x_ph: obs_epi})
            q_epi, adv_epi = sess.run([Q, adv], feed_dict={x_ph: obs_epi[:-1], a_ph: act_epi})
            rets_epi = rew_epi + gamma*v_epi[1:]
            if t > start_steps:
                alpha.update_alpha(adv_epi, np.mean(rets_epi*(v_epi[:-1]-q_epi)) > 0)
                alpha_t = alpha()
            print("{} {}".format(np.mean(rets_epi*(v_epi[:-1]-q_epi)), alpha_t))
            """
            if ptr_epi >= max_ep_len:
                feed_dict = {
                    x_ph: obs1_epi[:ptr_epi],
                    x2_ph: obs2_epi[:ptr_epi],
                    a_ph: act_epi[:ptr_epi],
                    r_ph: rew_epi[:ptr_epi],
                    d_ph: done_epi[:ptr_epi]
                }
                adv_epi, Q_epi, R_epi = sess.run([adv, Q, R], feed_dict)
                R_next_epi = sess.run(R, feed_dict={x_ph: obs2_epi[:ptr_epi]})
                dQ_epi = (rew_epi[:ptr_epi] + gamma *
                          (1 - done_epi[:ptr_epi]) * R_next_epi) * (R_epi -
                                                                    Q_epi)
                """
                ret_epi = np.zeros([ptr_epi], dtype=np.float32)
                for i in np.arange(ptr_epi)[::-1]:
                    if i == ptr_epi - 1:
                        R_next_epi = sess.run(R, feed_dict={x_ph: [obs2_epi[i]]})[0]
                        ret_epi[i] = rew_epi[i] + gamma*(1 - done_epi[i])*R_next_epi
                    else:
                        ret_epi[i] = rew_epi[i] + gamma*(1 - done_epi[i])*ret_epi[i+1]
                dQ_epi = ret_epi * (R_epi - Q_epi)
                """
                if t > start_steps:
                    alpha.update_alpha(adv_epi, np.mean(dQ_epi) > 0)
                    alpha_t = alpha()
                    print("{} {}".format(np.mean(dQ_epi), alpha_t))
                obs1_epi = np.zeros([max_ep_len * 2, obs_dim],
                                    dtype=np.float32)
                obs2_epi = np.zeros([max_ep_len * 2, obs_dim],
                                    dtype=np.float32)
                act_epi = np.zeros([max_ep_len * 2, act_dim], dtype=np.float32)
                rew_epi = np.zeros([max_ep_len * 2], dtype=np.float32)
                done_epi = np.zeros([max_ep_len * 2], dtype=np.float32)
                ptr_epi = 0
            """
            batch = replay_buffer.sample_batch(1000)
            feed_dict = {x_ph: batch['obs1'],
                         x2_ph: batch['obs2'],
                         a_ph: batch['acts'],
                         r_ph: batch['rews'],
                         d_ph: batch['done'],
                         alpha_ph: alpha_t}
            dQ_epi = sess.run(dQ, feed_dict)
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                    alpha_ph: alpha_t
                }
                outs = sess.run(step_ops, feed_dict)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
示例#14
0
def s2vg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=1000, epochs=100, replay_size=int(1e6), gamma=0.99,
        polyak=0.995, model_lr=3e-4, value_lr=1e-3, pi_lr=3e-4, alpha=0.4,
        batch_size=100, start_steps=1000,max_ep_len=1000, save_freq=1,
        train_model_epoch=1, test_freq=10, exp_name='',env_name='',save_epoch=100):


    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_limit = env.action_space.high[0]
    ac_kwargs['action_space'] = env.action_space

    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)

    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
        transition , r_rm, transition_pi ,r_rm_pi, v_prime = core.reward_dynamic_model(x_ph, a_ph, pi, **ac_kwargs)

    # Target value network for updates
    with tf.variable_scope('target'):
        _, _, _, _, _, _, _,v_targ  = actor_critic(x2_ph, a_ph, **ac_kwargs)

    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # TD3 style Q function updates

    min_q_pi = tf.minimum(q1_pi, q2_pi)

    q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ)
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)

    r_backup = r_ph
    transition_backup = x2_ph

    r_loss = 0.5 * tf.reduce_mean((r_backup-r_rm)**2)
    transition_loss = 0.5 * tf.reduce_mean((transition_backup - transition)**2)
    model_loss = r_loss+transition_loss

    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    value_loss = q1_loss + q2_loss + v_loss

    pi_loss = r_rm_pi - alpha*logp_pi + gamma*(1-d_ph)*v_prime


    # model train op
    model_optimizer = tf.train.AdamOptimizer(learning_rate=model_lr)
    model_params = get_vars('main/dm') + get_vars('main/rm')
    train_model_op = model_optimizer.minimize(model_loss, var_list=model_params)

    # policy train op
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    with tf.control_dependencies([train_model_op]):
        train_pi_op = pi_optimizer.minimize(-pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    value_optimizer = tf.train.AdamOptimizer(learning_rate=value_lr)
    value_params = get_vars('main/q') + get_vars('main/v')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)

    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi,
                train_pi_op, train_value_op, target_update]

    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    saver = tf.compat.v1.train.Saver()
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)


    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0]

    def test_agent(epoch,n=1):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        total_reward = 0
        for j in range(n): # repeat n times
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            total_reward += ep_ret
        print('The '+str(epoch)+' epoch is finished!')
        print('The test reward is '+str(total_reward/n))
        return total_reward/n

    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs
    reward_recorder = []


    for t in range(total_steps):
        """
        The algorithm would take total_steps totally in the training
        """

        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        d = False if ep_len==max_ep_len else d

        replay_buffer.store(o, a, r, o2, d)

        o = o2

        if t // steps_per_epoch > train_model_epoch:
            # train 5 steps of Q, V, and pi.
            # train 1 step of model
            for j in range(5):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph: batch['obs1'],
                            x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done']}
                _ = sess.run(step_ops, feed_dict)
            outs = sess.run(train_model_op, feed_dict)
        else:
            # pretrain the model
            batch = replay_buffer.sample_batch(batch_size)
            feed_dict = {x_ph: batch['obs1'],
                         x2_ph: batch['obs2'],
                         a_ph: batch['acts'],
                         r_ph: batch['rews'],
                         d_ph: batch['done'],
                         }
            outs = sess.run(train_model_op, feed_dict)

        if d or (ep_len == max_ep_len):
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            if epoch > train_model_epoch and epoch % test_freq == 0:
                # test the agent when we reach the test_freq, save the experiment result
                reward_test = test_agent(epoch)
                reward_recorder.append(reward_test)
                reward_nparray = np.asarray(reward_recorder)
                np.save(str(exp_name)+'_'+str(env_name)+'_'+str(save_freq)+'.npy',reward_nparray)

            if epoch % save_epoch == 0:
                # save the model
                saver.save(sess, str(exp_name)+'_'+str(env_name),global_step=epoch)
示例#15
0
 def add_place_holders(self):
   self.x_ph, self.a_ph = core.placeholders(self.obs_dim, self.act_dim)
示例#16
0
def sac(env_name='Ant-v2',
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        max_ep_len=1000,
        save_freq=1):
    """
    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.
        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:
            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and
                                           | ``pi`` for states in ``x_ph``:
                                           | q2(x, pi(x)).
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``.
            ===========  ================  ======================================
        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to SAC.
        seed (int): Seed for random number generators.
        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.
        epochs (int): Number of epochs to run and train agent.
        replay_size (int): Maximum length of replay buffer.
        gamma (float): Discount factor. (Always between 0 and 1.)
        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:
            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)
        lr (float): Learning rate (used for both policy and value learning).
        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)
        batch_size (int): Minibatch size for SGD.
        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.
        max_ep_len (int): Maximum length of trajectory / episode / rollout.
        logger_kwargs (dict): Keyword args for EpochLogger.
        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.
    """

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(
            x_ph, a_ph, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n') % var_counts)

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ)
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    value_loss = q1_loss + q2_loss + v_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op,
        train_value_op, target_update
    ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})

    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    total_steps = steps_per_epoch * epochs

    tf.summary.FileWriter('./log/', graph=tf.get_default_graph())

    replay_buffer = ReplayBuffer(obs_dim=env.observation_space.shape[0],
                                 act_dim=env.action_space.shape[0],
                                 size=replay_size)

    episode = 0

    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)[0]
        else:
            a = np.clip(env.action_space.sample(), -1, 1)

        # Step the env
        env.render()
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.

            """

            episode += 1
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                }
                outs = sess.run(step_ops, feed_dict)
            print("episode %d, reward %d" % (episode, ep_ret))
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    sess.close()
示例#17
0
def vpg(env_config, ac_type, ac_kwargs, gamma, lam, epochs, steps_per_epoch,
        lr, train_v_iters, max_ep_len, logger_kwargs, seed):
    logger = EpochLogger(**logger_kwargs)
    configs = locals().copy()
    configs.pop("logger")
    logger.save_config(configs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = make_env(env_config)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    obs_ph, a_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(
        obs_dim, act_dim, None, None, None)

    actor_critic = gaussian_mlp_actor_critic
    pi, logp, logp_pi, v = actor_critic(obs_ph, a_ph, **ac_kwargs)

    all_phs = [obs_ph, a_ph, adv_ph, ret_ph, logp_old_ph]
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    buf = VPGBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam)

    # VPG objectives
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute

    # Optimizers
    train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(pi_loss)
    train_v = tf.train.AdamOptimizer(learning_rate=lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    def update():
        buffer_data = buf.get()
        #util.plot_adv(data[0] * act_high, data[1], logger.output_dir + "/ep_adv%s.png" % epoch)
        inputs = {k: v for k, v in zip(all_phs, buffer_data)}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        sess.run(train_pi, feed_dict=inputs)

        # Training
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, v_new = sess.run(
            [pi_loss, v_loss, approx_kl, v], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    real_action = env.action_space.default()

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={obs_ph: o.reshape(1, -1)})

            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            delta = np.exp(a[0])
            delta = np.clip(delta, 0.9, 1.1)
            real_action = env.action_space.clip(real_action * delta)

            o, r, d, _ = env.step(real_action)
            ep_ret += r
            ep_len += 1

            if ep_len == max_ep_len or t == steps_per_epoch - 1:
                last_val = sess.run(v, feed_dict={obs_ph: o.reshape(1, -1)})
                #print(last_val)
                buf.finish_path(last_val)
                logger.store(EpRet=ep_ret, EpLen=ep_len)

                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                real_action = env.action_space.default()

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
    def __init__(self, opt, job):
        self.opt = opt
        with tf.Graph().as_default():
            tf.set_random_seed(opt.seed)
            np.random.seed(opt.seed)

            # Inputs to computation graph
            self.x_ph, self.a_ph, self.x2_ph = core.placeholders(
                opt.obs_shape, opt.act_shape, opt.obs_shape)
            self.r_ph, self.d_ph, self.logp_pi_ph = core.placeholders(
                (opt.Ln, ), (opt.Ln, ), (opt.Ln, ))

            # ------
            if opt.alpha == 'auto':
                log_alpha = tf.get_variable('log_alpha',
                                            dtype=tf.float32,
                                            initializer=0.0)
                alpha_v = tf.exp(log_alpha)
            else:
                alpha_v = opt.alpha
            # ------

            # Main outputs from computation graph
            with tf.variable_scope('main'):
                mu, pi, logp_pi, self.logp_pi2, q1, q2, q1_pi, q2_pi, q1_mu, q2_mu \
                    = actor_critic(self.x_ph, self.x2_ph, self.a_ph, alpha_v,
                                   use_bn=opt.use_bn, phase=True, coefficent_regularizer=opt.c_regularizer,
                                   hidden_sizes=opt.hidden_size,
                                   action_space=opt.act_space,
                                   model=opt.model)

            # Target value network
            with tf.variable_scope('target'):
                _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_, q1_mu_, q2_mu_ \
                    = actor_critic(self.x2_ph, self.x2_ph, self.a_ph, alpha_v,
                                   use_bn=opt.use_bn, phase=True, coefficent_regularizer=opt.c_regularizer,
                                   hidden_sizes=opt.hidden_size,
                                   action_space=opt.act_space,
                                   model=opt.model)

            # Count variables
            var_counts = tuple(
                core.count_vars(scope)
                for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
            print(('\nNumber of parameters: \t pi: %d, \t' +
                   'q1: %d, \t q2: %d, \t total: %d\n') % var_counts)

            # ------
            if isinstance(alpha_v, tf.Tensor):
                alpha_loss = tf.reduce_mean(
                    -log_alpha *
                    tf.stop_gradient(logp_pi_ + opt.target_entropy))

                alpha_optimizer = tf.train.AdamOptimizer(
                    learning_rate=opt.lr, name='alpha_optimizer')
                train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                          var_list=[log_alpha])
            # ------

            # Min Double-Q:
            if opt.use_max:
                min_q_pi = tf.minimum(q1_mu_, q2_mu_)
            else:
                min_q_pi = tf.minimum(q1_pi_, q2_pi_)  # x2

            # get rid of abnormal explosion
            # min_q_pi = tf.clip_by_value(min_q_pi, -300.0, 900.0)

            #### n-step backup
            q_backup = tf.stop_gradient(min_q_pi)
            for step_i in reversed(range(opt.Ln)):
                q_backup = self.r_ph[:, step_i] + \
                           opt.gamma * (1 - self.d_ph[:, step_i]) * (-alpha_v * self.logp_pi_ph[:, step_i] + q_backup)
            ####

            # Soft actor-critic losses
            q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
            q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
            self.value_loss = q1_loss + q2_loss

            value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr)
            value_params = get_vars('main/q')

            bn_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(bn_update_ops):
                train_value_op = value_optimizer.minimize(
                    self.value_loss, var_list=value_params)

            # Polyak averaging for target variables
            # (control flow because sess.run otherwise evaluates in nondeterministic order)
            with tf.control_dependencies([train_value_op]):
                target_update = tf.group([
                    tf.assign(v_targ,
                              opt.polyak * v_targ + (1 - opt.polyak) * v_main)
                    for v_main, v_targ in zip(get_vars('main'),
                                              get_vars('target'))
                ])

            # All ops to call during one training step
            if isinstance(alpha_v, Number):
                self.step_ops = [
                    q1_loss, q2_loss, q1, q2, logp_pi_,
                    tf.identity(alpha_v), train_value_op, target_update
                ]
            else:
                self.step_ops = [
                    q1_loss, q2_loss, q1, q2, logp_pi_, alpha_v,
                    train_value_op, target_update, train_alpha_op
                ]

            # Initializing targets to match main variables
            self.target_init = tf.group([
                tf.assign(v_targ, v_main)
                for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
            ])

            if job == "learner":
                config = tf.ConfigProto()
                config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction
                config.inter_op_parallelism_threads = 1
                config.intra_op_parallelism_threads = 1
                self.sess = tf.Session(config=config)
            else:
                self.sess = tf.Session(config=tf.ConfigProto(
                    # device_count={'GPU': 0},
                    intra_op_parallelism_threads=1,
                    inter_op_parallelism_threads=1))

            self.sess.run(tf.global_variables_initializer())

            if job == "learner":
                # Set up summary Ops
                self.train_ops, self.train_vars = self.build_summaries()
                self.writer = tf.summary.FileWriter(
                    opt.summary_dir + "/" + "^^^^^^^^^^" +
                    str(datetime.datetime.now()) + opt.env_name + "-" +
                    opt.exp_name + "-workers_num:" + str(opt.num_workers) +
                    "%" + str(opt.a_l_ratio), self.sess.graph)

            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                self.value_loss, self.sess)
示例#19
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.tau = 0.995
        self.gamma = env_set['gamma']
        self.hidden = env_set['hidden']
        self.batch_size = 64
        self.pi_lr = env_set['pi_lr']
        self.q_lr = env_set['q_lr']
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.target_noise = 0.2
        self.noise_clip = 0.5


        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q1, self.q2, self.q1_pi, _ = cr.td3_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
            )

        with tf.variable_scope('target'):
            self.pi_targ, self.q1_double_targ, self.q2_double_targ, self.q1_pi_targ, self.q2_pi_targ = cr.td3_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                pi_q_noise=self.target_noise,
                noise_clip=self.noise_clip)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')

        self.min_q_targ = tf.minimum(self.q1_pi_targ, self.q2_pi_targ)
        #self.min_q_targ = tf.minimum(self.q1_double_targ,self.q2_double_targ)
        self.backup = tf.stop_gradient(self.r_ph + self.gamma *
                                       (1 - self.d_ph) * self.min_q_targ)
        self.pi_loss = -tf.reduce_mean(self.q1_pi)
        self.q1_loss = tf.reduce_mean((self.q1 - self.backup)**2)
        self.q2_loss = tf.reduce_mean((self.q2 - self.backup)**2)
        self.v_loss = self.q1_loss + self.q2_loss

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.train_value_op = self.value_optimizer.minimize(
            self.v_loss, var_list=self.q_params)

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        with tf.control_dependencies([self.train_value_op]):
            self.train_pi_op = self.pi_optimizer.minimize(
                self.pi_loss, var_list=self.pi_params)

        with tf.control_dependencies([self.train_pi_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op,
            self.target_update
        ]
        self.value_ops = [self.v_loss, self.train_value_op]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
        self.saver = tf.train.Saver()
示例#20
0
def ddpg(env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         n_episodes=10000,
         replay_size=int(1e6),
         gamma=0.99,
         show_steps=50,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         act_noise=0.1,
         max_ep_len=200,
         logger_kwargs=dict(),
         save_freq=1):

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is
        # irrelevant, because we only need q_targ(s, pi_targ(s)).
        pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # DDPG losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=5):
        for j in range(n):
            o, r, d, ep_ret, ep_len, ep_cost = test_env.reset(
            ), 0, False, 0, 0, 0
            while not (d or (ep_len == 5 * max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                test_env.render()
                a = get_action(o, 0)
                o, r, d, _, c = test_env.step(a + 0.5 * np.random.rand(), 1)
                ep_ret += (r - c)
                ep_len += 1
                ep_cost += c
        test_env.close()
        print(
            "\n avg reward {} and episode length {} over {} trials, cost/step {}"
            .format(ep_ret / n, ep_len / n, n, ep_cost / ep_len))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    for t in range(start_steps):
        a = env.action_space.sample()
        o2, r, d, _, c = env.step(a, 1)
        r -= c
        replay_buffer.store(o, a, r, o2, d)
        o = o2
        if d:
            o = env.reset()

    fails = 0

    # Main loop: collect experience in env and update/log each epoch
    for t in itertools.count():
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy (with some noise, via act_noise).
        """
        a = get_action(o, act_noise)

        # Step the env
        o2, r, d, _, c = env.step(a, 1)
        r -= c
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if t == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        print("\rSteps {:3}, fails {}".format(t, fails), end="")

        if t % max_ep_len == 0:
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for _ in range(max_ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)

                # Policy update
                outs = sess.run([pi_loss, train_pi_op, target_update],
                                feed_dict)
        if d:
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            fails += 1

    # End of epoch wrap-up
        if t > 0 and t % (show_steps * max_ep_len) == 0:
            # Test the performance of the deterministic version of the agent.
            test_agent()
示例#21
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.support_size = 8
        self.target_update_tau = 0.995
        self.gamma = 0.99
        self.hidden = env_set['hidden']
        self.batch_size = 64
        self.pi_lr = 1e-4
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.target_noise = 0.2
        self.noise_clip = 0.1

        self.x_ph, self.a_ph, self.tau_ph,self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.support_size,self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.q, self.q_pi = cr.dipg_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                tau=self.tau_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit)

        with tf.variable_scope('target'):
            _, _, self.q_pi_targ = cr.dipg_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                tau=self.tau_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                pi_q_noise=self.target_noise)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')
        self.backup = tf.stop_gradient(tf.tile(tf.expand_dims(self.r_ph,axis=1),[1,self.support_size])\
                    + self.gamma*tf.tile(tf.expand_dims(1-self.d_ph,axis=1),[1,self.support_size])*self.q_pi_targ)
        self.pi_loss = -tf.reduce_mean(tf.reduce_mean(self.q_pi))

        self.clip_tau = 5e-2
        theta_loss_tile = tf.tile(tf.expand_dims(self.q, axis=2),
                                  [1, 1, self.support_size])
        logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1),
                                   [1, self.support_size, 1])
        Huber_loss = tf.losses.huber_loss(logit_valid_tile,
                                          theta_loss_tile,
                                          reduction=tf.losses.Reduction.NONE)
        tau = tf.tile(tf.expand_dims(self.tau_ph, axis=2),
                      [1, 1, self.support_size])
        bellman_errors = logit_valid_tile - theta_loss_tile
        Loss = (
            tf.abs(tau - tf.stop_gradient(tf.to_float(bellman_errors < 0))) *
            Huber_loss)
        self.v_loss = tf.reduce_mean(
            tf.reduce_sum(tf.reduce_mean(Loss, axis=1)))

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        grad = self.pi_optimizer.compute_gradients(self.pi_loss,
                                                   var_list=self.pi_params)
        grad = [(gr / self.support_size, var) for gr, var in grad]
        self.train_pi_op = self.pi_optimizer.apply_gradients(grad)

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        with tf.control_dependencies([self.train_pi_op]):
            self.train_value_op = self.value_optimizer.minimize(
                self.v_loss, var_list=self.q_params)

        with tf.control_dependencies([self.train_value_op]):
            self.target_update = tf.group([
                tf.assign(
                    v_targ, self.target_update_tau * v_targ +
                    (1 - self.target_update_tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])
        self.step_ops = [
            self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op,
            self.target_update
        ]
        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
示例#22
0
文件: ppo.py 项目: zhc134/l2s
def ppo(env_config, ac_type, ac_kwargs, clip_ratio, epochs, steps_per_epoch,
        optimizer, lr, train_pi_iters, max_ep_len, target_kl, logger_kwargs,
        seed):
    logger = EpochLogger(**logger_kwargs)
    configs = locals().copy()
    configs.pop("logger")
    logger.save_config(configs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = make_env(env_config)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_high = env.action_space.high

    obs_ph, a_ph, adv_ph, logp_old_ph = core.placeholders(
        obs_dim, act_dim, None, None)
    all_phs = [obs_ph, a_ph, adv_ph, logp_old_ph]

    actor_critic = get_ppo_actor_critic(ac_type)
    pi, logp, logp_pi = actor_critic(obs_ph, a_ph, **ac_kwargs)

    # Experience buffer
    buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))

    # Optimizers
    if optimizer == "adam":
        train_pi = tf.train.AdamOptimizer(learning_rate=lr).minimize(pi_loss)
    elif optimizer == "sgd":
        train_pi = tf.train.GradientDescentOptimizer(
            learning_rate=lr).minimize(pi_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    def update():

        print(sess.run(tf.trainable_variables()))

        data = buf.get()
        #util.plot_adv(data[0] * act_high, data[1], logger.output_dir + "/ep_adv%s.png" % epoch)
        inputs = {k: v for k, v in zip(all_phs, data[:4])}
        pi_l_old, ent = sess.run([pi_loss, approx_ent], feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)

        # Log changes from update
        pi_l_new, kl, cf = sess.run([pi_loss, approx_kl, clipfrac],
                                    feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    real_action = env.action_space.default()
    o, r, d, _ = env.step(real_action)

    episode_actions = []
    episode_obs = []
    episode_actions.append(real_action)
    episode_obs.append(o)

    print(tf.trainable_variables())
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        episode_count = 0
        ep_actions = []
        for t in range(steps_per_epoch):
            a, logp_t = sess.run([pi, logp_pi],
                                 feed_dict={obs_ph: o.reshape(1, -1)})
            delta = np.exp(a[0])
            delta = np.clip(delta, 0.95, 1.05)
            real_action = env.action_space.clip(real_action * delta)

            o, r, d, _ = env.step(real_action)

            buf.store(o, a, r, logp_t)

            ep_actions.append(real_action)
            episode_actions.append(real_action)
            episode_obs.append(o)
            ep_ret += r
            ep_len += 1

            if ep_len == max_ep_len or t == steps_per_epoch - 1:
                buf.finish_path()
                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                real_action = env.action_space.default()
                o, r, d, _ = env.step(real_action)

                util.plot_seq_obs_and_actions(
                    episode_obs, episode_actions, act_high, logger.output_dir +
                    '/episode_actions_%d_%d.png' % (epoch, episode_count))
                episode_count += 1
                episode_actions = []
                episode_obs = []
                episode_actions.append(real_action)
                episode_obs.append(o)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()

        util.plot_actions(ep_actions, act_high,
                          logger.output_dir + '/ep_actions%d.png' % epoch)
示例#23
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = 33
        self.output_size = 4
        self.tau = 0.995
        self.gamma = 0.99
        self.hidden = [400, 300]
        self.batch_size = 64
        self.pi_lr = 1e-3
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(1e5)
        self.target_noise = 0.2
        self.noise_clip = 0.1
        self.alpha = 1e-5
        self.num_worker = 20
        self.noise = OU_noise(self.output_size, self.num_worker)

        self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.state_size, None, None)

        with tf.variable_scope('main'):
            self.mu, self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi, self.v = \
                cr.sac_mlp_actor_critic(
                    x=self.x_ph,
                    a=self.a_ph,
                    hidden=self.hidden,
                    activation=tf.nn.relu,
                    output_activation=tf.tanh,
                    output_size=self.output_size,
                    action_limit=self.action_limit
                )
        with tf.variable_scope('target'):
            _, _, _, _, _, _, _, self.v_targ = \
                cr.sac_mlp_actor_critic(
                    x=self.x2_ph,
                    a=self.a_ph,
                    hidden=self.hidden,
                    activation=tf.nn.relu,
                    output_activation=tf.tanh,
                    output_size=self.output_size,
                    action_limit=self.action_limit
                )

        self.pi_params = cr.get_vars('main/pi')
        self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v')

        self.min_q_pi = tf.minimum(self.q1_pi, self.q2_pi)
        self.q_backup = tf.stop_gradient(self.r_ph + self.gamma *
                                         (1 - self.d_ph) * self.v_targ)
        self.v_backup = tf.stop_gradient(self.min_q_pi -
                                         self.alpha * self.logp_pi)

        self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - self.q1_pi)
        self.q1_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q1)**2)
        self.q2_loss = 0.5 * tf.reduce_mean((self.q_backup - self.q2)**2)
        self.v_loss = 0.5 * tf.reduce_mean((self.v_backup - self.v)**2)
        self.value_loss = self.q1_loss + self.q2_loss + self.v_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss,
                                                      var_list=self.pi_params)

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        with tf.control_dependencies([self.train_pi_op]):
            self.train_value_op = self.value_optimizer.minimize(
                self.value_loss, var_list=self.value_params)

        with tf.control_dependencies([self.train_value_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.q1_loss, self.q2_loss, self.v_loss, self.q1,
            self.q2, self.v, self.logp_pi, self.train_pi_op,
            self.train_value_op, self.target_update
        ]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
示例#24
0
def ddpg(env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         control_policy=ControlPolicy,
         n_episodes=10000,
         replay_size=int(1e6),
         gamma=0.99,
         show_steps=50,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         act_noise=0.1,
         max_ep_len=200,
         logger_kwargs=dict(),
         save_freq=1):

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    ctrl_pol = control_policy(env)

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is
        # irrelevant, because we only need q_targ(s, pi_targ(s)).
        pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # DDPG losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=5):
        tot_len, tot_ret = 0, 0
        cost, cost_ctrl = 0, 0
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            o_ctrl = np.array(o)
            while not (d or (ep_len == 5 * max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                test_env.render()
                a_ctrl = np.array([ctrl_pol.predict(o_ctrl)])
                o_ctrl, _, _, info = test_env.step(a_ctrl, 0)
                cost_ctrl += info["cost"]
                a = get_action(o, 0)
                o, r, d, info = test_env.step(a, 1)
                cost += info["cost"]
                ep_len += 1
            tot_len += ep_len
        test_env.close()
        print(
            "\n avg reward {:.5} and episode length {} over {} trials, cost/step rl/lqr {:.5}/{:.5}"
            .format((tot_len - cost) / n, tot_len / n, n, cost / tot_len,
                    cost_ctrl / tot_len))

    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    o_ctrl = np.array(o)  #env.state[0]

    for t in range(start_steps):
        #a = env.action_space.sample()
        a = np.array([ctrl_pol.predict(o)])
        o2, r, d, info = env.step(a, 1)
        r -= info["cost"]
        replay_buffer.store(o, a, r, o2, d)
        o = o2
        if d:
            o = env.reset()

    fails = 0
    takeover = False
    cost, cost_ctrl = 0, 0
    retrain_steps = 0
    show = False

    # Setup plotting
    # times = []
    # plt.ion()
    # fig, ax = plt.subplots()
    # plot = ax.plot([], [])
    # costs = []
    # plot_ctrl = ax.plot([], [])
    # ctrl_costs = []
    # ax.legend(["ddpg cost", "lqr cost"])
    # ax.set_xlabel("time")
    # ax.set_ylabel("cost")

    # Main loop: collect experience in env and update/log each epoch
    for t in itertools.count():
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy (with some noise, via act_noise).
        """
        if show > 0:
            env.render(takeover=takeover)

        # Step lqr
        a_ctrl = np.array([ctrl_pol.predict(o_ctrl)])
        o_ctrl, _, _, info = env.step(a_ctrl, 0)
        cost_ctrl += info["cost"]

        # Step ddpg
        scaler = min(1, 0.1 + t / 100000)
        takeover = np.abs(o[2]) > 0.5 * scaler or np.abs(o[0]) > 0.7 * scaler
        # takeover = False
        if takeover:
            a = np.array([ctrl_pol.predict(o)])
        else:
            a = get_action(o, act_noise)
        o2, r, d, info = env.step(a, 1)

        cost += info["cost"]
        r -= info["cost"]
        retrain_steps += 1
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        # d = False if t==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        print(
            "\rSteps {:5}, fails {:3}, ep_len {:5}, disturbance {:7.3}, cost rl/lqr {:7.3}/{:7.3}"
            .format(t, fails, ep_len,
                    info["disturbance"] if info["push"] else 0.0,
                    cost / retrain_steps, cost_ctrl / retrain_steps),
            end="")

        if np.random.rand() * max_ep_len < 1:
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for _ in range(max_ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)

                # Policy update
                outs = sess.run([pi_loss, train_pi_op, target_update],
                                feed_dict)

            # cost /= retrain_steps
            # cost_ctrl /= retrain_steps

            # costs.append(cost)
            # ctrl_costs.append(cost_ctrl)
            # times.append(0.02 * (t + start_steps))

            # ax.plot(times, costs, 'r-', times, ctrl_costs, 'b--')

            # fig.canvas.draw()
            # plt.pause(0.005)

            cost = 0
            cost_ctrl = 0
            retrain_steps = 0

            show -= 1

            env.state[0] = np.array(env.state[1])
            o_ctrl = env.state[0]
            print()
        if d:
            o, r, d, ep_len = env.reset(), 0, False, 0
            o_ctrl = np.array(o)
            fails += 1

    # End of epoch wrap-up
        if t > 0 and t % (show_steps * max_ep_len) == 0:
            # Test the performance of the deterministic version of the agent.
            test_agent()
            show = 5
示例#25
0
def asac(env_fn, actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(), seed=0,
         steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99,
         polyak=0.995, lr=5e-4, alpha_start=0.2, batch_size=100, start_steps=10000,
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001,
         delta=0.02, sample_step=2000):

    alpha = Alpha(alpha_start=alpha_start, delta=delta)
    alpha_t = alpha()

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None)
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
    alpha_ph = core.scale_holder()
    # Main outputs from computation graph

    #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs)
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(x_ph, a_ph, **ac_kwargs)
    # Target value network
    with tf.variable_scope('target'):
        _,_,_,_,_,_,_,v_targ, _, _, R_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in
                       ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts)
    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*v_targ)
    v_backup = tf.stop_gradient(min_q_pi - alpha_ph *logp_pi)
    Q_backup = tf.stop_gradient(r_ph + gamma*(1 - d_ph)*R_targ)
    R_backup = tf.stop_gradient(Q_pi)
    adv = Q_pi - R

    pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1) ** 2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2) ** 2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    Q_loss = 0.5*tf.reduce_mean((Q_backup - Q)**2)
    R_loss = 0.5*tf.reduce_mean((R_backup - R)**2)
    value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss
    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v') + get_vars('main/Q') + get_vars('main/R')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)
    """
    R_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R'))
    """
    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi,
                train_pi_op, train_value_op, target_update, R_loss, Q_loss]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    config = tf.ConfigProto(inter_op_parallelism_threads=30,intra_op_parallelism_threads=5)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph},
                          outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v, 'Q': Q, 'R': R})

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})

    def test_agent(n=10):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    ret_est = sess.run(R, feed_dict={x_ph: [o]})[0]
    total_steps = steps_per_epoch * epochs

    counter = 0
    ret_epi = []
    obs_epi = []
    loss_old = 10000
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()
        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)
        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                             alpha_ph: alpha_t
                            }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
                             LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5],
                             VVals=outs[6], LogPi=outs[7], LossR=outs[11])
                counter += 1
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            ret_est = sess.run(R, feed_dict={x_ph: [o]})[0]
            logger.store(RetEst=ret_est)
            if counter >= 1000:
                loss_new, _ = logger.get_stats('LossPi')
                counter = 0
                if (loss_old - loss_new)/np.absolute(loss_old) < loss_threshold and t > start_steps:
                    rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32)
                    rho_ptr = 0
                    for sample_t in range(sample_step):
                        a = get_action(o)
                        o2, r, d, _ = env.step(a)
                        ep_len += 1
                        d = False if ep_len == max_ep_len else d
                        rho_s[rho_ptr] = o
                        o = o2
                        if d or (ep_len == max_ep_len):
                            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    advantages = sess.run(adv, feed_dict={x_ph: rho_s})
                    alpha.update_alpha(advantages)
                    #alpha.update_alpha(rho_q-rho_v)
                    alpha_t = alpha()
                    print(alpha_t)
                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    loss_old = 10000
                else:
                    loss_old = loss_new
        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EntCoeff', alpha_t)
            logger.log_tabular('RetEst', average_only=True)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('LossR', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
示例#26
0
    def __init__(self, opt, job):
        self.opt = opt
        with tf.Graph().as_default():
            tf.set_random_seed(opt.seed)
            np.random.seed(opt.seed)

            # Inputs to computation graph
            self.x_ph, self.a_ph, self.x2_ph, self.r_ph, self.d_ph = core.placeholders(opt.obs_dim, None, opt.obs_dim, None, None)

            # Main outputs from computation graph
            with tf.variable_scope('main'):
                self.q, self.q_x2 = core.q_function(self.x_ph, self.x2_ph, opt.hidden_size, opt.act_dim)

            # Target value network
            with tf.variable_scope('target'):
                self.q_next, _ = core.q_function(self.x2_ph, self.x2_ph, opt.hidden_size, opt.act_dim)

            # Count variables
            var_counts = tuple(core.count_vars(scope) for scope in ['main'])
            print('\nNumber of parameters: total: %d\n' % var_counts)

            a_one_hot = tf.one_hot(tf.cast(self.a_ph, tf.int32), depth=opt.act_dim)
            q_value = tf.reduce_sum(self.q * a_one_hot, axis=1)

            # DDQN
            online_q_x2_a_one_hot = tf.one_hot(tf.argmax(self.q_x2, axis=1), depth=opt.act_dim)
            q_target = tf.reduce_sum(self.q_next * online_q_x2_a_one_hot, axis=1)

            # DQN
            # q_target = tf.reduce_max(self.q_next, axis=1)

            # Bellman backup for Q functions, using Clipped Double-Q targets
            q_backup = tf.stop_gradient(self.r_ph + opt.gamma * (1 - self.d_ph) * q_target)

            # q losses
            q_loss = 0.5 * tf.reduce_mean((q_backup - q_value) ** 2)

            # Value train op
            # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
            value_optimizer = tf.train.AdamOptimizer(learning_rate=opt.lr)
            value_params = get_vars('main/q')
            train_value_op = value_optimizer.minimize(q_loss, var_list=value_params)

            # Polyak averaging for target variables
            # (control flow because sess.run otherwise evaluates in nondeterministic order)
            with tf.control_dependencies([train_value_op]):
                target_update = tf.group([tf.assign(v_targ, opt.polyak * v_targ + (1 - opt.polyak) * v_main)
                                          for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

            # All ops to call during one training step
            self.step_ops = [q_loss, self.q, train_value_op, target_update]

            # Initializing targets to match main variables
            self.target_init = tf.group([tf.assign(v_targ, v_main)
                                    for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

            if job == "learner":
                config = tf.ConfigProto()
                config.gpu_options.per_process_gpu_memory_fraction = opt.gpu_fraction
                config.inter_op_parallelism_threads = 1
                config.intra_op_parallelism_threads = 1
                self.sess = tf.Session(config=config)
            else:
                self.sess = tf.Session(
                    config=tf.ConfigProto(
                        # device_count={'GPU': 0},
                        intra_op_parallelism_threads=1,
                        inter_op_parallelism_threads=1))

            self.sess.run(tf.global_variables_initializer())

            if job == "learner":
                # Set up summary Ops
                self.train_ops, self.train_vars = self.build_summaries()
                self.writer = tf.summary.FileWriter(
                    opt.summary_dir + "/" + "^^^^^^^^^^" + str(datetime.datetime.now()) + opt.env_name + "-" +
                    opt.exp_name + "-workers_num:" + str(opt.num_workers) + "%" + str(opt.a_l_ratio), self.sess.graph)

            self.variables = ray.experimental.tf_utils.TensorFlowVariables(
                q_loss, self.sess)
示例#27
0
def ddpg(env_name,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         act_noise=0.1,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1,
         test=False):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = gym.make(env_name), gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is
        #irrelevant, because we only need q_targ(s, pi_targ(s)).
        pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # DDPG losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q': q
                          })
    saver = tf.train.Saver()
    save_path = './saved_model/' + env_name + '/test'

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    def save(saver, sess):
        if not os.path.exists('./saved_model/' + env_name):
            os.mkdir('./saved_model/' + env_name)
        ckpt_path = saver.save(sess, save_path)
        #print('Save ckpt file: {}'.format(ckpt_path))

    def load(saver, sess):
        if os.path.exists('./saved_model/' + env_name):
            saver.restore(sess, save_path)
            print('Load model complete.')
        else:
            print('There is no saved model.')

    if test is False:
        start_time = time.time()
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        total_steps = steps_per_epoch * epochs

        # Main loop: collect experience in env and update/log each epoch
        for t in range(total_steps):
            """
            Until start_steps have elapsed, randomly sample actions
            from a uniform distribution for better exploration. Afterwards, 
            use the learned policy (with some noise, via act_noise). 
            """
            if t > start_steps:
                a = get_action(o, act_noise)
            else:
                a = env.action_space.sample()

            # Step the env
            o2, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            d = False if ep_len == max_ep_len else d

            # Store experience to replay buffer
            replay_buffer.store(o, a, r, o2, d)

            # Super critical, easy to overlook step: make sure to update
            # most recent observation!
            o = o2

            if d or (ep_len == max_ep_len):
                """
                Perform all DDPG updates at the end of the trajectory,
                in accordance with tuning done by TD3 paper authors.
                """
                for _ in range(ep_len):
                    batch = replay_buffer.sample_batch(batch_size)
                    feed_dict = {
                        x_ph: batch['obs1'],
                        x2_ph: batch['obs2'],
                        a_ph: batch['acts'],
                        r_ph: batch['rews'],
                        d_ph: batch['done']
                    }

                    # Q-learning update
                    outs = sess.run([q_loss, q, train_q_op], feed_dict)
                    logger.store(LossQ=outs[0], QVals=outs[1])

                    # Policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

            # End of epoch wrap-up
            if t > 0 and t % steps_per_epoch == 0:
                epoch = t // steps_per_epoch

                # Save model
                if (epoch % save_freq == 0) or (epoch == epochs - 1):
                    #logger.save_state({'env': env}, None)
                    save(saver, sess)

                # Test the performance of the deterministic version of the agent.
                test_agent()

                # Log info about epoch
                logger.log_tabular('Epoch', epoch)
                logger.log_tabular('EpRet', with_min_and_max=True)
                logger.log_tabular('TestEpRet', with_min_and_max=True)
                logger.log_tabular('EpLen', average_only=True)
                logger.log_tabular('TestEpLen', average_only=True)
                logger.log_tabular('TotalEnvInteracts', t)
                logger.log_tabular('QVals', with_min_and_max=True)
                logger.log_tabular('LossPi', average_only=True)
                logger.log_tabular('LossQ', average_only=True)
                logger.log_tabular('Time', time.time() - start_time)
                logger.dump_tabular()
        #save(saver, sess)

    else:
        load(saver, sess)

        test_logger = EpochLogger()
        o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0

        num_episodes = 100
        render = True
        max_ep_len = 0
        while n < num_episodes:
            if render:
                env.render()
                time.sleep(1e-3)

            a = get_action(o, 0)
            o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            if d or (ep_len == max_ep_len):
                test_logger.store(EpRet=ep_ret, EpLen=ep_len)
                print('Episode %d \t EpRet %.3f \t EpLen %d' %
                      (n, ep_ret, ep_len))
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                n += 1

        test_logger.log_tabular('EpRet', with_min_and_max=True)
        test_logger.log_tabular('EpLen', average_only=True)
        test_logger.dump_tabular()
示例#28
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.support_size = 8
        self.target_update_tau = 0.995
        self.gamma = 0.99
        self.hidden = env_set['hidden']
        self.batch_size = 64
        self.pi_lr = 1e-4
        self.q_lr = 1e-3
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.target_noise = 0.2
        self.noise_clip = 0.1
        self.alpha = 1e-5
        
        self.x_ph, self.a_ph, self.tau_ph,self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size, self.support_size,self.state_size, None, None)

        with tf.variable_scope('main'):
            self.pi, self.logp_pi, self.q1, self.q2, self.q1_pi, self.q2_pi,  self.v = cr.dipg_sac_mlp_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                tau= self.tau_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size
            )

        with tf.variable_scope('target'):
            _, _, _, _, _, _, self.v_targ = cr.dipg_sac_mlp_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                tau=self.tau_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size
            )

        self.pi_params = cr.get_vars('main/pi')
        self.value_params = cr.get_vars('main/q') + cr.get_vars('main/v')
        self.min_q = tf.where(tf.less(tf.reduce_mean(self.q1_pi),tf.reduce_mean(self.q2_pi)),self.q1_pi,self.q2_pi)
        self.q_backup = tf.stop_gradient(tf.tile(tf.expand_dims(self.r_ph,axis=1),[1,self.support_size])\
                    + self.gamma*tf.tile(tf.expand_dims(1-self.d_ph,axis=1),[1,self.support_size])*self.v_targ)
        self.v_backup = tf.stop_gradient(self.min_q\
                        - self.alpha*tf.tile(tf.expand_dims(self.logp_pi,axis=1),[1,self.support_size]))
        self.pi_loss = tf.reduce_mean(self.alpha * self.logp_pi - tf.reduce_mean(self.q1_pi*tf.square(self.tau_ph)))
        tau = self.tau_ph
        inv_tau = 1 - tau
        tau = tf.tile(tf.expand_dims(tau, axis=1), [1, self.support_size, 1])
        inv_tau = tf.tile(tf.expand_dims(inv_tau, axis=1), [1, self.support_size, 1])
        logit_valid_tile = tf.tile(tf.expand_dims(self.q_backup, axis=1), [1, self.support_size, 1])

        theta_loss_tile = tf.tile(tf.expand_dims(self.q1, axis=2), [1, 1, self.support_size])
        Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE)
        error_loss = logit_valid_tile - theta_loss_tile
        Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss)
        self.q1_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1))

        theta_loss_tile = tf.tile(tf.expand_dims(self.q2, axis=2), [1, 1, self.support_size])
        Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE)
        error_loss = logit_valid_tile - theta_loss_tile
        Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss)
        self.q2_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1))

        theta_loss_tile = tf.tile(tf.expand_dims(self.v, axis=2), [1, 1, self.support_size])
        logit_valid_tile = tf.tile(tf.expand_dims(self.v_backup, axis=1), [1, self.support_size, 1])
        Huber_loss = tf.losses.mean_squared_error(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE)
        error_loss = logit_valid_tile - theta_loss_tile
        Loss = tf.where(tf.less(error_loss, 0.0), Huber_loss, tau * Huber_loss)
        self.v_loss = 0.5*tf.reduce_mean(tf.reduce_sum(tf.reduce_mean(Loss, axis=2), axis=1))
        self.value_loss = self.q1_loss + self.q2_loss + self.v_loss

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        self.train_pi_op = self.pi_optimizer.minimize(self.pi_loss, var_list=self.pi_params)

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        with tf.control_dependencies([self.train_pi_op]):
            self.train_value_op = self.value_optimizer.minimize(self.value_loss, var_list=self.value_params)

        with tf.control_dependencies([self.train_value_op]):
            self.target_update = tf.group([tf.assign(v_targ, self.target_update_tau * v_targ + (1 - self.target_update_tau) * v_main)
                                           for v_main, v_targ in zip(cr.get_vars('main'), cr.get_vars('target'))])

        self.step_ops = [self.pi_loss, self.value_loss, self.train_pi_op, self.train_value_op, self.target_update]
        self.target_init = tf.group([tf.assign(v_targ, v_main)
                                    for v_main, v_targ in zip(cr.get_vars('main/v'), cr.get_vars('target/v'))])

        self.sess.run(tf.global_variables_initializer())

        self.sess.run(self.target_init)
示例#29
0
    def __init__(self):
        self.sess = tf.Session()
        self.state_size = env_set['state']
        self.output_size = env_set['action']
        self.worker_size = env_set['worker']
        self.support_size = 64
        self.tau = 0.995
        self.gamma = env_set['gamma']
        self.hidden = env_set['hidden']
        self.batch_size = env_set['batch_size']
        self.pi_lr = env_set['pi_lr']
        self.q_lr = env_set['q_lr']
        self.action_limit = 1.0
        self.memory = replay_buffer(env_set['mem_size'])
        self.kappa = 1.0
        self.risk_factor = -1.0
        self.random_risk = False
        self.target_noise = 0.2
        self.noise_clip = 0.5
        tf.set_random_seed(10)

        self.x_ph, self.a_ph,self.x2_ph, self.r_ph, self.d_ph = \
            cr.placeholders(self.state_size, self.output_size,self.state_size, None, None)
        self.risk_factor_ph = tf.placeholder(tf.float32)

        with tf.variable_scope('main'):
            self.pi, self.q1, self.q2, self.q1_pi, self.q2_pi = cr.dqpg_td3_actor_critic(
                x=self.x_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                support_size=self.support_size)

        with tf.variable_scope('target'):
            _, _, _, self.q1_pi_targ, self.q2_pi_targ = cr.dqpg_td3_actor_critic(
                x=self.x2_ph,
                a=self.a_ph,
                hidden=self.hidden,
                activation=tf.nn.relu,
                output_activation=tf.tanh,
                output_size=self.output_size,
                action_limit=self.action_limit,
                support_size=self.support_size,
                pi_q_noise=self.target_noise,
                noise_clip=self.noise_clip)

        self.pi_params = cr.get_vars('main/pi')
        self.q_params = cr.get_vars('main/q')
        self.min_q_targ = tf.minimum(self.q1_pi_targ, self.q2_pi_targ)
        self.backup = tf.stop_gradient(tf.expand_dims(self.r_ph,axis=1)\
                                       + self.gamma*tf.expand_dims(1-self.d_ph,axis=1)*self.min_q_targ)
        self.quantile_weight = 1.0 - self.risk_factor_ph*\
            (2.0*tf.reshape(tf.range(0.5/self.support_size, 1, 1 / self.support_size), [1, self.support_size]) - 1.0)
        self.pi_loss = -tf.reduce_mean(
            tf.reduce_mean(self.q1_pi * self.quantile_weight))

        logit_valid_tile = tf.tile(tf.expand_dims(self.backup, axis=1),
                                   [1, self.support_size, 1])
        tau = tf.reshape(
            tf.range(0.5 / self.support_size, 1, 1 / self.support_size),
            [1, self.support_size])
        tau = tf.tile(tf.expand_dims(tau, axis=2), [1, 1, self.support_size])

        theta_loss_tile = tf.tile(tf.expand_dims(self.q1, axis=2),
                                  [1, 1, self.support_size])
        #Huber_loss = tf.compat.v1.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE,delta=self.kappa)/self.kappa
        bellman_errors = logit_valid_tile - theta_loss_tile
        Logcosh = bellman_errors + tf.math.softplus(
            -2. * bellman_errors) - tf.log(2.)
        Loss = tf.abs(tau - tf.stop_gradient(tf.to_float(
            bellman_errors < 0))) * Logcosh
        self.v1_loss = tf.reduce_mean(
            tf.reduce_sum(tf.reduce_mean(Loss, axis=1), axis=1))

        theta_loss_tile = tf.tile(tf.expand_dims(self.q2, axis=2),
                                  [1, 1, self.support_size])
        #Huber_loss = tf.compat.v1.losses.huber_loss(logit_valid_tile, theta_loss_tile, reduction=tf.losses.Reduction.NONE,delta=self.kappa)/self.kappa
        bellman_errors = logit_valid_tile - theta_loss_tile
        Logcosh = bellman_errors + tf.math.softplus(
            -2. * bellman_errors) - tf.log(2.)
        Loss = tf.abs(tau - tf.stop_gradient(tf.to_float(
            bellman_errors < 0))) * Logcosh
        self.v2_loss = tf.reduce_mean(
            tf.reduce_sum(tf.reduce_mean(Loss, axis=1), axis=1))

        self.v_loss = self.v1_loss + self.v2_loss

        self.value_optimizer = tf.train.AdamOptimizer(self.q_lr)
        self.train_value_op = self.value_optimizer.minimize(
            self.v_loss, var_list=self.q_params)

        self.pi_optimizer = tf.train.AdamOptimizer(self.pi_lr)
        with tf.control_dependencies([self.train_value_op]):
            self.train_pi_op = self.pi_optimizer.minimize(
                self.pi_loss, var_list=self.pi_params)

        with tf.control_dependencies([self.train_pi_op]):
            self.target_update = tf.group([
                tf.assign(v_targ, self.tau * v_targ + (1 - self.tau) * v_main)
                for v_main, v_targ in zip(cr.get_vars('main'),
                                          cr.get_vars('target'))
            ])

        self.step_ops = [
            self.pi_loss, self.v_loss, self.train_pi_op, self.train_value_op,
            self.target_update
        ]
        self.value_ops = [self.v_loss, self.train_value_op]

        self.target_init = tf.group([
            tf.assign(v_targ, v_main) for v_main, v_targ in zip(
                cr.get_vars('main'), cr.get_vars('target'))
        ])

        self.sess.run(tf.global_variables_initializer())
        self.sess.run(self.target_init)
        print(
            self.sess.run(self.quantile_weight,
                          feed_dict={self.risk_factor_ph: self.risk_factor}))
        self.saver = tf.train.Saver()
示例#30
0
文件: vpg.py 项目: Baichenjia/PPO
def vpg(
        env_fn,
        actor_critic,
        ac_kwargs=dict(),  # ac_kwargs 存储了网络结构的参数
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        lam=0.97,  # gamma, lambda 的设置
        pi_lr=3e-4,
        vf_lr=1e-3,  # 学习率的设置
        train_v_iters=80,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols for state,
            ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
                          function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.
    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch /
                                num_procs())  # num_procs 是CPU个数
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # VPG objectives
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        print(
            sess.run([
                logp, logp_old_ph,
                tf.reduce_mean(approx_kl),
                tf.reduce_mean(logp - logp_old_ph)
            ],
                     feed_dict=inputs))

        # Policy gradient step
        sess.run(train_pi, feed_dict=inputs)

        # Value function learning
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl],
                                         feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            if epoch == epochs - 1:
                env.render()

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform VPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()