Пример #1
0
def ddpg(env_name,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         act_noise=0.1,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1,
         test=False):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = gym.make(env_name), gym.make(env_name)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is
        #irrelevant, because we only need q_targ(s, pi_targ(s)).
        pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # DDPG losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q': q
                          })
    saver = tf.train.Saver()
    save_path = './saved_model/' + env_name + '/test'

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    def save(saver, sess):
        if not os.path.exists('./saved_model/' + env_name):
            os.mkdir('./saved_model/' + env_name)
        ckpt_path = saver.save(sess, save_path)
        #print('Save ckpt file: {}'.format(ckpt_path))

    def load(saver, sess):
        if os.path.exists('./saved_model/' + env_name):
            saver.restore(sess, save_path)
            print('Load model complete.')
        else:
            print('There is no saved model.')

    if test is False:
        start_time = time.time()
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        total_steps = steps_per_epoch * epochs

        # Main loop: collect experience in env and update/log each epoch
        for t in range(total_steps):
            """
            Until start_steps have elapsed, randomly sample actions
            from a uniform distribution for better exploration. Afterwards, 
            use the learned policy (with some noise, via act_noise). 
            """
            if t > start_steps:
                a = get_action(o, act_noise)
            else:
                a = env.action_space.sample()

            # Step the env
            o2, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            d = False if ep_len == max_ep_len else d

            # Store experience to replay buffer
            replay_buffer.store(o, a, r, o2, d)

            # Super critical, easy to overlook step: make sure to update
            # most recent observation!
            o = o2

            if d or (ep_len == max_ep_len):
                """
                Perform all DDPG updates at the end of the trajectory,
                in accordance with tuning done by TD3 paper authors.
                """
                for _ in range(ep_len):
                    batch = replay_buffer.sample_batch(batch_size)
                    feed_dict = {
                        x_ph: batch['obs1'],
                        x2_ph: batch['obs2'],
                        a_ph: batch['acts'],
                        r_ph: batch['rews'],
                        d_ph: batch['done']
                    }

                    # Q-learning update
                    outs = sess.run([q_loss, q, train_q_op], feed_dict)
                    logger.store(LossQ=outs[0], QVals=outs[1])

                    # Policy update
                    outs = sess.run([pi_loss, train_pi_op, target_update],
                                    feed_dict)
                    logger.store(LossPi=outs[0])

                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

            # End of epoch wrap-up
            if t > 0 and t % steps_per_epoch == 0:
                epoch = t // steps_per_epoch

                # Save model
                if (epoch % save_freq == 0) or (epoch == epochs - 1):
                    #logger.save_state({'env': env}, None)
                    save(saver, sess)

                # Test the performance of the deterministic version of the agent.
                test_agent()

                # Log info about epoch
                logger.log_tabular('Epoch', epoch)
                logger.log_tabular('EpRet', with_min_and_max=True)
                logger.log_tabular('TestEpRet', with_min_and_max=True)
                logger.log_tabular('EpLen', average_only=True)
                logger.log_tabular('TestEpLen', average_only=True)
                logger.log_tabular('TotalEnvInteracts', t)
                logger.log_tabular('QVals', with_min_and_max=True)
                logger.log_tabular('LossPi', average_only=True)
                logger.log_tabular('LossQ', average_only=True)
                logger.log_tabular('Time', time.time() - start_time)
                logger.dump_tabular()
        #save(saver, sess)

    else:
        load(saver, sess)

        test_logger = EpochLogger()
        o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0

        num_episodes = 100
        render = True
        max_ep_len = 0
        while n < num_episodes:
            if render:
                env.render()
                time.sleep(1e-3)

            a = get_action(o, 0)
            o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            if d or (ep_len == max_ep_len):
                test_logger.store(EpRet=ep_ret, EpLen=ep_len)
                print('Episode %d \t EpRet %.3f \t EpLen %d' %
                      (n, ep_ret, ep_len))
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                n += 1

        test_logger.log_tabular('EpRet', with_min_and_max=True)
        test_logger.log_tabular('EpLen', average_only=True)
        test_logger.dump_tabular()
Пример #2
0
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, gru_units=256,
        trials_per_epoch=100, episodes_per_trial=2, n = 100, epochs=100, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
        vf_lr=1e-3, train_pi_iters=1000, train_v_iters=80, lam=0.97, max_ep_len=1000,
        target_kl=0.01, logger_kwargs=dict(), save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape
    
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph\
    raw_input_ph = tf.placeholder(dtype=tf.float32, shape=obs_dim, name='raw_input_ph')
    rescale_image_op = tf.image.resize_images(raw_input_ph, [30, 40])
    max_seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(), name='max_seq_len_ph')
    seq_len_ph = tf.placeholder(dtype=tf.int32, shape=(None,))

    # Because we pad zeros at the end of every sequence of length less than max length, we need to mask these zeros out
    # when computing loss
    seq_len_mask_ph = tf.placeholder(dtype=tf.int32, shape=(trials_per_epoch, episodes_per_trial * max_ep_len))

    # rescaled_image_ph This is a ph  because we want to be able to pass in value to this node manually
    rescaled_image_in_ph = tf.placeholder(dtype=tf.float32, shape=[None, 30, 40, 3], name='rescaled_image_in_ph')
    a_ph = core.placeholders_from_spaces( env.action_space)[0]
    conv1 = slim.conv2d(activation_fn=tf.nn.relu, inputs=rescaled_image_in_ph, num_outputs=16, kernel_size=[5,5],
                        stride=2)
    image_out = slim.flatten(slim.conv2d(activation_fn=tf.nn.relu, inputs=conv1, num_outputs=16, kernel_size=[5,5],
                        stride=2))

    rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None)
    rnn_state_ph = tf.placeholder(tf.float32, [None, gru_units], name='pi_rnn_state_ph')
    # Main outputs from computation graph

    action_encoder_matrix = np.load(r'encoder.npy')
    pi, logp, logp_pi, v, rnn_state, logits, seq_len_vec, tmp_vec = actor_critic(
            image_out, a_ph, rew_ph, rnn_state_ph, gru_units,
            max_seq_len_ph, action_encoder_matrix, seq_len=seq_len_ph, action_space=env.action_space)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [rescaled_image_in_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi, rnn_state, logits]

    # Experience buffer
    buffer_size = trials_per_epoch * episodes_per_trial * max_ep_len
    buf = PPOBuffer(rescaled_image_in_ph.get_shape().as_list()[1:], act_dim, buffer_size, trials_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)          # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph>0, (1+clip_ratio)*adv_ph, (1-clip_ratio)*adv_ph)

    # Need to mask out the padded zeros when computing loss
    sequence_mask = tf.sequence_mask(seq_len_ph, episodes_per_trial*max_ep_len)
    # Convert bool tensor to int tensor with 1 and 0
    sequence_mask = tf.where(sequence_mask,
                             np.ones(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len)),
                             np.zeros(dtype=np.float32, shape=(trials_per_epoch, episodes_per_trial*max_ep_len)))

    # need to reshape because ratio is a 1-D vector (it is a concatnation of all sequence) for masking and then reshape
    # it back
    pi_loss_vec = tf.multiply(sequence_mask, tf.reshape(tf.minimum(ratio * adv_ph, min_adv), tf.shape(sequence_mask)))
    pi_loss = -tf.reduce_mean(tf.reshape(pi_loss_vec, tf.shape(ratio)))
    aaa = (ret_ph - v)**2

    v_loss_vec = tf.multiply(sequence_mask, tf.reshape((ret_ph - v)**2, tf.shape(sequence_mask)))
    ccc = tf.reshape(v_loss_vec, tf.shape(v))

    v_loss = tf.reduce_mean(tf.reshape(v_loss_vec, tf.shape(v)))


    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(logp_old_ph - logp)      # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)                  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    train = MpiAdamOptimizer(learning_rate=1e-4).minimize(pi_loss + 0.01 * v_loss - 0.001 * approx_ent)


    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'rescaled_image_in': rescaled_image_in_ph}, outputs={'pi': pi, 'v': v})



    def update():
        print(f'Start updating at {datetime.now()}')
        inputs = {k:v for k,v in zip(all_phs, buf.get())}

        inputs[rnn_state_ph] = np.zeros((trials_per_epoch, gru_units), np.float32)
        inputs[max_seq_len_ph] = int(episodes_per_trial * max_ep_len)
        inputs[seq_len_ph] = buf.seq_len_buf
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)

        buf.reset()

        
        # Training
        print(f'sequence length = {sess.run(seq_len_vec, feed_dict=inputs)}')


        for i in range(train_pi_iters):
            _, kl, pi_loss_i, v_loss_i, ent = sess.run([train_pi, approx_kl, pi_loss, v_loss, approx_ent], feed_dict=inputs)
            print(f'i: {i}, pi_loss: {pi_loss_i}, v_loss: {v_loss_i}, entropy: {ent}')


        logger.store(StopIter=i)


        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
                [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old, LossV=v_l_old, 
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))
        print(f'Updating finished at {datetime.now()}')


    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0

    def recenter_rgb(image, min=0.0, max=255.0):
        '''

        :param image:
        :param min:
        :param max:
        :return: an image with rgb value re-centered to [-1, 1]
        '''
        mid = (min + max) / 2.0
        return np.apply_along_axis(func1d=lambda x: (x - mid) / mid, axis=2, arr=image)

    o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o}))
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for trial in range(trials_per_epoch):
            # TODO: tweek settings to match the paper

            # TODO: find a way to generate mazes
            last_a = np.array(0)
            last_r = np.array(r)
            last_rnn_state = np.zeros((1, gru_units), np.float32)

            step_counter = 0
            for episode in range(episodes_per_trial):
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o}))

                action_dict = defaultdict(int)

                # dirty hard coding to make it print in order
                action_dict[0] = 0
                action_dict[1] = 0
                action_dict[2] = 0

                for step in range(max_ep_len):
                    a, v_t, logp_t, rnn_state_t, logits_t = sess.run(
                            get_action_ops, feed_dict={
                                    rescaled_image_in_ph: np.expand_dims(o_rescaled, 0),
                                    a_ph: last_a.reshape(-1,),
                                    rew_ph: last_r.reshape(-1,1),
                                    rnn_state_ph: last_rnn_state,
                                    # v_rnn_state_ph: last_v_rnn_state,
                                    max_seq_len_ph: 1,
                        seq_len_ph: [1]})
                    action_dict[a[0]] += 1
                    # save and log
                    buf.store(o_rescaled, a, r, v_t, logp_t)
                    logger.store(VVals=v_t)
                    o, r, d, _ = env.step(a[0])
                    step_counter += 1
                    o_rescaled = recenter_rgb(sess.run(rescale_image_op, feed_dict={raw_input_ph: o}))
                    ep_ret += r
                    ep_len += 1

                    last_a = a[0]
                    last_r = np.array(r)
                    last_rnn_state = rnn_state_t

                    terminal = d or (ep_len == max_ep_len)
                    if terminal or (step==n-1):
                        if not(terminal):
                            print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
                        # if trajectory didn't reach terminal state, bootstrap value target
                        last_val = r if d else sess.run(v, feed_dict={rescaled_image_in_ph: np.expand_dims(o_rescaled, 0),
                                    a_ph: last_a.reshape(-1,),
                                    rew_ph: last_r.reshape(-1,1),
                                    rnn_state_ph: last_rnn_state,
                                    max_seq_len_ph: 1,
                                    seq_len_ph: [1]})
                        buf.finish_path(last_val)
                        logger.store(EpRet=ep_ret, EpLen=ep_len)


                        print(f'episode terminated with {step} steps. epoch:{epoch} trial:{trial} episode:{episode}')
                        break
                print(action_dict)
            if step_counter < episodes_per_trial * max_ep_len:
                buf.pad_zeros(episodes_per_trial * max_ep_len - step_counter)
            buf.seq_len_buf[trial] = step_counter



            # pad zeros to sequence buffer after each trial
        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, None)
        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*trials_per_epoch*episodes_per_trial*max_ep_len)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()
Пример #3
0
def sac(env_fn,
        seed=0,
        gamma=.99,
        lam=.97,
        hidden_sizes=(200, 100),
        alpha=.0,
        v_lr=1e-3,
        q_lr=1e-3,
        pi_lr=1e-3,
        polyak=1e-2,
        epochs=50,
        steps_per_epoch=1000,
        batch_size=100,
        start_steps=1000,
        logger_kwargs=dict(),
        replay_size=int(1e6),
        max_ep_len=1000,
        save_freq=1):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()

    env = env_fn()

    # Dimensions
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n

    # act_limit = env.action_space.high[0]

    # Placeholders
    x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32)
    a_ph = tf.placeholder(shape=[None, 1], dtype=tf.float32)
    x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32)
    r_ph = tf.placeholder(shape=[None], dtype=tf.float32)
    d_ph = tf.placeholder(shape=[None], dtype=tf.float32)

    # Networks
    def mlp(x,
            hidden_sizes=(32, ),
            activation=tf.tanh,
            output_activation=None):
        for h in hidden_sizes[:-1]:
            x = tf.layers.dense(x, units=h, activation=activation)
        return tf.layers.dense(x,
                               units=hidden_sizes[-1],
                               activation=output_activation)

    def mlp_categorical_policy(x, a, hidden_sizes, activation,
                               output_activation, action_space):
        act_dim = action_space.n
        logits = mlp(x, list(hidden_sizes) + [act_dim], activation, None)
        pi_all = tf.nn.softmax(logits)
        logpi_all = tf.nn.log_softmax(logits)
        # pi = tf.squeeze(tf.random.categorical(logits,1), axis=1)
        pi = tf.random.categorical(logits, 1)
        # a = tf.cast( a, tf.uint8)
        # logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1)
        # logp_pi = tf.reduce_sum(tf.one_hot( tf.squeeze( pi, axis=1), depth=act_dim) * logp_all, axis=1)

        return pi, pi_all, logpi_all

    LOG_STD_MIN = -20
    LOG_STD_MAX = 2

    with tf.variable_scope("main"):
        activation = tf.tanh
        with tf.variable_scope("pi"):
            pi, pi_all, logpi_all = mlp_categorical_policy(
                x_ph, a_ph, hidden_sizes, activation, None, env.action_space)

        print("### DEBUG @ main-discrete.py pi and others' dimensions")
        print(pi)
        print(pi_all)
        print(logpi_all)
        input()

        with tf.variable_scope("q1"):
            q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1),
                                hidden_sizes + (act_dim, ), activation, None),
                            axis=-1)

        with tf.variable_scope("q1", reuse=True):
            q1_pi = tf.squeeze(mlp(
                tf.concat([x_ph, tf.cast(pi, tf.float32)], axis=-1),
                hidden_sizes + (act_dim, ), activation, None),
                               axis=-1)

        with tf.variable_scope("q2"):
            q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1),
                                hidden_sizes + (act_dim, ), activation, None),
                            axis=-1)

        with tf.variable_scope("q2", reuse=True):
            q2_pi = tf.squeeze(mlp(
                tf.concat([x_ph, tf.cast(pi, tf.float32)], -1),
                hidden_sizes + (act_dim, ), activation, None),
                               axis=-1)

        with tf.variable_scope("v"):
            # v = mlp( x_ph, hidden_sizes+(1,), activation, None)
            v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None),
                           axis=-1)

    with tf.variable_scope("target"):

        with tf.variable_scope("v"):
            v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation,
                                    None),
                                axis=-1)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    # Count variables
    var_counts = tuple(
        count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n'
        % var_counts)

    # Targets
    q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ
    v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi
    q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient(
        v_backup_prestop)

    # Q Loss
    q1_loss = tf.reduce_mean((q1 - q_backup)**2)
    q2_loss = tf.reduce_mean((q2 - q_backup)**2)
    q_loss = q1_loss + q2_loss

    # V Loss
    v_loss = tf.reduce_mean((v - v_backup)**2)

    # Pol loss
    pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi)

    # Training ops
    v_trainop = tf.train.AdamOptimizer(v_lr).minimize(
        v_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/v"))
    q_trainop = tf.train.AdamOptimizer(q_lr).minimize(
        q_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/q"))
    pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize(
        pi_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/pi"))

    assert polyak <= .5
    # Target update op
    init_v_target = tf.group([
        tf.assign(v_target, v_main) for v_main, v_target in zip(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v"))
    ])

    update_v_target = tf.group([
        tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main)
        for v_main, v_target in zip(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v"))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(init_v_target)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2,
                              'v': v
                          })

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            # print( o.reshape(-1, 1))
            # input()
            while not (d or (ep_len == max_ep_len)):
                o, r, d, _ = test_env.step(
                    sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0])
                ep_ret += r
                ep_len += 1

            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    #Buffer init
    buffer = ReplayBuffer(obs_dim, 1, replay_size)

    # Main loop
    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    for t in range(total_steps):
        if t > start_steps:
            a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0]
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        d = False or (ep_len == max_ep_len)

        # Still needed ?
        o2 = np.squeeze(o2)

        buffer.store(o, a, r, o2, d)

        o = o2

        if d or (ep_len == max_ep_len):
            for j in range(ep_len):
                batch = buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                # DEBUG:
                # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict)
                # print( v_backup_prestop_out.shape)
                # print( v_backup_prestop_out)
                # input()

                # Value gradient steps
                v_step_ops = [v_loss, v, v_trainop]
                outs = sess.run(v_step_ops, feed_dict)
                logger.store(LossV=outs[0], VVals=outs[1])

                # Q Gradient steps
                q_step_ops = [q_loss, q1, q2, q_trainop]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                # Policy gradient steps
                # TODO Add entropy logging
                pi_step_ops = [pi_loss, pi_trainop, update_v_target]
                outs = sess.run(pi_step_ops, feed_dict=feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Saving the model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #4
0
def ddpg(env_fn,
         actor_critic=a2c,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(1e6),
         gamma=.99,
         polyak=.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         act_noise=.1,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_limit = env.action_space.high[0]

    ac_kwargs['action_space'] = env.action_space

    x_ph, a_ph, x2_ph, r_ph, d_ph = \
        tf.placeholder( name='x_ph', shape=[None, obs_dim], dtype=tf.float32), \
        tf.placeholder( name='a_ph', shape=[None, act_dim], dtype=tf.float32), \
        tf.placeholder( name='x2_ph', shape=[None, obs_dim], dtype=tf.float32), \
        tf.placeholder( name='r_ph', shape=[None], dtype=tf.float32), \
        tf.placeholder( name='d_ph', shape=[None], dtype=tf.float32)

    # Main networks
    with tf.variable_scope('main'):
        pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    replaybuffer = ReplayBuffer(obs_dim, act_dim, replay_size)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    var_counts = tuple(
        count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # Losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q - backup)**2)

    # Optimizer and train ops
    train_pi_op = tf.train.AdamOptimizer(pi_lr).minimize(
        pi_loss, var_list=get_vars('main/pi'))
    train_q_op = tf.train.AdamOptimizer(q_loss).minimize(
        q_loss, var_list=get_vars('main/q'))

    # Update target networks
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Init targets
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q': q
                          })

    def get_actions(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        a += noise_scale * np.random.randn(act_dim)

        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_actions(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop:
    for t in range(total_steps):
        if t > start_steps:
            a = get_actions(o, act_noise)
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        d = False if ep_len == max_ep_len else d

        # Storing experience
        replaybuffer.store(o, a, r, o2, d)

        o = o2

        if d or (ep_len == max_ep_len):
            for _ in range(ep_len):
                batch = replaybuffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)
                logger.store(LossQ=outs[0], QVals=outs[1])

                # Policy update
                outs = sess.run([pi_loss, train_pi_op, target_update],
                                feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #5
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        batch_size=250000,
        n=100,
        epochs=100,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=1000,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    sequence_length = n * max_ep_len
    trials = batch_size // sequence_length

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    # rew_ph, adv_ph, ret_ph, logp_old_ph = core.placeholders(1, None, None, None)
    x_ph = tf.placeholder(dtype=tf.int32,
                          shape=(None, sequence_length),
                          name='x_ph')
    t_ph = tf.placeholder(dtype=tf.int32,
                          shape=(None, sequence_length),
                          name='t_ph')
    a_ph = tf.placeholder(dtype=tf.int32,
                          shape=(None, sequence_length),
                          name='a_ph')
    r_ph = tf.placeholder(dtype=tf.float32,
                          shape=(None, sequence_length),
                          name='r_ph')
    #    input_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, n, None), name='rew_ph')
    adv_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='adv_ph')
    ret_ph = tf.placeholder(dtype=tf.float32, shape=(None), name='ret_ph')
    logp_old_ph = tf.placeholder(dtype=tf.float32,
                                 shape=(None),
                                 name='logp_old_ph')
    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, t_ph, a_ph, r_ph,
                                        sequence_length, env.action_space.n,
                                        env.observation_space.shape[0])

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, t_ph, a_ph, r_ph, adv_ph, ret_ph, logp_old_ph]
    #    for ph in all_phs:
    #        print(ph.shape)

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    buf = PPOBuffer(obs_dim, act_dim, batch_size, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    model_inputs = {'x': x_ph, 't': t_ph, 'a': a_ph, 'r': r_ph}
    model_outputs = {'pi': pi}
    logger.setup_tf_saver(sess, inputs=model_inputs, outputs=model_outputs)

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        #        inputs[a_ph] = np.tril(np.transpose(np.repeat(inputs[a_ph], n).reshape(trials, n, n), [0, 2, 1]))
        #        inputs[rew_ph] = np.tril(np.transpose(np.repeat(inputs[rew_ph], n).reshape(trials, n, n), [0, 2, 1]))
        #        print(inputs[x_ph])
        #        print(inputs[t_ph])
        #        print(inputs[a_ph])
        #        print(inputs[r_ph])
        inputs[x_ph] = inputs[x_ph].reshape(trials, sequence_length)
        inputs[t_ph] = inputs[t_ph].reshape(trials, sequence_length)
        inputs[a_ph] = inputs[a_ph].reshape(trials, sequence_length)
        inputs[r_ph] = inputs[r_ph].reshape(trials, sequence_length)
        #        print('x:', inputs[x_ph])
        #        print('t:', inputs[t_ph])
        #        print('a:', inputs[a_ph])
        #        print('r:', inputs[r_ph])
        #        print('ret:', inputs[ret_ph])
        #        print('adv:', inputs[adv_ph])
        #        print('logp_old:', inputs[logp_old_ph])
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)


#            kl = mpi_avg(kl)
#            if kl > 1.5 * target_kl:
#                logger.log('Early stopping at step %d due to reaching max kl.'%i)
#                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    save_itr = 0
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for trail in range(trials):
            print('trial:', trail)
            #            last_a = np.zeros(n).reshape(1, n)
            #            last_r = np.zeros(n).reshape(1, n)
            o_deque = deque(sequence_length * [0], sequence_length)
            t_deque = deque(sequence_length * [0], sequence_length)
            last_a = deque(sequence_length * [0], sequence_length)
            last_r = deque(sequence_length * [0], sequence_length)
            means = env.sample_tasks(1)[0]
            #            print('task means:', means)
            action_dict = defaultdict(int)
            total_reward = 0
            env.reset_task(means)
            o, r, d, ep_ret, ep_len = env.reset(), np.zeros(1), False, 0, 0

            for episode in range(sequence_length):
                #                print('episode:', episode)
                #                print('o:', o_deque)
                #                print('d:', t_deque)
                #                print('a:', last_a)
                #                print('r:', last_r)
                a, v_t, logp_t = sess.run(
                    get_action_ops,
                    feed_dict={
                        x_ph: np.array(o_deque).reshape(1, sequence_length),
                        t_ph: np.array(t_deque).reshape(1, sequence_length),
                        a_ph: np.array(last_a).reshape(1, sequence_length),
                        r_ph: np.array(last_r).reshape(1, sequence_length)
                    })
                #                print("a shape:", a.shape)
                #                print("v_t shape:", v_t.shape)
                #                print("logp_t shape:", logp_t.shape)
                #                choosen_a = a[episode, 0]
                #                choosen_v_t = v_t[0, episode]
                #                choosen_logp_t = logp_t[episode]
                #                print('a:', a)
                choosen_a = a[-1]
                choosen_v_t = v_t[-1]
                choosen_logp_t = logp_t[-1]
                action_dict[choosen_a] += 1
                o, r, d, _ = env.step(choosen_a)

                ep_ret += r
                ep_len += 1
                t = ep_len == max_ep_len
                total_reward += r

                o_deque.append(o)
                t_deque.append(int(d))
                last_a.append(choosen_a)
                last_r.append(r)

                # save and log
                buf.store(o, int(t), choosen_a, r, choosen_v_t, choosen_logp_t)
                logger.store(VVals=v_t)

                terminal = d or t
                if terminal or (episode == sequence_length - 1):
                    if not (terminal):
                        print(
                            'Warning: trajectory cut off by epoch at %d steps.'
                            % ep_len)
                    # if trajectory didn't reach terminal state, bootstrap value target
                    if d:
                        last_val = r
                    else:
                        last_val = sess.run(
                            v,
                            feed_dict={
                                x_ph:
                                np.array(o_deque).reshape(1, sequence_length),
                                t_ph:
                                np.array(t_deque).reshape(1, sequence_length),
                                a_ph:
                                np.array(last_a).reshape(1, sequence_length),
                                r_ph:
                                np.array(last_r).reshape(1, sequence_length)
                            })
                        last_val = last_val[-1]
                    buf.finish_path(last_val)
                    if terminal:
                        # only save EpRet / EpLen if trajectory finished
                        logger.store(EpRet=ep_ret, EpLen=ep_len)
                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    o_deque[-1] = 0
                    t_deque[-1] = 0
                    last_a[-1] = 0
                    last_r[-1] = 0
            print(action_dict)
            print('average reward:', total_reward / sequence_length)
        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, save_itr)
            save_itr += 1
        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * batch_size)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #6
0
def ppo(env_fn,
        actor_critic=a2c,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=.99,
        clip_ratio=.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=.97,
        max_ep_len=1000,
        target_kl=.01,
        logger_kwargs=dict(),
        save_freq=10):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Share action space structure with the actor_critic
    ac_kwargs['action_space'] = env.action_space

    x_ph, a_ph = tf.placeholder( name="x_ph", shape=[None, obs_dim], dtype=tf.float32), \
        tf.placeholder( name="a_ph", shape=[None, act_dim], dtype=tf.float32)
    adv_ph, ret_ph, logp_old_ph = tf.placeholder( name="adv_ph", shape=[None], dtype=tf.float32), \
        tf.placeholder( name="ret_ph", shape=[None], dtype=tf.float32), \
        tf.placeholder( name="logp_old_ph", shape=[None], dtype=tf.float32)

    # Main outputs from computation graph
    # print( actor_critic( x_ph, a_ph, **ac_kwargs))
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    get_action_ops = [pi, v, logp_pi]

    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    var_counts = tuple(count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO Objectives
    ratio = tf.exp(logp - logp_old_ph)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Stats to watch
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(-logp)

    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)

            def mpi_avg(x):
                """Average a scalar or vector over MPI processes."""
                return mpi_sum(x) / num_procs()

            kl = mpi_avg(kl)

            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break

        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #7
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        trials_per_epoch=2500,
        steps_per_trial=100,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=1000,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    # x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    x_ph = tf.placeholder(dtype=tf.float32, shape=(None, None, 1), name='x_ph')
    a_ph = tf.placeholder(dtype=tf.int32, shape=(None, None), name='a_ph')
    # adv_ph, ret_ph, logp_old_ph, rew_ph = core.placeholders(None, None, None, 1)
    adv_ph = tf.placeholder(dtype=tf.float32,
                            shape=(None, None),
                            name='adv_ph')
    ret_ph = tf.placeholder(dtype=tf.float32,
                            shape=(None, None),
                            name='ret_ph')
    logp_old_ph = tf.placeholder(dtype=tf.float32,
                                 shape=(None, None),
                                 name='logp_old_ph')
    rew_ph = tf.placeholder(dtype=tf.float32,
                            shape=(None, None, 1),
                            name='rew_ph')
    pi_state_ph = tf.placeholder(dtype=tf.float32,
                                 shape=(None, NUM_GRU_UNITS),
                                 name='pi_state_ph')
    v_state_ph = tf.placeholder(dtype=tf.float32,
                                shape=(None, NUM_GRU_UNITS),
                                name='v_state_ph')

    # Initialize rnn states for pi and v

    # Main outputs from computation graph
    pi, logp, logp_pi, v, new_pi_state, new_v_state = actor_critic(
        x_ph,
        a_ph,
        rew_ph,
        pi_state_ph,
        v_state_ph,
        NUM_GRU_UNITS,
        action_space=env.action_space)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, rew_ph]

    # Every step, get: action, value, and logprob and reward
    get_action_ops = [pi, v, logp_pi, new_pi_state, new_v_state]

    # Experience buffer
    steps_per_epoch = trials_per_epoch * steps_per_trial
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(
        learning_rate=pi_lr).minimize(pi_loss - 0.01 * approx_ent)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    # tf.reset_default_graph()
    # restore_tf_graph(sess, '..//data//ppo//ppo_s0//simple_save')

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        inputs[pi_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS))
        inputs[v_state_ph] = np.zeros((trials_per_epoch, NUM_GRU_UNITS))
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)
        print(pi_l_old, v_l_old)
        # Training
        for i in range(train_pi_iters):
            # print(f'pi:{i}')
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            # print(sess.run(pi_loss, feed_dict=inputs))
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            # print(f'v:{_}')
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        import datetime
        print(f'finish one batch training at {datetime.datetime.now()}')
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch

    for epoch in range(epochs):
        for trial in range(trials_per_epoch):
            print(f'trial: {trial}')
            old_a = np.array([0]).reshape(1, 1)
            old_r = np.array([0]).reshape((1, 1, 1))
            means = env.sample_tasks(1)[0]
            action_dict = defaultdict(int)
            for i in range(env.action_space.n):
                action_dict[i] = 0

            env.reset_task_simple(means)
            task_avg = 0.0
            pi_state_t = np.zeros((1, NUM_GRU_UNITS))
            v_state_t = np.zeros((1, NUM_GRU_UNITS))
            for step in range(steps_per_trial):
                a, v_t, logp_t, pi_state_t, v_state_t = sess.run(
                    get_action_ops,
                    feed_dict={
                        x_ph: o.reshape(1, 1, -1),
                        a_ph: old_a,
                        rew_ph: old_r,
                        pi_state_ph: pi_state_t,
                        v_state_ph: v_state_t
                    })
                # save and log
                buf.store(o, a, r, v_t, logp_t)
                logger.store(VVals=v_t)

                try:
                    o, r, d, _ = env.step(a[0][0])
                except:
                    print(a)
                    raise AssertionError

                action_dict[a[0][0]] += 1

                old_a = np.array(a).reshape(1, 1)
                old_r = np.array([r]).reshape(1, 1, 1)
                ep_ret += r
                task_avg += r
                ep_len += 1

                terminal = d or (ep_len == max_ep_len)
                if terminal or (step == local_steps_per_epoch - 1):
                    if not (terminal):
                        print(
                            'Warning: trajectory cut off by epoch at %d steps.'
                            % ep_len)
                    # if trajectory didn't reach terminal state, bootstrap value target
                    last_val = r if d else sess.run(
                        v, feed_dict={x_ph: o.reshape(1, -1)})
                    buf.finish_path(last_val)
                    if terminal:
                        # only save EpRet / EpLen if trajectory finished
                        logger.store(EpRet=ep_ret, EpLen=ep_len)

                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

            # logger.log_tabular('Epoch', epoch)
            # logger.log_tabular('EpRet', with_min_and_max=True)
            # logger.log_tabular('Means', means)
            # logger.dump_tabular()
            print(f'avg in trial {trial}: {task_avg / steps_per_trial}')
            print(f'Means in trial {trial}: {means}')

            print(action_dict)

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)
            # saved_path = saver.save(sess, f"/tmp/model_epoch{epoch}.ckpt")
            # print(f'Model saved in {saved_path}')
        # Perform PPO update!

        update()
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #8
0
def td3( env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000,
    epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3,
    batch_size=100, start_steps=10000, act_noise=.1, target_noise=.2, noise_clip=.5,
    policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1):

    logger = EpochLogger( **logger_kwargs)
    logger.save_config( locals())

    tf.set_random_seed(seed)
    np.random.seed( seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping
    act_limit = env.action_space.high[0]

    # Share action sapce info with A2C
    ac_kwargs['action_space'] = env.action_space

    x_ph, a_ph, x2_ph, r_ph, d_ph = \
        tf.placeholder( name='x_ph', shape=(None, obs_dim), dtype=tf.float32), \
        tf.placeholder( name='a_ph', shape=(None, act_dim), dtype=tf.float32), \
        tf.placeholder( name='x2_ph', shape=(None, obs_dim), dtype=tf.float32),\
        tf.placeholder( name='r_ph', shape=(None), dtype=tf.float32), \
        tf.placeholder( name='d_ph', shape=(None), dtype=tf.float32)

    # Actor policy and value
    with tf.variable_scope('main'):
        pi, q1, q2, q1_pi = actor_critic( x_ph, a_ph, **ac_kwargs)

    # Tghis seems a bit memory inneficient: what happens to the q values created
    # along with the target policy ? the poluicy created along the q targets ?
    # Not referenced, but still declared right, a the cost of GPU memory
    # Target policy
    with tf.variable_scope( 'target'):
        pi_targ, _, _, _  = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Target Q networks
    with tf.variable_scope( 'target', reuse=True):
        epsilon = tf.random_normal( tf.shape( pi_targ), stddev=target_noise)
        epsilon = tf.clip_by_value( epsilon, -noise_clip, noise_clip)
        a2 = pi_targ + epsilon
        a2 = tf.clip_by_value( a2, -act_limit, act_limit)

        # Target Q-Values using actions from target policy
        _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs)

    replaybuffer = ReplayBuffer( obs_dim, act_dim, size=replay_size)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    # Count variables
    var_counts = tuple( count_vars( scope) for scope in ['main/pi',
        'main/q1', 'main/q2', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts)

    # CLiped Double Q-Learning with Bellman backup
    min_q_targ = tf.minimum( q1_targ, q2_targ)
    backup = tf.stop_gradient( r_ph + gamma * (1 -d_ph) * min_q_targ)

    # TD3 Losses
    pi_loss = - tf.reduce_mean( q1_pi)
    q1_loss = tf.reduce_mean( (q1 - backup)**2)
    q2_loss = tf.reduce_mean( (q2 - backup)**2)
    q_loss = q1_loss + q2_loss

    # Trainin ops
    pi_train = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss)
    q_train = tf.train.AdamOptimizer(q_lr).minimize( q_loss)

    # Polyak wise target update
    target_update = tf.group( [ tf.assign( v_targ, polyak * v_targ + (1-polyak)
        * v_main) for v_main, v_targ in zip( get_vars('main'), get_vars('target'))])

    target_init = tf.group( [ tf.assign( v_targ, v_main) for v_targ, v_main in
        zip( get_vars('target'), get_vars('main'))])

    sess = tf.Session()
    sess.run( tf.global_variables_initializer())
    sess.run( target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2})

    def get_action( o, noise_scale):
        a = sess.run( pi, feed_dict={ x_ph: o.reshape(1,-1)})
        a += noise_scale * np.random.randn( act_dim)

        return np.clip( a, -act_limit, act_limit)

    def test_agent( n=10):
        for j in range( n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0 ,0
            while not ( d or (ep_len == max_ep_len)):
                o, r, d, _ = test_env.step( get_action( o, 0))
                ep_ret += r
                ep_len += 1

            logger.store( TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0 , 0
    total_steps = steps_per_epoch * epochs

    # Main loop
    for t in range( total_steps):
        if t > start_steps:
            a = get_action( o, act_noise)
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step( a)
        ep_ret += r
        ep_len += 1

        d = False or ( ep_len == max_ep_len)

        o2 = np.squeeze( o2)

        # print( "O2: ", o2)
        replaybuffer.store( o, a, r, o2, d)

        o = o2

        if d or ( ep_len == max_ep_len):
            for j in range( ep_len):
                batch = replaybuffer.sample_batch( batch_size)
                feed_dict = {x_ph: batch['obs1'],
                                 x2_ph: batch['obs2'],
                                 a_ph: batch['acts'],
                                 r_ph: batch['rews'],
                                 d_ph: batch['done']
                                }
                q_step_ops = [q_loss, q1, q2, q_train]
                outs = sess.run( q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                if j % policy_delay == 0:
                    outs = sess.run( [pi_loss, pi_train, target_update], feed_dict)
                    logger.store( LossPi=outs[0])

            logger.store( EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Saving the model
            if (epoch % save_freq == 0) or ( epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
Пример #9
0
def trpo(env_fn,
         actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         gamma=.99,
         delta=.01,
         vf_lr=1e-3,
         train_v_iters=80,
         damping_coeff=.1,
         cg_iters=10,
         backtrack_iters=10,
         backtrack_coeff=.8,
         lam=.97,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=10,
         algo="trpo"):

    # LOgger tools
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Seed inits
    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Environment recreation
    env = env_fn()

    # Getting obs dims
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    ac_kwargs['action_space'] = env.action_space

    # Placeholders
    x_ph, a_ph = tf.placeholder( name="x_ph", shape=[None, obs_dim], dtype=tf.float32), \
        tf.placeholder( name="a_ph", shape=[None, act_dim], dtype=tf.float32)
    adv_ph, ret_ph, logp_old_ph = tf.placeholder( name="adv_ph", shape=[None], dtype=tf.float32), \
        tf.placeholder( name="ret_ph", shape=[None], dtype=tf.float32), \
        tf.placeholder( name="logp_old_ph", shape=[None], dtype=tf.float32)

    pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic(
        x_ph, a_ph, **ac_kwargs)

    def keys_as_sorted_list(dict):
        return sorted(list(dict.keys()))

    def values_as_sorted_list(dict):
        return [dict[k] for k in keys_as_sorted_list(dict)]

    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph
               ] + values_as_sorted_list(info_phs)

    get_action_ops = [pi, v, logp_pi] + values_as_sorted_list(info)

    # Experience buffer init
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    info_shapes = {k: v.shape.as_list()[1:] for k, v in info_phs.items()}
    buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes,
                    gamma, lam)

    # Count variables
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    var_counts = tuple(count_vars(scope) for scope in ["pi", "v"])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # TRPO Losses
    ratio = tf.exp(logp - logp_old_ph)
    pi_loss = -tf.reduce_mean(ratio * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Optimizer for value function
    train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # CG solver requirements
    pi_params = get_vars("pi")

    # Some helpers
    def flat_concat(xs):
        return tf.concat([tf.reshape(x, (-1, )) for x in xs], axis=0)

    def flat_grad(f, params):
        return flat_concat(tf.gradients(xs=params, ys=f))

    def hessian_vector_product(f, params):
        g = flat_grad(f, params)
        x = tf.placeholder(tf.float32, shape=g.shape)

        return x, flat_grad(tf.reduce_sum(g * x), params)

    def assign_params_from_flat(x, params):
        flat_size = lambda p: int(np.prod(p.shape.as_list())
                                  )  # the 'int' is important for scalars
        splits = tf.split(x, [flat_size(p) for p in params])
        new_params = [
            tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)
        ]

        return tf.group(
            [tf.assign(p, p_new) for p, p_new in zip(params, new_params)])

    gradient = flat_grad(pi_loss, pi_params)
    v_ph, hvp = hessian_vector_product(d_kl, pi_params)
    if damping_coeff > 0:
        hvp += damping_coeff * v_ph

    # Symbols for getting and setting params
    get_pi_params = flat_concat(pi_params)
    set_pi_params = assign_params_from_flat(v_ph, pi_params)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def cg(Ax, b):
        x = np.zeros_like(b)
        r = b.copy()
        p = r.copy()
        r_dot_old = np.dot(r, r)

        for _ in range(cg_iters):
            z = Ax(p)
            alpha = r_dot_old / (np.dot(p, z) + EPS)
            x += alpha * p
            r -= alpha * z
            r_dot_new = np.dot(r, r)
            p = r + (r_dot_new / r_dot_old) * p
            r_dot_old = r_dot_new
        return x

    def update():
        # Prepare hessian func, gradient eval
        # Always so elegant haha
        inputs = {k: v for k, v in zip(all_phs, buf.get())}

        def mpi_avg(x):
            """Average a scalar or vector over MPI processes."""
            return mpi_sum(x) / num_procs()

        Hx = lambda x: mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))
        g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss],
                                        feed_dict=inputs)
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)

        # Core calculations for TRPO or NPG
        x = cg(Hx, g)
        alpha = np.sqrt(2 * delta / (np.dot(x, Hx(x)) + EPS))  # OK
        old_params = sess.run(get_pi_params)

        def set_and_eval(step):
            sess.run(set_pi_params,
                     feed_dict={v_ph: old_params - alpha * x * step})

            return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))

        if algo == 'npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)
        elif algo == "trpo":
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log(
                        'Accepting new params at step %d of line search.' % j)
                    logger.store(BacktrackIters=j)
                    break

                if j == backtrack_iters - 1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.)

        # Value function updates
        for _ in range(train_v_iters):
            sess.run(train_vf, feed_dict=inputs)
            v_l_new = sess.run(v_loss, feed_dict=inputs)

        # Log changes from update
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            agent_outs = sess.run(get_action_ops,
                                  feed_dict={x_ph: o.reshape(1, -1)})
            a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[
                1], agent_outs[2], agent_outs[3:]

            # Save and log
            buf.store(o, a, r, v_t, logp_t, info_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not terminal:
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)

                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform TRPO or NPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('KL', average_only=True)
        if algo == 'trpo':
            logger.log_tabular('BacktrackIters', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #10
0
def sac(env_fn,
        seed=0,
        gamma=.99,
        lam=.97,
        hidden_sizes=(200, 100),
        alpha=.5,
        v_lr=1e-3,
        q_lr=1e-3,
        pi_lr=1e-3,
        polyak=1e-2,
        epochs=50,
        steps_per_epoch=1000,
        batch_size=100,
        start_steps=10000,
        logger_kwargs=dict(),
        replay_size=int(1e6),
        max_ep_len=1000,
        save_freq=1):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()

    env = env_fn()

    # Dimensions
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_limit = env.action_space.high[0]

    # Placeholders
    x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32)
    a_ph = tf.placeholder(shape=[None, act_dim], dtype=tf.float32)
    x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32)
    r_ph = tf.placeholder(shape=[None], dtype=tf.float32)
    d_ph = tf.placeholder(shape=[None], dtype=tf.float32)

    # Networks
    def mlp(x,
            hidden_sizes=(32, ),
            activation=tf.tanh,
            output_activation=None):
        for h in hidden_sizes[:-1]:
            x = tf.layers.dense(x, units=h, activation=activation)
        return tf.layers.dense(x,
                               units=hidden_sizes[-1],
                               activation=output_activation)

    # Why isn't the k used here ?
    def gaussian_likelihood(x, mu, log_std):
        EPS = 1e-8
        pre_sum = -0.5 * (
            ((x - mu) /
             (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi))
        return tf.reduce_sum(pre_sum, axis=1)

    def clip_but_pass_gradient(x, l=-1., u=1.):
        clip_up = tf.cast(x > u, tf.float32)
        clip_low = tf.cast(x < l, tf.float32)
        return x + tf.stop_gradient((u - x) * clip_up + (l - x) * clip_low)

    LOG_STD_MIN = -20
    LOG_STD_MAX = 2

    def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
        act_dim = a.shape.as_list()[-1]
        net = mlp(x, list(hidden_sizes), activation, activation)
        mu = tf.layers.dense(net, act_dim, activation=output_activation)
        """
        Because algorithm maximizes trade-off of reward and entropy,
        entropy must be unique to state---and therefore log_stds need
        to be a neural network output instead of a shared-across-states
        learnable parameter vector. But for deep Relu and other nets,
        simply sticking an activationless dense layer at the end would
        be quite bad---at the beginning of training, a randomly initialized
        net could produce extremely large values for the log_stds, which
        would result in some actions being either entirely deterministic
        or too random to come back to earth. Either of these introduces
        numerical instability which could break the algorithm. To
        protect against that, we'll constrain the output range of the
        log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is
        slightly different from the trick used by the original authors of
        SAC---they used tf.clip_by_value instead of squashing and rescaling.
        I prefer this approach because it allows gradient propagation
        through log_std where clipping wouldn't, but I don't know if
        it makes much of a difference.
        """
        log_std = tf.layers.dense(net, act_dim, activation=tf.tanh)
        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std +
                                                                     1)

        std = tf.exp(log_std)
        pi = mu + tf.random_normal(tf.shape(mu)) * std
        logp_pi = gaussian_likelihood(pi, mu, log_std)
        return mu, pi, logp_pi

    def apply_squashing_func(mu, pi, logp_pi):
        mu = tf.tanh(mu)
        pi = tf.tanh(pi)
        # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range.
        logp_pi -= tf.reduce_sum(
            tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)
        return mu, pi, logp_pi

    with tf.variable_scope("main"):
        activation = tf.tanh
        with tf.variable_scope("pi"):
            # mu = mlp( x_ph, hidden_sizes, activation, None)
            # log_std = mlp( mu, (act_dim,), activation, None)
            # # Avoid out of range log_std. Refer to Github for explanation.
            # log_std = LOG_STD_MIN + .5 * ( LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
            #
            # mu = mlp( mu, (act_dim,), activation, None)
            #
            # pi = mu + tf.exp( log_std) * tf.random_normal( tf.shape(mu))
            # logp_pi = gaussian_likelihood( pi, mu, log_std)
            #
            # # Follow SpinningUp Implementation
            # mu = tf.tanh(mu)
            # pi = tf.tanh(pi)
            #
            # def clip_but_pass_gradient(x, l=-1., u=1.):
            #     clip_up = tf.cast(x > u, tf.float32)
            #     clip_low = tf.cast(x < l, tf.float32)
            #     # What is this supposed to mean even ?
            #     return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low)
            #
            # # Shameless copy paste
            # logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)

            # Not working version bak
            # squashed_pi = tf.tanh( pi)
            #
            # # To be sure
            # pi = tf.clip_by_value( pi, -act_limit, act_limit)
            #
            # # Must take in the squased polic
            # log_squash_pi = gaussian_likelihood( squashed_pi, mu, log_std)

            # Shamefull plug
            mu, pi, logp_pi = mlp_gaussian_policy(x_ph, a_ph, hidden_sizes,
                                                  tf.tanh, None)
            mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)

        with tf.variable_scope("q1"):
            q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1),
                                hidden_sizes + (1, ), activation, None),
                            axis=-1)

        with tf.variable_scope("q1", reuse=True):
            q1_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1),
                                   hidden_sizes + (1, ), activation, None),
                               axis=-1)

        with tf.variable_scope("q2"):
            q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1),
                                hidden_sizes + (1, ), activation, None),
                            axis=-1)

        with tf.variable_scope("q2", reuse=True):
            q2_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1),
                                   hidden_sizes + (1, ), activation, None),
                               axis=-1)

        with tf.variable_scope("v"):
            # v = mlp( x_ph, hidden_sizes+(1,), activation, None)
            v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None),
                           axis=-1)

    with tf.variable_scope("target"):

        with tf.variable_scope("v"):
            v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation,
                                    None),
                                axis=-1)

    # helpers for var count
    def get_vars(scope=''):
        return [x for x in tf.trainable_variables() if scope in x.name]

    def count_vars(scope=''):
        v = get_vars(scope)
        return sum([np.prod(var.shape.as_list()) for var in v])

    # Count variables
    var_counts = tuple(
        count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n'
        % var_counts)

    # Targets
    q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ
    v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi
    q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient(
        v_backup_prestop)

    # Q Loss
    q1_loss = tf.reduce_mean((q1 - q_backup)**2)
    q2_loss = tf.reduce_mean((q2 - q_backup)**2)
    q_loss = q1_loss + q2_loss

    # V Loss
    v_loss = tf.reduce_mean((v - v_backup)**2)

    # Pol loss
    pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi)

    # Training ops
    v_trainop = tf.train.AdamOptimizer(v_lr).minimize(
        v_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/v"))
    q_trainop = tf.train.AdamOptimizer(q_lr).minimize(
        q_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/q"))
    pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize(
        pi_loss,
        var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope="main/pi"))

    assert polyak <= .5
    # Target update op
    init_v_target = tf.group([
        tf.assign(v_target, v_main) for v_main, v_target in zip(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v"))
    ])

    update_v_target = tf.group([
        tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main)
        for v_main, v_target in zip(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v"))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(init_v_target)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2,
                              'v': v
                          })

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            # print( o.reshape(-1, 1))
            # input()
            while not (d or (ep_len == max_ep_len)):
                o, r, d, _ = test_env.step(
                    sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}))
                ep_ret += r
                ep_len += 1

            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    #Buffer init
    buffer = ReplayBuffer(obs_dim, act_dim, replay_size)

    # Main loop
    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    for t in range(total_steps):
        if t > start_steps:
            a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})
        else:
            a = env.action_space.sample()

        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        o2, r, d, _ = env.step(o)

        d = False or (ep_len == max_ep_len)

        # Still needed ?
        o2 = np.squeeze(o2)

        buffer.store(o, a, r, o2, d)

        o = o2

        if d or (ep_len == max_ep_len):
            for j in range(ep_len):
                batch = buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }
                # DEBUG:
                # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict)
                # print( v_backup_prestop_out.shape)
                # print( v_backup_prestop_out)
                # input()

                # Value gradient steps
                v_step_ops = [v_loss, v, v_trainop]
                outs = sess.run(v_step_ops, feed_dict)
                logger.store(LossV=outs[0], VVals=outs[1])

                # Q Gradient steps
                q_step_ops = [q_loss, q1, q2, q_trainop]
                outs = sess.run(q_step_ops, feed_dict)
                logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                # Policy gradient steps
                # TODO Add entropy logging
                pi_step_ops = [pi_loss, pi_trainop, update_v_target]
                outs = sess.run(pi_step_ops, feed_dict=feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0

        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Saving the model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()