Пример #1
0
def trpo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
         steps_per_epoch=4000, epochs=50, gamma=0.99, delta=0.01, vf_lr=1e-3,
         train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, 
         backtrack_coeff=0.8, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), 
         save_freq=10, algo='trpo'):
    """
    Trust Region Policy Optimization 

    (with support for Natural Policy Gradient)

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ============  ================  ========================================
            Symbol        Shape             Description
            ============  ================  ========================================
            ``pi``        (batch, act_dim)  | Samples actions from policy given 
                                            | states.
            ``logp``      (batch,)          | Gives log probability, according to
                                            | the policy, of taking actions ``a_ph``
                                            | in states ``x_ph``.
            ``logp_pi``   (batch,)          | Gives log probability, according to
                                            | the policy, of the action sampled by
                                            | ``pi``.
            ``info``      N/A               | A dict of any intermediate quantities
                                            | (from calculating the policy or log 
                                            | probabilities) which are needed for
                                            | analytically computing KL divergence.
                                            | (eg sufficient statistics of the
                                            | distributions)
            ``info_phs``  N/A               | A dict of placeholders for old values
                                            | of the entries in ``info``.
            ``d_kl``      ()                | A symbol for computing the mean KL
                                            | divergence between the current policy
                                            | (``pi``) and the old policy (as 
                                            | specified by the inputs to 
                                            | ``info_phs``) over the batch of 
                                            | states given in ``x_ph``.
            ``v``         (batch,)          | Gives the value estimate for states
                                            | in ``x_ph``. (Critical: make sure 
                                            | to flatten this!)
            ============  ================  ========================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TRPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        delta (float): KL-divergence limit for TRPO / NPG update. 
            (Should be small for stability. Values like 0.01, 0.05.)

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        damping_coeff (float): Artifact for numerical stability, should be 
            smallish. Adjusts Hessian-vector product calculation:
            
            .. math:: Hv \\rightarrow (\\alpha I + H)v

            where :math:`\\alpha` is the damping coefficient. 
            Probably don't play with this hyperparameter.

        cg_iters (int): Number of iterations of conjugate gradient to perform. 
            Increasing this will lead to a more accurate approximation
            to :math:`H^{-1} g`, and possibly slightly-improved performance,
            but at the cost of slowing things down. 

            Also probably don't play with this hyperparameter.

        backtrack_iters (int): Maximum number of steps allowed in the 
            backtracking line search. Since the line search usually doesn't 
            backtrack, and usually only steps back once when it does, this
            hyperparameter doesn't often matter.

        backtrack_coeff (float): How far back to step during backtracking line
            search. (Always between 0 and 1, usually above 0.5.)

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        algo: Either 'trpo' or 'npg': this code supports both, since they are 
            almost the same.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape
    
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph, plus placeholders for old pdist (for KL)
    pi, logp, logp_pi, info, info_phs, d_kl, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] + core.values_as_sorted_list(info_phs)

    # Every step, get: action, value, logprob, & info for pdist (for computing kl div)
    get_action_ops = [pi, v, logp_pi] + core.values_as_sorted_list(info)

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    info_shapes = {k: v.shape.as_list()[1:] for k,v in info_phs.items()}
    buf = GAEBuffer(obs_dim, act_dim, local_steps_per_epoch, info_shapes, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts)

    # TRPO losses
    ratio = tf.exp(logp - logp_old_ph)          # pi(a|s) / pi_old(a|s)
    pi_loss = -tf.reduce_mean(ratio * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Optimizer for value function
    train_vf = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # Symbols needed for CG solver
    pi_params = core.get_vars('pi')
    gradient = core.flat_grad(pi_loss, pi_params)
    v_ph, hvp = core.hessian_vector_product(d_kl, pi_params)
    if damping_coeff > 0:
        hvp += damping_coeff * v_ph

    # Symbols for getting and setting params
    get_pi_params = core.flat_concat(pi_params)
    set_pi_params = core.assign_params_from_flat(v_ph, pi_params)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def cg(Ax, b):
        """
        Conjugate gradient algorithm
        (see https://en.wikipedia.org/wiki/Conjugate_gradient_method)
        """
        x = np.zeros_like(b)
        r = b.copy() # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start.
        p = r.copy()
        r_dot_old = np.dot(r,r)
        for _ in range(cg_iters):
            z = Ax(p)
            alpha = r_dot_old / (np.dot(p, z) + EPS)
            x += alpha * p
            r -= alpha * z
            r_dot_new = np.dot(r,r)
            p = r + (r_dot_new / r_dot_old) * p
            r_dot_old = r_dot_new
        return x

    def update():
        # Prepare hessian func, gradient eval
        inputs = {k:v for k,v in zip(all_phs, buf.get())}
        Hx = lambda x : mpi_avg(sess.run(hvp, feed_dict={**inputs, v_ph: x}))
        g, pi_l_old, v_l_old = sess.run([gradient, pi_loss, v_loss], feed_dict=inputs)
        g, pi_l_old = mpi_avg(g), mpi_avg(pi_l_old)

        # Core calculations for TRPO or NPG
        x = cg(Hx, g)
        alpha = np.sqrt(2*delta/(np.dot(x, Hx(x))+EPS))
        old_params = sess.run(get_pi_params)

        def set_and_eval(step):
            sess.run(set_pi_params, feed_dict={v_ph: old_params - alpha * x * step})
            return mpi_avg(sess.run([d_kl, pi_loss], feed_dict=inputs))

        if algo=='npg':
            # npg has no backtracking or hard kl constraint enforcement
            kl, pi_l_new = set_and_eval(step=1.)

        elif algo=='trpo':
            # trpo augments npg with backtracking line search, hard kl
            for j in range(backtrack_iters):
                kl, pi_l_new = set_and_eval(step=backtrack_coeff**j)
                if kl <= delta and pi_l_new <= pi_l_old:
                    logger.log('Accepting new params at step %d of line search.'%j)
                    logger.store(BacktrackIters=j)
                    break

                if j==backtrack_iters-1:
                    logger.log('Line search failed! Keeping old params.')
                    logger.store(BacktrackIters=j)
                    kl, pi_l_new = set_and_eval(step=0.)

        # Value function updates
        for _ in range(train_v_iters):
            sess.run(train_vf, feed_dict=inputs)
        v_l_new = sess.run(v_loss, feed_dict=inputs)

        # Log changes from update
        logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            agent_outs = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)})
            a, v_t, logp_t, info_t = agent_outs[0][0], agent_outs[1], agent_outs[2], agent_outs[3:]

            o2, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v_t, logp_t, info_t)
            logger.store(VVals=v_t)

            # Update obs (critical!)
            o = o2

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t==local_steps_per_epoch-1):
                if not(terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.'%ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = 0 if d else sess.run(v, feed_dict={x_ph: o.reshape(1,-1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs-1):
            logger.save_state({'env': env}, None)

        # Perform TRPO or NPG update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('KL', average_only=True)
        if algo=='trpo':
            logger.log_tabular('BacktrackIters', average_only=True)
        logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()
Пример #2
0
def sac(env_fn, logger_kwargs=dict(), network_params=dict(), rl_params=dict()):

    # env params
    thresh          = rl_params['thresh']

    # control params
    seed            = rl_params['seed']
    epochs          = rl_params['epochs']
    steps_per_epoch = rl_params['steps_per_epoch']
    replay_size     = rl_params['replay_size']
    batch_size      = rl_params['batch_size']
    start_steps     = rl_params['start_steps']
    max_ep_len      = rl_params['max_ep_len']
    save_freq       = rl_params['save_freq']
    render          = rl_params['render']

    # rl params
    gamma           = rl_params['gamma']
    polyak          = rl_params['polyak']
    lr              = rl_params['lr']
    grad_clip_val       = rl_params['grad_clip_val']

    # entropy params
    alpha           = rl_params['alpha']
    target_entropy  = rl_params['target_entropy']

    logger = EpochLogger(**logger_kwargs)
    if save_freq is not None:
        logger.save_config(locals())

    train_env, test_env = env_fn(), env_fn()
    obs = train_env.observation_space
    act = train_env.action_space

    tf.set_random_seed(seed)
    np.random.seed(seed)
    train_env.seed(seed)
    train_env.action_space.np_random.seed(seed)
    test_env.seed(seed)
    test_env.action_space.np_random.seed(seed)

    # get the size after resize
    obs_dim = network_params['input_dims']
    act_dim = act.shape[0]

    # init a state buffer for storing last m states
    train_state_buffer = StateBuffer(m=obs_dim[2])
    test_state_buffer  = StateBuffer(m=obs_dim[2])

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1_a, q2_a = build_models(x_ph, a_ph, act, act_dim, network_params)

    with tf.variable_scope('main', reuse=True):
        # compose q with pi, for pi-learning
        _, _, _, q1_pi, q2_pi = build_models(x_ph, pi, act, act_dim, network_params)

        # get actions and log probs of actions for next states, for Q-learning
        _, pi_next, logp_pi_next, _, _ = build_models(x2_ph, a_ph, act, act_dim, network_params)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, q1_pi_targ, q2_pi_targ  = build_models(x2_ph, pi_next, act, act_dim, network_params)

    # alpha Params
    if target_entropy == 'auto':
        target_entropy = tf.cast(-act_dim, tf.float32)
    else:
        target_entropy = tf.cast(target_entropy, tf.float32)

    log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)

    if alpha == 'auto': # auto tune alpha
        alpha = tf.exp(log_alpha)
    else: # fixed alpha
        alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha)

    # Count variables
    var_counts = tuple(count_vars(scope) for scope in ['log_alpha',
                                                       'main/pi',
                                                       'main/q1',
                                                       'main/q2',
                                                       'main'])
    print("""\nNumber of other parameters:
             alpha: %d,
             pi: %d,
             q1: %d,
             q2: %d,
             total: %d\n"""%var_counts)

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)
    min_q_pi_targ = tf.minimum(q1_pi_targ, q2_pi_targ)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*(min_q_pi_targ - alpha*logp_pi_next))

    # critic losses
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2)
    value_loss = q1_loss + q2_loss

    # Soft actor losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - min_q_pi)

    # alpha loss for temperature parameter
    alpha_backup = tf.stop_gradient(logp_pi + target_entropy)
    alpha_loss  = -tf.reduce_mean(log_alpha * alpha_backup)

    # Policy train op
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    if grad_clip_val is not None:
        gvs = pi_optimizer.compute_gradients(pi_loss,  var_list=get_vars('main/pi'))
        capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs]
        train_pi_op = pi_optimizer.apply_gradients(capped_gvs)
    else:
        train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    with tf.control_dependencies([train_pi_op]):
        if grad_clip_val is not None:
            gvs = value_optimizer.compute_gradients(value_loss, var_list=get_vars('main/q'))
            capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var) for grad, var in gvs]
            train_value_op = value_optimizer.apply_gradients(capped_gvs)
        else:
            train_value_op = value_optimizer.minimize(value_loss, var_list=get_vars('main/q'))

    alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    with tf.control_dependencies([train_value_op]):
        train_alpha_op = alpha_optimizer.minimize(alpha_loss, var_list=get_vars('log_alpha'))

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    step_ops = [pi_loss, q1_loss, q2_loss, q1_a, q2_a, logp_pi, target_entropy, alpha_loss, alpha,
                train_pi_op, train_value_op, train_alpha_op, target_update]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session(config=tf_config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    if save_freq is not None:
        logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph},
                                    outputs={'mu': mu, 'pi': pi, 'q1_a': q1_a, 'q2_a': q2_a})

    def get_action(state, deterministic=False):
        state = state.astype('float32') / 255.
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: [state]})[0]

    def reset(env, state_buffer):
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        o = process_image_observation(o, obs_dim, thresh)
        state = state_buffer.init_state(init_obs=o)
        return o, r, d, ep_ret, ep_len, state

    def test_agent(n=10, render=True):
        for j in range(n):
            o, r, d, ep_ret, ep_len, test_state = reset(test_env, test_state_buffer)

            if render: test_env.render()

            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(test_state, True))
                o = process_image_observation(o, obs_dim, thresh)
                test_state = test_state_buffer.append_state(o)
                ep_ret += r
                ep_len += 1

                if render: test_env.render()

            if render: test_env.close()
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len, state = reset(train_env, train_state_buffer)
    total_steps = steps_per_epoch * epochs
    save_iter = 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards,
        use the learned policy.
        """
        if t > start_steps:
            a = get_action(state)
        else:
            a = train_env.action_space.sample()

        # Step the env
        o2, r, d, _ = train_env.step(a)
        o2        = process_image_observation(o2, obs_dim, thresh)

        next_state = train_state_buffer.append_state(o2)

        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(state, a, r, next_state, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        state = next_state

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                            }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
                             Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], TargEntropy=outs[6],
                             LossAlpha=outs[7], Alpha=outs[8])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len, state = reset(train_env, train_state_buffer)


        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if save_freq is not None:
                if (epoch % save_freq == 0) or (epoch == epochs-1):
                    logger.save_state({'env': train_env}, itr=save_iter)
                    save_iter+=1

            # Test the performance of the deterministic version of the agent.
            test_agent(n=2, render=render)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', average_only=True)
            logger.log_tabular('TargEntropy', average_only=True)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossAlpha', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
Пример #3
0
def bcq_learn(env_set="Hopper-v2", seed=0, buffer_type="FinalSigma0.5_env_0_1000K",
			  batch_size=100, eval_freq=int(1e2), max_timesteps=float(2e6), lr=1e-3,
              save_freq=int(1e2),
			  logger_kwargs=dict()):

    eval_freq = save_freq if eval_freq=="save_freq" else eval_freq

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("running on device:", device)

    """set up logger"""
    global logger
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    file_name = "BCQbatchpolicy_%s_%s" % (env_set, seed)

    print("---------------------------------------")
    print
    ("Task: " + file_name)
    print("Evaluate Policy every", eval_freq * batch_size / 1e6,
          'epoches; Total', max_timesteps * batch_size / 1e6, 'epoches')
    print("---------------------------------------")

    if not os.path.exists("./results"):
        os.makedirs("./results")

    env = gym.make(env_set)
    test_env = gym.make(env_set)

    # Set seeds
    env.seed(seed)
    test_env.seed(seed)
    env.action_space.np_random.seed(seed)
    test_env.action_space.np_random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = BCQ_batchpolicy.BCQ(state_dim, action_dim, max_action, lr=lr)

    # Load buffer
    if 'sac' in buffer_type:
        replay_buffer = utils.BEAR_ReplayBuffer()
        desire_stop_dict = {'Hopper-v2': 1000, 'Walker2d-v2': 500, 'HalfCheetah-v2': 4000, 'Ant-v2': 750}
        buffer_name = buffer_type.replace('env', env_set).replace('crt', str(desire_stop_dict[env_set]))
        replay_buffer.load(buffer_name)
        buffer_name += '_1000K'
        #setting_name = setting_name.replace('crt', str(desire_stop_dict[env_set]))
    elif 'Final' in buffer_type or 'sigma'in buffer_type:
        replay_buffer = utils.ReplayBuffer()
        buffer_name = buffer_type.replace('env', env_set)
        replay_buffer.load(buffer_name)
    elif 'optimal' in buffer_type:
        buffer_name = buffer_type.replace('env', env_set)
        replay_buffer = utils.ReplayBuffer()
        replay_buffer.load(buffer_name)
    else:
        raise FileNotFoundError('! Unknown type of dataset %s' % buffer_type)


    training_iters, epoch = 0, 0
    while training_iters < max_timesteps:
        epoch += eval_freq * batch_size / 1e6
        bcq_state_dict = policy.train(replay_buffer, iterations=int(eval_freq), batch_size=batch_size, logger=logger)

        if (training_iters  % save_freq == 0):
            logger.save_state(bcq_state_dict, training_iters)

        avgtest_reward = evaluate_policy(policy, test_env)
        training_iters += eval_freq

        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('AverageTestEpRet', avgtest_reward)
        logger.log_tabular('TotalSteps', training_iters)
        logger.log_tabular('QLoss', average_only=True)
        logger.log_tabular('Q1Vals', with_min_and_max=True)
        logger.log_tabular('Q2Vals', with_min_and_max=True)
        logger.log_tabular('ActLoss', with_min_and_max=True)
        logger.dump_tabular()
Пример #4
0
def ppo(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Proximal Policy Optimization (by clipping), 

    with early stopping based on approximate KL

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================


        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.) Typically
            denoted by :math:`\epsilon`. 

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        def update():

            # Set up function for computing PPO policy loss
            def compute_loss_pi(data):
                obs, act, adv, logp_old = data['obs'], data['act'], data[
                    'adv'], data['logp']

                # Policy loss
                pi, logp = ac.pi(obs, act)
                ratio = torch.exp(logp - logp_old)
                clip_adv = torch.clamp(ratio, 1 - clip_ratio,
                                       1 + clip_ratio) * adv
                loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

                # Useful extra info
                approx_kl = (logp_old - logp).mean().item()
                ent = pi.entropy().mean().item()
                clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
                clipfrac = torch.as_tensor(clipped,
                                           dtype=torch.float32).mean().item()
                pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

                return loss_pi, pi_info

            # Set up function for computing value loss
            def compute_loss_v(data):
                obs, ret = data['obs'], data['ret']
                return ((ac.v(obs) - ret)**2).mean()

            data = buf.get()

            pi_l_old, pi_info_old = compute_loss_pi(data)
            pi_l_old = pi_l_old.item()
            v_l_old = compute_loss_v(data).item()

            # Train policy with multiple steps of gradient descent
            for i in range(train_pi_iters):
                pi_optimizer.zero_grad()
                loss_pi, pi_info = compute_loss_pi(data)
                kl = mpi_avg(pi_info['kl'])
                if kl > 1.5 * target_kl:
                    logger.log(
                        'Early stopping at step %d due to reaching max kl.' %
                        i)
                    break
                loss_pi.backward()
                mpi_avg_grads(ac.pi)  # average grads across MPI processes
                pi_optimizer.step()

            logger.store(StopIter=i)

            # Value function learning
            for i in range(train_v_iters):
                vf_optimizer.zero_grad()
                loss_v = compute_loss_v(data)
                loss_v.backward()
                mpi_avg_grads(ac.v)  # average grads across MPI processes
                vf_optimizer.step()

            # Log changes from update
            kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
            logger.store(LossPi=pi_l_old,
                         LossV=v_l_old,
                         KL=kl,
                         Entropy=ent,
                         ClipFrac=cf,
                         DeltaLossPi=(loss_pi.item() - pi_l_old),
                         DeltaLossV=(loss_v.item() - v_l_old))

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
def vpg_linesearch_penalty(env_fn,
                           actor_critic=core.mlp_actor_critic,
                           ac_kwargs=dict(),
                           seed=0,
                           steps_per_epoch=4000,
                           epochs=50,
                           gamma=0.99,
                           pi_lr=3e-3,
                           vf_lr=1e-3,
                           train_v_iters=80,
                           lam=0.97,
                           max_ep_len=1000,
                           backtrack_iters=500,
                           logger_kwargs=dict(),
                           save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # VPG objectives
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute

    # Optimizers
    ##train_pi is not used
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    ##op for taking a gradient step
    ## use AdamOptimizer as it is adjusts the learning rate
    opt = tf.train.AdamOptimizer(learning_rate=pi_lr)
    pi_name = "pi"
    scope_variable = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                       scope=pi_name)
    grads_and_vars = opt.compute_gradients(pi_loss, scope_variable)
    pi_grad_step_op = opt.apply_gradients(grads_and_vars)
    pi_grad_norm = tf.global_norm([item[0] for item in grads_and_vars])

    #policy params - need to get and save policy params
    pi_params = core.get_vars('pi')
    gradient = core.flat_grad(pi_loss, pi_params)
    v_ph = tf.placeholder(tf.float32, shape=gradient.shape)
    get_pi_params = core.flat_concat(pi_params)
    set_pi_params = core.assign_params_from_flat(v_ph, pi_params)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def batch_run(inputs):
        penalty = 0
        aa, v_t, logp_t = sess.run(get_action_ops, feed_dict=inputs)
        for aindex in range(len(aa)):
            o, r, d, _ = env.step(aa[aindex])
            penalty = penalty + env.penalty_sa(o, aa[aindex])
            if d:
                o = env.reset()
                break
        #print("batch_run, size {}, Penaty {} ".format(aindex,env.penalty_sa(o,aa[aindex])))
        return penalty

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Policy gradient step
        #sess.run(train_pi, feed_dict=inputs)

        # Value function learning
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        ##linesearch - backtracking iteration using penalty
        save_params = sess.run(get_pi_params)
        for j in range(backtrack_iters):
            old_penalty = batch_run(inputs)
            pi_l_old = sess.run([pi_loss], feed_dict=inputs)
            sess.run(pi_grad_step_op, feed_dict=inputs)
            new_penalty = batch_run(inputs)
            pi_l_new = sess.run([pi_loss], feed_dict=inputs)
            if new_penalty < old_penalty:
                #if pi_l_new <= pi_l_old:
                #print("Accepting params at iter {} pi_l_new={} pi_l_old={}".format(j, pi_l_new, pi_l_old))
                #print("Accepting params at iter {} new_penalty={} old_penalty={}".format(j, new_penalty, old_penalty))
                save_params = sess.run(get_pi_params)
                sess.run(set_pi_params, feed_dict={v_ph: save_params})
                #break

        sess.run(set_pi_params, feed_dict={v_ph: save_params})

        # Log changes from update
        pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl],
                                         feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform VPG update!
        update()

        # Log info about epoch
        #logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', average_only=True)
        #logger.log_tabular('EpLen', average_only=True)
        #logger.log_tabular('VVals', with_min_and_max=True)
        #logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
        #logger.log_tabular('LossPi', average_only=True)
        #logger.log_tabular('LossV', average_only=True)
        #logger.log_tabular('DeltaLossPi', average_only=True)
        #logger.log_tabular('DeltaLossV', average_only=True)
        #logger.log_tabular('Entropy', average_only=True)
        #logger.log_tabular('KL', average_only=True)
        #logger.log_tabular('Time', time.time()-start_time)
        logger.dump_tabular()
Пример #6
0
def klucb_bs_sac(env_fn,
                 actor_critic=core.mlp_actor_critic,
                 ac_kwargs=dict(),
                 seed=0,
                 steps_per_epoch=5000,
                 epochs=100,
                 replay_size=int(1e6),
                 gamma=0.99,
                 polyak=0.995,
                 lr=1e-3,
                 alpha=0.2,
                 batch_size=100,
                 start_steps=10000,
                 max_ep_len=1000,
                 logger_kwargs=dict(),
                 save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. 
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(
            x_ph, a_ph, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, _, _, v_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # mu = tf.squeeze(mu,axis=1)
    # pi = tf.squeeze(pi,axis=1)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)

    print(mu.shape, pi.shape, logp_pi.shape, q1.shape, q2.shape, q1_pi.shape,
          q2_pi.shape, v.shape,
          tf.expand_dims(d_ph, 1).shape,
          tf.expand_dims(d_ph, 1).shape, v_targ.shape)

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(
        tf.expand_dims(r_ph, 1) + gamma *
        (1 - tf.expand_dims(d_ph, 1)) * v_targ)
    # q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph))
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    value_loss = q1_loss + q2_loss + v_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op,
        train_value_op, target_update
    ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)
    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'mu': mu,
                              'pi': pi,
                              'q1': q1,
                              'q2': q2,
                              'v': v
                          })

    def get_action(o, head, deterministic=False):
        # act_op = mu[:,p_head,:] if deterministic else pi[:,p_head,:]
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0, head, :]

    def test_agent(n, head):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            # head = np.random.randint(num_heads, size = 1)[0]
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, head, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    policy = LM_DSEE(ac_kwargs['num_heads'],
                     rho=0.49,
                     lower=-50,
                     amplitude=4450)

    policy.startGame()
    returns = []
    choices = []
    head = policy.choice()

    # print ('Total number of heads', ac_kwargs['num_heads'])
    # Main loop: collect experience in env and update/log each epoch
    train_end = start_time
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """

        if t > start_steps:
            a = get_action(o, head)
        else:
            a = env.action_space.sample()
        # a = env.action_space.sample()
        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """

            train_start = time.time()
            # print (t//steps_per_epoch, "Playing time", train_start - train_end)
            policy.getReward(head, ep_ret)
            returns.append(ep_ret)
            choices.append(head)
            head = policy.choice()
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                }
                # tic = time.time()
                outs = sess.run(step_ops, feed_dict)
                # toc = time.time()
                # print (toc-tic)

                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             LossV=outs[3],
                             Q1Vals=outs[4],
                             Q2Vals=outs[5],
                             VVals=outs[6],
                             LogPi=outs[7])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            train_end = time.time()
            # print (t//steps_per_epoch, "Training time", train_end - train_start)
        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:

            test_start = time.time()
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            head = policy.choice()
            test_agent(n=10, head=head)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
            test_end = time.time()
            # print (t//steps_per_epoch, "Testing time", test_end - test_start)
            # print ("*"*30)
    print(returns, choices)
Пример #7
0
def ddpg(env_fn,
         actor_critic=core.MLPActorCritic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         update_after=1000,
         update_every=50,
         act_noise=0.1,
         num_test_episodes=10,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1):
    """
    Deep Deterministic Policy Gradient (DDPG)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act``
            method, a ``pi`` module, and a ``q`` module. The ``act`` method and
            ``pi`` module should accept batches of observations as inputs,
            and ``q`` should accept a batch of observations and a batch of
            actions as inputs. When called, these should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each
                                           | observation.
            ``pi``       (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``q``        (batch,)          | Tensor containing the current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object
            you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long
            you wait between updates, the ratio of env steps to gradient steps
            is locked to 1.

        act_noise (float): Stddev for Gaussian exploration noise added to
            policy at training time. (At test time, no noise is added.)

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q])
    logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts)

    # Set up function for computing DDPG Q-loss
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q = ac.q(o, a)
        assert len(q.shape) == 1 and q.shape[
            0] == batch_size, 'Expected shape (%d,), got %s' % (batch_size,
                                                                str(q.shape))

        # Bellman backup for Q function
        with torch.no_grad():
            q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2))
            backup = r + gamma * (1 - d) * q_pi_targ

        # MSE loss against Bellman backup
        loss_q = ((q - backup)**2).mean()

        # Useful info for logging
        loss_info = dict(QVals=q.detach().numpy())

        return loss_q, loss_info

    # Set up function for computing DDPG pi loss
    def compute_loss_pi(data):
        o = data['obs']
        q_pi = ac.q(o, ac.pi(o))
        return -q_pi.mean()

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    q_optimizer = Adam(ac.q.parameters(), lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q.
        q_optimizer.zero_grad()
        loss_q, loss_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Freeze Q-network so you don't waste computational effort
        # computing gradients for it during the policy learning step.
        for p in ac.q.parameters():
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-network so you can optimize it at next DDPG step.
        for p in ac.q.parameters():
            p.requires_grad = True

        # Record things
        logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, noise_scale):
        a = ac.act(torch.as_tensor(o, dtype=torch.float32))
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy (with some noise, via act_noise).
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for _ in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #8
0
def sad(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=100,
        demo_file=''):
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space,
                      **ac_kwargs).to(device)
    ac_targ = deepcopy(ac).to(device)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Expert replay buffer
    demo_buffer = DemoBuffer()
    demo_buffer.load(demo_file)
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']
        o, a, r, o2, d = Variable(o), Variable(a), Variable(r), Variable(
            o2), Variable(d)
        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)
        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.cpu().detach().numpy(),
                      Q2Vals=q2.cpu().detach().numpy())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        o = Variable(o)
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)
        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.cpu().detach().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    # all_obs, all_action = torch.tensor(demo_buffer.obs_buf, device=device), demo_buffer.act_buf
    # stack = []
    # global counter
    # counter = 0
    def get_action(o, deterministic=False):
        o = torch.as_tensor(o, device=device, dtype=torch.float32)
        # norm = torch.norm(all_obs - o, dim=1)
        # idx_min = torch.argmin(norm)
        # if norm[idx_min] < 0.3:
        #     action = all_action[idx_min.item()]
        #     stack.append(1)
        # else:
        action = ac.act(o, deterministic).cpu().numpy()
        return action

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            writer.add_scalar(tag='test_reward',
                              scalar_value=ep_ret,
                              global_step=t)
            test_reward_buffer.append((t, ep_ret))
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    for i in range(update_every * 10):
        batch = demo_buffer.sample_batch(batch_size)
        update(batch)
        # if i % update_every == 1:
        #     logger.log_tabular('LossQ', average_only=True)
        #     logger.dump_tabular()
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            writer.add_scalar(tag='train_reward',
                              scalar_value=ep_ret,
                              global_step=t)
            train_reward_buffer.append((t, ep_ret))
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                # batch = core.merge_batch(batch1, batch2)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)
                output_dir = logger_kwargs['output_dir'] + '/'
                test_rewards = np.array(test_reward_buffer)
                train_rewards = np.array(train_reward_buffer)
                train_file_name = os.path.join(
                    output_dir, '{}_train_rewards.npy'.format(seed))
                test_file_name = os.path.join(
                    output_dir, '{}_test_rewards.npy'.format(seed))
                np.save(train_file_name, train_rewards)
                np.save(test_file_name, test_rewards)
            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            # logger.log_tabular('Demo action', len(stack))
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
    writer.close()
def vpg(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10,
        n_test_episodes=100):
    """
    Vanilla Policy Gradient 

    (with GAE-Lambda for advantage estimation)

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with a 
            ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations 
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.

            The ``pi`` module's forward call should accept a batch of 
            observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations
            and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        n_test_episodes (int): Number of episodes for test agent evaluation at
            the end of each epoch.
    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    #setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()

    def space_dim(space):
        if isinstance(space, Box):
            return space.shape
        elif isinstance(space, Discrete):
            return space.n
        else:
            raise ValueError

    obs_dim = space_dim(env.observation_space)
    act_dim = space_dim(env.action_space)

    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    # if torch.cuda.device_count() > 1:
    #     ac.pi = nn.DataParallel(ac.pi)
    #     ac.v = nn.DataParallel(ac.v)

    ac.to(device)

    # Sync params across processes
    #sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer
    local_steps_per_epoch = steps_per_epoch
    #local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing VPG policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[
            'logp']

        # Policy loss
        pi, logp = ac.pi(obs, act)
        loss_pi = -(logp * adv).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        pi_info = dict(kl=approx_kl, ent=ent)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = data['obs'], data['ret']
        return ((ac.v(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        # Get loss and info values before update
        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with a single step of gradient descent
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        #mpi_avg_grads(ac.pi)    # average grads across MPI processes
        pi_optimizer.step()

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            #mpi_avg_grads(ac.v)    # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent = pi_info['kl'], pi_info_old['ent']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    def test_agent():
        test_env = env_fn()
        o, test_ep_ret, test_ep_len = test_env.reset(), 0, 0

        num_episodes = 0
        while num_episodes < n_test_episodes:
            a, _, _ = ac.step(
                torch.as_tensor(o, dtype=torch.float32, device=device))
            o2, r, d, _ = env.step(a)
            test_ep_ret += r
            test_ep_len += 1

            o = o2
            timeout = ep_len == max_ep_len
            terminal = d or timeout

            if timeout or terminal:
                logger.store(TestEpRet=test_ep_ret)
                num_episodes += 1
                o, test_ep_ret, test_ep_len = test_env.reset(), 0, 0

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(
                torch.as_tensor(o, dtype=torch.float32, device=device))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(
                        torch.as_tensor(o, dtype=torch.float32, device=device))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform VPG update!
        update()

        test_agent()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('TestEpRet', with_min_and_max=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        wandb.log(logger.log_current_row, step=epoch)
        logger.dump_tabular()
Пример #10
0
def ppo(workload_file,
        model_path,
        ac_kwargs=dict(),
        seed=0,
        traj_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        pre_trained=0,
        trained_model=None,
        attn=False,
        shuffle=False,
        backfil=False,
        skip=False,
        score_type=0,
        batch_job_slice=0,
        sched_algo=4):
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = HPCEnvSkip(shuffle=shuffle,
                     backfil=backfil,
                     skip=skip,
                     job_score_type=score_type,
                     batch_job_slice=batch_job_slice,
                     build_sjf=False,
                     sched_algo=sched_algo)
    env.seed(seed)
    env.my_init(workload_file=workload_file, sched_file=model_path)

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space
    ac_kwargs['attn'] = attn

    # Inputs to computation graph

    buf = PPOBuffer(obs_dim, act_dim, traj_per_epoch * JOB_SEQUENCE_SIZE,
                    gamma, lam)

    if pre_trained:
        sess = tf.Session()
        model = restore_tf_graph(sess, trained_model)
        logger.log('load pre-trained model')
        # Count variables
        var_counts = tuple(count_vars(scope) for scope in ['pi', 'v'])
        logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                   var_counts)

        x_ph = model['x']
        a_ph = model['a']
        mask_ph = model['mask']
        adv_ph = model['adv']
        ret_ph = model['ret']
        logp_old_ph = model['logp_old_ph']

        pi = model['pi']
        v = model['v']
        # logits = model['logits']
        out = model['out']
        logp = model['logp']
        logp_pi = model['logp_pi']
        pi_loss = model['pi_loss']
        v_loss = model['v_loss']
        approx_ent = model['approx_ent']
        approx_kl = model['approx_kl']
        clipfrac = model['clipfrac']
        clipped = model['clipped']

        # Optimizers
        # graph = tf.get_default_graph()
        # op = sess.graph.get_operations()
        # [print(m.values()) for m in op]
        # train_pi = graph.get_tensor_by_name('pi/conv2d/kernel/Adam:0')
        # train_v = graph.get_tensor_by_name('v/conv2d/kernel/Adam:0')
        train_pi = tf.get_collection("train_pi")[0]
        train_v = tf.get_collection("train_v")[0]
        # train_pi_optimizer = MpiAdamOptimizer(learning_rate=pi_lr, name='AdamLoad')
        # train_pi = train_pi_optimizer.minimize(pi_loss)
        # train_v_optimizer = MpiAdamOptimizer(learning_rate=vf_lr, name='AdamLoad')
        # train_v = train_v_optimizer.minimize(v_loss)
        # sess.run(tf.variables_initializer(train_pi_optimizer.variables()))
        # sess.run(tf.variables_initializer(train_v_optimizer.variables()))
        # Need all placeholders in *this* order later (to zip with data from buffer)
        all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph]
        # Every step, get: action, value, and logprob
        get_action_ops = [pi, v, logp_pi, out]

    else:
        x_ph, a_ph = placeholders_from_spaces(env.observation_space,
                                              env.action_space)
        # y_ph = placeholder(JOB_SEQUENCE_SIZE*3) # 3 is the number of sequence features
        mask_ph = placeholder(env.action_space.n)
        adv_ph, ret_ph, logp_old_ph = placeholders(None, None, None)

        # Main outputs from computation graph
        pi, logp, logp_pi, v, out = actor_critic(x_ph, a_ph, mask_ph,
                                                 **ac_kwargs)

        # Need all placeholders in *this* order later (to zip with data from buffer)
        all_phs = [x_ph, a_ph, mask_ph, adv_ph, ret_ph, logp_old_ph]

        # Every step, get: action, value, and logprob
        get_action_ops = [pi, v, logp_pi, out]

        # Experience buffer

        # Count variables
        var_counts = tuple(count_vars(scope) for scope in ['pi', 'v'])
        logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' %
                   var_counts)

        # PPO objectives
        ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
        min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                           (1 - clip_ratio) * adv_ph)
        pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
        v_loss = tf.reduce_mean((ret_ph - v)**2)

        # Info (useful to watch during learning)
        approx_kl = tf.reduce_mean(
            logp_old_ph -
            logp)  # a sample estimate for KL-divergence, easy to compute
        approx_ent = tf.reduce_mean(
            -logp)  # a sample estimate for entropy, also easy to compute
        clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio <
                                (1 - clip_ratio))
        clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

        # Optimizers
        train_pi = tf.train.AdamOptimizer(
            learning_rate=pi_lr).minimize(pi_loss)
        train_v = tf.train.AdamOptimizer(learning_rate=vf_lr).minimize(v_loss)
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        tf.add_to_collection("train_pi", train_pi)
        tf.add_to_collection("train_v", train_v)

    # Setup model saving
    # logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'action_probs': action_probs, 'log_picked_action_prob': log_picked_action_prob, 'v': v})
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph,
                              'adv': adv_ph,
                              'mask': mask_ph,
                              'ret': ret_ph,
                              'logp_old_ph': logp_old_ph
                          },
                          outputs={
                              'pi': pi,
                              'v': v,
                              'out': out,
                              'pi_loss': pi_loss,
                              'logp': logp,
                              'logp_pi': logp_pi,
                              'v_loss': v_loss,
                              'approx_ent': approx_ent,
                              'approx_kl': approx_kl,
                              'clipped': clipped,
                              'clipfrac': clipfrac
                          })

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    [o, co], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset(
    ), 0, False, 0, 0, 0, 0, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    start_time = time.time()
    for epoch in range(epochs):
        t = 0
        discard_times = 0
        while True:
            # [no_skip, skip]
            lst = [1, 1]
            #for i in range(0, MAX_QUEUE_SIZE * JOB_FEATURES, JOB_FEATURES):
            #    job = o[i:i + JOB_FEATURES]
            #    # the skip time of will_skip job exceeds MAX_SKIP_TIME
            #    if job[-2] == 1.0:
            #        lst = [1,0]

            a, v_t, logp_t, output = sess.run(get_action_ops,
                                              feed_dict={
                                                  x_ph:
                                                  o.reshape(1, -1),
                                                  mask_ph:
                                                  np.array(lst).reshape(1, -1)
                                              })
            # print(a, end=" ")
            '''
            action = np.random.choice(np.arange(MAX_QUEUE_SIZE), p=action_probs)
            log_action_prob = np.log(action_probs[action])
            '''
            if buf.ptr - buf.path_start_idx >= 10 * JOB_SEQUENCE_SIZE or buf.ptr >= buf.max_size:
                discard_times += 1
                buf.ptr = buf.path_start_idx
                [
                    o, co
                ], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset(
                ), 0, False, 0, 0, 0, 0, 0, 0
                continue
            # save and log
            buf.store(o, None, a, np.array(lst), r, v_t, logp_t)
            logger.store(VVals=v_t)
            if a[0] == 1:
                skip_count += 1
            o, r, d, r2, sjf_t, f1_t = env.step(a[0])
            ep_ret += r
            ep_len += 1
            show_ret += r2
            sjf += sjf_t
            f1 += f1_t

            if d:
                t += 1
                buf.finish_path(r)
                logger.store(EpRet=ep_ret,
                             EpLen=ep_len,
                             ShowRet=show_ret,
                             SJF=sjf,
                             F1=f1,
                             SkipRatio=skip_count / ep_len)
                [
                    o, co
                ], r, d, ep_ret, ep_len, show_ret, sjf, f1, skip_count = env.reset(
                ), 0, False, 0, 0, 0, 0, 0, 0
                if t >= traj_per_epoch:
                    # print ("state:", state, "\nlast action in a traj: action_probs:\n", action_probs, "\naction:", action)
                    break
        # print("Sample time:", (time.time()-start_time)/num_total, num_total)
        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        # start_time = time.time()
        update()
        # print("Train time:", time.time()-start_time)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', with_min_and_max=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts',
                           (epoch + 1) * traj_per_epoch * JOB_SEQUENCE_SIZE)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('ShowRet', average_only=True)
        logger.log_tabular('SJF', average_only=True)
        logger.log_tabular('F1', average_only=True)
        logger.log_tabular('SkipRatio', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #11
0
def ude_td3_ConcreteD_batchP(
        env_fn,
        render_env=False,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-3,
        q_lr=1e-3,
        reward_scale=5,
        without_start_steps=True,
        batch_size=100,
        # TODO: change it back to 10000
        start_steps=10000,  #start_steps=10000,
        without_delay_train=False,
        act_noise=0.1,
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1,
        n_post_action=10,
        uncertainty_method='dropout',
        sample_obs_std=1,
        uncertainty_driven_exploration=False,
        uncertainty_policy_delay=5000,
        dropout_rate=0.1,
        concentration_factor=0.1,
        minimum_exploration_level=0):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    # TODO: Test no start steps
    if without_start_steps:
        start_steps = batch_size
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    print('Creating networks ...')
    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, _, pi_dropout_mask_generator, pi_dropout_mask_phs, \
        q1, _, q1_dropout_mask_generator, q1_dropout_mask_phs, q1_pi, _, \
        q2, _, q2_dropout_mask_generator, q2_dropout_mask_phs = actor_critic(x_ph, a_ph, **ac_kwargs,
                                                                             dropout_rate=0)

    # Random Network Distillation
    with tf.variable_scope('random_net_distill'):
        # RND Target and Predictor Network
        rnd_lr = 1e-3
        rnd_targ_act, \
        rnd_pred_act, rnd_pred_act_reg, rnd_pred_act_dropout_mask_generator, rnd_pred_act_dropout_mask_phs, \
        rnd_targ_cri, \
        rnd_pred_cri, rnd_pred_cri_reg, rnd_pred_cri_dropout_mask_generator, rnd_pred_cri_dropout_mask_phs = core.random_net_distill(x_ph, a_ph, **ac_kwargs, dropout_rate=0)

    # TODO: add environment model learning transition dynamics

    # TODO: Calculate Uncertainty of Q-value function
    # Initialize uncertainty module
    obs_set_size = 10
    track_obs_set_unc_frequency = 100  # every 100 steps
    pi_unc_module = DropoutUncertaintyModule(
        act_dim,
        obs_dim,
        n_post_action,
        obs_set_size,
        track_obs_set_unc_frequency,
        x_ph,
        a_ph,
        ac_kwargs,
        dropout_rate,
        logger_kwargs,
        tf_var_scope_main='main',
        tf_var_scope_target='target',
        tf_var_scope_rnd='random_net_distill')

    # Target policy network
    with tf.variable_scope('target'):
        pi_targ, _, pi_dropout_mask_generator_targ, pi_dropout_mask_phs_targ, \
        _, _, _, _, _, _, \
        _, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs, dropout_rate=dropout_rate)
        pi_targ = pi_targ[0]

    # Target Q networks
    with tf.variable_scope('target', reuse=True):
        # TODO: add with_out_policy_smoothing
        # Target policy smoothing, by adding clipped noise to target actions
        epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
        epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
        a2 = pi_targ + epsilon
        a2 = tf.clip_by_value(a2, -act_limit, act_limit)
        # Target Q-values, using action from target policy
        _, _, _, _, \
        q1_targ, _, q1_dropout_mask_generator_targ, q1_dropout_mask_phs_targ, _, _, \
        q2_targ, _, q2_dropout_mask_generator_targ, q2_dropout_mask_phs_targ = actor_critic(x2_ph, a2, **ac_kwargs, dropout_rate=dropout_rate)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size,
                                 logger_fname='experiences_log.txt',
                                 **logger_kwargs)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    # TODO: use conservative estimation of Q
    # Bellman backup for Q functions, using Clipped Double-Q targets
    def post_sample_q1_and_q2(feed_dictionary, batch_size):
        dropout_masks_set_q1 = q1_dropout_mask_generator_targ.generate_dropout_mask(
            n_post_action)
        dropout_masks_set_q2 = q2_dropout_mask_generator_targ.generate_dropout_mask(
            n_post_action)
        q1_targ_post = np.zeros((n_post_action, batch_size))
        q2_targ_post = np.zeros((n_post_action, batch_size))

        for mask_i in range(len(q1_dropout_mask_phs_targ)):
            feed_dictionary[q1_dropout_mask_phs_targ[
                mask_i]] = dropout_masks_set_q1[mask_i]
            feed_dictionary[q2_dropout_mask_phs_targ[
                mask_i]] = dropout_masks_set_q2[mask_i]
        q1_targ_post = sess.run(q1_targ, feed_dict=feed_dictionary)
        q2_targ_post = sess.run(q2_targ, feed_dict=feed_dictionary)
        min_q_targ = np.minimum(q1_targ_post.mean(axis=1),
                                q2_targ_post.mean(axis=1))
        return min_q_targ

    # min_q_targ = tf.placeholder(dtype=tf.float32)
    # backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*min_q_targ)

    min_q_targ = tf.minimum(q1_targ[0], q2_targ[0])
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ)

    # TD3 losses
    pi_loss = -tf.reduce_mean(q1_pi[0])
    q1_loss = tf.reduce_mean((q1[0] - backup)**2)
    q2_loss = tf.reduce_mean((q2[0] - backup)**2)
    q_loss = q1_loss + q2_loss

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # RND losses and train ops
    rnd_loss_act = tf.reduce_mean(
        (rnd_pred_act[0] - rnd_targ_act)**2) + rnd_pred_act_reg / batch_size
    rnd_optimizer_act = tf.train.AdamOptimizer(learning_rate=rnd_lr)
    train_rnd_op_act = rnd_optimizer_act.minimize(
        rnd_loss_act, var_list=get_vars('random_net_distill/rnd_pred_act'))

    rnd_loss_cri = tf.reduce_mean(
        (rnd_pred_cri[0] - rnd_targ_cri)**2) + rnd_pred_cri_reg / batch_size
    rnd_optimizer_cri = tf.train.AdamOptimizer(learning_rate=rnd_lr)
    train_rnd_op_cri = rnd_optimizer_cri.minimize(
        rnd_loss_cri, var_list=get_vars('random_net_distill/rnd_pred_cri'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q1': q1,
                              'q2': q2
                          })

    def set_dropout_mask_and_post_number_to_one(feed_dictionary, *argv):
        """Set all dropout masks and post sample number in argv to one."""
        for dropout_mask_ph in argv:
            for mask_i in range(len(dropout_mask_ph)):
                feed_dictionary[dropout_mask_ph[mask_i]] = np.ones(
                    [1, dropout_mask_ph[mask_i].shape.as_list()[1]])
        return feed_dictionary

    def set_dropout_mask_randomly_and_post_nuber_to_one(
            feed_dictionary, mask_phs, mask_generators):
        if len(mask_phs) != len(mask_generators):
            raise ValueError('mask_phs and mask_generators do not match.')
        else:
            for i in range(len(mask_phs)):
                dropout_mask_ph = mask_phs[i]
                dropout_masks = mask_generators[i].generate_dropout_mask(
                    post_size=1)
                for mask_i in range(len(dropout_mask_ph)):
                    feed_dictionary[
                        dropout_mask_ph[mask_i]] = dropout_masks[mask_i]
        return feed_dictionary

    def get_action_train(o, noise_scale, pi_unc_module, step_index):
        feed_dictionary = {x_ph: o.reshape(1, -1)}
        # Set dropout masks to one
        feed_dictionary = set_dropout_mask_and_post_number_to_one(
            feed_dictionary, rnd_pred_act_dropout_mask_phs,
            rnd_pred_cri_dropout_mask_phs, pi_dropout_mask_phs,
            q1_dropout_mask_phs, q2_dropout_mask_phs)
        # RND actor
        rnd_t_act, rnd_p_act = sess.run([rnd_targ_act, rnd_pred_act],
                                        feed_dict=feed_dictionary)
        rnd_t_act = rnd_t_act[0]
        rnd_p_act = rnd_p_act[0]
        rnd_e_act = np.sqrt(np.sum(rnd_p_act - rnd_t_act)**2)

        # Generate action
        if uncertainty_driven_exploration:
            # 1. Generate action Prediction
            a_prediction = sess.run(pi, feed_dict=feed_dictionary)[0][0]

            # 2. Generate post sampled actions
            # TODO: get covariance based on online and target policy respectively and calculate the difference
            a_post = pi_unc_module.get_post_samples_act(o, sess, step_index)
            rnd_a_post = pi_unc_module.get_post_samples_rnd_act(
                o, sess, step_index)

            # 3. Generate uncertainty-driven exploratory action
            a = np.zeros((act_dim, ))
            if act_dim > 1:
                # TODO: compute correlation rather than covariance
                a_cov = np.cov(a_post, rowvar=False)
                a_cov_shaped = concentration_factor * a_cov

                rnd_a_cov = np.cov(rnd_a_post, rowvar=False)

                a = np.random.multivariate_normal(a_prediction, a_cov_shaped,
                                                  1)[0]

                unc_a = a_cov
                unc_rnd_a = rnd_a_cov
            else:
                a_std = np.std(a_post, axis=0)
                a_std_shaped = concentration_factor * a_std + minimum_exploration_level * np.ones(
                    a_std.shape)
                rnd_a_cov = np.std(rnd_a_post, axis=0)
                # TODO: only keep one
                a = np.random.normal(a_prediction, a_std_shaped, 1)[0]

                unc_a = a_std
                unc_rnd_a = rnd_a_cov
        else:
            for mask_i in range(len(pi_dropout_mask_phs)):
                feed_dictionary[pi_dropout_mask_phs[mask_i]] = np.ones(
                    [1, pi_dropout_mask_phs[mask_i].shape.as_list()[1]])
            a = sess.run(pi, feed_dict=feed_dictionary)[0][0]
            a += noise_scale * np.random.randn(act_dim)
            unc_a = 0
            unc_rnd_a = 0

        a = np.clip(a, -act_limit, act_limit)
        # TODO: use uncertainty as intrinsic reward
        unc_based_reward = np.mean(np.abs(unc_a))

        # TODO: should the a_ph be a or a_prediction??
        feed_dictionary[a_ph] = a.reshape(1, -1)

        # Generate post sampled q values
        q1_post, q2_post = pi_unc_module.get_post_samples_q(
            o, a, sess, step_index)
        rnd_q_post = pi_unc_module.get_post_samples_rnd_cri(
            o, a, sess, step_index)

        unc_q1 = np.std(q1_post, axis=0)
        unc_q2 = np.std(q2_post, axis=0)
        unc_rnd_q = np.std(rnd_q_post, axis=0)

        q1_pred = sess.run(q1, feed_dict=feed_dictionary)[0][0]
        q2_pred = sess.run(q2, feed_dict=feed_dictionary)[0][0]

        # RND critic
        rnd_t_cri, rnd_p_cri = sess.run([rnd_targ_cri, rnd_pred_cri],
                                        feed_dict=feed_dictionary)
        rnd_t_cri = rnd_t_cri[0]
        rnd_p_cri = rnd_p_cri[0]
        rnd_e_cri = np.sqrt(np.sum(rnd_p_cri - rnd_t_cri)**2)

        return a, \
               q1_pred, q2_pred, q1_post, q2_post,\
               unc_a, unc_based_reward, unc_q1, unc_q2,\
               unc_rnd_a, unc_rnd_q,\
               rnd_t_act, rnd_p_act, rnd_e_act,\
               rnd_t_cri, rnd_p_cri, rnd_e_cri

    def get_action_test(o):
        """Get deterministic action without exploration."""
        feed_dictionary = {x_ph: o.reshape(1, -1)}
        for mask_i in range(len(pi_dropout_mask_phs)):
            feed_dictionary[pi_dropout_mask_phs[mask_i]] = np.ones(
                [1, pi_dropout_mask_phs[mask_i].shape.as_list()[1]])
        a = sess.run(pi, feed_dict=feed_dictionary)[0][0]
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action_test(o))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    ep_q1_var, ep_q2_var, ep_unc_a, ep_unc_q1, ep_unc_q2, \
    ep_unc_rnd_a, ep_unc_rnd_q, ep_rnd_e_act, ep_rnd_e_cri = 0, 0, 0, 0, 0, 0, 0, 0, 0
    total_steps = steps_per_epoch * epochs

    # No dropout and no post sample for training phase: set all dropout masks to 1 and post_size to 1
    feed_dict_train = {}
    feed_dict_train = set_dropout_mask_and_post_number_to_one(
        feed_dict_train, pi_dropout_mask_phs, q1_dropout_mask_phs,
        q2_dropout_mask_phs, pi_dropout_mask_phs_targ,
        q1_dropout_mask_phs_targ, q2_dropout_mask_phs_targ,
        rnd_pred_act_dropout_mask_phs, rnd_pred_cri_dropout_mask_phs)

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            # import pdb; pdb.set_trace()
            a, \
            q1_pred, q2_pred, q1_post, q2_post,\
            unc_a, unc_based_reward, unc_q1, unc_q2, \
            unc_rnd_a, unc_rnd_q, \
            rnd_t_act, rnd_p_act, rnd_e_act,\
            rnd_t_cri, rnd_p_cri, rnd_e_cri = get_action_train(o, act_noise, pi_unc_module, step_index=t)
        else:
            a = env.action_space.sample()
            # TODO:keep the same dimension with real covariance
            if uncertainty_driven_exploration:
                unc_a = np.zeros((act_dim, act_dim))
                unc_rnd_a = np.zeros((act_dim, act_dim))
            else:
                unc_a = 0
                unc_rnd_a = 0
            q1_pred, q2_pred = 0, 0
            q1_post = np.zeros((n_post_action, ))
            q2_post = np.zeros((n_post_action, ))
            unc_q1, unc_q2, unc_rnd_q = 0, 0, 0
            unc_based_reward = 0
            rnd_t_act, rnd_p_act, rnd_e_act, rnd_t_cri, rnd_p_cri, rnd_e_cri = 0, 0, 0, 0, 0, 0

        # Sample an observation set to track their uncertainty trajectories
        if t > start_steps:
            if pi_unc_module.obs_set_is_empty:
                pi_unc_module.sample_obs_set_from_replay_buffer(replay_buffer)

            if t % pi_unc_module.track_obs_set_unc_frequency == 0:
                pi_unc_module.calculate_obs_set_uncertainty(
                    sess, t // steps_per_epoch, t)

            # TODO: try more frequent update to avoid bad dropout masks (perhaps not necessary because we can get
            #  larger sample size now.)
            # Update uncertainty policy to current policy
            if t % uncertainty_policy_delay == 0:
                # pi_unc_module.uncertainty_policy_update(sess)
                pi_unc_module.update_weights_of_main_unc(sess)

        # Step the env
        if render_env:
            env.render()
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        ep_q1_var += np.var(q1_post)
        ep_q2_var += np.var(q2_post)
        # TODO: we cannot use this as uncertainty, because if the policy learns some dimension is correlated then the
        #   corresponding element will be 1 in covariance matrix.
        ep_unc_a += np.sum(unc_a)
        ep_unc_rnd_a += np.sum(unc_rnd_a)
        ep_unc_q1 += unc_q1
        ep_unc_q2 += unc_q2
        ep_unc_rnd_q += unc_rnd_q
        ep_rnd_e_act += rnd_e_act
        ep_rnd_e_cri += rnd_e_cri

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o,
                            a,
                            reward_scale * r,
                            o2,
                            d,
                            t,
                            steps_per_epoch,
                            start_time,
                            unc_a=unc_a,
                            unc_rnd_a=unc_rnd_a,
                            unc_q1=unc_q1,
                            unc_q2=unc_q2,
                            unc_rnd_q=unc_rnd_q,
                            q1_pred=q1_pred,
                            q2_pred=q2_pred,
                            q1_post=q1_post,
                            q2_post=q2_post,
                            rnd_e_act=rnd_e_act,
                            rnd_e_cri=rnd_e_cri)
        # replay_buffer.store(o, a, r + unc_based_reward, o2, d, uncertainty, t, steps_per_epoch, start_time)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if without_delay_train:
            batch = replay_buffer.sample_batch(batch_size)
            feed_dict_train[x_ph] = batch['obs1']
            feed_dict_train[x2_ph] = batch['obs2']
            feed_dict_train[a_ph] = batch['acts']
            feed_dict_train[r_ph] = batch['rews']
            feed_dict_train[d_ph] = batch['done']

            # Train Random Net Distillation
            rnd_step_ops_act = [
                rnd_loss_act, rnd_targ_act, rnd_pred_act, train_rnd_op_act
            ]
            rnd_outs_act = sess.run(rnd_step_ops_act, feed_dict_train)
            logger.store(LossRnd=rnd_outs_act[0])

            # Train q
            q_step_ops = [q_loss, q1, q2, train_q_op]
            outs = sess.run(q_step_ops, feed_dict_train)
            logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

            # Delayed policy update
            outs = sess.run([pi_loss, train_pi_op, target_update],
                            feed_dict_train)
            logger.store(LossPi=outs[0])

        if d or (ep_len == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            if not without_delay_train:
                for j in range(ep_len):
                    batch = replay_buffer.sample_batch(batch_size)
                    feed_dict_train[x_ph] = batch['obs1']
                    feed_dict_train[x2_ph] = batch['obs2']
                    feed_dict_train[a_ph] = batch['acts']
                    feed_dict_train[r_ph] = batch['rews']
                    feed_dict_train[d_ph] = batch['done']

                    # Train Random Net Distillation
                    # change dropout masks every training step
                    mask_phs = [
                        rnd_pred_act_dropout_mask_phs,
                        rnd_pred_cri_dropout_mask_phs
                    ]
                    mask_generators = [
                        rnd_pred_act_dropout_mask_generator,
                        rnd_pred_cri_dropout_mask_generator
                    ]
                    feed_dict_train = set_dropout_mask_randomly_and_post_nuber_to_one(
                        feed_dict_train, mask_phs, mask_generators)
                    rnd_step_ops_act = [
                        rnd_loss_act, rnd_targ_act, rnd_pred_act,
                        train_rnd_op_act
                    ]
                    rnd_outs_act = sess.run(rnd_step_ops_act, feed_dict_train)
                    logger.store(LossRndAct=rnd_outs_act[0])

                    rnd_step_ops_cri = [
                        rnd_loss_cri, rnd_targ_cri, rnd_pred_cri,
                        train_rnd_op_cri
                    ]
                    rnd_outs_cri = sess.run(rnd_step_ops_cri, feed_dict_train)
                    logger.store(LossRndCri=rnd_outs_cri[0])

                    # Train Q-value function
                    # feed_dict_train[min_q_targ] = post_sample_q1_and_q2(feed_dict_train, batch_size)
                    q_step_ops = [q_loss, q1, q2, train_q_op]
                    outs = sess.run(q_step_ops, feed_dict_train)
                    logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                    if j % policy_delay == 0:
                        # Delayed policy update
                        outs = sess.run([pi_loss, train_pi_op, target_update],
                                        feed_dict_train)
                        logger.store(LossPi=outs[0])
                # No weight update delay
                pi_unc_module.update_weights_of_rnd_unc(sess)

            logger.store(EpRet=ep_ret,
                         EpLen=ep_len,
                         EpQ1Var=ep_q1_var,
                         EpQ2Var=ep_q2_var,
                         EpUncAct=ep_unc_a,
                         EpUncRndAct=ep_unc_rnd_a,
                         EpUncQ1=ep_unc_q1,
                         EpUncQ2=ep_unc_q2,
                         EpUncRndQ=ep_unc_rnd_q,
                         EpRndErrorAct=ep_rnd_e_act,
                         EpRndErrorCri=ep_rnd_e_cri)

            o, r, d, ep_ret, ep_len, ep_q1_var, ep_q2_var,\
            ep_unc_a, ep_unc_q1, ep_unc_q2,\
            ep_unc_rnd_a, ep_unc_rnd_q,\
            ep_rnd_e_act, ep_rnd_e_cri = env.reset(), 0, False, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('LossRndAct', average_only=True)
            logger.log_tabular('LossRndCri', average_only=True)
            logger.log_tabular('EpQ1Var', with_min_and_max=True)
            logger.log_tabular('EpQ2Var', with_min_and_max=True)
            logger.log_tabular('EpUncAct', with_min_and_max=True)
            logger.log_tabular('EpUncRndAct', with_min_and_max=True)
            logger.log_tabular('EpUncQ1', with_min_and_max=True)
            logger.log_tabular('EpUncQ2', with_min_and_max=True)
            logger.log_tabular('EpUncRndQ', with_min_and_max=True)
            logger.log_tabular('EpRndErrorAct', with_min_and_max=True)
            logger.log_tabular('EpRndErrorCri', with_min_and_max=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #12
0
def oac(env_fn, logger_kwargs=dict(), network_params=dict(), rl_params=dict()):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # env params
    thresh = rl_params['thresh']

    # control params
    seed = rl_params['seed']
    epochs = rl_params['epochs']
    steps_per_epoch = rl_params['steps_per_epoch']
    replay_size = rl_params['replay_size']
    batch_size = rl_params['batch_size']
    start_steps = rl_params['start_steps']
    max_ep_len = rl_params['max_ep_len']
    max_noop = rl_params['max_noop']
    save_freq = rl_params['save_freq']
    render = rl_params['render']

    # rl params
    gamma = rl_params['gamma']
    polyak = rl_params['polyak']
    lr = rl_params['lr']
    grad_clip_val = rl_params['grad_clip_val']

    # entropy params
    alpha = rl_params['alpha']
    target_entropy_start = rl_params['target_entropy_start']
    target_entropy_stop = rl_params['target_entropy_stop']
    target_entropy_steps = rl_params['target_entropy_steps']

    # optimistic exploration params
    use_opt = rl_params['use_opt']
    beta_UB = rl_params['beta_UB']
    beta_LB = rl_params['beta_LB']
    delta = rl_params['delta']
    opt_lr = rl_params['opt_lr']
    max_opt_steps = rl_params['max_opt_steps']

    train_env, test_env = env_fn(), env_fn()
    obs_space = env.observation_space
    act_space = env.action_space

    # get the size after resize
    obs_dim = network_params['input_dims']
    act_dim = act_space.n

    # set the seed
    tf.set_random_seed(seed)
    np.random.seed(seed)
    train_env.seed(seed)
    train_env.action_space.np_random.seed(seed)
    test_env.seed(seed)
    test_env.action_space.np_random.seed(seed)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # init a state buffer for storing last m states
    train_state_buffer = StateBuffer(m=obs_dim[2])
    test_state_buffer = StateBuffer(m=obs_dim[2])

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = placeholders(obs_dim, act_dim, obs_dim,
                                                 None, None)

    # alpha and entropy setup
    max_target_entropy = tf.log(tf.cast(act_dim, tf.float32))
    target_entropy_prop_ph = tf.placeholder(dtype=tf.float32, shape=())
    target_entropy = max_target_entropy * target_entropy_prop_ph

    log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)

    if alpha == 'auto':  # auto tune alpha
        alpha = tf.exp(log_alpha)
    else:  # fixed alpha
        alpha = tf.get_variable('alpha', dtype=tf.float32, initializer=alpha)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, action_probs, log_action_probs, action_logits, q1_logits, q2_logits = build_models(
            x_ph, a_ph, act_dim, network_params)

    with tf.variable_scope('main', reuse=True):
        _, _, action_probs_next, log_action_probs_next, _, _, _ = build_models(
            x2_ph, a_ph, act_dim, network_params)

    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, q1_logits_targ, q2_logits_targ = build_models(
            x2_ph, a_ph, act_dim, network_params)

    # Count variables
    var_counts = tuple(
        count_vars(scope)
        for scope in ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main'])
    print("""\nNumber of parameters:
             alpha: %d,
             pi: %d,
             q1: %d,
             q2: %d,
             total: %d\n""" % var_counts)

    if use_opt:

        # Optimistic Exploration
        mu_Q = (q1_logits + q2_logits) / 2.0
        sigma_Q = tf.math.abs(q1_logits - q2_logits) / 2.0

        Q_UB = mu_Q + beta_UB * sigma_Q
        Q_LB = mu_Q + beta_LB * sigma_Q

        Q_UB_sm = tf.nn.softmax(
            Q_UB, axis=-1
        )  # needed to make EV and penalty proportional for optimisation

        R = tf.get_variable('R',
                            dtype=tf.float32,
                            shape=[1, act_dim],
                            initializer=tf.random_normal_initializer(
                                mean=0.0, stddev=0.01))
        assign_R = R.assign(
            action_logits
        )  # initialises P as the same "pessimistic" action distribution
        P = tf.nn.softmax(R, axis=-1)

        expected_value = tf.reduce_sum(tf.multiply(P, Q_UB_sm))
        KL_P_PT = tf.reduce_sum(
            tf.multiply(P, tf.log(tf.divide(P, action_probs))))
        penalty = KL_P_PT - delta
        relu_penalty = tf.nn.relu(penalty)
        penalised_opt_function = -expected_value + relu_penalty

        optpi_optimizer = tf.train.AdamOptimizer(learning_rate=opt_lr)
        train_optpi_op = optpi_optimizer.minimize(penalised_opt_function,
                                                  var_list=get_vars('R'))

        optimistic_policy_dist = tf.distributions.Categorical(probs=P)
        optimistic_pi = optimistic_policy_dist.sample()
    else:
        optimistic_pi = pi  # use standard SAC policy
        Q_LB = tf.minimum(q1_logits, q2_logits)

    # Min Double-Q:
    min_q_logits_targ = tf.minimum(q1_logits_targ, q2_logits_targ)

    # Targets for Q regression
    q_backup = r_ph + gamma * (1 - d_ph) * tf.stop_gradient(
        tf.reduce_sum(action_probs_next *
                      (min_q_logits_targ - alpha * log_action_probs_next),
                      axis=-1))

    # critic losses
    q1_a = tf.reduce_sum(tf.multiply(q1_logits, a_ph), axis=1)
    q2_a = tf.reduce_sum(tf.multiply(q2_logits, a_ph), axis=1)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2)
    value_loss = q1_loss + q2_loss

    # policy loss
    pi_backup = tf.reduce_sum(action_probs * (alpha * log_action_probs - Q_LB),
                              axis=-1)
    pi_loss = tf.reduce_mean(pi_backup)

    # alpha loss for temperature parameter
    pi_entropy = -tf.reduce_sum(action_probs * log_action_probs, axis=-1)
    alpha_backup = tf.stop_gradient(target_entropy - pi_entropy)
    alpha_loss = -tf.reduce_mean(log_alpha * alpha_backup)

    # Policy train op
    # (has to be separate from value train op, because q1_logits appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    if grad_clip_val is not None:
        gvs = pi_optimizer.compute_gradients(pi_loss,
                                             var_list=get_vars('main/pi'))
        capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var)
                      for grad, var in gvs]
        train_pi_op = pi_optimizer.apply_gradients(capped_gvs)
    else:
        train_pi_op = pi_optimizer.minimize(pi_loss,
                                            var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    with tf.control_dependencies([train_pi_op]):
        if grad_clip_val is not None:
            gvs = value_optimizer.compute_gradients(
                value_loss, var_list=get_vars('main/q'))
            capped_gvs = [(ClipIfNotNone(grad, grad_clip_val), var)
                          for grad, var in gvs]
            train_value_op = value_optimizer.apply_gradients(capped_gvs)
        else:
            train_value_op = value_optimizer.minimize(
                value_loss, var_list=get_vars('main/q'))

    # Alpha train op
    alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, epsilon=1e-04)
    with tf.control_dependencies([train_value_op]):
        train_alpha_op = alpha_optimizer.minimize(
            alpha_loss, var_list=get_vars('log_alpha'))

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, q1_a, q2_a, pi_entropy, target_entropy,
        alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op,
        target_update
    ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session(config=tf_config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'mu': mu,
                              'pi': pi,
                              'q1_a': q1_a,
                              'q2_a': q2_a
                          })

    def get_action(state, deterministic=False):
        state = state.astype('float32') / 255.

        # # record data for printing
        # _ =  sess.run(assign_R, feed_dict={x_ph: [state]})
        # ins = sess.run([action_probs, Q_UB, P, KL_P_PT], feed_dict={x_ph: [state]})

        if deterministic:
            act_op = mu
        else:
            if use_opt:
                # run a few optimisation steps to set optimistic policy
                _ = sess.run(assign_R, feed_dict={x_ph: [state]})
                for i in range(max_opt_steps):
                    _ = sess.run([train_optpi_op], feed_dict={x_ph: [state]})
            act_op = optimistic_pi

        # # print difference between pessimistic and optimistic policy probabilities
        # outs = sess.run([P, KL_P_PT], feed_dict={x_ph: [state]})
        # print('ap:     ', ins[0])
        # print('Q:      ', ins[1])
        # print('P_in:   ', ins[2])
        # print('P_out:  ', outs[0])
        # print('KL_in:  ', ins[3])
        # print('KL_out: ', outs[1])
        # print('')
        return sess.run(act_op, feed_dict={x_ph: [state]})[0]

    def reset(env, state_buffer):
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # fire to start game and perform no-op for some frames to randomise start
        o, _, _, _ = env.step(1)  # Fire action to start game
        for _ in range(np.random.randint(1, max_noop)):
            o, _, _, _ = env.step(0)  # Action 'NOOP'

        o = process_image_observation(o, obs_dim, thresh)
        r = process_reward(r)
        old_lives = env.ale.lives()
        state = state_buffer.init_state(init_obs=o)
        return o, r, d, ep_ret, ep_len, old_lives, state

    def test_agent(n=10, render=True):
        global sess, mu, pi, q1, q2
        for j in range(n):
            o, r, d, ep_ret, ep_len, test_old_lives, test_state = reset(
                test_env, test_state_buffer)
            terminal_life_lost_test = False

            if render: test_env.render()

            while not (d or (ep_len == max_ep_len)):

                # start by firing
                if terminal_life_lost_test:
                    a = 1
                else:
                    # Take  lower variance actions at test(noise_scale=0.05)
                    a = get_action(test_state, True)

                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(a)
                o = process_image_observation(o, obs_dim, thresh)
                r = process_reward(r)
                test_state = test_state_buffer.append_state(o)
                ep_ret += r
                ep_len += 1

                if test_env.ale.lives() < test_old_lives:
                    test_old_lives = test_env.ale.lives()
                    terminal_life_lost_test = True
                else:
                    terminal_life_lost_test = False

                if render: test_env.render()

            if render: test_env.close()
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    # ================== Main training Loop  ==================

    start_time = time.time()
    o, r, d, ep_ret, ep_len, old_lives, state = reset(train_env,
                                                      train_state_buffer)
    total_steps = steps_per_epoch * epochs

    target_entropy_prop = linear_anneal(current_step=0,
                                        start=target_entropy_start,
                                        stop=target_entropy_stop,
                                        steps=target_entropy_steps)
    save_iter = 0
    terminal_life_lost = False

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # press fire to start
        if terminal_life_lost:
            a = 1
        else:
            if t > start_steps:
                a = get_action(state)
            else:
                a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        o2 = process_image_observation(o2, obs_dim, thresh)
        r = process_reward(r)
        one_hot_a = process_action(a, act_dim)

        next_state = train_state_buffer.append_state(o2)

        ep_ret += r
        ep_len += 1

        if train_env.ale.lives() < old_lives:
            old_lives = train_env.ale.lives()
            terminal_life_lost = True
        else:
            terminal_life_lost = False

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(state, one_hot_a, r, next_state,
                            terminal_life_lost)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        state = next_state

        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                    target_entropy_prop_ph: target_entropy_prop
                }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             Q1Vals=outs[3],
                             Q2Vals=outs[4],
                             PiEntropy=outs[5],
                             TargEntropy=outs[6],
                             LossAlpha=outs[7],
                             Alpha=outs[8])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len, old_lives, state = reset(
                train_env, train_state_buffer)

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # update target entropy every epoch
            target_entropy_prop = linear_anneal(current_step=t,
                                                start=target_entropy_start,
                                                stop=target_entropy_stop,
                                                steps=target_entropy_steps)

            # Save model
            if save_freq is not None:
                if (epoch % save_freq == 0) or (epoch == epochs - 1):
                    print('Saving...')
                    logger.save_state({'env': env}, itr=save_iter)
                    save_iter += 1

            # Test the performance of the deterministic version of the agent.
            test_agent(n=10, render=render)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('PiEntropy', average_only=True)
            logger.log_tabular('TargEntropy', average_only=True)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossAlpha', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

    plot_progress(os.path.join(logger_kwargs['output_dir'], 'progress.txt'),
                  show_plot=True)
Пример #13
0
def simple_dqn(env_fn = lambda : gym.make('CartPole-v1')
    , actor_critic=None
    , ac_kwargs=dict()
    , seed=0
    , episodes_per_epoch=1000
    , epochs=1000
    , gamma=0.99
    , logger_kwargs=dict()
    , save_freq=1000
    , hidden_dim=32
    , n_layers=2
    , lr=1e-4
    , batch_size=32
    , target_update_freq=2500
    , final_epsilon=0.05
    , finish_decay=50000
    , replay_buffer_size=25000
    , steps_before_training=5000
    , n_test_eps = 10
    ):


    max_steps_per_epoch  = 5000

    # Global variables
    num_of_train_epochs = epochs
    # `number_of_layers` hidden layers with `hidden_dim` units each

    number_of_layers = n_layers
    learning_rate = lr
    discount_factor = gamma
    
 

    # init log
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    #make gym enviornment
    env = env_fn()
    obs_dim = env.observation_space.shape[0]
    number_of_actions = env.action_space.n

    #define evaluation network
    with tf.variable_scope('evaluation_network'):
        #input layer
        obs_ph = tf.placeholder(dtype=tf.float32, shape=(None,obs_dim), name='obs_ph')
        #mlp - #mlp (Multi Layer Perceptron) - hidden layers
        hidden_sizes = [hidden_dim] * number_of_layers
        x = obs_ph
        for h in hidden_sizes:
            x = tf.layers.dense(x, units=h, activation=tf.tanh)
        #output layer
        eval_net = tf.layers.dense(x,units=number_of_actions,activation=None)
    #define taget network
    with tf.variable_scope('target_network'):
        #input layer
        obs_target_ph = tf.placeholder(dtype=tf.float32, shape=(None,obs_dim), name='obs_target_ph')
        #mlp - #mlp (Multi Layer Perceptron) - hidden layers
        hidden_sizes = [hidden_dim] * number_of_layers
        x = obs_target_ph
        for h in hidden_sizes:
            x = tf.layers.dense(x, units=h, activation=tf.tanh)
        #output layer
        target_net = tf.layers.dense(x,units=number_of_actions,activation=None)


    #define loss function
    selected_action_ph = tf.placeholder(shape=(None,), dtype=tf.int32, name='selected_action_ph')
    reward_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='reward_ph')
    done_ph = tf.placeholder(shape=(None,), dtype=tf.float32, name='done_ph')
    actions_one_hot = tf.one_hot(selected_action_ph, number_of_actions)
    q_a = tf.reduce_sum(actions_one_hot * eval_net,axis=1)
    #use target network to approximate TD
    target = reward_ph + discount_factor * (1-done_ph) * tf.stop_gradient(tf.reduce_max(target_net, axis=1))
    loss = tf.reduce_mean((q_a - target)**2)

    #init replay buffer
    replay_current_obs = np.zeros([replay_buffer_size, obs_dim], dtype=np.int32)
    replay_next_obs = np.zeros([replay_buffer_size, obs_dim], dtype=np.int32)
    replay_selected_action = np.zeros(replay_buffer_size, dtype=np.int32)
    replay_reward =np.zeros(replay_buffer_size, dtype=np.float32)
    replay_done = np.zeros(replay_buffer_size, dtype=np.float32)

    # update op for target network
    main_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='evaluation_network')
    target_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_network')
    assign_ops = [tf.assign(target_var, main_var) for target_var, main_var in zip(target_vars, main_vars)]
    target_update_op = tf.group(*assign_ops)

    # define train optimizer_operation
    optimizer_operation = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

    # init session
    session = tf.InteractiveSession()
    session.run(tf.global_variables_initializer())

    logger.setup_tf_saver(session, inputs={'x': obs_ph}, outputs={'q': eval_net})


    current_index = replay_buffer_size - 1
    #reset train data
    epoch, step, training_finished, epsilon = 0, 0, False, 1
    #reset epoch data
    epoch_rews, epoch_lens, epoch_losses,  epoch_qs = [], [], [], []
    #reset episodic data
    obs, reward, done, ep_rews, ep_len, episode_num, end_of_epoch = env.reset(), 0, False, 0, 0, 0, False
    last_number_steps = 0 
    while not training_finished:
        step += 1
        #get action - 
        #     epsilon greedy
        selected_action = 0
        if np.random.rand() < epsilon :
            #exploration
            selected_action = np.random.randint(number_of_actions)
        else:
            #exploitation
            estimated_q = session.run(eval_net, feed_dict={obs_ph: obs.reshape(1,-1)})
            selected_action = np.argmax(estimated_q)
        
        # preform one step in gym enviornment
        #  receive observation reward and whether the episode has ended
        obs, reward, done, _  = env.step(selected_action)

        #store information in replay buffer
        #TODO deal with first and done
        replay_next_obs[current_index] = obs

        current_index = step % replay_buffer_size
        replay_current_obs[current_index] = obs
        replay_selected_action[current_index] = selected_action
        replay_reward[current_index] = reward
        replay_done[current_index] = done

        ep_rews += reward
        ep_len += 1

        if done:
            episode_num += 1
            #save episodic data 
            epoch_rews.append(ep_rews)
            epoch_lens.append(ep_len)
            #reset episodic data
            obs, reward, done, ep_rews, ep_len, end_of_epoch = env.reset(), 0, False, 0, 0, episode_num % episodes_per_epoch == 0
         
            
        #first `steps_before_training` do no train - replay buffer is too small
        if step > steps_before_training:
        #single train iteration
            #get data from replay
            trained_indices = np.random.randint(min(replay_buffer_size, step), size = batch_size)
            trained_observation = replay_current_obs[trained_indices]
            trained_next_observation = replay_next_obs[trained_indices]
            trained_selected_action = replay_selected_action[trained_indices]
            trained_reward = replay_reward[trained_indices]
            trained_done = replay_done[trained_indices]


            #if (step % save_freq == 0) or (step >= total_number_of_steps - 1):
            #    logger.save_state({'env': env}, None)
            # train eval network
            step_loss, curr_q, _  = session.run([loss, q_a, optimizer_operation], feed_dict={obs_ph: trained_observation, 
                                                                     obs_target_ph: trained_next_observation, 
                                                                     selected_action_ph: trained_selected_action,
                                                                     reward_ph: trained_reward,
                                                                     done_ph: trained_done})

            #just for logging
            epoch_losses.append(step_loss)
            epoch_qs.append(curr_q)
            
            
        
            if end_of_epoch:
                logger.save_state({'env': env}, None)
                # update target network
                session.run(target_update_op)
                
                epoch += 1
                training_finished = epoch >= num_of_train_epochs
                
                #test epoch
                ep_rets, ep_lens = [], []
                for _ in range(n_test_eps):
                    obs, rew, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    while not(done):
                        #env.render()
                        estimated_q = session.run(eval_net, feed_dict={obs_ph: obs.reshape(1,-1)})
                        selected_action = np.argmax(estimated_q)
                        obs, rew, done, _ = env.step(selected_action)
                        ep_ret += rew
                        ep_len += 1
                    ep_rets.append(ep_ret)
                    ep_lens.append(ep_len)
                
                test_ep_ret =  np.mean(ep_rets)
                test_ep_len =  np.mean(ep_lens)

                obs, rew, done, ep_ret, ep_len, end_of_epoch = env.reset(), 0, False, 0, 0, False

                # log epoch results
                logger.log_tabular('Epoch', epoch)
                logger.log_tabular('TotalEnvInteracts', step - last_number_steps)
                logger.log_tabular('loss', np.mean(epoch_losses))
                logger.log_tabular('AverageEpRet', np.mean(test_ep_ret))
                logger.log_tabular('epispode mean length', np.mean(test_ep_len))
                logger.dump_tabular()

        
                epoch_rews, epoch_lens, epoch_losses,  epoch_qs, last_number_steps= [], [], [], [], step
                
            
                #adapt epsilon
                epsilon = 1 + (final_epsilon - 1)*min(1, step/finish_decay)
Пример #14
0
def my_ddpg(env_fn, seed=0, steps_per_epoch=4000, epochs=100, max_ep_len=1000,
        hidden_sizes=[256,256], 
        logger_kwargs=dict(), save_freq=1,
        batch_size=100, start_steps=10000,
        update_after=1000, update_every=50, num_test_episodes=10,
        gamma=0.99, polyak=0.995, act_noise=0.1,
        pi_lr=1e-3, q_lr=1e-3, buffer_size=int(1e6)):
    """
    My DDPG implementation
    """

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    test_env = env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    print("env.observation_space", env.observation_space)
    print("env.observation_space.shape", env.observation_space.shape)
    print("env.action_space", env.action_space)
    action_min = env.action_space.low[0]
    action_max = env.action_space.high[0]
    if isinstance(env.action_space, gym.spaces.Discrete):
        print("Discrete action space not supported for my-ddpg!")
        return

    # Set up experience buffer
    buf = ReplayBuffer(obs_dim, act_dim, buffer_size)

    # Instantiate models
    assert action_max == abs(action_min)
    policy = DeterministicPolicyNet(obs_dim, act_dim, hidden_sizes, action_max)
    policy_target = copy.deepcopy(policy)
    policy_optimizer = torch.optim.Adam(policy.mu_net.parameters(), lr=pi_lr)

    q_function = QNet(obs_dim, act_dim, hidden_sizes)
    q_function_target = copy.deepcopy(q_function)
    q_optimizer = torch.optim.Adam(q_function.q_net.parameters(), lr=q_lr)

    # Set up model saving
    logger.setup_pytorch_saver(policy)
    # TODO: Save value network as well

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p_targ in policy_target.parameters():
        p_targ.requires_grad = False
    for q_targ in q_function_target.parameters():
        q_targ.requires_grad = False

    # Prepare for interaction with environment
    num_steps = epochs * steps_per_epoch
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for step in range(num_steps): # TODO: Change to for loop over range(epochs) and range(steps_per_epoch)

        with torch.no_grad():
            if step < start_steps:
                # Until start_steps have elapsed, randomly sample actions
                # from a uniform distribution for better exploration. Afterwards, 
                # use the learned policy (with some noise, via act_noise). 
                a = env.action_space.sample()
            else:
                assert o.shape == (obs_dim,)
                a = policy(torch.tensor(o, dtype=torch.float32).unsqueeze(0))
                assert a.shape == (1, act_dim)
                a = a[0] # Remove batch dimension
                a = torch.clamp(a + act_noise * torch.randn(act_dim), action_min, action_max) # Add exploration noise
                a = a.numpy() # Convert to numpy

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            d = False if ep_len==max_ep_len else d

            buf.store(o, a, r, next_o, d)
            
            # Update obs (critical!)
            o = next_o

            # Trajectory finished
            if d or (ep_len == max_ep_len):
                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        if step >= update_after and step % update_every == 0:
            for _ in range(update_every):

                def update():
                    o, a, r, next_o, d = buf.sample_batch(batch_size)

                    # Compute targets
                    with torch.no_grad():
                        next_a_targ = policy_target(next_o)
                        next_q_targ = q_function_target(next_o, next_a_targ)
                        q_targ = r + gamma * (1 - d) * next_q_targ

                    # Update Q function
                    q_optimizer.zero_grad()
                    q_loss = ((q_function(o, a) - q_targ)**2).mean()
                    q_loss.backward()
                    q_optimizer.step()

                    # Freeze Q-network so you don't waste computational effort 
                    # computing gradients for it during the policy learning step.
                    for p in q_function.parameters():
                        p.requires_grad = False

                    # Policy function update
                    policy_optimizer.zero_grad()
                    policy_loss = -(q_function(o, policy(o))).mean()
                    policy_loss.backward()
                    policy_optimizer.step()

                    # Unfreeze Q-network so you can optimize it at next DDPG step.
                    for p in q_function.parameters():
                        p.requires_grad = True

                    # Update target networks with polyak
                    with torch.no_grad():
                        for p, p_targ in zip(policy.parameters(), policy_target.parameters()):
                            p_targ.data.mul_(polyak)
                            p_targ.data.add_((1 - polyak) * p.data)
                        for q, q_targ in zip(q_function.parameters(), q_function_target.parameters()):
                            q_targ.data.mul_(polyak)
                            q_targ.data.add_((1 - polyak) * q.data)
                update()
                
        if (step + 1) % steps_per_epoch == 0:
            epoch = (step + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            def test_agent():
                with torch.no_grad():
                    for j in range(num_test_episodes):
                        o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
                        while not(d or (ep_len == max_ep_len)):
                            # Take deterministic actions at test time
                            a = policy(torch.tensor(o, dtype=torch.float32).unsqueeze(0))
                            assert a.shape == (1, act_dim)
                            a = a[0] # Remove batch dimension
                            a = a.numpy() # Convert to numpy
                            o, r, d, _ = test_env.step(a)
                            ep_ret += r
                            ep_len += 1
                        logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', step)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
def ppo(env_fn,
        expert=None,
        policy_path=None,
        actor_critic=core.mlp_actor_critic_m,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=5000,
        epochs=10000,
        dagger_epochs=500,
        pretrain_epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=1e-4,
        dagger_noise=0.01,
        batch_size=64,
        replay_size=int(5e3),
        vf_lr=1e-4,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.999,
        max_ep_len=500,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10,
        test_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while 
            still profiting (improving the objective function)? The new policy 
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take 
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used 
            for early stopping. (Usually small, 0.01 or 0.05.)

        policy_path (str): path of pretrained policy model
            train from scratch if None

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    test_logger_kwargs = dict()
    test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'],
                                                "test")
    test_logger_kwargs['exp_name'] = logger_kwargs['exp_name']
    test_logger = EpochLogger(**test_logger_kwargs)
    test_logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space
    act_high_limit = env.action_space.high
    act_low_limit = env.action_space.low

    sess = tf.Session()
    if policy_path is None:
        # Inputs to computation graph
        x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                                   env.action_space)
        adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)
        tfa_ph = core.placeholder(act_dim)

        # Main outputs from computation graph
        mu, pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
        sess.run(tf.global_variables_initializer())

    else:
        # load pretrained model
        # sess, x_ph, a_ph, mu, pi, logp, logp_pi, v = load_policy(policy_path, itr='last', deterministic=False, act_high=env.action_space.high)
        # # get_action_2 = lambda x : sess.run(mu, feed_dict={x_ph: x[None,:]})[0]
        # adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)
        model = restore_tf_graph(sess, osp.join(policy_path, 'simple_save'))
        x_ph, a_ph, adv_ph, ret_ph, logp_old_ph = model['x_ph'], model[
            'a_ph'], model['adv_ph'], model['ret_ph'], model['logp_old_ph']
        mu, pi, logp, logp_pi, v = model['mu'], model['pi'], model[
            'logp'], model['logp_pi'], model['v']
        # tfa_ph = core.placeholder(act_dim)
        tfa_ph = model['tfa_ph']

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    print("---------------", local_steps_per_epoch)
    buf = PPOBuffer(obs_dim, act_dim, steps_per_epoch, gamma, lam)
    # print(obs_dim)
    # print(act_dim)
    dagger_replay_buffer = DaggerReplayBuffer(obs_dim=obs_dim[0],
                                              act_dim=act_dim[0],
                                              size=replay_size)
    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    if policy_path is None:
        ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
        min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                           (1 - clip_ratio) * adv_ph)
        pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
        v_loss = tf.reduce_mean((ret_ph - v)**2)
        dagger_pi_loss = tf.reduce_mean(tf.square(mu - tfa_ph))

        # Info (useful to watch during learning)
        approx_kl = tf.reduce_mean(
            logp_old_ph -
            logp)  # a sample estimate for KL-divergence, easy to compute
        approx_ent = tf.reduce_mean(
            -logp)  # a sample estimate for entropy, also easy to compute
        clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio <
                                (1 - clip_ratio))
        clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

        # Optimizers
        dagger_pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
        optimizer_pi = tf.train.AdamOptimizer(learning_rate=pi_lr)
        optimizer_v = tf.train.AdamOptimizer(learning_rate=vf_lr)
        train_dagger_pi_op = dagger_pi_optimizer.minimize(
            dagger_pi_loss, name='train_dagger_pi_op')
        train_pi = optimizer_pi.minimize(pi_loss, name='train_pi_op')
        train_v = optimizer_v.minimize(v_loss, name='train_v_op')

        sess.run(tf.variables_initializer(optimizer_pi.variables()))
        sess.run(tf.variables_initializer(optimizer_v.variables()))
        sess.run(tf.variables_initializer(dagger_pi_optimizer.variables()))
    else:
        graph = tf.get_default_graph()
        dagger_pi_loss = model['dagger_pi_loss']
        pi_loss = model['pi_loss']
        v_loss = model['v_loss']
        approx_ent = model['approx_ent']
        approx_kl = model['approx_kl']
        clipfrac = model['clipfrac']

        train_dagger_pi_op = graph.get_operation_by_name('train_dagger_pi_op')
        train_pi = graph.get_operation_by_name('train_pi_op')
        train_v = graph.get_operation_by_name('train_v_op')
    # sess = tf.Session()
    # sess.run(tf.global_variables_initializer())

    # Sync params across processes
    # sess.run(sync_all_params())

    tf.summary.FileWriter("log/", sess.graph)
    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph, 'tfa_ph': tfa_ph, 'adv_ph': adv_ph, 'ret_ph': ret_ph, 'logp_old_ph': logp_old_ph}, \
        outputs={'mu': mu, 'pi': pi, 'v': v, 'logp': logp, 'logp_pi': logp_pi, 'clipfrac': clipfrac, 'approx_kl': approx_kl, \
            'pi_loss': pi_loss, 'v_loss': v_loss, 'dagger_pi_loss': dagger_pi_loss, 'approx_ent': approx_ent})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    def choose_action(s, add_noise=False):
        s = s[np.newaxis, :]
        a = sess.run(mu, {x_ph: s})[0]
        if add_noise:
            noise = dagger_noise * act_high_limit * np.random.normal(
                size=a.shape)
            a = a + noise
        return np.clip(a, act_low_limit, act_high_limit)

    def test_agent(n=81, test_num=1):
        n = env.unwrapped._set_test_mode(True)
        con_flag = False
        for j in range(n):
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, info = env.step(choose_action(np.array(o), 0))
                ep_ret += r
                ep_len += 1
                if d:
                    test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
                    test_logger.store(arrive_des=info['arrive_des'])
                    test_logger.store(
                        arrive_des_appro=info['arrive_des_appro'])
                    if not info['out_of_range']:
                        test_logger.store(converge_dis=info['converge_dis'])
                        con_flag = True
                    test_logger.store(out_of_range=info['out_of_range'])
                    # print(info)
        # test_logger.dump_tabular()
        # time.sleep(10)
        if not con_flag:
            test_logger.store(converge_dis=10000)
        env.unwrapped._set_test_mode(False)

    def ref_test_agent(n=81, test_num=1):
        n = env.unwrapped._set_test_mode(True)
        con_flag = False
        for j in range(n):
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                a = call_ref_controller(env, expert)
                o, r, d, info = env.step(a)
                ep_ret += r
                ep_len += 1
                if d:
                    test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
                    test_logger.store(arrive_des=info['arrive_des'])
                    test_logger.store(
                        arrive_des_appro=info['arrive_des_appro'])
                    if not info['out_of_range']:
                        test_logger.store(converge_dis=info['converge_dis'])
                        con_flag = True
                    test_logger.store(out_of_range=info['out_of_range'])
                    # print(info)
        # test_logger.dump_tabular()
        if not con_flag:
            test_logger.store(converge_dis=10000)
        env.unwrapped._set_test_mode(False)

    ref_test_agent(test_num=-1)
    test_logger.log_tabular('epoch', -1)
    test_logger.log_tabular('TestEpRet', average_only=True)
    test_logger.log_tabular('TestEpLen', average_only=True)
    test_logger.log_tabular('arrive_des', average_only=True)
    test_logger.log_tabular('arrive_des_appro', average_only=True)
    test_logger.log_tabular('converge_dis', average_only=True)
    test_logger.log_tabular('out_of_range', average_only=True)
    test_logger.dump_tabular()

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    test_policy_epochs = 91
    episode_steps = 500
    total_env_t = 0
    test_num = 0
    print(colorize("begin dagger training", 'green', bold=True))
    for epoch in range(1, dagger_epochs + 1, 1):
        # test policy
        if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs):
            # Save model
            logger.save_state({}, None)

            # Test the performance of the deterministic version of the agent.
            test_num += 1
            test_agent(test_num=test_num)

            test_logger.log_tabular('epoch', epoch)
            test_logger.log_tabular('TestEpRet', average_only=True)
            test_logger.log_tabular('TestEpLen', average_only=True)
            test_logger.log_tabular('arrive_des', average_only=True)
            test_logger.log_tabular('arrive_des_appro', average_only=True)
            test_logger.log_tabular('converge_dis', average_only=True)
            test_logger.log_tabular('out_of_range', average_only=True)
            test_logger.dump_tabular()

        # train policy
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        env.unwrapped._set_test_mode(False)
        obs, acs, rewards = [], [], []
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(
                get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)})
            # a = get_action_2(np.array(o))
            # save and log
            obs.append(o)
            ref_action = call_ref_controller(env, expert)
            if (epoch < pretrain_epochs):
                action = ref_action
            else:
                action = choose_action(np.array(o), True)

            buf.store(o, action, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(action)
            acs.append(ref_action)
            rewards.append(r)

            ep_ret += r
            ep_len += 1
            total_env_t += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: np.array(o).reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Perform dagger and partical PPO update!
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        # pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        max_step = len(np.array(rewards))
        dagger_replay_buffer.stores(obs, acs, rewards)
        for _ in range(int(local_steps_per_epoch / 10)):
            batch = dagger_replay_buffer.sample_batch(batch_size)
            feed_dict = {x_ph: batch['obs1'], tfa_ph: batch['acts']}
            q_step_ops = [dagger_pi_loss, train_dagger_pi_op]
            for j in range(10):
                outs = sess.run(q_step_ops, feed_dict)
            logger.store(LossPi=outs[0])

        c_v_loss = sess.run(v_loss, feed_dict=inputs)
        logger.store(LossV=c_v_loss,
                     KL=0,
                     Entropy=0,
                     ClipFrac=0,
                     DeltaLossPi=0,
                     DeltaLossV=0,
                     StopIter=0)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()

    # Main loop: collect experience in env and update/log each epoch
    print(colorize("begin ppo training", 'green', bold=True))
    for epoch in range(1, epochs + 1, 1):
        # test policy
        if epoch > 0 and (epoch % save_freq == 0) or (epoch
                                                      == epochs) or epoch == 1:
            # Save model
            logger.save_state({}, None)

            # Test the performance of the deterministic version of the agent.
            test_num += 1
            test_agent(test_num=test_num)

            test_logger.log_tabular('epoch', epoch)
            test_logger.log_tabular('TestEpRet', average_only=True)
            test_logger.log_tabular('TestEpLen', average_only=True)
            test_logger.log_tabular('arrive_des', average_only=True)
            test_logger.log_tabular('arrive_des_appro', average_only=True)
            test_logger.log_tabular('converge_dis', average_only=True)
            test_logger.log_tabular('out_of_range', average_only=True)
            test_logger.dump_tabular()

        # train policy
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        env.unwrapped._set_test_mode(False)
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(
                get_action_ops, feed_dict={x_ph: np.array(o).reshape(1, -1)})
            # a = a[0]
            # a = get_action_2(np.array(o))
            # a = np.clip(a, act_low_limit, act_high_limit)
            # if epoch < pretrain_epochs:
            #     a = env.action_space.sample()
            # a = np.clip(a, act_low_limit, act_high_limit)
            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: np.array(o).reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
def sac_pytorch(
        env_fn,
        hidden_sizes=[256, 256],
        seed=0,
        steps_per_epoch=5000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=3e-4,
        alpha=0.2,
        batch_size=256,
        start_steps=10000,
        max_ep_len=1000,
        save_freq=1,
        dont_save=True,
        regularization_weight=0,
        grad_clip=-1,
        logger_kwargs=dict(),
):
    """
    Largely following OpenAI documentation
    But slightly different from tensorflow implementation
    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        hidden_sizes: number of entries is number of hidden layers
            each entry in this list indicate the size of that hidden layer.
            applies to all networks

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch. Note the epoch here is just logging epoch
            so every this many steps a logging to stdouot and also output file will happen
            note: not to be confused with training epoch which is a term used often in literature for all kinds of
            different things

        epochs (int): Number of epochs to run and train agent. Usage of this term can be different in different
            algorithms, use caution. Here every epoch you get new logs

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration. However during testing the action always come from policy

        max_ep_len (int): Maximum length of trajectory / episode / rollout. Environment will get reseted if
        timestep in an episode excedding this number

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        logger_kwargs (dict): Keyword args for EpochLogger.

    """
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("running on device:", device)
    """set up logger"""
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    env, test_env = env_fn(), env_fn()

    ## seed torch and numpy
    torch.manual_seed(seed)
    np.random.seed(seed)

    ## seed environment along with env action space so that everything about env is seeded
    env.seed(seed)
    env.action_space.np_random.seed(seed)
    test_env.seed(seed)
    test_env.action_space.np_random.seed(seed)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # if environment has a smaller max episode length, then use the environment's max episode length
    max_ep_len = env._max_episode_steps if max_ep_len > env._max_episode_steps else max_ep_len

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    # we need .item() to convert it from numpy float to python float
    act_limit = env.action_space.high[0].item()

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    def test_agent(n=5):
        """
        This will test the agent's performance by running n episodes
        During the runs, the agent only take deterministic action, so the
        actions are not drawn from a distribution, but just use the mean
        :param n: number of episodes to run the agent
        """
        ep_return_list = np.zeros(n)
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                a = policy_net.get_env_action(o, deterministic=True)
                o, r, d, _ = test_env.step(a)
                ep_ret += r
                ep_len += 1
            ep_return_list[j] = ep_ret
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs
    """init all networks"""
    # see line 1
    policy_net = TanhGaussianPolicy(obs_dim,
                                    act_dim,
                                    hidden_sizes,
                                    action_limit=act_limit).to(device)
    value_net = Mlp(obs_dim, 1, hidden_sizes).to(device)
    target_value_net = Mlp(obs_dim, 1, hidden_sizes).to(device)
    q1_net = Mlp(obs_dim + act_dim, 1, hidden_sizes).to(device)
    q2_net = Mlp(obs_dim + act_dim, 1, hidden_sizes).to(device)
    # see line 2: copy parameters from value_net to target_value_net
    target_value_net.load_state_dict(value_net.state_dict())

    # set up optimizers
    policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    value_optimizer = optim.Adam(value_net.parameters(), lr=lr)
    q1_optimizer = optim.Adam(q1_net.parameters(), lr=lr)
    q2_optimizer = optim.Adam(q2_net.parameters(), lr=lr)

    # mean squared error loss for v and q networks
    mse_criterion = nn.MSELoss()

    # Main loop: collect experience in env and update/log each epoch
    # NOTE: t here is the current number of total timesteps used
    # it is not the number of timesteps passed in the current episode
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = policy_net.get_env_action(o, deterministic=False)
        else:
            a = env.action_space.sample()

        # Step the env, get next observation, reward and done signal
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience (observation, action, reward, next observation, done) to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            Quoted from the original SAC paper: 'In practice, we take a single environment step
            followed by one or several gradient step' after a single environment step,
            the number of gradient steps is 1 for SAC. (see paper for reference)
            """
            for j in range(ep_len):
                # get data from replay buffer
                batch = replay_buffer.sample_batch(batch_size)
                obs_tensor = Tensor(batch['obs1']).to(device)
                obs_next_tensor = Tensor(batch['obs2']).to(device)
                acts_tensor = Tensor(batch['acts']).to(device)
                # unsqueeze is to make sure rewards and done tensors are of the shape nx1, instead of n
                # to prevent problems later
                rews_tensor = Tensor(batch['rews']).unsqueeze(1).to(device)
                done_tensor = Tensor(batch['done']).unsqueeze(1).to(device)
                """
                now we do a SAC update, following the OpenAI spinup doc
                check the openai sac document psudocode part for reference
                line nubmers indicate lines in psudocode part
                we will first compute each of the losses
                and then update all the networks in the end
                """
                # see line 12: get a_tilda, which is newly sampled action (not action from replay buffer)
                a_tilda, mean_a_tilda, log_std_a_tilda, log_prob_a_tilda, _, _ = policy_net.forward(
                    obs_tensor)
                """get q loss"""
                # see line 12: first equation
                v_from_target_v_net = target_value_net(obs_next_tensor)
                y_q = rews_tensor + gamma * (1 -
                                             done_tensor) * v_from_target_v_net
                # see line 13: compute loss for the 2 q networks, note that we want to detach the y_q value
                # since we only want to update q networks here, and don't want other gradients
                q1_prediction = q1_net(torch.cat([obs_tensor, acts_tensor], 1))
                q1_loss = mse_criterion(q1_prediction, y_q.detach())
                q2_prediction = q2_net(torch.cat([obs_tensor, acts_tensor], 1))
                q2_loss = mse_criterion(q2_prediction, y_q.detach())
                """get v loss"""
                # see line 12: second equation
                q1_a_tilda = q1_net(torch.cat([obs_tensor, a_tilda], 1))
                q2_a_tilda = q2_net(torch.cat([obs_tensor, a_tilda], 1))
                min_q1_q2_a_tilda = torch.min(
                    torch.cat([q1_a_tilda, q2_a_tilda], 1),
                    1)[0].reshape(-1, 1)
                y_v = min_q1_q2_a_tilda - alpha * log_prob_a_tilda

                # see line 14: compute loss for value network
                v_prediction = value_net(obs_tensor)
                v_loss = mse_criterion(v_prediction, y_v.detach())
                """policy loss"""
                # line 15: note that here we are doing gradient ascent, so we add a minus sign in the front
                policy_loss = -(q1_a_tilda - alpha * log_prob_a_tilda).mean()
                """
                add policy regularization loss, this is not in openai's minimal version, but
                they are in the original sac code, see https://github.com/vitchyr/rlkit for reference
                this part is not necessary but might improve performance
                """
                if regularization_weight > 0:
                    policy_mean_reg_weight = regularization_weight
                    policy_std_reg_weight = regularization_weight
                    mean_reg_loss = policy_mean_reg_weight * (mean_a_tilda**
                                                              2).mean()
                    std_reg_loss = policy_std_reg_weight * (log_std_a_tilda**
                                                            2).mean()
                    policy_loss = policy_loss + mean_reg_loss + std_reg_loss
                """update networks"""
                q1_optimizer.zero_grad()
                q1_loss.backward()
                if grad_clip > 0:
                    nn.utils.clip_grad_norm_(q1_net.parameters(), grad_clip)
                q1_optimizer.step()

                q2_optimizer.zero_grad()
                q2_loss.backward()
                if grad_clip > 0:
                    nn.utils.clip_grad_norm_(q2_net.parameters(), grad_clip)
                q2_optimizer.step()

                value_optimizer.zero_grad()
                v_loss.backward()
                if grad_clip > 0:
                    nn.utils.clip_grad_norm_(value_net.parameters(), grad_clip)
                value_optimizer.step()

                policy_optimizer.zero_grad()
                policy_loss.backward()
                if grad_clip > 0:
                    nn.utils.clip_grad_norm_(policy_net.parameters(),
                                             grad_clip)
                policy_optimizer.step()

                # see line 16: update target value network with value network
                soft_update_model1_with_model2(target_value_net, value_net,
                                               polyak)

                # store diagnostic info to logger
                logger.store(LossPi=policy_loss.cpu().item(),
                             LossQ1=q1_loss.cpu().item(),
                             LossQ2=q2_loss.cpu().item(),
                             LossV=v_loss.cpu().item(),
                             Q1Vals=q1_prediction.detach().cpu().numpy(),
                             Q2Vals=q2_prediction.detach().cpu().numpy(),
                             VVals=v_prediction.detach().cpu().numpy(),
                             LogPi=log_prob_a_tilda.detach().cpu().numpy())

            ## store episode return and length to logger
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            ## reset environment
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if (t + 1) % steps_per_epoch == 0:
            epoch = t // steps_per_epoch
            """
            Save pytorch model, very different from tensorflow version
            We need to save the environment, the state_dict of each network
            and also the state_dict of each optimizer
            """
            if not dont_save:
                sac_state_dict = {
                    'env': env,
                    'policy_net': policy_net.state_dict(),
                    'value_net': value_net.state_dict(),
                    'target_value_net': target_value_net.state_dict(),
                    'q1_net': q1_net.state_dict(),
                    'q2_net': q2_net.state_dict(),
                    'policy_opt': policy_optimizer,
                    'value_opt': value_optimizer,
                    'q1_opt': q1_optimizer,
                    'q2_opt': q2_optimizer
                }
                if (epoch % save_freq == 0) or (epoch == epochs - 1):
                    logger.save_state(sac_state_dict, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #17
0
def sac(env_fn,
        actor_critic=core.MLPActorCritic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=10000,
        update_after=1000,
        update_every=50,
        num_test_episodes=10,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=1):
    """
    Soft Actor-Critic (SAC)


    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: The constructor method for a PyTorch Module with an ``act`` 
            method, a ``pi`` module, a ``q1`` module, and a ``q2`` module.
            The ``act`` method and ``pi`` module should accept batches of 
            observations as inputs, and ``q1`` and ``q2`` should accept a batch 
            of observations and a batch of actions as inputs. When called, 
            ``act``, ``q1``, and ``q2`` should return:

            ===========  ================  ======================================
            Call         Output Shape      Description
            ===========  ================  ======================================
            ``act``      (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``q1``       (batch,)          | Tensor containing one current estimate
                                           | of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ``q2``       (batch,)          | Tensor containing the other current 
                                           | estimate of Q* for the provided observations
                                           | and actions. (Critical: make sure to
                                           | flatten this!)
            ===========  ================  ======================================

            Calling ``pi`` should return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Tensor containing actions from policy
                                           | given observations.
            ``logp_pi``  (batch,)          | Tensor containing log probabilities of
                                           | actions in ``a``. Importantly: gradients
                                           | should be able to flow back into ``a``.
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object 
            you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        update_after (int): Number of env interactions to collect before
            starting to do gradient descent updates. Ensures replay buffer
            is full enough for useful updates.

        update_every (int): Number of env interactions that should elapse
            between gradient descent updates. Note: Regardless of how long 
            you wait between updates, the ratio of env steps to gradient steps 
            is locked to 1.

        num_test_episodes (int): Number of episodes to test the deterministic
            policy at the end of each epoch.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Create actor-critic module and target networks
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs)
    ac_targ = deepcopy(ac)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in ac_targ.parameters():
        p.requires_grad = False

    # List of parameters for both Q-networks (save this for convenience)
    q_params = itertools.chain(ac.q1.parameters(), ac.q2.parameters())

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables (protip: try to get a feel for how different size networks behave!)
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.q1, ac.q2])
    logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
               var_counts)

    # Set up function for computing SAC Q-losses
    def compute_loss_q(data):
        o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[
            'obs2'], data['done']

        q1 = ac.q1(o, a)
        q2 = ac.q2(o, a)

        # Bellman backup for Q functions
        with torch.no_grad():
            # Target actions come from *current* policy
            a2, logp_a2 = ac.pi(o2)

            # Target Q-values
            q1_pi_targ = ac_targ.q1(o2, a2)
            q2_pi_targ = ac_targ.q2(o2, a2)
            q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ)
            backup = r + gamma * (1 - d) * (q_pi_targ - alpha * logp_a2)

        # MSE loss against Bellman backup
        loss_q1 = ((q1 - backup)**2).mean()
        loss_q2 = ((q2 - backup)**2).mean()
        loss_q = loss_q1 + loss_q2

        # Useful info for logging
        q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy())

        return loss_q, q_info

    # Set up function for computing SAC pi loss
    def compute_loss_pi(data):
        o = data['obs']
        pi, logp_pi = ac.pi(o)
        q1_pi = ac.q1(o, pi)
        q2_pi = ac.q2(o, pi)
        q_pi = torch.min(q1_pi, q2_pi)

        # Entropy-regularized policy loss
        loss_pi = (alpha * logp_pi - q_pi).mean()

        # Useful info for logging
        pi_info = dict(LogPi=logp_pi.detach().numpy())

        return loss_pi, pi_info

    # Set up optimizers for policy and q-function
    pi_optimizer = Adam(ac.pi.parameters(), lr=lr)
    q_optimizer = Adam(q_params, lr=lr)

    # Set up model saving
    logger.setup_pytorch_saver(ac)

    def update(data):
        # First run one gradient descent step for Q1 and Q2
        q_optimizer.zero_grad()
        loss_q, q_info = compute_loss_q(data)
        loss_q.backward()
        q_optimizer.step()

        # Record things
        logger.store(LossQ=loss_q.item(), **q_info)

        # Freeze Q-networks so you don't waste computational effort
        # computing gradients for them during the policy learning step.
        for p in q_params:
            p.requires_grad = False

        # Next run one gradient descent step for pi.
        pi_optimizer.zero_grad()
        loss_pi, pi_info = compute_loss_pi(data)
        loss_pi.backward()
        pi_optimizer.step()

        # Unfreeze Q-networks so you can optimize it at next DDPG step.
        for p in q_params:
            p.requires_grad = True

        # Record things
        logger.store(LossPi=loss_pi.item(), **pi_info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(ac.parameters(), ac_targ.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)

    def get_action(o, deterministic=False):
        return ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic)

    def test_agent():
        for j in range(num_test_episodes):
            o, d, ep_ret, ep_len, success, goalDist, reachDist = test_env.reset(
            ), False, 0, 0, False, None, None
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, info = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
                if 'success' in info:
                    success = info['success'] or success
                if 'goalDist' in info and info['goalDist'] is not None:
                    goalDist = info['goalDist']
                if 'reachDist' in info and info['reachDist'] is not None:
                    reachDist = info['reachDist']
            if goalDist != None:
                logger.store(TestGoalDist=goalDist)
            if reachDist != None:
                logger.store(TestReachDist=reachDist)
            logger.store(TestEpRet=ep_ret,
                         TestEpLen=ep_len,
                         TestSuccess=success)

    # Prepare for interaction with environment
    total_steps = steps_per_epoch * epochs
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        # Until start_steps have elapsed, randomly sample actions
        # from a uniform distribution for better exploration. Afterwards,
        # use the learned policy.
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of trajectory handling
        if d or (ep_len == max_ep_len):
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, ep_ret, ep_len = env.reset(), 0, 0

        # Update handling
        if t >= update_after and t % update_every == 0:
            for j in range(update_every):
                batch = replay_buffer.sample_batch(batch_size)
                update(data=batch)

        # End of epoch handling
        if (t + 1) % steps_per_epoch == 0:
            epoch = (t + 1) // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            if 'TestGoalDist' in logger.epoch_dict:
                logger.log_tabular('TestGoalDist', with_min_and_max=True)
            if 'TestReachDist' in logger.epoch_dict:
                logger.log_tabular('TestReachDist', with_min_and_max=True)
            if 'TestSuccess' in logger.epoch_dict:
                logger.log_tabular('TestSuccess', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #18
0
def pretrain(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0,
                steps_per_epoch=4000, epochs=50, pi_epochs=100, vf_epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4,
                vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000,
                target_kl=0.01, logger_kwargs=dict(), save_freq=10, demo_file=""):

    setup_pytorch_for_mpi()
    logger = EpochLogger(**logger_kwargs)
    # locals() return all local variable
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # demo environment
    demo_env = DemoGymEnv(demo_file=demo_file, seed=seed)
    demo_env.check_env(env)
    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs).to(device)

    sync_params(ac)
    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = ACDFBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)
    vf_pi_optimizer = Adam(ac.v_pi.parameters(), lr=vf_lr)

    logger.setup_pytorch_saver(ac)
    def compute_loss_v(data):
        obs, ret = Variable(data['obs']), Variable(data['ret'])
        return ((ac.v(obs) - ret)**2).mean()
    def compute_loss_v_pi(data):
        obs, ret = Variable(data['obs']), Variable(data['ret'])
        return ((ac.v_pi(obs) - ret)**2).mean()
    def demo_update():
        data = buf.get()
        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()
        pi_info, loss_pi, loss_v = {}, 0, 0
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                # logger.log('Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()
        logger.store(StopIter=i)
        for i in range(train_v_iters):
            vf_pi_optimizer.zero_grad()
            loss_v = compute_loss_v_pi(data)
            loss_v.backward()
            mpi_avg_grads(ac.v_pi)
            vf_pi_optimizer.step()
        print("Pi loss:     {}".format(pi_l_old))
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old, LossV=v_l_old,
                     KL=kl, Entropy=ent, ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    def compute_loss_pi(data):
        obs, act, adv, logp_old = Variable(data['obs']), Variable(data['act']), Variable(data['adv']), Variable(data['logp'])

        # Policy loss
        pi, logp = ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1-clip_ratio, 1+clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1+clip_ratio) | ratio.lt(1-clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32, device=device).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    buf = ACDFBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
    # pretraining epochs
    # demonstration training: main loop, for policy network
    o, ep_ret, ep_len = demo_env.reset(), 0, 0
    start_time = time.time()
    for epoch in range(pi_epochs):

        pi_old_data = [deepcopy(p.data) for p in ac.pi.parameters()]
        vf_old_data = [deepcopy(p.data) for p in ac.v.parameters()]
        vf_pi_old_data = [deepcopy(p.data) for p in ac.v_pi.parameters()]
        for t in range(local_steps_per_epoch):
            a, v, logp_a, m, std = ac.pretrain_step(torch.as_tensor(o, dtype=torch.float32, device=device))
            next_o, r, d, _ = demo_env.step(a, std)
            ep_ret += r
            ep_len += 1

            buf.store(o, a, r, v, logp_a, std=std)
            logger.store(VVals=v)
            o = next_o
            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1
            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _, _, _ = ac.pretrain_step(torch.as_tensor(o, dtype=torch.float32, device=device))
                else:
                    v = 0
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                buf.finish_path(v)
                o, ep_ret, ep_len = demo_env.reset(), 0, 0


        # Save model
        # if (epoch % save_freq == 0) or (epoch == pi_epochs-1):
        if (epoch in SAVE_FREQ) or (epoch == pi_epochs - 1):
            save_pi(logger_kwargs.get('output_dir', "model"), itr=epoch, paramenters=ac.pi)
            logger.save_state({'env': env}, None)
        demo_update()
        delta_v, delta_v_pi, delta_pi = 0, 0, 0
        for i, param in enumerate(ac.v_pi.parameters()):
            delta_v_pi += torch.norm(param.data - vf_pi_old_data[i])
        for i, param in enumerate(ac.v.parameters()):
            delta_v += torch.norm(param.data - vf_old_data[i])
        for i, param in enumerate(ac.pi.parameters()):
            delta_pi += torch.norm(param.data - pi_old_data[i])
        print("delta v_pi: {}; delta vf: {}; delta pi: {}".format(delta_v_pi, delta_v, delta_pi))
        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
    logger.save_state({'env': env}, pi_epochs)
    def update_vf():
        data = buf.get()
        v_l_old = compute_loss_v(data).item()
        print("Loss for Value function: {}".format(v_l_old))
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)
            vf_optimizer.step()
    # for the value function pre-training
    o, ep_ret, ep_len = demo_env.reset(), 0, 0
    start_time = time.time()
    for epoch in range(vf_epochs):
        pi_old_data = [deepcopy(p.data) for p in ac.pi.parameters()]
        vf_old_data = [deepcopy(p.data) for p in ac.v.parameters()]
        vf_pi_old_data = [deepcopy(p.data) for p in ac.v_pi.parameters()]
        for t in range(local_steps_per_epoch):
            next_o, r, d, _, a = demo_env.free_step()
            v = ac.v(torch.as_tensor(o, dtype=torch.float32, device=device)).cpu().detach().numpy()
            ep_ret += r
            ep_len += 1
            buf.store(o, a, r, v, 1)
            # logger.store(VVals=v)
            o = next_o
            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1
            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    v = ac.v(torch.as_tensor(o, dtype=torch.float32, device=device)).cpu().detach().numpy()
                else:
                    v = 0
                buf.finish_path(v)
                o, ep_ret, ep_len = demo_env.reset(), 0, 0
        print("Pretraining for value function at Epoch: {}".format(epoch))
        update_vf()
        delta_v, delta_v_pi, delta_pi = 0, 0, 0
        for i, param in enumerate(ac.v_pi.parameters()):
            delta_v_pi += torch.norm(param.data - vf_pi_old_data[i])
        for i, param in enumerate(ac.v.parameters()):
            delta_v += torch.norm(param.data - vf_old_data[i])
        for i, param in enumerate(ac.pi.parameters()):
            delta_pi += torch.norm(param.data - pi_old_data[i])
        print("delta v_pi: {}; delta vf: {}; delta pi: {}".format(delta_v_pi, delta_v, delta_pi))
        if (epoch in SAVE_FREQ) or (epoch == vf_epochs - 1):
            save_vf(logger_kwargs.get('output_dir', "model"), itr=epoch, paramenters=ac.v)
            logger.save_state({'env': env}, None)
Пример #19
0
def ddpg(env_fn,
         actor_critic=core.ActorCritic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         act_noise=0.1,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A reference to ActorCritic class which after instantiation
            takes state, ``x``, and action, ``a``, and returns:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for 
                                           | states in ``x`` and actions in
                                           | ``a``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                           | ``pi`` for states in ``x``:
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    torch.manual_seed(seed)
    np.random.seed(seed)

    # https://pytorch.org/docs/master/notes/randomness.html#cudnn
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    actor_critic_main = actor_critic(obs_dim, **ac_kwargs).to(device)
    # Note that the action placeholder going to targer actor_critic here
    # is irrelevant, because we only need q_targ(s, pi_targ(s)).
    actor_critic_target = actor_critic(obs_dim, **ac_kwargs).to(device)

    # Count variables
    var_counts = tuple(
        core.count_vars(model) for model in
        [actor_critic_main.policy, actor_critic_main.q, actor_critic_main])
    logger.log('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
               var_counts)

    # Optimizers
    pi_optimizer = optim.Adam(actor_critic_main.policy.parameters(), lr=pi_lr)
    q_optimizer = optim.Adam(actor_critic_main.q.parameters(), lr=q_lr)

    def get_action(o, noise_scale):
        a = actor_critic_main(Tensor(o.reshape(1, -1)).to(device))
        a = a.cpu().detach().numpy()
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for _ in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                x, x2, a, r, d = [
                    Tensor(batch[k]).to(device)
                    for k in ['obs1', 'obs2', 'acts', 'rews', 'done']
                ]

                _, q, q_pi = actor_critic_main(x, a)
                _, _, q_pi_targ = actor_critic_target(x2, a)

                # Bellman backup for Q function
                backup = (r + gamma * (1 - d) * q_pi_targ).detach()

                # DDPG losses
                pi_loss = -q_pi.mean()
                q_loss = ((q - backup)**2).mean()

                # Q-learning update
                q_optimizer.zero_grad()
                q_loss.backward()
                q_optimizer.step()
                logger.store(LossQ=q_loss, QVals=q.cpu().detach().numpy())

                # Policy update
                pi_optimizer.zero_grad()
                pi_loss.backward()
                pi_optimizer.step()
                logger.store(LossPi=pi_loss)

                # Polyak averaging for target variables
                # Credits: https://github.com/ghliu/pytorch-ddpg/blob/master/util.py
                params = zip(actor_critic_target.parameters(),
                             actor_critic_main.parameters())
                for ac_target, ac_main in params:
                    ac_target.data.copy_(ac_main.data * (1.0 - polyak) +
                                         ac_target.data * polyak)

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, actor_critic_main, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
def td3(env_fn,
        expert=None,
        policy_path=None,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=500,
        epochs=1000,
        replay_size=int(5e3),
        gamma=0.99,
        polyak=0.995,
        pi_lr=1e-4,
        q_lr=1e-4,
        batch_size=64,
        start_epochs=500,
        dagger_epochs=500,
        pretrain_epochs=50,
        dagger_noise=0.02,
        act_noise=0.02,
        target_noise=0.02,
        noise_clip=0.5,
        policy_delay=2,
        max_ep_len=500,
        logger_kwargs=dict(),
        save_freq=50,
        UPDATE_STEP=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to TD3.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        target_noise (float): Stddev for smoothing noise added to target 
            policy.

        noise_clip (float): Limit for absolute value of target policy 
            smoothing noise.

        policy_delay (int): Policy will only be updated once every 
            policy_delay times for each update of the Q-networks.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    test_logger_kwargs = dict()
    test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'],
                                                "test")
    test_logger_kwargs['exp_name'] = logger_kwargs['exp_name']
    test_logger = EpochLogger(**test_logger_kwargs)

    # test_logger_kwargs = dict()
    # test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "test")
    # test_logger_kwargs['exp_name'] = logger_kwargs['exp_name']
    # test_logger = EpochLogger(**test_logger_kwargs)

    # pretrain_logger_kwargs = dict()
    # pretrain_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "pretrain")
    # pretrain_logger_kwargs['exp_name'] = logger_kwargs['exp_name']
    # pretrain_logger = EpochLogger(**pretrain_logger_kwargs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, do not assumes all dimensions share the same bound!
    act_limit = env.action_space.high / 2
    act_high_limit = env.action_space.high
    act_low_limit = env.action_space.low

    act_noise_limit = act_noise * act_limit
    sess = tf.Session()
    if policy_path is None:
        # Share information about action space with policy architecture
        ac_kwargs['action_space'] = env.action_space

        # Inputs to computation graph
        x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(
            obs_dim, act_dim, obs_dim, None, None)
        tfa_ph = core.placeholder(act_dim)

        # Main outputs from computation graph
        with tf.variable_scope('main'):
            pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

        # Target policy network
        with tf.variable_scope('target'):
            pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs)

        # Target Q networks
        with tf.variable_scope('target', reuse=True):

            # Target policy smoothing, by adding clipped noise to target actions
            epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise)
            epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip)
            a2 = pi_targ + epsilon
            a2 = tf.clip_by_value(a2, act_low_limit, act_high_limit)

            # Target Q-values, using action from target policy
            _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs)

    else:
        # sess = tf.Session()
        model = restore_tf_graph(sess, osp.join(policy_path, 'simple_save'))
        x_ph, a_ph, x2_ph, r_ph, d_ph = model['x_ph'], model['a_ph'], model[
            'x2_ph'], model['r_ph'], model['d_ph']
        pi, q1, q2, q1_pi = model['pi'], model['q1'], model['q2'], model[
            'q1_pi']
        pi_targ, q1_targ, q2_targ = model['pi_targ'], model['q1_targ'], model[
            'q2_targ']
        tfa_ph = core.placeholder(act_dim)
        dagger_epochs = 0

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)
    dagger_replay_buffer = DaggerReplayBuffer(obs_dim=obs_dim,
                                              act_dim=act_dim,
                                              size=replay_size)
    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(
        '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n'
        % var_counts)

    if policy_path is None:
        # Bellman backup for Q functions, using Clipped Double-Q targets
        min_q_targ = tf.minimum(q1_targ, q2_targ)
        backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ)

        # dagger loss
        dagger_pi_loss = tf.reduce_mean(tf.square(pi - tfa_ph))
        # TD3 losses
        pi_loss = -tf.reduce_mean(q1_pi)
        q1_loss = tf.reduce_mean((q1 - backup)**2)
        q2_loss = tf.reduce_mean((q2 - backup)**2)
        q_loss = tf.add(q1_loss, q2_loss)
        pi_loss = tf.identity(pi_loss, name="pi_loss")
        q1_loss = tf.identity(q1_loss, name="q1_loss")
        q2_loss = tf.identity(q2_loss, name="q2_loss")
        q_loss = tf.identity(q_loss, name="q_loss")

        # Separate train ops for pi, q
        dagger_pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
        pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
        q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
        train_dagger_pi_op = dagger_pi_optimizer.minimize(
            dagger_pi_loss,
            var_list=get_vars('main/pi'),
            name='train_dagger_pi_op')
        train_pi_op = pi_optimizer.minimize(pi_loss,
                                            var_list=get_vars('main/pi'),
                                            name='train_pi_op')
        train_q_op = q_optimizer.minimize(q_loss,
                                          var_list=get_vars('main/q'),
                                          name='train_q_op')

        # Polyak averaging for target variables
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

        # Initializing targets to match main variables
        target_init = tf.group([
            tf.assign(v_targ, v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])
        sess.run(tf.global_variables_initializer())
    else:
        graph = tf.get_default_graph()
        # opts = graph.get_operations()
        # print (opts)
        pi_loss = model['pi_loss']
        q1_loss = model['q1_loss']
        q2_loss = model['q2_loss']
        q_loss = model['q_loss']
        train_q_op = graph.get_operation_by_name('train_q_op')
        train_pi_op = graph.get_operation_by_name('train_pi_op')
        # target_update = graph.get_operation_by_name('target_update')
        # target_init = graph.get_operation_by_name('target_init')
        # Polyak averaging for target variables
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

        # Initializing targets to match main variables
        target_init = tf.group([
            tf.assign(v_targ, v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # sess = tf.Session()
    # sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph, 'x2_ph': x2_ph, 'r_ph': r_ph, 'd_ph': d_ph}, \
         outputs={'pi': pi, 'q1': q1, 'q2': q2, 'q1_pi': q1_pi, 'pi_targ': pi_targ, 'q1_targ': q1_targ, 'q2_targ': q2_targ, \
             'pi_loss': pi_loss, 'q1_loss': q1_loss, 'q2_loss': q2_loss, 'q_loss': q_loss})

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0]
        # todo: add act_limit scale noise
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, act_low_limit, act_high_limit)

    def choose_action(s, add_noise=False):
        s = s[np.newaxis, :]
        a = sess.run(pi, {x_ph: s})[0]
        if add_noise:
            noise = dagger_noise * act_high_limit * np.random.normal(
                size=a.shape)
            a = a + noise
        return np.clip(a, act_low_limit, act_high_limit)

    def test_agent(n=81, test_num=1):
        n = env.unwrapped._set_test_mode(True)
        con_flag = False
        for j in range(n):
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, info = env.step(choose_action(np.array(o), 0))
                ep_ret += r
                ep_len += 1
                if d:
                    test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
                    test_logger.store(arrive_des=info['arrive_des'])
                    test_logger.store(
                        arrive_des_appro=info['arrive_des_appro'])
                    if not info['out_of_range']:
                        test_logger.store(converge_dis=info['converge_dis'])
                        con_flag = True
                    test_logger.store(out_of_range=info['out_of_range'])
                    # print(info)
        # test_logger.dump_tabular()
        # time.sleep(10)
        if not con_flag:
            test_logger.store(converge_dis=10000)
        env.unwrapped._set_test_mode(False)

    start_time = time.time()
    env.unwrapped._set_test_mode(False)
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs
    test_num = 0

    total_env_t = 0
    print(colorize("begin dagger training", 'green', bold=True))
    # Main loop for dagger pretrain
    for epoch in range(1, dagger_epochs + 1, 1):
        obs, acs, rewards = [], [], []
        # number of timesteps
        for t in range(steps_per_epoch):
            # action = env.action_space.sample()
            # action = ppo.choose_action(np.array(observation))
            obs.append(o)
            ref_action = call_ref_controller(env, expert)
            if (epoch < pretrain_epochs):
                action = ref_action
            else:
                action = choose_action(np.array(o), True)

            o2, r, d, info = env.step(action)
            ep_ret += r
            ep_len += 1
            total_env_t += 1

            acs.append(ref_action)
            rewards.append(r)
            # Store experience to replay buffer
            replay_buffer.store(o, action, r, o2, d)

            o = o2

            if (t == steps_per_epoch - 1):
                # print ("reached the end")
                d = True

            if d:
                # collected data to replaybuffer
                max_step = len(np.array(rewards))
                q = [
                    np.sum(
                        np.power(gamma, np.arange(max_step - t)) * rewards[t:])
                    for t in range(max_step)
                ]
                dagger_replay_buffer.stores(obs, acs, rewards, q)

                # update policy
                for _ in range(int(max_step / 5)):
                    batch = dagger_replay_buffer.sample_batch(batch_size)
                    feed_dict = {x_ph: batch['obs1'], tfa_ph: batch['acts']}
                    q_step_ops = [dagger_pi_loss, train_dagger_pi_op]
                    for j in range(UPDATE_STEP):
                        outs = sess.run(q_step_ops, feed_dict)
                    logger.store(LossPi=outs[0])

                # train q function
                for j in range(int(max_step / 5)):
                    batch = replay_buffer.sample_batch(batch_size)
                    feed_dict = {
                        x_ph: batch['obs1'],
                        x2_ph: batch['obs2'],
                        a_ph: batch['acts'],
                        r_ph: batch['rews'],
                        d_ph: batch['done']
                    }
                    q_step_ops = [q_loss, q1, q2, train_q_op]
                    # for _ in range(UPDATE_STEP):
                    outs = sess.run(q_step_ops, feed_dict)
                    logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                    if j % policy_delay == 0:
                        # Delayed target update
                        outs = sess.run([target_update], feed_dict)
                        # logger.store(LossPi=outs[0])

                # logger.store(LossQ=1000000, Q1Vals=1000000, Q2Vals=1000000)
                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                break

        # End of epoch wrap-up
        if epoch > 0 and (epoch % save_freq == 0) or (epoch == dagger_epochs):
            # Save model
            logger.save_state({}, None)

            # Test the performance of the deterministic version of the agent.
            test_num += 1
            test_agent(test_num=test_num)

            # Log info about epoch
            test_logger.log_tabular('epoch', epoch)
            test_logger.log_tabular('TestEpRet', average_only=True)
            test_logger.log_tabular('TestEpLen', average_only=True)
            test_logger.log_tabular('arrive_des', average_only=True)
            test_logger.log_tabular('converge_dis', average_only=True)
            test_logger.log_tabular('out_of_range', average_only=True)
            test_logger.dump_tabular()

    sess.run(target_init)
    print(colorize("begin td3 training", 'green', bold=True))
    # Main loop: collect experience in env and update/log each epoch
    # total_env_t = 0
    for epoch in range(1, epochs + 1, 1):

        # End of epoch wrap-up
        if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs):

            # Save model
            logger.save_state({}, None)

            # Test the performance of the deterministic version of the agent.
            test_num += 1
            test_agent(test_num=test_num)

            # Log info about epoch
            test_logger.log_tabular('epoch', epoch)
            test_logger.log_tabular('TestEpRet', average_only=True)
            test_logger.log_tabular('TestEpLen', average_only=True)
            test_logger.log_tabular('arrive_des', average_only=True)
            test_logger.log_tabular('converge_dis', average_only=True)
            test_logger.log_tabular('out_of_range', average_only=True)
            test_logger.dump_tabular()
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        # o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        for t in range(steps_per_epoch):
            if epoch > start_epochs:
                a = get_action(np.array(o), act_noise_limit)
            else:
                a = env.action_space.sample()
                # ref_action = call_ref_controller(env, expert)

            # Step the env
            o2, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1
            total_env_t += 1

            # Ignore the "done" signal if it comes from hitting the time
            # horizon (that is, when it's an artificial terminal signal
            # that isn't based on the agent's state)
            # d = False if ep_len==max_ep_len else d

            # Store experience to replay buffer
            replay_buffer.store(o, a, r, o2, d)

            # Super critical, easy to overlook step: make sure to update
            # most recent observation!
            o = o2

            if (t == steps_per_epoch - 1):
                # print ("reached the end")
                d = True

            if d:
                """
                Perform all TD3 updates at the end of the trajectory
                (in accordance with source code of TD3 published by
                original authors).
                """
                for j in range(ep_len):
                    batch = replay_buffer.sample_batch(batch_size)
                    feed_dict = {
                        x_ph: batch['obs1'],
                        x2_ph: batch['obs2'],
                        a_ph: batch['acts'],
                        r_ph: batch['rews'],
                        d_ph: batch['done']
                    }
                    q_step_ops = [q_loss, q1, q2, train_q_op]
                    # for _ in range(UPDATE_STEP):
                    outs = sess.run(q_step_ops, feed_dict)
                    logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2])

                    if j % policy_delay == 0:
                        # Delayed policy update
                        outs = sess.run([pi_loss, train_pi_op, target_update],
                                        feed_dict)
                        logger.store(LossPi=outs[0])

                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                break
Пример #21
0
def acdf(env_fn,
         actor_critic=core.MLPActorCritic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=4000,
         epochs=50,
         pi_epochs=100,
         vf_epochs=50,
         gamma=0.99,
         clip_ratio=0.2,
         pi_lr=3e-4,
         vf_lr=1e-3,
         train_pi_iters=80,
         train_v_iters=80,
         lam=0.97,
         max_ep_len=1000,
         target_kl=0.01,
         logger_kwargs=dict(),
         save_freq=10,
         demo_file=""):

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Random seed
    seed += 10000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # demo environment
    demo_env = DemoGymEnv(demo_file=demo_file, seed=seed)
    demo_env.check_env(env)
    # Create actor-critic module
    ac = actor_critic(env.observation_space, env.action_space,
                      **ac_kwargs).to(device)

    # Sync params across processes
    sync_params(ac)
    # Count variables
    var_counts = tuple(
        core.count_vars(module) for module in [ac.pi, ac.v, ac.v_pi])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d and v_pi: %d\n' %
               var_counts)

    # Set up experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = ACDFBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Set up function for computing PPO policy loss
    def compute_loss_pi(data):
        obs, act, adv, logp_old = Variable(data['obs']), Variable(
            data['act']), Variable(data['adv']), Variable(data['logp'])

        # Policy loss
        pi, logp = ac.pi(obs, act)
        ratio = torch.exp(logp - logp_old)
        clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv
        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()

        # Useful extra info
        approx_kl = (logp_old - logp).mean().item()
        ent = pi.entropy().mean().item()
        clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio)
        clipfrac = torch.as_tensor(clipped, dtype=torch.float32,
                                   device=device).mean().item()
        pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac)

        return loss_pi, pi_info

    # Set up function for computing value loss
    def compute_loss_v(data):
        obs, ret = Variable(data['obs']), Variable(data['ret'])
        return ((ac.v(obs) - ret)**2).mean()

    def compute_loss_v_pi(data):
        obs, ret = Variable(data['obs']), Variable(data['ret'])
        return ((ac.v_pi(obs) - ret)**2).mean()

    # Set up optimizers for policy and value function
    pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr)
    vf_pi_optimizer = Adam(ac.v_pi.parameters(), lr=vf_lr)
    # Set up model savingF
    logger.setup_pytorch_saver(ac)

    def update():
        data = buf.get()

        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v(data).item()

        # Train policy with multiple steps of gradient descent
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()

        logger.store(StopIter=i)

        # Value function learning
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)  # average grads across MPI processes
            vf_optimizer.step()

        # Log changes from update
        kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(loss_pi.item() - pi_l_old),
                     DeltaLossV=(loss_v.item() - v_l_old))

    def demo_update():
        data = buf.get()
        pi_l_old, pi_info_old = compute_loss_pi(data)
        pi_l_old = pi_l_old.item()
        v_l_old = compute_loss_v_pi(data).item()
        for i in range(train_pi_iters):
            pi_optimizer.zero_grad()
            loss_pi, pi_info = compute_loss_pi(data)
            kl = mpi_avg(pi_info['kl'])
            if kl > 1.5 * target_kl:
                # logger.log('Early stopping at step %d due to reaching max kl.' % i)
                break
            loss_pi.backward()
            mpi_avg_grads(ac.pi)  # average grads across MPI processes
            pi_optimizer.step()
        # logger.store(StopIter=i)
        for i in range(train_v_iters):
            vf_pi_optimizer.zero_grad()
            loss_v = compute_loss_v_pi(data)
            loss_v.backward()
            mpi_avg_grads(ac.v_pi)
            vf_pi_optimizer.step()
        print("Pi loss:     {}".format(pi_l_old))
        # kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf']
        # logger.store(LossPi=pi_l_old, LossV=v_l_old,
        #              KL=kl, Entropy=ent, ClipFrac=cf,
        #              DeltaLossPi=(loss_pi.item() - pi_l_old),
        #              DeltaLossV=(loss_v.item() - v_l_old))

    def update_vf():
        data = buf.get()
        v_l_old = compute_loss_v(data).item()
        print("Loss for Value function: {}".format(v_l_old))
        for i in range(train_v_iters):
            vf_optimizer.zero_grad()
            loss_v = compute_loss_v(data)
            loss_v.backward()
            mpi_avg_grads(ac.v)
            vf_optimizer.step()

    # pretraining epochs
    # pi_epochs, vf_epochs = 100, 50

    # demonstration training: main loop, for policy network
    o, ep_ret, ep_len = demo_env.reset(), 0, 0
    start_time = time.time()
    for epoch in range(pi_epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp_a, m, std = ac.pretrain_step(
                torch.as_tensor(o, dtype=torch.float32, device=device))
            next_o, r, d, _ = demo_env.step(a, std)
            ep_ret += r
            ep_len += 1

            buf.store(o, a, r, v, logp_a, std=std)
            # logger.store(VVals=v)
            o = next_o
            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1
            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _, _, _ = ac.pretrain_step(
                        torch.as_tensor(o, dtype=torch.float32, device=device))
                else:
                    v = 0
                # if terminal:
                #     # only save EpRet / EpLen if trajectory finished
                #     # logger.store(EpRet=ep_ret, EpLen=ep_len)
                buf.finish_path(v)
                o, ep_ret, ep_len = demo_env.reset(), 0, 0
        demo_update()
        # # Log info about epoch
        # logger.log_tabular('Epoch', epoch)
        # logger.log_tabular('EpRet', with_min_and_max=True)
        # logger.log_tabular('EpLen', average_only=True)
        # logger.log_tabular('VVals', with_min_and_max=True)
        # logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        # logger.log_tabular('LossPi', average_only=True)
        # logger.log_tabular('LossV', average_only=True)
        # logger.log_tabular('DeltaLossPi', average_only=True)
        # logger.log_tabular('DeltaLossV', average_only=True)
        # logger.log_tabular('Entropy', average_only=True)
        # logger.log_tabular('KL', average_only=True)
        # logger.log_tabular('ClipFrac', average_only=True)
        # logger.log_tabular('StopIter', average_only=True)
        # logger.log_tabular('Time', time.time() - start_time)
        # logger.dump_tabular()

    # for the value function pre-training
    o, ep_ret, ep_len = demo_env.reset(), 0, 0
    start_time = time.time()
    for epoch in range(vf_epochs):
        for t in range(local_steps_per_epoch):
            next_o, r, d, _, a = demo_env.free_step()
            v = ac.v(torch.as_tensor(o, dtype=torch.float32,
                                     device=device)).cpu().detach().numpy()
            ep_ret += r
            ep_len += 1
            buf.store(o, a, r, v, 1)
            # logger.store(VVals=v)
            o = next_o
            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1
            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    v = ac.v(
                        torch.as_tensor(o, dtype=torch.float32,
                                        device=device)).cpu().detach().numpy()
                else:
                    v = 0
                buf.finish_path(v)
                o, ep_ret, ep_len = demo_env.reset(), 0, 0
        print("Pretraining for value function at Epoch: {}".format(epoch))
        update_vf()

    # Prepare for interaction with environment
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0
    buf = ACDFBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(
                torch.as_tensor(o, dtype=torch.float32, device=device))

            next_o, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1

            # save and log
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # Update obs (critical!)
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1

            if terminal or epoch_ended:
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(
                        torch.as_tensor(o, dtype=torch.float32, device=device))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #22
0
def sac(env,
        logger_kwargs=dict(),
        network_params=dict(),
        rl_params=dict(),
        resume_training=False,
        resume_params=dict()):

    logger = EpochLogger(**logger_kwargs)

    if not resume_training:
        save_vars = locals().copy()
        save_vars.pop('env')
        logger.save_config(save_vars)

    # ==== control params ====
    seed = rl_params['seed']
    epochs = rl_params['epochs']
    steps_per_epoch = rl_params['steps_per_epoch']
    replay_size = rl_params['replay_size']
    update_freq = rl_params['update_freq']
    n_updates = rl_params['n_updates']
    batch_size = rl_params['batch_size']
    start_steps = rl_params['start_steps']
    max_ep_len = rl_params['max_ep_len']
    num_tests = rl_params['num_tests']
    save_freq = rl_params['save_freq']

    # ==== rl params ====
    use_HER = rl_params['use_HER']
    use_prev_a = rl_params['use_prev_a']
    gamma = rl_params['gamma']
    polyak = rl_params['polyak']
    act_lr = rl_params['act_lr']
    crit_lr = rl_params['crit_lr']
    alph_lr = rl_params['alph_lr']

    # ==== exploration params ====
    alpha = rl_params['alpha']
    target_entropy = rl_params['target_entropy']

    if not resume_training:
        sess = tf.compat.v1.Session(config=tf_config)

    # set seeding (still not perfectly deterministic)
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)
    env.action_space.np_random.seed(seed)

    # get required gym spaces
    obs = env.observation_space
    act = env.action_space

    # get the obs size after resize of raw image
    obs_dim = network_params['input_dims']
    act_dim = env.action_space.shape[0]
    act_low = env.action_space.low[0]
    act_high = env.action_space.high[0]
    goal_dim = len(env.goal_list)

    if not resume_training:
        # init a state buffer for storing last m states
        train_state_buffer = StateBuffer(m=obs_dim[2])
        test_state_buffer = StateBuffer(m=obs_dim[2])

        # Experience buffer
        replay_buffer = ContReplayBuffer(obs_dim=obs_dim,
                                         act_dim=act_dim,
                                         goal_dim=goal_dim,
                                         size=replay_size)

        # Inputs to computation graph
        x_ph, a_ph, prev_a_ph, x2_ph, r_ph, d_ph, g_ph = placeholders(
            obs_dim, act_dim, act_dim, obs_dim, None, None, goal_dim)

        # alpha Params
        if target_entropy == 'auto':
            target_entropy = tf.cast(-act_dim, tf.float32)
        else:
            target_entropy = tf.cast(target_entropy, tf.float32)

        log_alpha = tf.get_variable('log_alpha',
                                    dtype=tf.float32,
                                    initializer=0.0)

        if alpha == 'auto':  # auto tune alpha
            alpha = tf.exp(log_alpha)
        else:  # fixed alpha
            alpha = tf.get_variable('alpha',
                                    dtype=tf.float32,
                                    initializer=alpha)

        # Main outputs from computation graph
        with tf.variable_scope('main'):
            mu, pi, logp_pi, q1_a, q2_a = create_rl_networks(
                x_ph, a_ph, use_prev_a, prev_a_ph, g_ph, act_high,
                network_params)

        with tf.variable_scope('main', reuse=True):
            # compose q with pi, for pi-learning
            _, _, _, q1_pi, q2_pi = create_rl_networks(x_ph, pi, use_prev_a,
                                                       prev_a_ph, g_ph,
                                                       act_high,
                                                       network_params)

            # get actions and log probs of actions for next states, for Q-learning
            _, pi_next, logp_pi_next, _, _ = create_rl_networks(
                x2_ph, a_ph, use_prev_a, prev_a_ph, g_ph, act_high,
                network_params)

        # Target networks
        with tf.variable_scope('target'):
            _, _, _, q1_pi_targ, q2_pi_targ = create_rl_networks(
                x2_ph, pi_next, use_prev_a, a_ph, g_ph, act_high,
                network_params)

        var_counts = tuple(
            count_vars(scope) for scope in
            ['log_alpha', 'main/pi', 'main/q1', 'main/q2', 'main'])
        print("""\nNumber of parameters:
                 alpha: %d,
                 pi: %d,
                 q1: %d,
                 q2: %d,
                 total: %d\n""" % var_counts)

        # Min Double-Q:
        min_q_pi = tf.minimum(q1_pi, q2_pi)
        min_q_pi_targ = tf.minimum(q1_pi_targ, q2_pi_targ)

        # Targets for Q and V regression
        q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) *
                                    (min_q_pi_targ - alpha * logp_pi_next))

        # critic losses
        q1_loss = 0.5 * tf.reduce_mean((q_backup - q1_a)**2)
        q2_loss = 0.5 * tf.reduce_mean((q_backup - q2_a)**2)
        value_loss = q1_loss + q2_loss

        # Soft actor losses
        pi_loss = tf.reduce_mean(alpha * logp_pi - min_q_pi)

        # alpha loss for temperature parameter
        alpha_backup = tf.stop_gradient(logp_pi + target_entropy)
        alpha_loss = -tf.reduce_mean((log_alpha * alpha_backup))

        # Policy train op
        # (has to be separate from value train op, because q1_pi appears in pi_loss)
        pi_optimizer = tf.train.AdamOptimizer(learning_rate=act_lr,
                                              epsilon=1e-04)
        train_pi_op = pi_optimizer.minimize(pi_loss,
                                            var_list=get_vars('main/pi'),
                                            name='train_pi_op')

        # Value train op
        value_optimizer = tf.train.AdamOptimizer(learning_rate=crit_lr,
                                                 epsilon=1e-04)
        with tf.control_dependencies([train_pi_op]):
            train_value_op = value_optimizer.minimize(
                value_loss, var_list=get_vars('main/q'), name='train_value_op')

        # Alpha train op
        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=alph_lr,
                                                 epsilon=1e-04)
        with tf.control_dependencies([train_value_op]):
            train_alpha_op = alpha_optimizer.minimize(
                alpha_loss,
                var_list=get_vars('log_alpha'),
                name='train_alpha_op')

        # Polyak averaging for target variables
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ],
                                 name='target_update')

        # Initializing targets to match main variables
        target_init = tf.group([
            tf.compat.v1.assign(v_targ, v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

        sess.run(tf.compat.v1.global_variables_initializer())
        sess.run(target_init)

    else:
        # if resuming define all the ph and outputs from saved model
        # inputs
        x_ph = resume_params['model']['x_ph']
        a_ph = resume_params['model']['a_ph']
        prev_a_ph = resume_params['model']['prev_a_ph']
        x2_ph = resume_params['model']['x2_ph']
        r_ph = resume_params['model']['r_ph']
        d_ph = resume_params['model']['d_ph']
        g_ph = resume_params['model']['g_ph']

        # outputs
        mu = resume_params['model']['mu']
        pi = resume_params['model']['pi']
        pi_loss = resume_params['model']['pi_loss']
        q1_loss = resume_params['model']['q1_loss']
        q2_loss = resume_params['model']['q2_loss']
        q1_a = resume_params['model']['q1_a']
        q2_a = resume_params['model']['q2_a']
        logp_pi = resume_params['model']['logp_pi']
        target_entropy = resume_params['model']['target_entropy']
        alpha_loss = resume_params['model']['alpha_loss']
        alpha = resume_params['model']['alpha']

        # buffers
        replay_buffer = resume_params['resume_state']['replay_buffer']
        train_state_buffer = resume_params['resume_state'][
            'train_state_buffer']
        test_state_buffer = resume_params['resume_state']['test_state_buffer']

        # get needed operations from graph by name (trouble saving these)
        train_pi_op = tf.get_default_graph().get_operation_by_name(
            "train_pi_op")
        train_value_op = tf.get_default_graph().get_operation_by_name(
            "train_value_op")
        train_alpha_op = tf.get_default_graph().get_operation_by_name(
            "train_alpha_op")
        target_update = tf.get_default_graph().get_operation_by_name(
            "target_update")

        sess = resume_params['sess']

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, q1_a, q2_a, logp_pi, target_entropy,
        alpha_loss, alpha, train_pi_op, train_value_op, train_alpha_op,
        target_update
    ]

    # Setup model saving
    if save_freq is not None:
        logger.setup_tf_saver(sess,
                              inputs={
                                  'x_ph': x_ph,
                                  'a_ph': a_ph,
                                  'prev_a_ph': prev_a_ph,
                                  'x2_ph': x2_ph,
                                  'r_ph': r_ph,
                                  'd_ph': d_ph,
                                  'g_ph': g_ph
                              },
                              outputs={
                                  'mu': mu,
                                  'pi': pi,
                                  'pi_loss': pi_loss,
                                  'q1_loss': q1_loss,
                                  'q2_loss': q2_loss,
                                  'q1_a': q1_a,
                                  'q2_a': q2_a,
                                  'logp_pi': logp_pi,
                                  'target_entropy': target_entropy,
                                  'alpha_loss': alpha_loss,
                                  'alpha': alpha
                              })

    def get_action(state, one_hot_goal, prev_a, deterministic=False):
        state = state.astype('float32') / 255.
        act_op = mu if deterministic else pi
        a = sess.run(act_op,
                     feed_dict={
                         x_ph: [state],
                         g_ph: [one_hot_goal],
                         prev_a_ph: [prev_a]
                     })[0]
        return a

    def reset(state_buffer):
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        o = process_image_observation(o, obs_dim)
        r = process_reward(r)
        state = state_buffer.init_state(init_obs=o)
        prev_a = np.zeros(act_dim)

        # new random goal when the env is reset
        goal_id = np.random.randint(goal_dim)
        one_hot_goal = np.eye(goal_dim)[goal_id]
        goal = env.goal_list[goal_id]
        env.goal_button = goal
        # print('Goal Button: {}'.format(goal))

        return o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a

    def test_agent(n=1):
        print('Testing...')
        for j in range(n):
            test_o, test_r, test_d, test_ep_ret, test_ep_len, test_state, test_one_hot_goal, test_prev_a = reset(
                test_state_buffer)

            while not (test_d or (test_ep_len == max_ep_len)):

                test_a = get_action(test_state, test_one_hot_goal, test_prev_a,
                                    True)

                test_o, test_r, test_d, _ = env.step(test_a)
                test_o = process_image_observation(test_o, obs_dim)
                test_r = process_reward(test_r)
                test_state = test_state_buffer.append_state(test_o)

                test_ep_ret += test_r
                test_ep_len += 1

                test_prev_a = test_a

            logger.store(TestEpRet=test_ep_ret, TestEpLen=test_ep_len)

    # ================== Main training Loop  ==================
    if not resume_training:
        start_time = time.time()
        o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = reset(
            train_state_buffer)

        total_steps = steps_per_epoch * epochs
        resume_t = 0

        # array for storing states used with HER
        if use_HER:
            HER_buffer = ContHERBuffer(obs_dim=obs_dim,
                                       act_dim=act_dim,
                                       goal_dim=goal_dim,
                                       size=max_ep_len)

    # resuming training
    else:
        start_time = time.time()
        total_steps = steps_per_epoch * (epochs +
                                         resume_params['additional_epochs'])
        HER_buffer = resume_params['resume_state']['HER_buffer']
        resume_t = resume_params['resume_state']['resume_t']
        o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = resume_params[
            'resume_state']['rl_state']

        # reset the environment to the state set before saving
        env.set_env_state(resume_params['resume_state']['env_state'])

    # Main loop: collect experience in env and update/log each epoch
    for t in range(resume_t, total_steps):

        if t > start_steps:
            a = get_action(state, one_hot_goal, prev_a, False)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        o2 = process_image_observation(o2, obs_dim)  # thresholding done in env
        r = process_reward(r)

        next_state = train_state_buffer.append_state(o2)

        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        # if life is lost then store done as true true
        replay_buffer.store(state, a, prev_a, r, next_state, d, one_hot_goal)

        # append to HER buffer
        if use_HER:
            HER_buffer.store(state, a, prev_a, r, next_state, d, one_hot_goal)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        state = next_state
        prev_a = a

        # store additional states in replay buffer where the goal
        # is given by the final state, if the final state was incorrect
        if use_HER:
            if d and (ep_len != max_ep_len):

                # get actual goal achieved
                achieved_goal = np.eye(goal_dim)[env.goal_list.index(
                    env.latest_button)]

                # if an incorrect goal was reached
                if (achieved_goal != one_hot_goal).any():

                    for j in range(ep_len):
                        # pull data from HER buffer
                        sample = HER_buffer.sample(j)

                        # change this to calc_rew function in env
                        if j == ep_len - 1:
                            new_rew = env.max_rew
                        else:
                            new_rew = sample['rews']

                        # add to replay buffer
                        replay_buffer.store(sample['obs1'], sample['acts'],
                                            sample['prev_acts'], new_rew,
                                            sample['obs2'], sample['done'],
                                            achieved_goal)

        # do a single update
        if t > 0 and t % update_freq == 0:
            for i in range(n_updates):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    prev_a_ph: batch['prev_acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                    g_ph: batch['goal']
                }

                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             Q1Vals=outs[3],
                             Q2Vals=outs[4],
                             LogPPi=outs[5],
                             TargEntropy=outs[6],
                             LossAlpha=outs[7],
                             Alpha=outs[8])

        if d or (ep_len == max_ep_len):
            # store episode values
            logger.store(EpRet=ep_ret, EpLen=ep_len)

            # reset the environment
            o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a = reset(
                train_state_buffer)

            if use_HER:
                # reset HER buffer
                HER_buffer.reset()

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:

            epoch = t // steps_per_epoch

            # save everything neccessary for restarting training from current position
            env_state = env.get_env_state()

            # Save model
            if save_freq is not None:
                if (epoch % save_freq == 0) or (epoch == epochs - 1):
                    print('Saving...')
                    rl_state = [
                        o, r, d, ep_ret, ep_len, state, one_hot_goal, prev_a
                    ]
                    logger.save_state(
                        state_dict={
                            'env_state': env_state,
                            'replay_buffer': replay_buffer,
                            'train_state_buffer': train_state_buffer,
                            'test_state_buffer': test_state_buffer,
                            'HER_buffer': HER_buffer,
                            'resume_t': t + 1,
                            'rl_state': rl_state
                        })

            # Test the performance of the deterministic version of the agent. (resets the env)
            test_agent(n=num_tests)

            # set params for resuming training
            env.set_env_state(env_state)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('LogPPi', average_only=True)
            logger.log_tabular('TargEntropy', average_only=True)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossAlpha', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

    plot_progress(os.path.join(logger_kwargs['output_dir'], 'progress.txt'),
                  show_plot=False)
Пример #23
0
def ddpg(env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         pi_lr=1e-3,
         q_lr=1e-3,
         batch_size=100,
         start_steps=10000,
         act_noise=0.1,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Deterministically computes actions
                                           | from policy given states.
            ``q``        (batch,)          | Gives the current estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to DDPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        pi_lr (float): Learning rate for policy.

        q_lr (float): Learning rate for Q-networks.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        act_noise (float): Stddev for Gaussian exploration noise added to 
            policy at training time. (At test time, no noise is added.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Target networks
    with tf.variable_scope('target'):
        # Note that the action placeholder going to actor_critic here is
        # irrelevant, because we only need q_targ(s, pi_targ(s)).
        pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in ['main/pi', 'main/q', 'main'])
    print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' %
          var_counts)

    # Bellman backup for Q function
    backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ)

    # DDPG losses
    pi_loss = -tf.reduce_mean(q_pi)
    q_loss = tf.reduce_mean((q - backup)**2)

    # Separate train ops for pi, q
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr)
    q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))
    train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q'))

    # Polyak averaging for target variables
    target_update = tf.group([
        tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'pi': pi,
                              'q': q
                          })

    def get_action(o, noise_scale):
        a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})
        a += noise_scale * np.random.randn(act_dim)
        return np.clip(a, -act_limit, act_limit)

    def test_agent(n=10):
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, _ = test_env.step(get_action(o, 0))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            a = get_action(o, act_noise)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        if d or (ep_len == max_ep_len):
            """
            Perform all DDPG updates at the end of the trajectory,
            in accordance with tuning done by TD3 paper authors.
            """
            for _ in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done']
                }

                # Q-learning update
                outs = sess.run([q_loss, q, train_q_op], feed_dict)
                logger.store(LossQ=outs[0], QVals=outs[1])

                # Policy update
                outs = sess.run([pi_loss, train_pi_op, target_update],
                                feed_dict)
                logger.store(LossPi=outs[0])

            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('QVals', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Пример #24
0
def sqn(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
        steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99,
        polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=10000,
        max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for policy/value/alpha learning).

        alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    # print(max_ep_len,type(max_ep_len))
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)


    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    obs_space = env.observation_space
    act_dim = env.action_space.n
    act_space = env.action_space


    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders_from_space(obs_space, act_space, obs_space, None, None)


    ######
    if alpha == 'auto':
        # target_entropy = (-np.prod(env.action_space.n))
        # target_entropy = (np.prod(env.action_space.n))/4/10
        target_entropy = 0.15

        log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0)
        alpha = tf.exp(log_alpha)
    ######


    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, _, q1, q2, q1_pi, q2_pi = actor_critic(x_ph, a_ph, alpha, **ac_kwargs)
    
    # Target value network
    with tf.variable_scope('target'):
        _, _, logp_pi_, _, _,q1_pi_, q2_pi_= actor_critic(x2_ph, a_ph, alpha,  **ac_kwargs)

    # Experience buffer
    if isinstance(act_space, Box):
        a_dim = act_dim
    elif isinstance(act_space, Discrete):
        a_dim = 1
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=a_dim, size=replay_size)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in 
                       ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t total: %d\n')%var_counts)


######
    if isinstance(alpha,tf.Tensor):
        alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi_ + target_entropy))

        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha])
######

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi_, q2_pi_)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi_)  ############################## alpha=0
    q_backup = r_ph + gamma*(1-d_ph)*v_backup


    # Soft actor-critic losses
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    value_loss = q1_loss + q2_loss

    # # Policy train op
    # # (has to be separate from value train op, because q1_pi appears in pi_loss)
    # pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    # train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q')
    #with tf.control_dependencies([train_pi_op]):
    train_value_op = value_optimizer.minimize(value_loss, var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                  for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [q1_loss, q2_loss, q1, q2, logp_pi_, tf.identity(alpha),
                train_value_op, target_update]
    else:
        step_ops = [q1_loss, q2_loss, q1, q2, logp_pi_, alpha,
                train_value_op, target_update, train_alpha_op]

    # Initializing targets to match main variables
    target_init = tf.group([tf.assign(v_targ, v_main)
                              for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, 
                                outputs={'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2})

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: np.expand_dims(o, axis=0)})[0]

    def test_agent(n=20):  # n: number of tests
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):  # max_ep_len
                # Take deterministic actions at test time 
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()


    # o = env.reset()                                                     #####################
    # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0            #####################
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0



    total_steps = steps_per_epoch * epochs

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):

        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        # if t > start_steps and 100*t/total_steps > np.random.random(): # greedy, avoid falling into sub-optimum
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        np.random.random()


        # Step the env
        o2, r, d, _ = env.step(a)
        #print(a,o2)
        # o2, r, _, d = env.step(a)                     #####################
        # d = d['ale.lives'] < 5                        #####################

        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len==max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update 
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len):   # make sure: max_ep_len < steps_per_epoch
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {x_ph: batch['obs1'],
                             x2_ph: batch['obs2'],
                             a_ph: batch['acts'],
                             r_ph: batch['rews'],
                             d_ph: batch['done'],
                            }
                # step_ops = [q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update]
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossQ1=outs[0], LossQ2=outs[1],
                            Q1Vals=outs[2], Q2Vals=outs[3],
                            LogPi=outs[4], Alpha=outs[5])

            #if d:
            logger.store(EpRet=ep_ret, EpLen=ep_len)


            # o = env.reset()                                              #####################
            # o, r, d, ep_ret, ep_len = env.step(1)[0], 0, False, 0, 0     #####################
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0



        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs-1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Alpha',average_only=True)
            logger.log_tabular('Q1Vals', with_min_and_max=True) 
            logger.log_tabular('Q2Vals', with_min_and_max=True) 
            # logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            # logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            # logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()
Пример #25
0
def a2c(env_fn,
        model: IActorCritic,
        seed=0,
        num_cpu=1,
        device=torch.device("cpu"),
        epochs=1000,
        steps_per_epoch=100,
        episode_len_limit=None,
        gamma=0.99,
        use_gae=True,
        tau=0.95,
        max_grad_norm=0.5,
        polyak=0.995,
        learning_rate=1e-3,
        value_loss_coef=0.5,
        policy_loss_coef=1,
        entropy_loss_coef=0.1,
        save_every=100,
        log_every=10,
        logger_kwargs=dict(),
        test_every=100,
        num_test_episodes=5,
        test_episode_len_limit=None,
        deterministic=False,
        save_freq=1,
        solved_score=None,
        ):
    use_MPI = num_cpu > 1

    if use_MPI:
        # Special function to avoid certain slowdowns from PyTorch + MPI combo.
        mpi_pytorch.setup_pytorch_for_mpi()
    else:
        torch.set_num_threads(torch.get_num_threads())

    # Set up logger and save configuration
    logger = EpochLogger(**logger_kwargs)
    config = locals()
    del config['env_fn']
    del config['model']
    del config['logger']
    logger.save_config(config)

    test_logger_kwargs = deepcopy(logger_kwargs)
    test_logger_kwargs['output_dir'] = pathlib.Path(test_logger_kwargs['output_dir']) / 'evaluation'
    test_logger = EpochLogger(**test_logger_kwargs)

    # Random seed
    if use_MPI:
        seed += 10000 * mpi_tools.proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Instantiate environment
    env = env_fn()
    test_env = env_fn()
    obs_shape = env.observation_space.shape
    act_dim = env.action_space.n

    # episode length limit
    if episode_len_limit is None:
        if env.unwrapped.spec and env.unwrapped.spec.max_episode_steps:
            episode_len_limit = env.spec.max_episode_steps
        else:
            raise ValueError("Episode length limit must be specified")

    if test_episode_len_limit is None:
        test_episode_len_limit = episode_len_limit

    # training model and target model
    actor_critic = model
    target_actor_critic = deepcopy(actor_critic)
    if use_MPI:
        # Sync params across processes
        mpi_pytorch.sync_params(actor_critic)
        mpi_pytorch.sync_params(target_actor_critic)

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    for p in target_actor_critic.parameters():
        p.requires_grad = False

    # Utilize GPU
    actor_critic.to(device)
    target_actor_critic.to(device)

    # Set up optimizers for policy and q-function
    optimizer = Adam(actor_critic.parameters(), lr=learning_rate)

    # Set up model saving
    logger.setup_pytorch_saver(actor_critic, name='model')

    def update(episode_buffer):
        # Update
        if episode_buffer.dones[-1]:
            next_value = 0.0
        else:
            last_obs = episode_buffer.next_observations[-1]
            last_obs_tensor = torch.tensor(last_obs, dtype=torch.float32).unsqueeze(0)
            context = actor_critic.get_context()
            next_value = target_actor_critic.predict_value(last_obs_tensor, context=context).cpu().item()

        # Super critical!!
        optimizer.zero_grad()

        # Compute value and policy losses
        loss, info = actor_critic.compute_loss(rewards=np.array(episode_buffer.rewards),
                                               dones=np.array(episode_buffer.dones),
                                               next_value=next_value,
                                               discount_factor=gamma,
                                               use_gae=use_gae,
                                               tau=tau,
                                               value_loss_coef=value_loss_coef,
                                               policy_loss_coef=policy_loss_coef,
                                               entropy_reg_coef=entropy_loss_coef)
        loss.backward()
        if use_MPI:
            mpi_pytorch.mpi_avg_grads(actor_critic)

        # Optimize
        if max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        # Log losses and info
        logger.store(**info)

        # Finally, update target networks by polyak averaging.
        with torch.no_grad():
            for p, p_targ in zip(actor_critic.parameters(), target_actor_critic.parameters()):
                # NB: We use an in-place operations "mul_", "add_" to update target
                # params, as opposed to "mul" and "add", which would make new tensors.
                p_targ.data.mul_(polyak)
                p_targ.data.add_((1 - polyak) * p.data)
        if use_MPI:
            mpi_pytorch.sync_params(target_actor_critic)

    # Prepare for interaction with environment
    start_time = time.time()
    # Main loop: collect experience in env and update/log each epoch
    total_steps = 0
    # Reset env
    obs = env.reset()
    # Reset episode stats
    episode_return = 0
    episode_length = 0
    for _ in range(5):
        logger.store(EpRet=0, EpLen=0)
    for epoch in range(1, epochs + 1):
        actor_critic.reset_for_training()
        epoch_history = EpisodeHistory()
        for t in range(steps_per_epoch):
            total_steps += 1

            # Get action from the model
            obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
            action = actor_critic.step(obs_tensor)

            # Step the env
            obs2, reward, done, _ = env.step(action.detach().cpu().item())
            episode_return += reward
            episode_length += 1

            # Store transition to history
            epoch_history.store(observation=obs, action=action, reward=reward, done=done, next_observation=obs2)

            # Super critical, easy to overlook step: make sure to update
            # most recent observation!
            obs = obs2

            # End of trajectory handling
            if done or episode_length > episode_len_limit:
                break

        update(epoch_history)

        # if done
        if epoch_history.dones[-1]:
            logger.store(EpRet=episode_return, EpLen=episode_length)
            # Reset env
            obs = env.reset()
            actor_critic.reset()
            # Reset episode stats
            episode_return = 0
            episode_length = 0

        # End of epoch handling
        if epoch % log_every == 0:
            total_interactions = mpi_tools.mpi_sum(total_steps) if use_MPI else total_steps

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('Value', average_only=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossEntropy', average_only=True)
            logger.log_tabular('TotalEnvInteracts', total_interactions)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

        # Test agent
        solved = False
        if epoch % test_every == 0:
            # Test the performance of the deterministic version of the agent.
            context = actor_critic.get_context()
            actor_critic.eval()
            episode_info = evaluate_agent(env=test_env,
                                          agent=actor_critic,
                                          deterministic=deterministic,
                                          num_episodes=num_test_episodes,
                                          episode_len_limit=test_episode_len_limit,
                                          render=False,
                                          logger=test_logger)
            actor_critic.train()
            actor_critic.set_context(context)
            if solved_score is not None:
                solved = all(r >= solved_score for (t, r) in episode_info)

        # Save model
        if (epoch % save_every == 0) or (epoch == epochs) or solved:
            logger.save_state({'env': env})

        # Check environment is solved
        if solved:
            plog = lambda msg: logger.log(msg, color='green')
            plog("=" * 40)
            plog(f"ENVIRONMENT SOLVED!")
            plog("=" * 40)
            plog(f'    TotalEnvInteracts {total_steps}')
            plog(f'    Time {time.time() - start_time}')
            plog(f'    Epoch {epoch}')
            break
def sac(env_fn,  expert=None, policy_path=None, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
        steps_per_epoch=500, epochs=100000, replay_size=int(5e3), gamma=0.99, 
        dagger_noise=0.02, polyak=0.995, lr=1e-4, alpha=0.2, batch_size=64, dagger_epochs=200, pretrain_epochs=50,
        max_ep_len=500, logger_kwargs=dict(), save_freq=50, update_steps=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. 
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to 
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())
    test_logger_kwargs = dict()
    test_logger_kwargs['output_dir'] = osp.join(logger_kwargs['output_dir'], "test")
    test_logger_kwargs['exp_name'] = logger_kwargs['exp_name']
    test_logger = EpochLogger(**test_logger_kwargs)

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    print(obs_dim)
    print(act_dim)
    
    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space
    act_high_limit = env.action_space.high
    act_low_limit = env.action_space.low

    sess = tf.Session()
    if policy_path is None:
        # Inputs to computation graph
        x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None)
        tfa_ph = core.placeholder(act_dim)
        # Main outputs from computation graph
        with tf.variable_scope('main'):
            mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)
        
        # Target value network
        with tf.variable_scope('target'):
            _, _, _, _, _, _, _, v_targ  = actor_critic(x2_ph, a_ph, **ac_kwargs)
        # sess.run(tf.global_variables_initializer())
    
    else:
        # load pretrained model
        model = restore_tf_graph(sess, osp.join(policy_path, 'simple_save'))
        x_ph, a_ph, x2_ph, r_ph, d_ph = model['x_ph'], model['a_ph'], model['x2_ph'], model['r_ph'], model['d_ph']
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v = model['mu'], model['pi'], model['logp_pi'], model['q1'], model['q2'], model['q1_pi'], model['q2_pi'], model['v']
        # tfa_ph = core.placeholder(act_dim)
        tfa_ph = model['tfa_ph']

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
    dagger_replay_buffer = DaggerReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in 
                       ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t total: %d\n')%var_counts)


    # print(obs_dim)
    # print(act_dim)

    # SAC objectives
    if policy_path is None:
        # Min Double-Q:
        min_q_pi = tf.minimum(q1_pi, q2_pi)

        # Targets for Q and V regression
        q_backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*v_targ)
        v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi)

        # Soft actor-critic losses
        dagger_pi_loss = tf.reduce_mean(tf.square(mu-tfa_ph))
        pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
        q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
        q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
        v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
        value_loss = q1_loss + q2_loss + v_loss

        # Policy train op 
        # (has to be separate from value train op, because q1_pi appears in pi_loss)
        dagger_pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        train_dagger_pi_op = dagger_pi_optimizer.minimize(dagger_pi_loss, name='train_dagger_pi_op')

        pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'), name='train_pi_op')
        # sess.run(tf.variables_initializer(pi_optimizer.variables()))

        # Value train op
        # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
        value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        value_params = get_vars('main/q') + get_vars('main/v')
        with tf.control_dependencies([train_pi_op]):
            train_value_op = value_optimizer.minimize(value_loss, var_list=value_params, name='train_value_op')
            # sess.run(tf.variables_initializer(value_optimizer.variables()))

        # Polyak averaging for target variables
        # (control flow because sess.run otherwise evaluates in nondeterministic order)
        with tf.control_dependencies([train_value_op]):
            target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                    for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

        # All ops to call during one training step
        step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 
                    train_pi_op, train_value_op, target_update]

        # Initializing targets to match main variables
        target_init = tf.group([tf.assign(v_targ, v_main)
                                for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
        sess.run(tf.global_variables_initializer())
    else:
        graph = tf.get_default_graph()
        dagger_pi_loss = model['dagger_pi_loss']
        pi_loss = model['pi_loss']
        q1_loss = model['q1_loss']
        q2_loss = model['q2_loss']        
        v_loss = model['v_loss']

        train_dagger_pi_op = graph.get_operation_by_name('train_dagger_pi_op')
        train_value_op = graph.get_operation_by_name('train_value_op')
        train_pi_op = graph.get_operation_by_name('train_pi_op')
        
        # Polyak averaging for target variables
        # (control flow because sess.run otherwise evaluates in nondeterministic order)
        with tf.control_dependencies([train_value_op]):
            target_update = tf.group([tf.assign(v_targ, polyak*v_targ + (1-polyak)*v_main)
                                    for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])

        # All ops to call during one training step
        step_ops = [pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, 
                    train_pi_op, train_value_op, target_update]

        # Initializing targets to match main variables
        target_init = tf.group([tf.assign(v_targ, v_main)
                                for v_main, v_targ in zip(get_vars('main'), get_vars('target'))])
    # sess = tf.Session()
    # sess.run(tf.global_variables_initializer())
    dagger_step_ops = [q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_value_op, target_update]
    tf.summary.FileWriter("log/", sess.graph)
    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x_ph': x_ph, 'a_ph': a_ph, 'tfa_ph': tfa_ph, 'x2_ph': x2_ph, 'r_ph': r_ph, 'd_ph': d_ph}, \
        outputs={'mu': mu, 'pi': pi, 'v': v, 'logp_pi': logp_pi, 'q1': q1, 'q2': q2, 'q1_pi': q1_pi, 'q2_pi': q2_pi, \
            'pi_loss': pi_loss, 'v_loss': v_loss, 'dagger_pi_loss': dagger_pi_loss, 'q1_loss': q1_loss, 'q2_loss': q2_loss})
    
    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        a = sess.run(act_op, feed_dict={x_ph: o.reshape(1,-1)})[0]
        return np.clip(a, act_low_limit, act_high_limit)

    def choose_action(s, add_noise=False):
        s = s[np.newaxis, :]
        a = sess.run(mu, {x_ph: s})[0]
        if add_noise:
            noise = dagger_noise * act_high_limit * np.random.normal(size=a.shape)
            a = a + noise
        return np.clip(a, act_low_limit, act_high_limit)

    def test_agent(n=81, test_num=1):
        n = env.unwrapped._set_test_mode(True)
        con_flag = False
        for j in range(n):
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                o, r, d, info = env.step(choose_action(np.array(o), 0))
                ep_ret += r
                ep_len += 1
                if d:
                    test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
                    test_logger.store(arrive_des=info['arrive_des'])
                    test_logger.store(arrive_des_appro=info['arrive_des_appro'])
                    if not info['out_of_range']:
                        test_logger.store(converge_dis=info['converge_dis'])
                        con_flag = True
                    test_logger.store(out_of_range=info['out_of_range'])
                    # print(info)
        # test_logger.dump_tabular()
        # time.sleep(10)
        if not con_flag:
            test_logger.store(converge_dis=10000)
        env.unwrapped._set_test_mode(False)

    def ref_test_agent(n=81, test_num=1):
        n = env.unwrapped._set_test_mode(True)
        con_flag = False
        for j in range(n):
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            while not(d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time (noise_scale=0)
                a  = call_ref_controller(env, expert)
                o, r, d, info = env.step(a)
                ep_ret += r
                ep_len += 1
                if d:
                    test_logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)
                    test_logger.store(arrive_des=info['arrive_des'])
                    test_logger.store(arrive_des_appro=info['arrive_des_appro'])
                    if not info['out_of_range']:
                        test_logger.store(converge_dis=info['converge_dis'])
                        con_flag = True
                    test_logger.store(out_of_range=info['out_of_range'])
                    # print(info)
        # test_logger.dump_tabular()
        if not con_flag:
            test_logger.store(converge_dis=10000)
        env.unwrapped._set_test_mode(False)

    # ref_test_agent(test_num = -1)
    # test_logger.log_tabular('epoch', -1)
    # test_logger.log_tabular('TestEpRet', average_only=True)
    # test_logger.log_tabular('TestEpLen', average_only=True)
    # test_logger.log_tabular('arrive_des', average_only=True)
    # test_logger.log_tabular('arrive_des_appro', average_only=True)
    # test_logger.log_tabular('converge_dis', average_only=True)
    # test_logger.log_tabular('out_of_range', average_only=True)
    # test_logger.dump_tabular()



    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    episode_steps = 500
    total_env_t = 0
    test_num = 0
    print(colorize("begin dagger training", 'green', bold=True))
    for epoch in range(1, dagger_epochs + 1, 1):
        # test policy
        if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs):
            # Save model
            logger.save_state({}, None)
            
            # Test the performance of the deterministic version of the agent.
            test_num += 1
            test_agent(test_num=test_num)
            
            test_logger.log_tabular('epoch', epoch)
            test_logger.log_tabular('TestEpRet', average_only=True)
            test_logger.log_tabular('TestEpLen', average_only=True)
            test_logger.log_tabular('arrive_des', average_only=True)
            test_logger.log_tabular('arrive_des_appro', average_only=True)
            test_logger.log_tabular('converge_dis', average_only=True)
            test_logger.log_tabular('out_of_range', average_only=True)
            test_logger.dump_tabular()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True) 
            logger.log_tabular('Q2Vals', with_min_and_max=True) 
            logger.log_tabular('VVals', with_min_and_max=True) 
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()

        # train policy
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        env.unwrapped._set_test_mode(False)
        obs, acs, rewards = [], [], []
        for t in range(steps_per_epoch):
            obs.append(o)
            ref_action = call_ref_controller(env, expert)
            if(epoch < pretrain_epochs):
                action = ref_action
            else:
                action = choose_action(np.array(o), True)
            
            o2, r, d, _ = env.step(action)
            o = o2
            acs.append(ref_action)
            rewards.append(r)

            if (t == steps_per_epoch-1):
                # print ("reached the end")
                d = True

            # Store experience to replay buffer
            replay_buffer.store(o, action, r, o2, d)

            ep_ret += r
            ep_len += 1
            total_env_t += 1

            if d:
                # Perform partical sac update!
                for j in range(ep_len):
                    batch = replay_buffer.sample_batch(batch_size)
                    feed_dict = {x_ph: batch['obs1'],
                                x2_ph: batch['obs2'],
                                a_ph: batch['acts'],
                                r_ph: batch['rews'],
                                d_ph: batch['done'],
                                }
                    outs = sess.run(dagger_step_ops, feed_dict)
                    logger.store(LossQ1=outs[0], LossQ2=outs[1],
                                LossV=outs[2], Q1Vals=outs[3], Q2Vals=outs[4],
                                VVals=outs[5], LogPi=outs[6])

                # Perform dagger policy update
                dagger_replay_buffer.stores(obs, acs, rewards)
                for _ in range(int(ep_len/5)):
                    batch = dagger_replay_buffer.sample_batch(batch_size)
                    feed_dict = {x_ph: batch['obs1'], tfa_ph: batch['acts']}
                    q_step_ops = [dagger_pi_loss, train_dagger_pi_op]
                    for j in range(10):
                        outs = sess.run(q_step_ops, feed_dict)
                    logger.store(LossPi = outs[0])

                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                break

    # Main loop: collect experience in env and update/log each epoch
    print(colorize("begin sac training", 'green', bold=True))
    for epoch in range(1, epochs + 1, 1):
        # test policy
        if epoch > 0 and (epoch % save_freq == 0) or (epoch == epochs):
            # Save model
            logger.save_state({}, None)
            
            # Test the performance of the deterministic version of the agent.
            test_num += 1
            test_agent(test_num=test_num)
            
            test_logger.log_tabular('epoch', epoch)
            test_logger.log_tabular('TestEpRet', average_only=True)
            test_logger.log_tabular('TestEpLen', average_only=True)
            test_logger.log_tabular('arrive_des', average_only=True)
            # test_logger.log_tabular('arrive_des_appro', average_only=True)
            test_logger.log_tabular('converge_dis', average_only=True)
            test_logger.log_tabular('out_of_range', average_only=True)
            test_logger.dump_tabular()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            # logger.log_tabular('DeltaLossPi', average_only=True)
            # logger.log_tabular('DeltaLossV', average_only=True)
            # logger.log_tabular('Entropy', average_only=True)
            # logger.log_tabular('KL', average_only=True)
            # logger.log_tabular('ClipFrac', average_only=True)
            # logger.log_tabular('StopIter', average_only=True)
            logger.log_tabular('Time', time.time()-start_time)
            logger.dump_tabular()

        # train policy
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        env.unwrapped._set_test_mode(False)
        for t in range(steps_per_epoch):
            a = get_action(np.array(o))

            o2, r, d, _ = env.step(a)
            ep_ret += r
            ep_len += 1
            if (t == steps_per_epoch-1):
                # print ("reached the end")
                d = True

            replay_buffer.store(o, a, r, o2, d)
            o = o2
            if d:
                """
                Perform all SAC updates at the end of the trajectory.
                This is a slight difference from the SAC specified in the
                original paper.
                """
                for j in range(ep_len):
                    batch = replay_buffer.sample_batch(batch_size)
                    feed_dict = {x_ph: batch['obs1'],
                                x2_ph: batch['obs2'],
                                a_ph: batch['acts'],
                                r_ph: batch['rews'],
                                d_ph: batch['done'],
                                }
                    outs = sess.run(step_ops, feed_dict)
                    logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2],
                                LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5],
                                VVals=outs[6], LogPi=outs[7])

                logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
Пример #27
0
def vpg(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10,
        custom_h=None,
        do_checkpoint_eval=False,
        env_name=None):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure 
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to VPG.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_v_iters (int): Number of gradient descent steps to take on 
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # create logger for tensorboard
    tb_logdir = "{}/tb_logs/".format(logger.output_dir)
    tb_logger = Logger(log_dir=tb_logdir)

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    if custom_h is not None:
        hidden_layers_str_list = custom_h.split('-')
        hidden_layers_int_list = [int(h) for h in hidden_layers_str_list]
        ac_kwargs['hidden_sizes'] = hidden_layers_int_list

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # VPG objectives
    pi_loss = -tf.reduce_mean(logp * adv_ph)
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    # create a tf session with GPU memory usage option to be allow_growth so that one program will not use up the
    # whole GPU memory
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    # log tf graph
    tf.summary.FileWriter(tb_logdir, sess.graph)

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    # for saving the best models and performances during train and evaluate
    best_eval_AverageEpRet = 0.0
    best_eval_StdEpRet = 1.0e20

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Policy gradient step
        sess.run(train_pi, feed_dict=inputs)

        # Value function learning
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl],
                                         feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            # Save a new model every save_freq and at the last epoch. Do not overwrite the previous save.
            logger.save_state({'env': env}, epoch)

            # Evaluate and save best model
            if do_checkpoint_eval and epoch > 0:
                # below is a hack. best model related stuff is saved at itr 999999, therefore, simple_save999999.
                # Doing this way, I can use test_policy and plot directly to test the best models.
                # saved best models includes:
                # 1) a copy of the env
                # 2) the best rl model with parameters
                # 3) a pickle file "best_eval_performance_n_structure" storing best_performance, best_structure and epoch
                # note that 1) and 2) are spinningup defaults, and 3) is a custom save
                best_eval_AverageEpRet, best_eval_StdEpRet = eval_and_save_best_model(
                    best_eval_AverageEpRet,
                    best_eval_StdEpRet,
                    # a new logger is created and passed in so that the new logger can leverage the directory
                    # structure without messing up the logger in the training loop
                    eval_logger=EpochLogger(
                        **dict(exp_name=logger_kwargs['exp_name'],
                               output_dir=os.path.join(logger.output_dir,
                                                       "simple_save999999"))),
                    train_logger=logger,
                    tb_logger=tb_logger,
                    epoch=epoch,
                    # the env_name is passed in so that to create an env when and where it is needed. This is to
                    # logx.save_state() error where an env pointer cannot be pickled
                    env_name=env_name,
                    get_action=lambda x: sess.run(
                        pi, feed_dict={x_ph: x[None, :]})[0])

        # Perform VPG update!
        update()

        # # # Log into tensorboard
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="EpRet",
                      with_min_and_max=True)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="EpLen",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="VVals",
                      with_min_and_max=True)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="LossPi",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="LossV",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="DeltaLossPi",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="DeltaLossV",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="Entropy",
                      with_min_and_max=False)
        log_key_to_tb(tb_logger,
                      logger,
                      epoch,
                      key="KL",
                      with_min_and_max=False)
        tb_logger.log_scalar(tag="TotalEnvInteracts",
                             value=(epoch + 1) * steps_per_epoch,
                             step=epoch)
        tb_logger.log_scalar(tag="Time",
                             value=time.time() - start_time,
                             step=epoch)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #28
0
def sac_multistep(
        env_fn,
        hidden_sizes=[256, 256],
        seed=0,
        steps_per_epoch=1000,
        epochs=1000,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=3e-4,
        alpha=0.2,
        batch_size=256,
        start_steps=10000,
        max_ep_len=1000,
        save_freq=1,
        save_model=False,
        auto_alpha=True,
        grad_clip=-1,
        logger_store_freq=100,
        multistep_k=1,
        debug=False,
        use_single_variant=False,
        logger_kwargs=dict(),
):
    """
    Largely following OpenAI documentation, but a bit different
    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        hidden_sizes: number of entries is number of hidden layers
            each entry in this list indicate the size of that hidden layer.
            applies to all networks

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch. Note the epoch here is just logging epoch
            so every this many steps a logging to stdouot and also output file will happen
            note: not to be confused with training epoch which is a term used often in literature for all kinds of
            different things

        epochs (int): Number of epochs to run and train agent. Usage of this term can be different in different
            algorithms, use caution. Here every epoch you get new logs

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target
            networks. Target networks are updated towards main networks
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
            close to 1.)

        lr (float): Learning rate (used for both policy and value learning).

        alpha (float): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.)

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration. However during testing the action always come from policy

        max_ep_len (int): Maximum length of trajectory / episode / rollout. Environment will get reseted if
        timestep in an episode excedding this number

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_model (bool): set to True if want to save the trained agent

        auto_alpha: set to True to use the adaptive alpha scheme, target entropy will be set automatically

        grad_clip: whether to use gradient clipping. < 0 means no clipping

        logger_store_freq: how many steps to log debugging info, typically don't need to change

    """
    if debug:
        hidden_sizes = [2, 2]
        batch_size = 2
        start_steps = 1000
        multistep_k = 5
        use_single_variant = True
    """set up logger"""
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    env, test_env = env_fn(), env_fn()

    ## seed torch and numpy
    torch.manual_seed(seed)
    np.random.seed(seed)

    ## seed environment along with env action space so that everything about env is seeded
    env.seed(seed)
    env.action_space.np_random.seed(seed)
    test_env.seed(seed + 10000)
    test_env.action_space.np_random.seed(seed + 10000)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # if environment has a smaller max episode length, then use the environment's max episode length
    max_ep_len = env._max_episode_steps if max_ep_len > env._max_episode_steps else max_ep_len

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    # we need .item() to convert it from numpy float to python float
    act_limit = env.action_space.high[0].item()

    # Experience buffer
    replay_buffer = MultistepReplayBuffer(obs_dim=obs_dim,
                                          act_dim=act_dim,
                                          size=replay_size)
    """
    Auto tuning alpha
    """
    if auto_alpha:
        target_entropy = -np.prod(env.action_space.shape).item()  # H
        log_alpha = torch.zeros(1, requires_grad=True)
        alpha_optim = optim.Adam([log_alpha], lr=lr)
    else:
        target_entropy, log_alpha, alpha_optim = None, None, None

    def test_agent(n=1):
        """
        This will test the agent's performance by running n episodes
        During the runs, the agent only take deterministic action, so the
        actions are not drawn from a distribution, but just use the mean
        :param n: number of episodes to run the agent
        """
        ep_return_list = np.zeros(n)
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                a = policy_net.get_env_action(o, deterministic=True)
                o, r, d, _ = test_env.step(a)
                ep_ret += r
                ep_len += 1
            ep_return_list[j] = ep_ret
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs
    """init all networks"""
    # see line 1
    policy_net = TanhGaussianPolicySACAdapt(obs_dim,
                                            act_dim,
                                            hidden_sizes,
                                            action_limit=act_limit)
    q1_net = Mlp(obs_dim + act_dim, 1, hidden_sizes)
    q2_net = Mlp(obs_dim + act_dim, 1, hidden_sizes)

    q1_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes)
    q2_target_net = Mlp(obs_dim + act_dim, 1, hidden_sizes)

    # see line 2: copy parameters from value_net to target_value_net
    q1_target_net.load_state_dict(q1_net.state_dict())
    q2_target_net.load_state_dict(q2_net.state_dict())

    # set up optimizers
    policy_optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    q1_optimizer = optim.Adam(q1_net.parameters(), lr=lr)
    q2_optimizer = optim.Adam(q2_net.parameters(), lr=lr)

    # mean squared error loss for v and q networks
    mse_criterion = nn.MSELoss()

    # Main loop: collect experience in env and update/log each epoch
    # NOTE: t here is the current number of total timesteps used
    # it is not the number of timesteps passed in the current episode
    current_update_index = 0
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = policy_net.get_env_action(o, deterministic=False)
        else:
            a = env.action_space.sample()
        # Step the env, get next observation, reward and done signal
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience (observation, action, reward, next observation, done) to replay buffer
        # the multi-step buffer (given to you) will store the data in a fashion that
        # they can be easily used for multi-step update
        replay_buffer.store(o, a, r, o2, d, ep_len, max_ep_len, multistep_k,
                            gamma)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        """perform update"""
        if replay_buffer.size >= batch_size:
            # get data from replay buffer
            batch = replay_buffer.sample_batch(batch_size)
            obs_tensor = Tensor(batch['obs1'])
            # NOTE: given the multi-step buffer, obs_next_tensor now contains the observation that are
            # k-step away from current observation
            obs_next_tensor = Tensor(batch['obs2'])
            acts_tensor = Tensor(batch['acts'])
            # NOTE: given the multi-step buffer, rewards tensor now contain the sum of discounted rewards in the next
            # k steps (or up until termination, if terminated in less than k steps)
            rews_tensor = Tensor(batch['rews']).unsqueeze(1)
            # NOTE: given the multi-step buffer, done_tensor now shows whether the data's episode terminated in less
            # than k steps or not
            done_tensor = Tensor(batch['done']).unsqueeze(1)
            """
            now we do a SAC update, following the OpenAI spinup doc
            check the openai sac document pseudocode part for reference
            line numbers indicate lines in pseudocode part
            we will first compute each of the losses
            and then update all the networks in the end
            """
            # see line 12: get a_tilda, which is newly sampled action (not action from replay buffer)
            """get q loss"""
            with torch.no_grad():
                a_tilda_next, _, _, log_prob_a_tilda_next, _, _ = policy_net.forward(
                    obs_next_tensor)
                q1_next = q1_target_net(
                    torch.cat([obs_next_tensor, a_tilda_next], 1))
                q2_next = q2_target_net(
                    torch.cat([obs_next_tensor, a_tilda_next], 1))
                # TODO: compute the k-step Q estimate (in the form of reward + next Q), don't worry about the entropy terms
                if use_single_variant:
                    # write code for computing the k-step estimate for the single Q estimate variant case
                    y_q = rews_tensor + (gamma**multistep_k) * (
                        1 - done_tensor) * q1_next
                else:
                    # write code for computing the k-step estimate while using double clipped Q
                    y_q = rews_tensor + (gamma**multistep_k) * (
                        1 - done_tensor) * torch.min(q1_next, q2_next)
                # add the entropy, with a simplied heuristic way
                # NOTE: you don't need to modify the following 3 lines. They deal with entropy terms
                powers = np.arange(1, multistep_k + 1)
                entropy_discounted_sum = -sum(gamma**powers) * (
                    1 - done_tensor) * alpha * log_prob_a_tilda_next
                y_q += entropy_discounted_sum

            # JQ = 𝔼(st,at)~D[0.5(Q1(st,at) - r(st,at) - γ(𝔼st+1~p[V(st+1)]))^2]
            q1_prediction = q1_net(torch.cat([obs_tensor, acts_tensor], 1))
            q1_loss = mse_criterion(q1_prediction, y_q)
            q2_prediction = q2_net(torch.cat([obs_tensor, acts_tensor], 1))
            q2_loss = mse_criterion(q2_prediction, y_q)
            """
            get policy loss
            """
            a_tilda, mean_a_tilda, log_std_a_tilda, log_prob_a_tilda, _, _ = policy_net.forward(
                obs_tensor)

            # see line 12: second equation
            q1_a_tilda = q1_net(torch.cat([obs_tensor, a_tilda], 1))
            q2_a_tilda = q2_net(torch.cat([obs_tensor, a_tilda], 1))

            # TODO write code here to compute policy loss correctly, for both variants.
            if use_single_variant:
                q_policy_part = q1_a_tilda
            else:
                q_policy_part = torch.min(q1_a_tilda, q2_a_tilda)
            # Jπ = 𝔼st∼D,εt∼N[α * logπ(f(εt;st)|st) − Q(st,f(εt;st))]
            policy_loss = (alpha * log_prob_a_tilda - q_policy_part).mean()
            """
            alpha loss, update alpha
            """
            if auto_alpha:
                alpha_loss = -(
                    log_alpha *
                    (log_prob_a_tilda + target_entropy).detach()).mean()

                alpha_optim.zero_grad()
                alpha_loss.backward()
                if grad_clip > 0:
                    nn.utils.clip_grad_norm_(log_alpha, grad_clip)
                alpha_optim.step()

                alpha = log_alpha.exp().item()
            else:
                alpha_loss = 0
            """update networks"""
            q1_optimizer.zero_grad()
            q1_loss.backward()
            if grad_clip > 0:
                nn.utils.clip_grad_norm_(q1_net.parameters(), grad_clip)
            q1_optimizer.step()

            q2_optimizer.zero_grad()
            q2_loss.backward()
            if grad_clip > 0:
                nn.utils.clip_grad_norm_(q2_net.parameters(), grad_clip)
            q2_optimizer.step()

            policy_optimizer.zero_grad()
            policy_loss.backward()
            if grad_clip > 0:
                nn.utils.clip_grad_norm_(policy_net.parameters(), grad_clip)
            policy_optimizer.step()

            # see line 16: update target value network with value network
            soft_update_model1_with_model2(q1_target_net, q1_net, polyak)
            soft_update_model1_with_model2(q2_target_net, q2_net, polyak)

            current_update_index += 1
            if current_update_index % logger_store_freq == 0:
                # store diagnostic info to logger
                logger.store(LossPi=policy_loss.item(),
                             LossQ1=q1_loss.item(),
                             LossQ2=q2_loss.item(),
                             LossAlpha=alpha_loss.item(),
                             Q1Vals=q1_prediction.detach().numpy(),
                             Q2Vals=q2_prediction.detach().numpy(),
                             Alpha=alpha,
                             LogPi=log_prob_a_tilda.detach().numpy())

        if d or (ep_len == max_ep_len):
            """when episode terminates, log info about this episode, then reset"""
            ## store episode return and length to logger
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            ## reset environment
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if (t + 1) % steps_per_epoch == 0:
            epoch = t // steps_per_epoch
            """
            Save pytorch model, very different from tensorflow version
            We need to save the environment, the state_dict of each network
            and also the state_dict of each optimizer
            """
            if save_model:
                sac_state_dict = {
                    'env': env,
                    'policy_net': policy_net.state_dict(),
                    'q1_net': q1_net.state_dict(),
                    'q2_net': q2_net.state_dict(),
                    'q1_target_net': q1_target_net.state_dict(),
                    'q2_target_net': q2_target_net.state_dict(),
                    'policy_opt': policy_optimizer,
                    'q1_opt': q1_optimizer,
                    'q2_opt': q2_optimizer,
                    'log_alpha': log_alpha,
                    'alpha_opt': alpha_optim,
                    'target_entropy': target_entropy
                }
                if (epoch % save_freq == 0) or (epoch == epochs - 1):
                    logger.save_state(sac_state_dict, None)
            # use joblib.load(fname) to load

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # TODO write code here to estimate the bias of the Q networks
            #  recall that we can define the Q bias to be Q value - discounted MC return
            #  initialize another environment that is only used for provide such a bias estimate
            #  store that to logger

            def estimate_bias(n=1, use_single_variant=False, k=multistep_k):
                """
                run n episodes and calculate the mc_return and estimated_q for each appearing states
                drop last multistep_k data point of each episode
                calculate the q bias using mean(mc_return)-mean(estimated_q)
                return q bias and mean(estimated_q)
                """
                state_num, mc_ret, est_q = 0, 0, 0
                for _ in range(n):
                    o, r, d, ep_len, ep_mc_ret, reward_list, q_list = bias_test_env.reset(
                    ), 0, False, 0, 0, [], []
                    while not (d or (ep_len == max_ep_len)):
                        # Take stochastic actions
                        a = policy_net.get_env_action(o, deterministic=False)
                        q1 = q1_net(torch.cat(
                            [Tensor([o]), Tensor([a])], 1)).item()
                        q2 = q2_net(torch.cat(
                            [Tensor([o]), Tensor([a])], 1)).item()
                        # add estimated q for each state to q_list
                        # if use_single_variant:
                        q_list.append(q1)
                        # else:
                        #     q_list.append(min(q1,q2))
                        o, r, d, _ = bias_test_env.step(a)
                        # store each r in reward_list
                        reward_list.append(r)
                        ep_len += 1
                    # drop last 200 terms of the reward list and q list
                    reward_list = reward_list[:-200]
                    q_list = q_list[:-200]
                    # calculate the sum of all mc_returns for each state in the episode
                    for i in range(len(reward_list)):
                        powers = np.arange(len(reward_list) - i)
                        ep_mc_ret += sum(
                            (gamma**powers) * np.array(reward_list[i:]))
                    # update mc_ret and est_q
                    mc_ret = (state_num * mc_ret + ep_mc_ret) / (state_num +
                                                                 ep_len)
                    est_q = (state_num * est_q + sum(q_list)) / (state_num +
                                                                 ep_len)
                # calculate bias
                return est_q - mc_ret, est_q

            bias_test_env = env_fn()
            bias_test_env.seed(seed + 10000)
            bias_test_env.action_space.np_random.seed(seed + 10000)
            bias, est_q = estimate_bias(n=1,
                                        use_single_variant=use_single_variant,
                                        k=multistep_k)

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('Alpha', with_min_and_max=True)
            logger.log_tabular('LossAlpha', average_only=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)

            # TODO after you store bias info to logger, you should also write code here to log them
            #  so that you can later plot them
            logger.log_tabular('QBias', bias)
            logger.log_tabular('QVals', est_q)
            logger.log_tabular('K', multistep_k)

            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
            sys.stdout.flush()
Пример #29
0
def ppo(env_fn,
        actor_critic=core.mlp_actor_critic,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        clip_ratio=0.2,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_pi_iters=80,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        target_kl=0.01,
        logger_kwargs=dict(),
        save_freq=10):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols
            for state, ``x_ph``, and action, ``a_ph``, and returns the main
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       (batch, act_dim)  | Samples actions from policy given
                                           | states.
            ``logp``     (batch,)          | Gives log probability, according to
                                           | the policy, of taking actions ``a_ph``
                                           | in states ``x_ph``.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``.
            ``v``        (batch,)          | Gives the value estimate for states
                                           | in ``x_ph``. (Critical: make sure
                                           | to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
            function you provided to PPO.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs of interaction (equivalent to
            number of policy updates) to perform.

        gamma (float): Discount factor. (Always between 0 and 1.)

        clip_ratio (float): Hyperparameter for clipping in the policy objective.
            Roughly: how far can the new policy go from the old policy while
            still profiting (improving the objective function)? The new policy
            can still go farther than the clip_ratio says, but it doesn't help
            on the objective anymore. (Usually small, 0.1 to 0.3.)

        pi_lr (float): Learning rate for policy optimizer.

        vf_lr (float): Learning rate for value function optimizer.

        train_pi_iters (int): Maximum number of gradient descent steps to take
            on policy loss per epoch. (Early stopping may cause optimizer
            to take fewer than this.)

        train_v_iters (int): Number of gradient descent steps to take on
            value function per epoch.

        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
            close to 1.)

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        target_kl (float): Roughly what KL divergence we think is appropriate
            between new and old policies after an update. This will get used
            for early stopping. (Usually small, 0.01 or 0.05.)

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    seed += 10000 * proc_id()
    tf.set_random_seed(seed)
    np.random.seed(seed)

    env = env_fn()
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph = core.placeholders_from_spaces(env.observation_space,
                                               env.action_space)
    adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None)

    # Main outputs from computation graph
    pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs)

    # Need all placeholders in *this* order later (to zip with data from buffer)
    all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph]

    # Every step, get: action, value, and logprob
    get_action_ops = [pi, v, logp_pi]

    # Experience buffer
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)

    # Count variables
    var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v'])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # PPO objectives
    ratio = tf.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
    min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph,
                       (1 - clip_ratio) * adv_ph)
    pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv))
    v_loss = tf.reduce_mean((ret_ph - v)**2)

    # Info (useful to watch during learning)
    approx_kl = tf.reduce_mean(
        logp_old_ph -
        logp)  # a sample estimate for KL-divergence, easy to compute
    approx_ent = tf.reduce_mean(
        -logp)  # a sample estimate for entropy, also easy to compute
    clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio))
    clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32))

    # Optimizers
    train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss)
    train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # Sync params across processes
    sess.run(sync_all_params())

    # Setup model saving
    logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v})

    def update():
        inputs = {k: v for k, v in zip(all_phs, buf.get())}
        pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent],
                                          feed_dict=inputs)

        # Training
        for i in range(train_pi_iters):
            _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs)
            kl = mpi_avg(kl)
            if kl > 1.5 * target_kl:
                logger.log(
                    'Early stopping at step %d due to reaching max kl.' % i)
                break
        logger.store(StopIter=i)
        for _ in range(train_v_iters):
            sess.run(train_v, feed_dict=inputs)

        # Log changes from update
        pi_l_new, v_l_new, kl, cf = sess.run(
            [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs)
        logger.store(LossPi=pi_l_old,
                     LossV=v_l_old,
                     KL=kl,
                     Entropy=ent,
                     ClipFrac=cf,
                     DeltaLossPi=(pi_l_new - pi_l_old),
                     DeltaLossV=(v_l_new - v_l_old))

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

    # Main loop: collect experience in env and update/log each epoch
    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v_t, logp_t = sess.run(get_action_ops,
                                      feed_dict={x_ph: o.reshape(1, -1)})

            # save and log
            buf.store(o, a, r, v_t, logp_t)
            logger.store(VVals=v_t)

            o, r, d, _ = env.step(a[0])
            ep_ret += r
            ep_len += 1

            terminal = d or (ep_len == max_ep_len)
            if terminal or (t == local_steps_per_epoch - 1):
                if not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len)
                # if trajectory didn't reach terminal state, bootstrap value target
                last_val = r if d else sess.run(
                    v, feed_dict={x_ph: o.reshape(1, -1)})
                buf.finish_path(last_val)  #在轨迹的末尾调用finish进行切断
                if terminal:
                    # only save EpRet / EpLen if trajectory finished
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform PPO update!
        update()

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('ClipFrac', average_only=True)
        logger.log_tabular('StopIter', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Пример #30
0
def train_mnist(steps_per_epoch=100,
                epochs=5,
                lr=1e-3,
                layers=2,
                hidden_size=64,
                logger_kwargs=dict(),
                save_freq=1):

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # Load and preprocess MNIST data
    (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
    x_train = x_train.reshape(-1, 28 * 28) / 255.0

    # Define inputs & main outputs from computation graph
    x_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, 28 * 28))
    y_ph = tf.compat.v1.placeholder(tf.int32, shape=(None, ))
    logits = mlp(x_ph,
                 hidden_sizes=[hidden_size] * layers + [10],
                 activation=tf.nn.relu)
    predict = tf.argmax(input=logits, axis=1, output_type=tf.int32)

    # Define loss function, accuracy, and training op
    y = tf.one_hot(y_ph, 10)
    loss = tf.compat.v1.losses.softmax_cross_entropy(y, logits)
    acc = tf.reduce_mean(
        input_tensor=tf.cast(tf.equal(y_ph, predict), tf.float32))
    train_op = tf.compat.v1.train.AdamOptimizer().minimize(loss)

    # Prepare session
    sess = tf.compat.v1.Session()
    sess.run(tf.compat.v1.global_variables_initializer())

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={'x': x_ph},
                          outputs={
                              'logits': logits,
                              'predict': predict
                          })

    start_time = time.time()

    # Run main training loop
    for epoch in range(epochs):
        for t in range(steps_per_epoch):
            idxs = np.random.randint(0, len(x_train), 32)
            feed_dict = {x_ph: x_train[idxs], y_ph: y_train[idxs]}
            outs = sess.run([loss, acc, train_op], feed_dict=feed_dict)
            logger.store(Loss=outs[0], Acc=outs[1])

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state(state_dict=dict(), itr=None)

        # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('Acc', with_min_and_max=True)
        logger.log_tabular('Loss', average_only=True)
        logger.log_tabular('TotalGradientSteps', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()