示例#1
0
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False):
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env = gym.make("Pendulum-v0")
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    logz.configure_output_dir(logdir)
    if vf_type == 'linear':
        vf = LinearValueFunction(**vf_params)
    elif vf_type == 'nn':
        vf = NnValueFunction(ob_dim=ob_dim, **vf_params)


    YOUR_CODE_HERE


    sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101

    total_timesteps = 0
    stepsize = initial_stepsize

    for i in range(n_iter):
        print("********** Iteration %i ************"%i)

        YOUR_CODE_HERE

        if kl > desired_kl * 2: 
            stepsize /= 1.5
            print('stepsize -> %s'%stepsize)
        elif kl < desired_kl / 2: 
            stepsize *= 1.5
            print('stepsize -> %s'%stepsize)
        else:
            print('stepsize OK')


        # Log diagnostics
        logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
示例#2
0
文件: ars.py 项目: zhan0903/ARS
    def train(self, num_iter):

        start = time.time()
        for i in range(num_iter):
            
            t1 = time.time()
            self.train_step()
            t2 = time.time()
            print('total time of one step', t2 - t1)           
            print('iter ', i,' done')

            # record statistics every 10 iterations
            if ((i + 1) % 10 == 0):
                
                rewards = self.aggregate_rollouts(num_rollouts = 100, evaluate = True)
                w = ray.get(self.workers[0].get_weights_plus_stats.remote())
                np.savez(self.logdir + "/lin_policy_plus", w)
                
                print(sorted(self.params.items()))
                logz.log_tabular("Time", time.time() - start)
                logz.log_tabular("Iteration", i + 1)
                logz.log_tabular("AverageReward", np.mean(rewards))
                logz.log_tabular("StdRewards", np.std(rewards))
                logz.log_tabular("MaxRewardRollout", np.max(rewards))
                logz.log_tabular("MinRewardRollout", np.min(rewards))
                logz.log_tabular("timesteps", self.timesteps)
                logz.dump_tabular()
                
            t1 = time.time()
            # get statistics from all workers
            for j in range(self.num_workers):
                self.policy.observation_filter.update(ray.get(self.workers[j].get_filter.remote()))
            self.policy.observation_filter.stats_increment()

            # make sure master filter buffer is clear
            self.policy.observation_filter.clear_buffer()
            # sync all workers
            filter_id = ray.put(self.policy.observation_filter)
            setting_filters_ids = [worker.sync_filter.remote(filter_id) for worker in self.workers]
            # waiting for sync of all workers
            ray.get(setting_filters_ids)
         
            increment_filters_ids = [worker.stats_increment.remote() for worker in self.workers]
            # waiting for increment of all workers
            ray.get(increment_filters_ids)            
            t2 = time.time()
            print('Time to sync statistics:', t2 - t1)
                        
        return 
示例#3
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="advn", dtype=tf.float32)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#
    sy_logprob_n = None
    if discrete:
        sy_logits_na = build_mlp(sy_ob_no,
                                 ac_dim,
                                 "mlp",
                                 n_layers=n_layers,
                                 size=size)(sy_ob_no)
        sy_sampled_ac = tf.squeeze(tf.multinomial(
            sy_logits_na, 1))  # Hint: Use the tf.multinomial op
        sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sy_ac_na, logits=sy_logits_na)
    else:
        sy_mean = build_mlp(sy_ob_no,
                            ac_dim,
                            "mlp",
                            n_layers=n_layers,
                            size=size)(sy_ob_no)

        #will learn this when doing the loss
        sy_logstd = tf.get_variable("logstd", shape=[
            ac_dim
        ])  # logstd should just be a trainable variable, not a network output.

        #I guess I could also iterate over passing the mean and std, but less cool/efficient then reparametrization trick
        sy_sampled_ac = tf.squeeze(
            sy_mean + tf.exp(sy_logstd) * tf.random_normal(tf.shape(sy_mean)),
            axis=[1])

        # Hint: Use the log probability under a multivariate gaussian.
        sy_logprob_n = -tf.contrib.distributions.MultivariateNormalDiag(
            loc=sy_mean, scale_diag=tf.exp(sy_logstd)).log_prob(sy_ac_na)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = tf.reduce_mean(
        sy_logprob_n * sy_adv_n
    )  # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        #baseline_update_op = TODO

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    after_loss = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 20 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        q_n = 0

        rewards_by_episode = [path['reward'] for path in paths]

        if not reward_to_go:
            q_n = np.concatenate([[
                sum([
                    reward_path[i] * gamma**len(reward_path)
                    for i in range(len(reward_path))
                ])
            ] * len(reward_path) for reward_path in rewards_by_episode])
        else:
            q_n = np.concatenate([[
                sum([
                    reward_path[j] * gamma**(j - i)
                    for j in range(i, len(reward_path))
                ]) for i in range(len(reward_path))
            ] for reward_path in rewards_by_episode])

        assert len(q_n) == len(ob_no)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            assert False
            #b_n = TODO
            #adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_mean = np.mean(adv_n)
            adv_std = np.std(adv_n)
            adv_n = (adv_n - adv_mean) / adv_std

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            pass

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.
        before_loss = after_loss
        _, after_loss = sess.run([update_op, loss],
                                 feed_dict={
                                     sy_ob_no: ob_no,
                                     sy_ac_na: ac_na,
                                     sy_adv_n: adv_n
                                 })

        #after_loss = sess.run([loss], feed_dict={sy_ob_no: ob_no, sy_ac_na: ac_na,sy_adv_n: adv_n})

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.log_tabular("Loss before", before_loss)
        logz.log_tabular("Loss_after", after_loss)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#4
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        n_job=1,
        epoch=1,
        gae_lambda=None,
        # network arguments
        n_layers=1,
        size=32):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no, ac_dim, "nn_policy", n_layers, size)
        # Hint: Use the tf.multinomial op
        sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), 1)
        sy_logprob_na = tf.nn.log_softmax(sy_logits_na)
        sy_index_n = tf.stack([tf.range(tf.shape(sy_logits_na)[0]), sy_ac_na],
                              1)
        sy_logprob_n = tf.gather_nd(sy_logprob_na, sy_index_n)
    else:
        # YOUR_CODE_HERE
        sy_mu_na = build_mlp(sy_ob_no, ac_dim, "nn_mu", n_layers, size)
        # logstd should just be a trainable variable, not a network output.
        sy_logstd = tf.get_variable("nn_logstd",
                                    shape=[1, ac_dim],
                                    initializer=tf.zeros_initializer())
        norm_dist = tf.distributions.Normal(sy_mu_na, tf.exp(sy_logstd))
        sy_sampled_ac = norm_dist.sample()
        # Hint: Use the log probability under a multivariate gaussian.
        sy_logprob_n = tf.reduce_sum(norm_dist.log_prob(sy_ac_na), 1)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    # Loss function that we'll differentiate to get the policy gradient.
    loss = -tf.reduce_sum(sy_logprob_n * sy_adv_n)
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline or gae_lambda:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        baseline_target = tf.placeholder(tf.float32, [None])
        baseline_loss = tf.losses.mean_squared_error(baseline_target,
                                                     baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            baseline_loss)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    # Create environments
    envs = MultiEnv(env_name, n_job)

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Start timer for sample timing
        time_start = time.time()
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and animate)
            observations = envs.reset()
            paths_done = [False] * n_job
            paths_observations = [[] for _ in range(n_job)]
            paths_actions = [[] for _ in range(n_job)]
            paths_rewards = [[] for _ in range(n_job)]
            steps = 0
            while True:
                if animate_this_episode:
                    envs.render()
                    time.sleep(0.05)
                # Append observations
                for i in range(n_job):
                    if not paths_done[i]:
                        paths_observations[i].append(observations[i])
                # Get actions from current policy
                actions = sess.run(sy_sampled_ac,
                                   feed_dict={sy_ob_no: observations})
                for i in range(n_job):
                    if not paths_done[i]:
                        paths_actions[i].append(actions[i])
                # Run step
                observations, rewards, path_done_next = envs.step(actions)
                # Append rewards
                for i in range(n_job):
                    if not paths_done[i]:
                        paths_rewards[i].append(rewards[i])
                steps += 1
                paths_done = path_done_next
                if np.all(paths_done) or steps > max_path_length:
                    break
            # Append paths
            for i in range(n_job):
                path = {
                    "observation": np.array(paths_observations[i]),
                    "reward": np.array(paths_rewards[i]),
                    "action": np.array(paths_actions[i])
                }
                paths.append(path)
                timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Get sample time
        time_used = time.time() - time_start

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        q_n = np.asanyarray([])
        for path in paths:
            reward = path['reward']
            length = len(reward)
            if not reward_to_go:
                q_path = np.sum(reward *
                                np.logspace(0, length - 1, length, base=gamma))
                q_n = np.append(q_n, np.ones_like(reward) * q_path)
            else:
                q_path = np.zeros_like(reward)
                # Accumulate reward from right to left
                temp = reward.copy()
                for t in range(length):
                    q_path += np.pad(temp[t:], (0, t), 'constant')
                    temp *= gamma
                q_n = np.append(q_n, q_path)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline or gae_lambda:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            b_n = sess.run(baseline_prediction, {sy_ob_no: ob_no})
            # Rescale to normal distribution
            b_std = np.std(b_n)
            b_mean = np.mean(b_n)
            b_n = (b_n - b_mean) / b_std
            # Rescale to Q-value distribution
            q_std = np.std(q_n)
            q_mean = np.mean(q_n)
            b_n = q_mean + b_n * q_std

            if gae_lambda:  # Generalized advantage estimator
                adv_n = np.zeros_like(q_n)
                index_start = 0
                for path in paths:
                    reward = path['reward']
                    length = len(reward)
                    index_end = index_start + length
                    path_v = b_n[index_start:index_end]
                    path_v_next = b_n[index_start + 1:index_end]
                    path_v_next = np.append(path_v_next, 0)
                    delta = reward + gamma * path_v_next - path_v
                    # Accumulate critic from right to left
                    temp = delta.copy()
                    for t in range(length):
                        adv_n[index_start:index_end] += np.pad(
                            temp[t:], (0, t), 'constant')
                        temp *= gamma * gae_lambda
                    index_start = index_end
            else:  # Baseline estimator
                adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_std = np.std(adv_n) + 1e-5
            adv_mean = np.mean(adv_n)
            adv_n = (adv_n - adv_mean) / adv_std

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline or gae_lambda:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            target_n = (q_n - q_mean) / q_std
            feed_dict = {sy_ob_no: ob_no, baseline_target: target_n}
            for i in range(epoch):
                sess.run(baseline_update_op, feed_dict)

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # YOUR_CODE_HERE
        feed_dict = {
            sy_ob_no: ob_no,
            sy_ac_na: ac_na,
            sy_adv_n: adv_n / len(paths)
        }
        # Save the loss function before the update
        loss_before = sess.run(loss, feed_dict)
        # Train
        for i in range(epoch):
            sess.run(update_op, feed_dict)
        # Save the loss function after the update
        loss_after = sess.run(loss, feed_dict)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.log_tabular("LossBeforeUpdate", loss_before)
        logz.log_tabular("LossAfterUpdate", loss_after)
        logz.log_tabular("LossUpdated", loss_after - loss_before)
        logz.log_tabular("SampleTime", time_used)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#5
0
def deepq(env,
          max_episode_steps,
          n_experiments,
          n_total_steps,
          seed,
          gamma,
          learning_rate,
          conv_sizes,
          fc_sizes,
          n_init_buffer_size,
          n_buffer_size,
          batch_size,
          epsilon_start,
          epsilon_end,
          exploration_fraction,
          update_target_freq,
          logging_dir="log",
          isRenderding=True,
          isRecordingVideo=True,
          recordingVideo_dir="video",
          rec_per_episodes=100,
          chckp_dir="checkpoint",
          checkpt_save_freq=100,
          test_name="test",
          device="CPU"):
    # Get environment name
    env_name = env.spec.id
    if max_episode_steps > 0:
        env._max_episode_steps = max_episode_steps
    print("Env max_step_per_episode:{}".format(env._max_episode_steps))

    # Identify states and action dimensions
    isDiscrete = isinstance(env.action_space, gym.spaces.Discrete)
    n_actions = env.action_space.n if isDiscrete else env.action_space.shape[0]

    # State processor
    state_shape = env.observation_space.shape
    state_size = [84, 84, 4]  # list(state_shape)
    state_processor = StateProcessor(input_shape=state_shape,
                                     output_shape=state_size[:-1])

    if device in {"gpu", "GPU"}:
        tf_device = '/device:GPU:0'
    else:
        tf_device = '/device:CPU:0'

    with tf.device(tf_device):
        value_model = ValueEstimator(scope="q_func",
                                     state_size=state_size,
                                     action_size=n_actions,
                                     conv_sizes=conv_sizes,
                                     fc_sizes=fc_sizes,
                                     learning_rate=learning_rate,
                                     isDiscrete=isDiscrete)

        target_value_model = ValueEstimator(scope="t_q_func",
                                            state_size=state_size,
                                            action_size=n_actions,
                                            conv_sizes=conv_sizes,
                                            fc_sizes=fc_sizes,
                                            learning_rate=learning_rate,
                                            isDiscrete=isDiscrete)

    init_time = time.strftime("%d-%m-%Y_%H-%M-%S")
    for exp in range(n_experiments):

        # Set random seed
        rand_seed = seed + 10 * exp
        tf.set_random_seed(rand_seed)
        np.random.seed(rand_seed)

        # Global step
        global_step = tf.Variable(0, name="global_step", trainable=False)

        # Init TF session
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())

            # TF saver
            saver = tf.train.Saver()
            chckp_dir = os.path.join(chckp_dir, env_name, test_name, init_time,
                                     str(exp))
            if not os.path.exists(chckp_dir):
                os.makedirs(chckp_dir)
            latest_checkpoint = tf.train.latest_checkpoint(
                checkpoint_dir=chckp_dir)
            if latest_checkpoint:
                print(
                    "Loading model checkpoint {}...".format(latest_checkpoint))
                saver.restore(sess, latest_checkpoint)

            # Configure output directory for logging
            # Data logging paths
            if isRecordingVideo:
                recordingVideo_dir = os.path.join(recordingVideo_dir, env_name,
                                                  test_name, init_time,
                                                  str(exp))
                if not os.path.exists(recordingVideo_dir):
                    os.makedirs(recordingVideo_dir)

            logging_dir = os.path.join(logging_dir, env_name, test_name,
                                       init_time)
            if not os.path.exists(logging_dir):
                os.makedirs(logging_dir)
            logz.configure_output_dir(os.path.join(logging_dir, str(exp)))

            # Log experimental parameters
            args = inspect.getargspec(deepq)[0]
            locals_ = locals()
            params = {
                k: locals_[k] if k in locals_
                and isinstance(locals_[k], (int, str, float)) else None
                for k in args
            }
            logz.save_params(params)

            print("Parameter Lists")
            for param in params:
                if params[param]:
                    print(param + ": {}".format(params[param]))

            # Global step
            total_step = tf.train.global_step(sess, global_step)

            # Epsilon decaying schedule
            epsilons = np.linspace(epsilon_start, epsilon_end,
                                   int(exploration_fraction * n_total_steps))

            # The policy we're following
            policy = make_epsilon_greedy_policy(value_model,
                                                env.action_space.n)

            # Create a replay buffer
            replay_memory = []
            print("Collecting initial replay buffer")
            state = env.reset()
            state = state_processor.process(
                state, sess)  # TODO: DO NOT PROCESS IMAGE TO GRAYSCALE
            state = np.stack([state] * 4,
                             axis=2)  # Sequential images (4 frames)
            for idx in range(n_init_buffer_size):
                action_probs = policy(state, epsilons[0], sess)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)
                next_state, reward, done, _ = env.step(action)
                next_state = state_processor.process(next_state, sess)

                # Append next_state
                next_state = np.append(state[:, :, 1:],
                                       np.expand_dims(next_state, axis=2),
                                       axis=2)
                replay_memory.append(
                    Transition(state, action, reward, next_state, done))

                if done:
                    state = env.reset()
                    state = state_processor.process(state, sess)
                    state = np.stack([state] * 4, axis=2)
                else:
                    state = next_state

            print("==========================================")
            print("Exp: ", exp)
            print("==========================================")

            # Stat variables
            episode_reward_sum = 0
            episode_length = 0
            loss_sum = 0
            loss_steps = 0
            ep = 0  # Episode

            # Reset env variables
            state = env.reset()
            state = state_processor.process(
                state, sess)  # TODO: DO NOT PROCESS IMAGE TO GRAYSCALE
            state = np.stack([state] * 4,
                             axis=2)  # Sequential images (4 frames)

            video_recorder = None
            if isRenderding and isRecordingVideo and (ep == 0 or ep %
                                                      rec_per_episodes == 0):
                video_recorder = VideoRecorder(
                    env,
                    os.path.join(
                        recordingVideo_dir,
                        "vid_{}_{}_{}_{}.mp4".format(env_name, exp, test_name,
                                                     ep)),
                    enabled=True)
                print("Recording a video of this episode {} in experiment {}".
                      format(ep, exp))
            # Iterate total n steps of simulation across numerous episodes
            for total_step in range(n_total_steps):

                # Epsilon for this time step
                epsilon = epsilons[min(
                    total_step,
                    int(exploration_fraction * n_total_steps) - 1)]

                # Update target Q-function with online Q-function
                if total_step % update_target_freq == 0:
                    copy_model_parameters(value_model, target_value_model,
                                          sess)
                    print("Copied model parameters to target network.")

                # Take a step
                action_probs = policy(state, epsilon, sess)
                action = np.random.choice(np.arange(len(action_probs)),
                                          p=action_probs)
                next_state, reward, done, _ = env.step(action)
                next_state = state_processor.process(next_state, sess)
                next_state = np.append(state[:, :, 1:],
                                       np.expand_dims(next_state, axis=2),
                                       axis=2)

                if video_recorder and isRenderding:
                    env.render()
                    video_recorder.capture_frame()

                # Check whether replay buffer is full
                if len(replay_memory) == n_buffer_size:
                    replay_memory.pop(0)

                # Save transition to replay buffer
                replay_memory.append(
                    Transition(state, action, reward, next_state, done))

                # Update online Q-function
                # Sample randomized minibatch from replay buffer
                samples = random.sample(replay_memory, batch_size)
                states_batch, actions_batch, reward_batch, next_state_batch, done_batch = map(
                    np.array, zip(*samples))

                # Calculate action-values from double Q-functions
                q_values_next = value_model.predict(
                    next_state_batch,
                    sess)  # Q values per each possible actions
                selected_actions = np.argmax(
                    q_values_next, axis=1
                )  # Use Q-function (not target) to get the max action

                # Get max action-value using max action from online Q-values
                target_q_values_next = target_value_model.predict(
                    next_state_batch, sess)

                selected_target_values = gamma * target_q_values_next[
                    np.arange(batch_size), selected_actions]
                targets_batch = reward_batch + np.invert(done_batch).astype(
                    np.float32) * selected_target_values

                # Update Q(action-value) function
                states_batch = np.array(states_batch)
                loss = value_model.update(states_batch,
                                          actions_batch,
                                          targets_batch,
                                          sess=sess)
                loss_sum += loss
                loss_steps += 1

                if done:
                    # Close video recorder
                    if video_recorder:
                        video_recorder.close()
                        video_recorder = None

                    print(
                        "===================== End of Episode:{} @ step:{} ====================="
                        .format(ep, total_step))

                    # Log progress
                    logz.log_tabular("Episode", ep)
                    logz.log_tabular("Episode length", episode_length)
                    logz.log_tabular("Total steps", total_step)
                    logz.log_tabular("Mean rewards",
                                     episode_reward_sum / episode_length)
                    logz.dump_tabular()
                    logz.pickle_tf_vars()

                    # Reset env and stat variables
                    state = env.reset()
                    state = state_processor.process(
                        state, sess)  # TODO: DO NOT PROCESS IMAGE TO GRAYSCALE
                    state = np.stack([state] * 4,
                                     axis=2)  # Sequential images (4 frames)

                    episode_reward_sum = 0
                    episode_length = 0
                    loss_sum = 0
                    loss_steps = 0
                    ep += 1

                    # Save model per episode
                    if ep % checkpt_save_freq == 0 or ep == 0:
                        saver.save(tf.get_default_session(),
                                   chckp_dir,
                                   global_step=total_step)

                    # Recording videos
                    if video_recorder:
                        video_recorder.close()
                    if isRenderding and isRecordingVideo and (
                            ep == 0 or ep % rec_per_episodes == 0):
                        video_recorder = VideoRecorder(
                            env,
                            os.path.join(
                                recordingVideo_dir,
                                "vid_{}_{}_{}_{}.mp4".format(
                                    env_name, exp, test_name, ep)),
                            enabled=True)
                        print(
                            "Recording a video of this episode {} in experiment {}"
                            .format(ep, exp))

                else:
                    # Update episode stats
                    episode_reward_sum += reward
                    episode_length += 1
                    state = next_state

            print(
                "===================== End of Last Episode:{} @ step:{} ====================="
                .format(ep, total_step))

            # Log progress
            logz.log_tabular("Episode", ep)
            logz.log_tabular("Episode length", episode_length)
            logz.log_tabular("Total steps", total_step)
            logz.log_tabular("Mean rewards",
                             episode_reward_sum / episode_length)
            logz.dump_tabular()
            logz.pickle_tf_vars()

            # Save session
            saver.save(tf.get_default_session(),
                       chckp_dir,
                       global_step=total_step)
示例#6
0
def train_ppo(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch,
              max_path_length, learning_rate, num_target_updates,
              num_grad_steps_per_target_update, animate, logdir,
              normalize_advantages, seed, n_layers, size):
    start = time.time()

    # ========================================================================================#
    # Set Up Logger
    # ========================================================================================#
    setup_logger(logdir, locals())

    # ========================================================================================#
    # Set Up Env
    # ========================================================================================#

    # Make the gym environment
    env = gym.make(env_name)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous, or self.discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # ========================================================================================#
    # Initialize Agent
    # ========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
        'num_target_updates': num_target_updates,
        'num_grad_steps_per_target_update': num_grad_steps_per_target_update,
    }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_advantage_args = {
        'gamma': gamma,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args,
                  estimate_advantage_args, seed)  # estimate_return_args

    # build computation graph
    agent.build_computation_graph()

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    # ========================================================================================#
    # Training Loop
    # ========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = np.concatenate([path["reward"] for path in paths])
        next_ob_no = np.concatenate(
            [path["next_observation"] for path in paths])
        terminal_n = np.concatenate([path["terminal"] for path in paths])
        logp = np.concatenate([path["logp"] for path in paths])

        # Call tensorflow operations to:
        # (1) update the critic, by calling agent.update_critic
        # (2) use the updated critic to compute the advantage by, calling agent.estimate_advantage
        # (3) use the estimated advantage values to update the actor, by calling agent.update_actor
        # YOUR CODE HERE
        closs = agent.update_critic(ob_no, next_ob_no, re_n, terminal_n)
        adv = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)
        aloss = agent.update_actor(ob_no, ac_na, adv, logp)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.log_tabular("closs", closs)
        logz.log_tabular("aloss", aloss)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#7
0
def main_pendulum(logdir,
                  seed,
                  n_iter,
                  gamma,
                  min_timesteps_per_batch,
                  initial_stepsize,
                  desired_kl,
                  vf_type,
                  vf_params,
                  animate=False):
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env = gym.make("Pendulum-v0")
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    logz.configure_output_dir(logdir)
    if vf_type == 'linear':
        vf = LinearValueFunction(**vf_params)
    elif vf_type == 'nn':
        vf = NnValueFunction(ob_dim=ob_dim, **vf_params)

    #YOUR_CODE_HERE

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)  # batch of observations
    sy_ac_n = tf.placeholder(
        shape=[None, ac_dim], name="ac", dtype=tf.float32
    )  # batch of actions taken by the policy, used for policy gradient computation
    sy_adv_n = tf.placeholder(shape=[None], name="adv",
                              dtype=tf.float32)  # advantage function estimate

    # a network mapping state to probability of action
    sy_h1 = lrelu(dense(sy_ob_no, 32, "h1",
                        weight_init=normc_initializer(1.0)))  # hidden layer
    sy_mean_na = dense(sy_h1,
                       ac_dim,
                       "mean",
                       weight_init=normc_initializer(0.1))  # mean output
    sy_logstd_a = tf.get_variable(
        "logstdev", [ac_dim], initializer=tf.zeros_initializer())  # log std

    # sample the action anc calculate its log probability
    U = tf.random_normal([tf.shape(sy_ob_no)[0], ac_dim], 0.0,
                         1.0)  # a number from standard normal distribution
    sy_sampled_ac = (
        U * tf.exp(sy_logstd_a) + sy_mean_na
    )[0]  # convert standard normal to normal with given mean and var, used to update state and not for policy gradient
    #sy_logprob_n = -(sy_ac_n - sy_mean_na)**2/tf.exp(2*sy_logstd_a) - tf.log(2*np.pi)/2 - sy_logstd_a
    sy_logprob_n = tf.reduce_sum(
        -(sy_ac_n - sy_mean_na)**2 / tf.exp(2 * sy_logstd_a) / 2 - sy_logstd_a,
        axis=1)

    # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
    sy_oldmean_na = tf.placeholder(shape=[None, ac_dim],
                                   name="oldmean",
                                   dtype=tf.float32)  # mean before update
    sy_oldlogstd_na = tf.placeholder(shape=[ac_dim],
                                     name="oldstd",
                                     dtype=tf.float32)  # std before update
    sy_n = tf.shape(sy_ob_no)[0]

    # KL divergence
    sy_kl = tf.reduce_sum(sy_logstd_a-sy_oldlogstd_na - 0.5 + (tf.exp(sy_oldlogstd_na*2) + (sy_mean_na - \
                                 sy_oldmean_na)**2)/(2*tf.exp(sy_logstd_a*2))) / tf.to_float(sy_n)
    # entropy
    sy_ent = tf.reduce_sum(sy_logstd_a +
                           0.5 * tf.log(2 * np.pi * np.e)) / tf.to_float(sy_n)

    # end of your code

    sy_surr = -tf.reduce_mean(
        sy_adv_n * sy_logprob_n
    )  # Loss function that we'll differentiate to get the policy gradient

    sy_stepsize = tf.placeholder(
        shape=[], dtype=tf.float32
    )  # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    total_timesteps = 0
    stepsize = initial_stepsize

    for i in range(n_iter):
        print("********** Iteration %i ************" % i)

        #YOUR_CODE_HERE

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (i % 10 == 0)
                                    and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                if done:
                    break
            path = {
                "observation": np.array(obs),
                "terminated": terminated,
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict(path["observation"])
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        vf.fit(ob_no, vtarg_n)

        # Policy update
        _, old_mean_na, old_logstd_na = sess.run(
            [update_op, sy_mean_na, sy_logstd_a],
            feed_dict={
                sy_ob_no: ob_no,
                sy_ac_n: ac_n,
                sy_adv_n: standardized_adv_n,
                sy_stepsize: stepsize
            })
        kl, ent = sess.run(
            [sy_kl, sy_ent],
            feed_dict={
                sy_ob_no: ob_no,
                sy_oldmean_na: old_mean_na,
                sy_oldlogstd_na: old_logstd_na
            })

        # end of your code

        if kl > desired_kl * 2:
            stepsize /= 1.5
            print('stepsize -> %s' % stepsize)
        elif kl < desired_kl / 2:
            stepsize *= 1.5
            print('stepsize -> %s' % stepsize)
        else:
            print('stepsize OK')

        # Log diagnostics
        logz.log_tabular("EpRewMean",
                         np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean",
                         np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter",
                         explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
示例#8
0
def train_PG(
        exp_name,
        #env_name,
        n_iter,
        gamma,
        min_timesteps_per_batch,
        max_path_length,
        learning_rate,
        reward_to_go,
        animate,
        logdir,
        normalize_advantages,
        nn_baseline,
        seed,
        n_layers,
        size,
        pg_step):

    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    #setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    img = io.imread("Clear.png") # Starting image
    img = color.rgb2gray(img)
    env = MRILib(img, 'SyntheticImagesRecognizer_100K.hdf5', dim=2)

    # Set random seeds
    # tf.set_random_seed(seed)
    # np.random.seed(seed)
    # env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or 100


    # Observation and action sizes
    ob_dim = 4
    ac_dim = 4

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': True,
        'size': size,
        'learning_rate': learning_rate,
        'pg_step': pg_step
        }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_return_args = {
        'gamma': gamma,
        'reward_to_go': reward_to_go,
        'nn_baseline': nn_baseline,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args)

    # build computation graph
    agent.build_computation_graph()

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = [path["reward"] for path in paths]

        q_n, adv_n = agent.estimate_return(ob_no, re_n)
        agent.update_parameters(ob_no, ac_na, q_n, adv_n)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()

        if (args.log):
            
            logz.pickle_tf_vars()
    fig, axarr = plt.subplots(1,2)        
    #normalized = tmp / np.sum(tmp, axis = None)
    img = scipy.signal.convolve2d(color.rgb2gray(io.imread("Clear.png")), env.filter)

    #img = skimage.restoration.unsupervised_wiener(env.image, normalized)[0]
    histmatched = skimage.exposure.equalize_hist(img)
    axarr[0].imshow(env.filter)
    axarr[1].imshow(img)
    plt.show()
示例#9
0
def train_PG(logdir, path, sim_mode=False):

    start = time.time()

    # Initialize the ROS/Sim Environment
    env = ros_env.Env(path, train_mode=True, sim_mode=sim_mode)

    # initialize the ROS agent
    agent = Agent(path, sim_mode=sim_mode)

    # Set Up Logger
    setup_logger(logdir, locals())

    # Set random seeds
    tf.set_random_seed(agent.seed)
    np.random.seed(agent.seed)

    # Maximum length for episodes
    max_path_length = agent.max_path_length

    # Observation and action sizes
    ob_dim = agent.ob_dim
    ac_dim = agent.ac_dim
    """
    Placeholders for batch observations/actions/advantages in policy gradient 
    loss function.
    See Agent.build_computation_graph for notation
    
    sy_ob_no: placeholder for observations
    sy_ac_na: placeholder for actions
    sy_adv_n: placeholder for advantages
    """

    sy_ob_no = tf.placeholder(shape=[None, agent.ob_dim],
                              name="ob",
                              dtype=tf.float32)

    sy_ac_na = tf.placeholder(shape=[None, agent.ac_dim],
                              name="ac",
                              dtype=tf.float32)

    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)
    """ 
    The policy takes in an observation and produces a distribution 
    over the action space
    Constructs the symbolic operation for the policy network outputs,
    which are the parameters of the policy distribution p(a|s)
    """

    # output activations are left linear by not passing any arg
    sy_mean = build_mlp(sy_ob_no,
                        agent.ac_dim,
                        "policy-ddpg",
                        agent.n_layers,
                        agent.size,
                        activation=tf.tanh)
    #print sy_mean.name

    sy_logstd = tf.Variable(tf.zeros([1, agent.ac_dim]),
                            dtype=tf.float32,
                            name="logstd")
    """ 
    Constructs a symbolic operation for stochastically sampling from 
    the policy distribution
    
    use the reparameterization trick:
    The output from a Gaussian distribution with mean 'mu' and std 
    'sigma' is
    
    mu + sigma * z,         z ~ N(0, I)
    
    This reduces the problem to just sampling z. 
    (use tf.random_normal!)
    """

    sy_sampled_ac = sy_mean + tf.exp(sy_logstd) * tf.random_normal(
        tf.shape(sy_mean))
    """ 
    We can also compute the logprob of the actions that were
    actually taken by the policy. This is used in the loss function.
    
    Constructs a symbolic operation for computing the log probability 
    of a set of actions that were actually taken according to the policy
    use the log probability under a multivariate gaussian.
    """

    action_normalized = (sy_ac_na - sy_mean) / tf.exp(sy_logstd)
    sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(action_normalized), axis=1)

    #=================================================================#
    # Loss Function and Training Operation
    #=================================================================#

    loss = -tf.reduce_mean(tf.multiply(sy_logprob_n, sy_adv_n))
    update_op = tf.train.AdamOptimizer(agent.learning_rate).minimize(loss)

    #==============================================================#
    # Optional Baseline
    #
    # Define placeholders for targets, a loss function and an update op
    # for fitting a neural network baseline. These will be used to fit the
    # neural network baseline.
    #===============================================================#
    if agent.nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no,
                      1,
                      "nn_baseline",
                      n_layers=agent.n_layers,
                      size=agent.size))
        sy_target_n = tf.placeholder(shape=[None],
                                     name="sy_target_n",
                                     dtype=tf.float32)
        baseline_loss = tf.nn.l2_loss(baseline_prediction - sy_target_n)
        baseline_update_op = tf.train.AdamOptimizer(
            agent.learning_rate).minimize(baseline_loss)

    # tensorflow: config, session, variable initialization
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    #====================================================================#
    # Training Loop
    #====================================================================#

    total_timesteps = 0

    for itr in range(agent.n_iter):
        print("********** Iteration %i ************" % itr)
        itr_mesg = "Iteration started at "
        itr_mesg += time.strftime("%d-%m-%Y_%H-%M-%S")
        print(itr_mesg)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            steps = 0
            while True:
                #time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                #print sy_sampled_ac.name (add:0)
                #print sy_ob_no.name (ob:0)
                ac = ac[0]
                acs.append(ac)
                # returns obs, reward and done status
                ob, rew, done = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > agent.max_path_length:
                    break
            path = {
                "observation": np.array(obs, dtype=np.float32),
                "reward": np.array(rewards, dtype=np.float32),
                "action": np.array(acs, dtype=np.float32)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > agent.min_timesteps_per_batch:
                break

        total_timesteps += timesteps_this_batch
        '''
        # Build arrays for observation and action for the 
        # policy gradient update by concatenating across paths
        '''
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = [path["reward"] for path in paths]
        """
        Monte Carlo estimation of the Q function.        
        Estimates the returns over a set of trajectories.
        
        Store the Q-values for all timesteps and all trajectories in a 
        variable 'q_n', like the 'ob_no' and 'ac_na' above. 
        """

        if agent.reward_to_go:
            q_n = []
            for path in paths:
                q = np.zeros(pathlength(path))
                q[-1] = path['reward'][-1]
                for i in reversed(range(pathlength(path) - 1)):
                    q[i] = path['reward'][i] + agent.gamma * q[i + 1]
                q_n.extend(q)
        else:
            q_n = []
            for path in paths:
                ret_tau = 0
                for i in range(pathlength(path)):
                    ret_tau += (agent.gamma**i) * path['reward'][i]
                q = np.ones(shape=[pathlength(path)]) * ret_tau
                q_n.extend(q)
        """
        Compute advantages by (possibly) subtracting a baseline from the 
        estimated Q values let sum_of_path_lengths be the sum of the 
        lengths of the paths sampled.
        """
        #===========================================================#
        #
        # Computing Baselines
        #===========================================================#
        if agent.nn_baseline:
            # If nn_baseline is True, use your neural network to predict
            # reward-to-go at each timestep for each trajectory, and save the
            # result in a variable 'b_n' like 'ob_no', 'ac_na', and 'q_n'.
            #
            # rescale the output from the nn_baseline to match the
            # statistics (mean and std) of the current batch of Q-values.

            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            m1 = np.mean(b_n)
            s1 = np.std(b_n)
            m2 = np.mean(q_n)
            s2 = np.std(q_n)
            b_n = b_n - m1
            b_n = m2 + b_n * (s2 / (s1 + 1e-8))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #=========================================================#
        # Advantage Normalization
        #=========================================================#
        if agent.normalize_advantages:
            # On the next line, implement a trick which is known
            # empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean
            # zero and std=1.
            adv_n = preprocessing.scale(adv_n)
        """ 
        Update the parameters of the policy and (possibly) the neural 
        network baseline, which is trained to approximate the value function
        """
        #========================================================#
        # Optimizing Neural Network Baseline
        #========================================================#
        if agent.nn_baseline:
            # If a neural network baseline is used, set up the targets and
            # the inputs for the baseline.
            #
            # Fit it to the current batch in order to use for the next
            # iteration. Use the baseline_update_op you defined earlier.
            #
            # Instead of trying to target raw Q-values directly,
            # rescale the targets to have mean zero and std=1.

            target_n = preprocessing.scale(q_n)
            sess.run(baseline_update_op,
                     feed_dict={
                         sy_target_n: target_n,
                         sy_ob_no: ob_no
                     })

        #=================================================================#
        # Performing the Policy Update
        #=================================================================#

        # Call the update operation necessary to perform the policy
        # gradient update based on the current batch of rollouts.

        _, after_loss = sess.run([update_op, loss],
                                 feed_dict={
                                     sy_ob_no: ob_no,
                                     sy_ac_na: ac_na,
                                     sy_adv_n: adv_n
                                 })

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.log_tabular("After-Loss", after_loss)
        logz.dump_tabular()
        logz.pickle_tf_vars()

        model_file = os.path.join(logdir, "model.ckpt")
        save_path = saver.save(sess, model_file)
        print("Model saved in file: %s" % save_path)

    env.close_env_log()
示例#10
0
def train_PG(
    exp_name='',  #参数方案的名称
    env_name='CartPole-v0',
    n_iter=100,
    gamma=1.0,
    min_timesteps_per_batch=1000,
    max_path_length=None,
    learning_rate=5e-3,
    reward_to_go=True,
    animate=True,
    logdir=None,
    normalize_advantages=True,
    nn_baseline=False,
    seed=0,
    # network arguments
    n_layers=1,
    size=32,
    gae_lambda=-1.0,
    batch_epochs=1,
    model_tag='vanilla',
    #ppo parameter
    clip_ratio=0.2,
):
    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getfullargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # ========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    # ========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    # ========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    # ========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        scope_name = 'discrete'
        old_scope_name = 'discrete_old'
        sy_logits_na = build_mlp(sy_ob_no, ac_dim, scope_name, n_layers, size)
        # softmax生成prob被压缩在sparse_softmax_cross_entropy_with_logits中,提升效率
        # 因此sy_logits_na是没有归一化的,但不影响分布sample的生成
        sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1),
                                   [-1])  # Hint: Use the tf.multinomial op
        # 这里加负号为了兼容 continuous的情况,loss也加负号
        sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sy_ac_na, logits=sy_logits_na)

        old_logits_na = build_mlp(sy_ob_no, ac_dim, old_scope_name, n_layers,
                                  size)
        old_sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sy_ac_na, logits=old_logits_na)
    else:
        # YOUR_CODE_HERE
        scope_name = 'continuous'
        old_scope_name = 'continuous_old'
        sy_mean = build_mlp(sy_ob_no, ac_dim, scope_name, n_layers, size)
        # logstd should just be a trainable variable, not a network output.
        # ??? why
        sy_logstd = tf.get_variable('std', [ac_dim], dtype=tf.float32)
        sy_sampled_ac = tf.random_normal(shape=tf.shape(sy_mean),
                                         mean=sy_mean,
                                         stddev=sy_logstd)
        # Hint: Use the log probability under a multivariate gaussian.
        sy_logprob_n = tf.contrib.distributions.MultivariateNormalDiag(
            loc=sy_mean, scale_diag=tf.exp(sy_logstd)).log_prob(sy_ac_na)

        old_sy_mean = build_mlp(sy_ob_no, ac_dim, old_scope_name, n_layers,
                                size)
        old_sy_logprob_n = tf.contrib.distributions.MultivariateNormalDiag(
            loc=old_sy_mean, scale_diag=tf.exp(sy_logstd)).log_prob(sy_ac_na)

    old_network_param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          old_scope_name)
    network_param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                      scope_name)
    param_assign_op = [
        tf.assign(old_value, new_value)
        for (old_value, new_value) in zip(old_network_param, network_param)
    ]

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    # ========================================================================================#
    # Loss function that we'll differentiate to get the policy gradient.
    # ppo clip loss
    if model_tag == 'ppo':
        # 和tensorforce不同 这里stop_gradient之后的梯度为0,导致lossDelta为0
        #old_log_prob = tf.stop_gradient(input=sy_logprob_n)
        prob_ratio = tf.exp(x=(sy_logprob_n - old_sy_logprob_n))
        # 这里无法指定axis=1 因为只有一维,剩下的一维就是[?] 即batch_size
        prob_ratio = tf.reduce_mean(input_tensor=prob_ratio)
        clipped_prob_ratio = tf.clip_by_value(
            t=prob_ratio,
            clip_value_min=(1.0 - clip_ratio),
            clip_value_max=(1.0 + clip_ratio))
        loss = tf.reduce_mean(-tf.minimum(x=(prob_ratio * sy_adv_n),
                                          y=(clipped_prob_ratio * sy_adv_n)))
    else:  #vanilla pg
        loss = tf.reduce_mean(-sy_logprob_n * sy_adv_n)

    #loss = tf.Print(loss, [loss, loss.shape], 'debug loss')
    tf.summary.scalar('loss', loss)
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    # ========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    # ========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        baseline_targets = tf.placeholder(shape=[None],
                                          name='baseline_targets',
                                          dtype=tf.float32)
        baseline_loss = tf.nn.l2_loss(baseline_prediction - baseline_targets)
        tf.summary.scalar('baseline_loss', baseline_loss)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            baseline_loss)

    # ========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    # ========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  # pylint: disable=E1101

    # ========================================================================================#
    # Training Loop
    # ========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        # Collect paths until we have enough timesteps
        # 每一轮结束或者超过max_path_length时会结束一次path
        # 每一轮path结束后填充到paths中,检查一次总的batch步数是否超过batch需求数,超过了则退出,开始训练
        # 因此每次训练的都是完整的数据

        # PG算法每次都使用当前分布sample action,不涉及exploration
        # TODO 改成observation和train分开两个进程,这样不用互相等待
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        # ====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        # ====================================================================================#

        # YOUR_CODE_HERE
        q_n = []
        reward_n = []
        for path in paths:
            reward = path['reward']
            max_step = len(reward)
            reward_n.extend(reward)
            # 从当前t开始的value估算
            if reward_to_go:
                q = [
                    np.sum(
                        np.power(gamma, np.arange(max_step - t)) * reward[t:])
                    for t in range(max_step)
                ]
            else:  # 整个trajectory的q值估算
                q = [
                    np.sum(np.power(gamma, np.arange(max_step)) * reward)
                    for t in range(max_step)
                ]
            q_n.extend(q)

        for epoch in range(batch_epochs):
            # ====================================================================================#
            #                           ----------SECTION 5----------
            # Computing Baselines
            # ====================================================================================#
            #print('run %d epoch' % epoch)
            if nn_baseline:
                # If nn_baseline is True, use your neural network to predict reward-to-go
                # at each timestep for each trajectory, and save the result in a variable 'b_n'
                # like 'ob_no', 'ac_na', and 'q_n'.
                #
                # Hint #bl1: rescale the output from the nn_baseline to match the statistics
                # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
                # #bl2 below.)
                b_n = sess.run(baseline_prediction,
                               feed_dict={sy_ob_no: ob_no})
                # b_n_norm = b_n - np.mean(b_n, axis=0) / (np.std(b_n, axis=0) + 1e-7)
                # 这里b_n要根据qn设置回来,因为b_n在下面optimize时是标准化过的
                b_n = b_n * np.std(q_n, axis=0) + np.mean(q_n, axis=0)

                if gae_lambda > 0:
                    adv_n = lambda_advantage(reward_n, b_n, len(reward_n),
                                             gae_lambda * gamma)
                else:
                    adv_n = q_n - b_n
            else:
                adv_n = q_n.copy()

            # ====================================================================================#
            #                           ----------SECTION 4----------
            # Advantage Normalization
            # ====================================================================================#

            if normalize_advantages:
                # On the next line, implement a trick which is known empirically to reduce variance
                # in policy gradient methods: normalize adv_n to have mean zero and std=1.
                # YOUR_CODE_HERE
                adv_mean = np.mean(adv_n, axis=0)
                adv_std = np.std(adv_n, axis=0)
                adv_n = (adv_n - adv_mean) / (adv_std + 1e-7)

            # ====================================================================================#
            #                           ----------SECTION 5----------
            # Optimizing Neural Network Baseline
            # ====================================================================================#
            if nn_baseline:
                # ----------SECTION 5----------
                # If a neural network baseline is used, set up the targets and the inputs for the
                # baseline.
                #
                # Fit it to the current batch in order to use for the next iteration. Use the
                # baseline_update_op you defined earlier.
                #
                # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
                # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)
                # 标准化的q_n作为baseline的优化目标
                q_n_mean = np.mean(q_n, axis=0)
                q_n_std = np.std(q_n, axis=0)
                q_n = (q_n - q_n_mean) / (q_n_std + 1e-7)
                sess.run(baseline_update_op,
                         feed_dict={
                             sy_ob_no: ob_no,
                             baseline_targets: q_n
                         })

            # ====================================================================================#
            #                           ----------SECTION 4----------
            # Performing the Policy Update
            # ====================================================================================#

            # Call the update operation necessary to perform the policy gradient update based on
            # the current batch of rollouts.
            #
            # For debug purposes, you may wish to save the value of the loss function before
            # and after an update, and then log them below.
            # 输出两次loss是为了下面的log
            feed_dict = {sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n}
            sess.run(param_assign_op, feed_dict)
            loss_1 = sess.run(loss, feed_dict)
            sess.run(update_op, feed_dict)
            loss_2 = sess.run(loss, feed_dict)

            # Log diagnostics
            returns = [path["reward"].sum() for path in paths]
            ep_lengths = [pathlength(path) for path in paths]
            logz.log_tabular("LossDelta", loss_1 - loss_2)
            logz.log_tabular("Time", time.time() - start)
            logz.log_tabular("Iteration", itr)
            logz.log_tabular("AverageReturn", np.mean(returns))
            logz.log_tabular("StdReturn", np.std(returns))
            logz.log_tabular("MaxReturn", np.max(returns))
            logz.log_tabular("MinReturn", np.min(returns))
            logz.log_tabular("EpLenMean", np.mean(ep_lengths))
            logz.log_tabular("EpLenStd", np.std(ep_lengths))
            logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
            logz.log_tabular("TimestepsSoFar", total_timesteps)
            logz.dump_tabular()
            logz.pickle_tf_vars()
示例#11
0
    def train(self, train_db, val_db, test_db):
        ##################################################################
        ## LOG
        ##################################################################
        logz.configure_output_dir(self.cfg.model_dir)
        logz.save_config(self.cfg)

        ##################################################################
        ## Main loop
        ##################################################################
        start = time()
        min_val_loss = 100000000

        for epoch in range(self.epoch, self.cfg.n_epochs):
            ##################################################################
            ## Training
            ##################################################################
            torch.cuda.empty_cache()
            train_loss = self.train_epoch(train_db, epoch)

            ##################################################################
            ## Validation
            ##################################################################
            torch.cuda.empty_cache()
            val_loss = self.validate_epoch(val_db, epoch)
            # val_loss = train_loss

            ##################################################################
            ## Sample
            ##################################################################
            torch.cuda.empty_cache()
            self.sample_for_vis(epoch, test_db, self.cfg.n_samples)
            torch.cuda.empty_cache()
            ##################################################################
            ## Logging
            ##################################################################

            # update optim scheduler
            current_val_loss = np.mean(val_loss)

            logz.log_tabular("Time", time() - start)
            logz.log_tabular("Iteration", epoch)
            logz.log_tabular("AverageTotalError", np.mean(train_loss[:, 0]))
            logz.log_tabular("AveragePredError", np.mean(train_loss[:, 1]))
            logz.log_tabular("AverageImageError", np.mean(train_loss[:, 2]))
            logz.log_tabular("AverageFeat0Error", np.mean(train_loss[:, 3]))
            logz.log_tabular("AverageFeat1Error", np.mean(train_loss[:, 4]))
            logz.log_tabular("AverageFeat2Error", np.mean(train_loss[:, 5]))
            logz.log_tabular("AverageFeat3Error", np.mean(train_loss[:, 6]))
            logz.log_tabular("AverageFeat4Error", np.mean(train_loss[:, 7]))
            logz.log_tabular("ValAverageTotalError", np.mean(val_loss[:, 0]))
            logz.log_tabular("ValAveragePredError", np.mean(val_loss[:, 1]))
            logz.log_tabular("ValAverageImageError", np.mean(val_loss[:, 2]))
            logz.log_tabular("ValAverageFeat0Error", np.mean(val_loss[:, 3]))
            logz.log_tabular("ValAverageFeat1Error", np.mean(val_loss[:, 4]))
            logz.log_tabular("ValAverageFeat2Error", np.mean(val_loss[:, 5]))
            logz.log_tabular("ValAverageFeat3Error", np.mean(val_loss[:, 6]))
            logz.log_tabular("ValAverageFeat4Error", np.mean(val_loss[:, 7]))
            logz.dump_tabular()

            ##################################################################
            ## Checkpoint
            ##################################################################
            if min_val_loss > current_val_loss:
                min_val_loss = current_val_loss
            self.save_checkpoint(epoch)
            torch.cuda.empty_cache()
示例#12
0
def run_experiment(exp_params, learner_params, discriminator_params):

    # Experiment parameters
    file_location = exp_params.get('expert_samples_location', 'expert_data')
    prior_file_location = exp_params.get('prior_samples_location', 'prior_data')
    env_name = exp_params.get('env_name', 'InvertedPendulum-v2')
    env_type = exp_params.get('env_type', 'expert')
    exp_name = exp_params.get('exp_name', '{}_{}'.format(env_name, env_type))
    exp_num = exp_params.get('exp_num', 0)
    epochs = exp_params.get('epochs', 100)
    test_runs_per_epoch = exp_params.get('test_runs_per_epoch', 10)
    steps_per_epoch = exp_params.get('steps_per_epoch', 1000)
    init_random_samples = exp_params.get('init_random_samples', 5000)
    training_starts = exp_params.get('training_starts', 0)
    episode_limit = exp_params.get('episode_limit', 200)
    return_threshold = exp_params.get('return_threshold', 1e4)
    visualize_collected_observations = exp_params.get('visualize_collected_observations', False)

    # Learner parameters
    l_type = learner_params.get('l_type', 'TD3')
    l_buffer_size = learner_params.get('l_buffer_size', 10000)
    l_exploration_noise = learner_params.get('l_exploration_noise', 0.2)
    l_learning_rate = learner_params.get('l_learning_rate', 1e-3)
    l_batch_size = learner_params.get('l_batch_size', 128)
    l_updates_per_step = learner_params.get('l_updates_per_step', 1)
    l_act_delay = learner_params.get('l_act_delay', 2)
    l_gamma = learner_params.get('l_gamma', 0.99)
    l_polyak = learner_params.get('l_polyak', 0.995)
    l_train_actor_noise = learner_params.get('l_train_actor_noise', 0.1)
    l_entropy_coefficient = learner_params.get('l_entropy_coefficient', 0.2)
    l_tune_entropy_coefficient = learner_params.get('l_tune_entropy_coefficient', True)
    l_target_entropy = learner_params.get('l_target_entropy', None)
    l_clip_actor_gradients = learner_params.get('l_clip_actor_gradients', False)

    # Discriminator parameters
    d_type = discriminator_params.get('d_type', 'latent')
    d_domain_constant = discriminator_params.get('d_domain_constant', 0.25)
    d_rew = discriminator_params.get('d_rew', 'mixed')
    d_rew_noise = discriminator_params.get('d_rew_noise', True)
    d_learning_rate = discriminator_params.get('d_learning_rate', 1e-3)
    d_updates_per_step = discriminator_params.get('d_updates_per_step', 1)
    d_stability_constant = discriminator_params.get('d_stability_constant', 0.0)
    d_e_batch_size = discriminator_params.get('d_e_batch_size', 64)
    d_l_batch_size = discriminator_params.get('d_l_batch_size', 64)
    d_sn_discriminator = discriminator_params.get('d_sn_discriminator', False)
    d_use_prior_data = discriminator_params.get('d_use_prior_data', False)
    d_pre_filters = discriminator_params.get('d_pre_filters', [32, 32, 1])
    d_hidden_units = discriminator_params.get('d_hidden_units', [32])
    d_pre_scale_stddev = discriminator_params.get('d_pre_scale_stddev', 1.0)
    n_expert_demos = discriminator_params.get('n_expert_demos', None)
    n_expert_prior_demos = discriminator_params.get('n_expert_prior_demos', None)
    n_agent_prior_demos = discriminator_params.get('n_agent_prior_demos', n_expert_prior_demos)

    if env_name == 'InvertedPendulum-v2':
        im_side = 32
        im_shape = [im_side, im_side]
        expert_prior_location = 'Expert' + env_name
        if env_type == 'expert':
            env = ExpertInvertedPendulumEnv()
            agent_prior_location = 'Expert' + env_name
        elif env_type == 'agent' or env_type == 'colored':
            env = AgentInvertedPendulumEnv()
            agent_prior_location = 'Agent' + env_name
        elif env_type == 'to_two':
            env = ExpertInvertedDoublePendulumEnv()
            agent_prior_location = 'ExpertInvertedDoublePendulum-v2'
        elif env_type == 'to_colored_two':
            env = AgentInvertedDoublePendulumEnv()
            agent_prior_location = 'AgentInvertedDoublePendulum-v2'
        else:
            raise NotImplementedError
    elif env_name == 'InvertedDoublePendulum-v2':
        im_side = 32
        im_shape = [im_side, im_side]
        expert_prior_location = 'ExpertInvertedDoublePendulum-v2'
        if env_type == 'expert':
            agent_prior_location = 'ExpertInvertedDoublePendulum-v2'
            env = ExpertInvertedDoublePendulumEnv()
        elif env_type == 'colored':
            env = AgentInvertedDoublePendulumEnv()
            agent_prior_location = 'AgentInvertedDoublePendulum-v2'
        elif env_type == 'to_one':
            agent_prior_location = 'ExpertInvertedPendulum-v2'
            env = ExpertInvertedPendulumEnv()
        elif env_type == 'agent' or env_type == 'to_colored_one':
            agent_prior_location = 'AgentInvertedPendulum-v2'
            env = AgentInvertedPendulumEnv()
        else:
            raise NotImplementedError
    elif env_name == 'ThreeReacherEasy-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'Expert' + env_name
        if env_type == 'expert':
            env = ThreeReacherEasyEnv()
            agent_prior_location = 'Expert' + env_name
        elif env_type == 'agent' or env_type == 'to_two':
            agent_prior_location = 'ExpertReacherEasy-v2'
            env = ReacherEasyEnv()
        elif env_type == 'tilted':
            agent_prior_location = 'AgentThreeReacherEasy-v2'
            env = Tilted3ReacherEasyEnv()
        elif env_type == 'to_tilted_two':
            env = TiltedReacherEasyEnv()
            agent_prior_location = 'AgentReacherEasy-v2'
        else:
            raise NotImplementedError
    elif env_name == 'ReacherEasy-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'ExpertReacherEasy-v2'
        if env_type == 'expert':
            env = ReacherEasyEnv()
            agent_prior_location = 'ExpertReacherEasy-v2'
        elif env_type == 'agent' or env_type == 'tilted':
            env = TiltedReacherEasyEnv()
            agent_prior_location = 'AgentReacherEasy-v2'
        elif env_type == 'to_three':
            env = ThreeReacherEasyEnv()
            agent_prior_location = 'ExpertThreeReacherEasy-v2'
        elif env_type == 'to_tilted_three':
            agent_prior_location = 'AgentThreeReacherEasy-v2'
            env = Tilted3ReacherEasyEnv()
        else:
            raise NotImplementedError
    elif env_name == 'Hopper-v2':
        im_side = 64
        im_shape = [im_side, im_side]
        expert_prior_location = 'Hopper-v2'
        if env_type == 'expert':
            env = HopperEnv()
            agent_prior_location = 'Hopper-v2'
        elif env_type == 'flexible':
            env = HopperFlexibleEnv()
            agent_prior_location = 'HopperFlexible-v2'
        else:
            raise NotImplementedError
    elif env_name == 'HalfCheetah-v2':
        im_side = 64
        im_shape = [im_side, im_side]
        expert_prior_location = 'HalfCheetah-v2'
        if env_type == 'expert':
            env = ExpertHalfCheetahEnv()
            agent_prior_location = 'HalfCheetah-v2'
        elif env_type == 'locked_legs':
            env = LockedLegsHalfCheetahEnv()
            agent_prior_location = 'LockedLegsHalfCheetah-v2'
        else:
            raise NotImplementedError
    elif env_name == 'Striker-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'Striker-v2'
        if env_type == 'expert':
            env = StrikerEnv()
            agent_prior_location = 'Striker-v2'
        elif env_type == 'to_human':
            env = StrikerHumanSimEnv()
            agent_prior_location = 'StrikerHuman-v2'
        else:
            raise NotImplementedError
    elif env_name == 'StrikerHumanSim-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'StrikerHumanSim-v2'
        if env_type == 'expert':
            env = StrikerHumanSimEnv()
            agent_prior_location = 'StrikerHumanSim-v2'
        elif env_type == 'to_robot':
            env = StrikerEnv()
            agent_prior_location = 'Striker-v2'
        else:
            raise NotImplementedError
    elif env_name == 'Pusher-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'Pusher-v2'
        if env_type == 'expert':
            env = PusherEnv()
            agent_prior_location = 'Pusher-v2'
        elif env_type == 'to_human':
            env = PusherHumanSimEnv()
            agent_prior_location = 'PusherHuman-v2'
        else:
            raise NotImplementedError
    elif env_name == 'PusherHumanSim-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'PusherHumanSim-v2'
        if env_type == 'expert':
            env = PusherHumanSimEnv()
            agent_prior_location = 'PusherHumanSim-v2'
        elif env_type == 'to_robot':
            env = PusherEnv()
            agent_prior_location = 'Pusher-v2'
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError
    expert_buffer = DemonstrationsReplayBuffer(
        load_expert_trajectories(env_name, file_location, visual_data=True,
                                 load_ids=True, max_demos=n_expert_demos))
    expert_visual_data_shape = expert_buffer.get_random_batch(1)['ims'][0].shape
    print('Visual data shape: {}'.format(expert_visual_data_shape))
    past_frames = expert_visual_data_shape[0]
    print('Past frames: {}'.format(past_frames))

    if d_use_prior_data:
        prior_expert_buffer = DemonstrationsReplayBuffer(load_expert_trajectories(
            agent_prior_location, prior_file_location, visual_data=True, load_ids=True,
            max_demos=n_expert_prior_demos))
        prior_agent_buffer = DemonstrationsReplayBuffer(load_expert_trajectories(
            expert_prior_location, prior_file_location, visual_data=True, load_ids=True,
            max_demos=n_agent_prior_demos))
    else:
        prior_expert_buffer, prior_agent_buffer = None, None

    if d_type == 'latent' or d_type == 'pretrained_ae':
        im_shape += [3]
    else:
        im_shape += [3 * past_frames]
    action_size = env.action_space.shape[0]

    if exp_num == -1:
        logz.configure_output_dir(None, True)
    else:
        log_dir = osp.join('experiments_data/', '{}/{}'.format(exp_name, exp_num))
        logz.configure_output_dir(log_dir, True)

    params = {
        'exp': exp_params,
        'learner': learner_params,
        'discriminator': discriminator_params,
    }
    print(params)
    logz.save_params(params)
    if l_type == 'TD3':
        def make_actor():
            actor = Actor([tf.keras.layers.Dense(400, 'relu', kernel_initializer='orthogonal'),
                           tf.keras.layers.Dense(300, 'relu', kernel_initializer='orthogonal'),
                           tf.keras.layers.Dense(action_size, 'tanh',
                                                 kernel_initializer=tf.keras.initializers.Orthogonal(0.01))])
            return actor

        def make_critic():
            critic = Critic([tf.keras.layers.Dense(400, 'relu', kernel_initializer='orthogonal'),
                             tf.keras.layers.Dense(300, 'relu', kernel_initializer='orthogonal'),
                             tf.keras.layers.Dense(1,
                                                   kernel_initializer=tf.keras.initializers.Orthogonal(0.01))])
            return critic
    elif l_type == 'SAC':
        def make_actor():
            actor = StochasticActor([tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'),
                                     tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'),
                                     tf.keras.layers.Dense(action_size * 2,
                                                           kernel_initializer=tf.keras.initializers.Orthogonal(0.01))])
            return actor

        def make_critic():
            critic = Critic([tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'),
                             tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'),
                             tf.keras.layers.Dense(1,
                                                   kernel_initializer=tf.keras.initializers.Orthogonal(0.01))])
            return critic

        if l_target_entropy is None:
            l_target_entropy = -1 * (np.prod(env.action_space.shape))
    else:
        raise NotImplementedError

    d_optimizer = tf.keras.optimizers.Adam(learning_rate=d_learning_rate)
    tfl = tf.keras.layers
    if d_type == 'latent':
        pre_layers = [tfl.Reshape(im_shape)]
    else:
        pre_layers = [tfl.Permute((2, 3, 1, 4)),
                      tfl.Reshape(im_shape)]
    if (d_type == 'latent') or (not d_sn_discriminator):
        for filters in d_pre_filters[:-1]:
            pre_layers += [tfl.Conv2D(filters, 3, activation='tanh', padding='same'),
                           tfl.MaxPooling2D(2, padding='same')]
        pre_layers += [tfl.Conv2D(d_pre_filters[-1], 3, padding='same'),
                       tfl.MaxPooling2D(2, padding='same'),
                       tfl.Reshape([-1])]
    else:
        for filters in d_pre_filters[:-1]:
            pre_layers += [SpectralNormalization(
                tfl.Conv2D(filters, 3, padding='same')),
                tfl.LeakyReLU(),
                tfl.MaxPooling2D(2, padding='same')]
        pre_layers += [SpectralNormalization(
            tfl.Conv2D(d_pre_filters[-1], 3, padding='same')),
            tfl.MaxPooling2D(2, padding='same'),
            tfl.Reshape([-1])]

    def make_disc():
        if d_sn_discriminator:
            disc_layers = [SpectralNormalization(
                tfl.Dense(units, activation='relu'))
                for units in d_hidden_units]
            disc_layers.append(SpectralNormalization(tfl.Dense(1)))
        else:
            disc_layers = [tfl.Dense(units, activation='tanh')
                           for units in d_hidden_units]
            disc_layers.append(tfl.Dense(1))
        return InvariantDiscriminator(disc_layers,
                                      d_stability_constant,
                                      d_rew)
    if d_type == 'latent':
        def make_pre():
            pre = GaussianPreprocessor(pre_layers, d_pre_scale_stddev)
            return pre
    else:
        def make_pre():
            pre = DeterministicPreprocessor(pre_layers)
            return pre

    l_optimizer = tf.keras.optimizers.Adam(l_learning_rate)
    if l_type == 'TD3':
        l_agent = DDPG(make_actor=make_actor,
                       make_critic=make_critic,
                       make_critic2=make_critic,
                       actor_optimizer=l_optimizer,
                       critic_optimizer=l_optimizer,
                       gamma=l_gamma,
                       polyak=l_polyak,
                       train_actor_noise=l_train_actor_noise,
                       clip_actor_gradients=l_clip_actor_gradients,)
    elif l_type == 'SAC':
        l_agent = SAC(make_actor=make_actor,
                      make_critic=make_critic,
                      make_critic2=make_critic,
                      actor_optimizer=l_optimizer,
                      critic_optimizer=l_optimizer,
                      gamma=l_gamma,
                      polyak=l_polyak,
                      entropy_coefficient=l_entropy_coefficient,
                      tune_entropy_coefficient=l_tune_entropy_coefficient,
                      target_entropy=l_target_entropy,
                      clip_actor_gradients=l_clip_actor_gradients,)
    else:
        raise NotImplementedError
    sampler = Sampler(env, episode_limit, init_random_samples, visual_env=True)

    gail = DomainConfusionDisentanGAIL(agent=l_agent,
                                       make_discriminator=make_disc,
                                       make_preprocessing=make_pre,
                                       expert_buffer=expert_buffer,
                                       prior_expert_buffer=prior_expert_buffer,
                                       prior_agent_buffer=prior_agent_buffer,
                                       d_optimizer=d_optimizer,
                                       d_domain_constant=d_domain_constant,
                                       stab_const=d_stability_constant,
                                       past_frames=past_frames,)
    agent_buffer = LearnerAgentReplayBuffer(gail, l_buffer_size, reward_noise=d_rew_noise)
    test_input = expert_buffer.get_random_batch(1)
    test_input['obs'] = np.expand_dims(
        (env.reset()['obs']).astype('float32'), axis=0)
    gail(test_input)
    gail.summary()

    mean_test_returns = []
    mean_test_std = []
    steps = []

    step_counter = 0
    logz.log_tabular('Iteration', 0)
    logz.log_tabular('Steps', step_counter)
    print('Epoch {}/{} - total steps {}'.format(0, epochs, step_counter))
    out = sampler.evaluate(l_agent, test_runs_per_epoch, False)
    mean_test_returns.append(out['mean'])
    mean_test_std.append(out['std'])
    steps.append(step_counter)
    for k, v in out.items():
        logz.log_tabular(k, v)
    logz.dump_tabular()
    for e in range(epochs):
        while step_counter < (e + 1) * steps_per_epoch:
            traj_data = sampler.sample_trajectory(l_agent, l_exploration_noise)
            agent_buffer.add(traj_data)
            n = traj_data['n']
            step_counter += traj_data['n']
            if step_counter > training_starts:
                gail.train(agent_buffer=agent_buffer,
                           l_batch_size=l_batch_size,
                           l_updates=l_updates_per_step * n,
                           l_act_delay=l_act_delay,
                           d_updates=d_updates_per_step * n,
                           d_e_batch_size=d_e_batch_size,
                           d_l_batch_size=d_l_batch_size,)
        logz.log_tabular('Iteration', e + 1)
        logz.log_tabular('Steps', step_counter)
        print('Epoch {}/{} - total steps {}'.format(e + 1, epochs, step_counter))
        traj_test = sampler.sample_test_trajectories(l_agent, 0.0, test_runs_per_epoch)
        out = log_trajectory_statistics(traj_test['ret'], False)
        mean_test_returns.append(out['mean'])
        mean_test_std.append(out['std'])
        steps.append(step_counter)
        for k, v in out.items():
            logz.log_tabular(k, v)
        logz.dump_tabular()
        if visualize_collected_observations:
            training_sample = traj_data['ims'][-1, 0]
            print('Visualization of latest training sample')
            plt.imshow(training_sample)
            plt.show()
            test_sample = traj_test['ims'][-1, 0]
            print('Visualization of latest test sample')
            plt.imshow(test_sample)
            plt.show()
        if out['mean'] >= return_threshold:
            print('Early termination due to reaching return threshold')
            break

    return gail, sampler
示例#13
0
文件: main.py 项目: kn27/LearnToCut
    def train(self, num_iter):

        wandb.login()
        run = wandb.init(project="project-local",
                         entity="ieor-4575",
                         tags=[f"training-easy"])

        rewards_record = []

        start = time.time()
        for i in range(num_iter):
            t1 = time.time()
            self.train_step()
            t2 = time.time()
            print('total training time: ', t2 - t1)
            print('iter ', i, ' done')

            # record statistics every 10 iterations
            if ((i + 1) % 1 == 0):
                t3 = time.time()
                rewards = self.aggregate_rollouts(num_rollouts=5,
                                                  evaluate=True)
                t4 = time.time()
                print('total evaluation time: ', t4 - t3)
                if ((i + 1) % 10 == 0):
                    w = ray.get(
                        self.workers[0].get_weights_plus_stats.remote())
                    np.savez(self.logdir + f"/lin_policy_plus_{i}", w)

                print(sorted(self.params.items()))
                logz.log_tabular("Time", time.time() - start)
                logz.log_tabular("Iteration", i + 1)
                logz.log_tabular("AverageReward", np.mean(rewards))
                logz.log_tabular("StdRewards", np.std(rewards))
                logz.log_tabular("MaxRewardRollout", np.max(rewards))
                logz.log_tabular("MinRewardRollout", np.min(rewards))
                logz.log_tabular("timesteps", self.timesteps)
                logz.dump_tabular()

                rewards_record.append(np.mean(rewards))
                fixedWindow = 10
                movingAverage = 0
                if len(rewards_record) >= fixedWindow:
                    movingAverage = np.mean(
                        rewards_record[len(rewards_record) -
                                       fixedWindow:len(rewards_record) - 1])
                wandb.log({
                    "Training reward": rewards_record[-1],
                    "movingAverage": movingAverage,
                    "AverageReward": np.mean(rewards),
                    'StdRewards': np.std(rewards),
                    'MaxRewardRollout': np.max(rewards),
                    'MinRewardRollout': np.min(rewards)
                })

            t1 = time.time()
            # get statistics from all workers
            for j in range(self.num_workers):
                self.policy.observation_filter.update(
                    ray.get(self.workers[j].get_filter.remote()))
            self.policy.observation_filter.stats_increment()

            # make sure master filter buffer is clear
            self.policy.observation_filter.clear_buffer()
            # sync all workers
            filter_id = ray.put(self.policy.observation_filter)
            setting_filters_ids = [
                worker.sync_filter.remote(filter_id) for worker in self.workers
            ]
            # waiting for sync of all workers
            ray.get(setting_filters_ids)

            increment_filters_ids = [
                worker.stats_increment.remote() for worker in self.workers
            ]
            # waiting for increment of all workers
            ray.get(increment_filters_ids)
            t2 = time.time()
            print('Time to sync statistics:', t2 - t1)

        return
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        gae_lambda=0.99,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # ========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the
    # numerical values that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for
    # that axis is None
    # ========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy
    # gradient loss function.
    # ========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian
    #          distribution over actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces
    #          actions.
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std
    #          'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z.
    #          (Hint: use tf.random_normal!)
    #          Should have shape [None, ac_dim]
    #
    #      p.s. these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually
    #      taken, according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na',
    #      and the policy network output ops.
    #
    # ========================================================================================#

    if discrete:
        # Get the logits from neural network output
        sy_logits_na = build_mlp(sy_ob_no,
                                 ac_dim,
                                 "pi",
                                 n_layers=n_layers,
                                 size=size)

        # Sample one action for each sample from the above probability
        # distributionin, and then use [-1] to flatten from [None, 1] to [None]
        sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), [-1])

        # Compute likelihood of an action beging chosen from the action space
        # only single action is needed/classified, use sparse_... here
        sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sy_ac_na, logits=sy_logits_na)

    else:
        # YOUR_CODE_HERE
        # Assume independent continuous action

        sy_mean = build_mlp(sy_ob_no,
                            ac_dim,
                            "pi",
                            n_layers=n_layers,
                            size=size)
        # logstd should just be a trainable variable, not a network output.
        sy_logstd = tf.Variable(tf.zeros(shape=[1, ac_dim]),
                                name="ac_log_std",
                                dtype=tf.float32)
        sy_std = tf.exp(sy_logstd)

        sy_sampled_ac_k = tf.random_normal(tf.shape(sy_mean))
        sy_sampled_ac = sy_mean + sy_std * sy_sampled_ac_k

        # Hint: Use the log probability under a multivariate gaussian for each
        # row, using the formula. (action independent)
        sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(
            (sy_ac_na - sy_mean) / sy_std),
                                            axis=1)

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    # ========================================================================================#

    # Loss function that we'll differentiate to get the policy gradient.
    loss = -tf.reduce_mean(sy_logprob_n * sy_adv_n)
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    # ========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    # ========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for
        # fitting a neural network baseline. These will be used to fit the
        # neural network baseline.
        # YOUR_CODE_HERE
        baseline_target = tf.placeholder(shape=[None],
                                         name="baseline_target",
                                         dtype=tf.float32)
        baseline_loss = tf.nn.l2_loss(baseline_target - baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            baseline_loss)

    # ========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    # ========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  # pylint: disable=E1101

    # ========================================================================================#
    # Training Loop
    # ========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        # one batch starts
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and animate)
            steps = 0
            # single path starts
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
                # single path end
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
            # one batch ends
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update
        # by concatenating across paths
        ob_no = np.concatenate([p["observation"] for p in paths])
        ac_na = np.concatenate([p["action"] for p in paths])

        # ====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be
        # used to compute advantages (which will in turn be fed to the
        # placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag
        # 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward
        #       summed over entire trajectory (regardless of which time step
        #       the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of
        #       rewards starting from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a
        # variable 'q_n', like the 'ob_no' and 'ac_na' above.
        #
        # ====================================================================================#

        # YOUR_CODE_HERE
        # Use accumulate/reduce here to calculate the reward along the paths
        q_n = []
        for p in paths:
            if reward_to_go:
                q_n += list(
                    itertools.accumulate(
                        p["reward"][::-1],
                        lambda ss_r, cur_r: cur_r + gamma * ss_r))[::-1]
            else:
                q_n += [
                    functools.reduce(lambda ss_r, cur_r: cur_r + gamma * ss_r,
                                     p["reward"][::-1])
                ] * len(p["reward"])
        q_n = np.array(q_n)

        # ====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        # ====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict
            # reward-to-go at each timestep for each trajectory, and save the
            # result in a variable 'b_n' like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the
            # statistics (mean and std) of the current or previous batch of
            # Q-values. (Goes with Hint #bl2 below.)

            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            # normalize to mean and std that are the same with q_n
            q_mean, q_std = np.mean(q_n), np.std(q_n)
            b_n = (b_n - np.mean(b_n)) / (np.std(b_n) + 1e-9) + 1e-9
            b_n = q_mean + b_n * q_std

            # critics using state-dependent baselines
            # adv_n = q_n - b_n

            # Generalized advantage estimation
            adv_n, ll = [], 0
            for p in paths:
                pre_v, pre_t, adv_cur = 0, 0, []
                ll += len(p["reward"])
                for v, r in zip(b_n[ll - 1::-1], p["reward"][::-1]):
                    adv_cur.append(pre_v * gamma - v + r +
                                   pre_t * gamma * gae_lambda)
                    pre_v, pre_t = v, adv_cur[-1]
                if reward_to_go:
                    adv_n += adv_cur[::-1]
                else:
                    adv_n += [adv_cur[-1]] * len(adv_cur)
            adv_n = np.array(adv_n)
            # Recalculate the advantages for value function estimation
            q_n = adv_n + b_n

        else:
            adv_n = q_n.copy()

        # ====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        # ====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to
            # reduce variance in policy gradient methods: normalize adv_n to
            # have mean zero and std=1.
            # YOUR_CODE_HERE
            # Without SECTION 5
            # scale from sklearn == standardization != normalize from sklearn
            adv_n = scale(adv_n)

        # ====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        # ====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the
            # inputs for the baseline.
            #
            # Fit it to the current batch in order to use for the next
            # iteration. Use the baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly,
            # rescale the targets to have mean zero and std=1. (Goes with Hint
            # #bl1 above.)
            q_n_0 = scale(q_n)
            sess.run(baseline_update_op,
                     feed_dict={
                         sy_ob_no: ob_no,
                         baseline_target: q_n_0
                     })

        # ====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        # ====================================================================================#

        # Call the update operation necessary to perform the policy gradient
        # update based on the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss
        # function before and after an update, and then log them below.

        # YOUR_CODE_HERE
        sess.run(update_op,
                 feed_dict={
                     sy_ob_no: ob_no,
                     sy_ac_na: ac_na,
                     sy_adv_n: adv_n,
                 })

        # Log diagnostics
        returns = [p["reward"].sum() for p in paths]
        ep_lengths = [pathlength(p) for p in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#15
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    # sy_adv_n = TODO


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no, ac_dim, "", n_layers=3, size=64, activation=tf.tanh, output_activation=tf.softmax)
        sy_sampled_ac = tf.multinomial(sy_logits_na, None) # Hint: Use the tf.multinomial op
        sy_logprob_n = tf.log(tf.multiply(sy_ac_na, sy_sampled_ac))

    else:
        # YOUR_CODE_HERE
        # sy_mean = TODO
        # sy_logstd = TODO # logstd should just be a trainable variable, not a network output.
        # sy_sampled_ac = TODO
        # sy_logprob_n = TODO  # Hint: Use the log probability under a multivariate gaussian. 



    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = TODO # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no, 
                                1, 
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_update_op = TODO


    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        q_n = TODO

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = TODO
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            pass


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            pass

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE


        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()


def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str)
    parser.add_argument('--exp_name', type=str, default='vpg')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--discount', type=float, default=1.0)
    parser.add_argument('--n_iter', '-n', type=int, default=100)
    parser.add_argument('--batch_size', '-b', type=int, default=1000)
    parser.add_argument('--ep_len', '-ep', type=float, default=-1.)
    parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
    parser.add_argument('--reward_to_go', '-rtg', action='store_true')
    parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true')
    parser.add_argument('--nn_baseline', '-bl', action='store_true')
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--n_experiments', '-e', type=int, default=1)
    parser.add_argument('--n_layers', '-l', type=int, default=1)
    parser.add_argument('--size', '-s', type=int, default=32)
    args = parser.parse_args()

    if not(os.path.exists('data')):
        os.makedirs('data')
    logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('data', logdir)
    if not(os.path.exists(logdir)):
        os.makedirs(logdir)

    max_path_length = args.ep_len if args.ep_len > 0 else None

    for e in range(args.n_experiments):
        seed = args.seed + 10*e
        print('Running experiment with seed %d'%seed)
        def train_func():
            train_PG(
                exp_name=args.exp_name,
                env_name=args.env_name,
                n_iter=args.n_iter,
                gamma=args.discount,
                min_timesteps_per_batch=args.batch_size,
                max_path_length=max_path_length,
                learning_rate=args.learning_rate,
                reward_to_go=args.reward_to_go,
                animate=args.render,
                logdir=os.path.join(logdir,'%d'%seed),
                normalize_advantages=not(args.dont_normalize_advantages),
                nn_baseline=args.nn_baseline, 
                seed=seed,
                n_layers=args.n_layers,
                size=args.size
                )
        # Awkward hacky process runs, because Tensorflow does not like
        # repeatedly calling train_PG in the same thread.
        p = Process(target=train_func, args=tuple())
        p.start()
        p.join()
        

if __name__ == "__main__":
    main()
示例#16
0
def main_pendulum(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=False, logfile=None):
    env = gym.make("Pendulum-v0")
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]

    logz.configure_output_file(logfile)
    #vf = LinearValueFunction()
    vf = NeuralValueFunction(ob_dim)

    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in these functions
    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations
    sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.float32) # batch of actions taken by the policy, used for policy gradient computation
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate
    sy_h1 = tf.nn.relu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer
    sy_mean_n = dense(sy_h1, ac_dim, "final", weight_init=normc_initializer(0.05)) # Mean control output
    sy_logstd_n = tf.Variable(tf.zeros([ac_dim]))
    sy_std_n = tf.exp(sy_logstd_n)

    # Get probabilities from normal distribution and sample from distribution
    dist = tf.contrib.distributions.Normal(mu=tf.reshape(sy_mean_n,[-1]), sigma=sy_std_n)
    sy_logprob_n = tf.reshape(tf.log(dist.pdf(sy_ac_n)),[-1])
    sy_n = tf.shape(sy_ob_no)[0]
    sy_sampled_ac = dist.sample(sy_n) # sampled actions, used for defining the policy (NOT computing the policy gradient)

    # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
    sy_mean_n_old = tf.placeholder(shape=[None, ac_dim], name='old_mean', dtype=tf.float32)
    sy_std_n_old = tf.placeholder(shape=[ac_dim], name='old_std', dtype=tf.float32)

    sy_kl = tf.reduce_sum(tf.log(sy_std_n/sy_std_n_old)+(sy_std_n_old**0.5+(sy_mean_n_old-sy_mean_n)**0.5)/(2*sy_std_n**0.5)-0.5)/tf.to_float(sy_n)
    sy_ent = tf.reduce_sum(-(1+tf.log(2*math.pi*sy_std_n**2))*0.5)
    # <<<<<<<<<<<<<

    sy_surr = -tf.reduce_mean(sy_adv_n*sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__()
    sess.run(tf.global_variables_initializer())

    total_timesteps = 0
    obs_mean = np.zeros(ob_dim)
    obs_std = np.zeros(ob_dim)

    for i in range(n_iter):
        print("********** Iteration %i ************"%i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                acs.append(ac.flatten())
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew.flatten())
                ob = ob.flatten()
                if done:
                    break
            path = {"observation" : np.array(obs), "terminated" : terminated,
                    "reward" : np.array(rewards), "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict((path["observation"]-obs_mean)/(obs_std+1e-8))
            adv_t = return_t.flatten() - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n-adv_n.mean())/(adv_n.std()+1e-8)
        vtarg_n = np.concatenate(vtargs).flatten()
        vpred_n = np.concatenate(vpreds)
        obs_mean = np.average(ob_no,axis=0)
        obs_std = np.std(ob_no,axis=0)
        vf.fit((ob_no-obs_mean)/(obs_std+1e-8), vtarg_n)

        # Policy update
        _, mean_n, std_n = sess.run([update_op, sy_mean_n, sy_std_n], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n.flatten(), sy_adv_n:standardized_adv_n, sy_stepsize:stepsize})
        kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_mean_n_old: mean_n, sy_std_n_old: std_n})

        desired_kl = 2e-3
        if kl > desired_kl * 2: 
            stepsize /= 1.5
            print('stepsize -> %s'%stepsize)
        elif kl < desired_kl / 2: 
            stepsize *= 1.5
            print('stepsize -> %s'%stepsize)
        else:
            print('stepsize OK')

        # Log diagnostics
        logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
示例#17
0
def logger(id_, out):
    logz.log_tabular('Iteration', id_)
    logz.log_tabular('AverageReturn', out[0])
    logz.log_tabular('StdReturn', out[1])
    logz.dump_tabular()
def main():
    DATASET_SIZE = 30000
    STEPS = 200000
    VALIDSET_SIZE = 2000
    LR = 0.0003
    BATCH_SIZE = 64

    if not (os.path.exists('data/pre_trained_model')):
        os.makedirs('data/pre_trained_model')

    home = os.path.expanduser('~')
    expdir = os.path.join(home, 'robotics_drl/reacher/data/pre_trained_model')
    logz.configure_output_dir(d=expdir)

    D = deque(maxlen=DATASET_SIZE)
    V = deque(maxlen=VALIDSET_SIZE)
    env = environment(continuous_control=True,
                      obs_lowdim=False,
                      rpa=4,
                      frames=4)
    obs = env.reset()
    home = os.path.expanduser("~")
    path = home + "/robotics_drl/reacher"
    os.chdir(path)
    torchvision.utils.save_image(obs.view(-1, 64, 64)[0, :, :],
                                 "test_inverted.png",
                                 normalize=True)
    net = network().to(device)
    net.apply(weights_init)
    optimiser = optim.Adam(net.parameters(), lr=LR)

    #pbar = tqdm(range(1, STEPS + 1), unit_scale=1, smoothing=0)

    for i in range(DATASET_SIZE):
        action = env.sample_action()
        obs, _, _ = env.step(action)
        target_pos = env.target_position()
        joint_pos = env.agent.get_joint_positions()
        #joint_pos = [cos(joint_pos[0]),sin(joint_pos[1])]
        joint_vel = env.agent.get_joint_velocities()

        D.append({
            "target_pos": to_torch(target_pos[:2]).view(1, -1),
            "joint_pos": to_torch(joint_pos).view(1, -1),
            "joint_vel": to_torch(joint_vel).view(1, -1),
            "img": to_torch(obs).unsqueeze(dim=0)
        })

        if i % 50 == 0 and i != 0:
            env.reset()

    for i in range(VALIDSET_SIZE):
        action = env.sample_action()
        obs, _, _ = env.step(action)
        target_pos = env.target_position()
        joint_pos = env.agent.get_joint_positions()
        #joint_pos = [cos(joint_pos[0]),sin(joint_pos[1])]
        joint_vel = env.agent.get_joint_velocities()

        V.append({
            "target_pos": to_torch(target_pos[:2]).view(1, -1),
            "joint_pos": to_torch(joint_pos).view(1, -1),
            "joint_vel": to_torch(joint_vel).view(1, -1),
            "img": to_torch(obs).unsqueeze(dim=0)
        })

        if i % 50 == 0 and i != 0:
            env.reset()

    for step in range(STEPS):
        if len(D) > BATCH_SIZE:
            loss = get_loss(D, BATCH_SIZE, net)
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()

            if step % 800 == 0 and step != 0:
                net.eval()
                loss_v = get_loss(V, VALIDSET_SIZE, net)
                net.train()
                logz.log_tabular('Loss training', loss.item())
                logz.log_tabular('Loss validation', loss_v.item())
                logz.dump_tabular()
                #for param in net.parameters():
                #    print(param.data.size())
                #pbar.set_description()

            if step % 20000 == 0 and step != 0:
                home = os.path.expanduser("~")
                path = home + "/robotics_drl/reacher/data/pre_trained_model"
                torch.save(net.state_dict(),
                           os.path.join(path, "model%s.pkl" % step))

    #home = os.path.expanduser("~")
    #path = home + "/robotics_drl/reacher/pre_trained_net_reacher/model.pkl"
    #net.load_state_dict(torch.load(path))
    #net.eval()
    #get_loss(V,10,net)

    env.terminate()
示例#19
0
def train_ga(exp_name='',
             env_name='HalfCheetah-v2',
             logdir=None,
             prob_save=0.05,
             n_gen=100,
             gamma=0.5,
             sigma=1e-3,
             pop_size=100,
             fitness_eval_episodes=40,
             max_steps=150,
             n_elite=20,
             seed=1,
             n_layers=2,
             size=64,
             network_activation='leaky_relu',
             output_activation='tanh'):

    start = time.time()
    logz.configure_output_dir(logdir)
    args = inspect.getargspec(train_ga)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    torch.manual_seed(seed)
    np.random.seed(seed)
    env = gym.make(env_name)
    env.seed(seed)

    discrete = isinstance(env.action_space, gym.spaces.Discrete)
    max_steps = int(max_steps or env.spec.max_episode_steps)

    input_size = env.observation_space.shape[0]
    output_size = env.action_space.n if discrete else env.action_space.shape[0]

    if network_activation == 'relu':
        activation = torch.nn.functional.relu
    elif network_activation == 'leaky_relu':
        activation = torch.nn.functional.leaky_relu
    else:
        activation = torch.nn.functional.tanh

    if output_activation == 'relu':
        output_a = torch.nn.functional.relu
    elif output_activation == 'leaky_relu':
        output_a = torch.nn.functional.leaky_relu
    elif output_activation == 'tanh':
        output_a = torch.nn.functional.tanh
    else:
        output_a = None

    center_return_all = []
    member_archive = Archive(prob_save)
    population = get_init_population(pop_size, input_size, output_size,
                                     n_layers, size, activation, output_a,
                                     discrete)

    for member in population:
        member.setScore(
            compute_fitness(env, member, member_archive, fitness_eval_episodes,
                            gamma, max_steps, discrete))

    sort_members_in_place(population, reverse=True)

    #save in archive
    for member in population:
        member_archive.save(member)

    population = population[:n_elite]
    center_return_list = []
    current_best_fitness_score = float(population[0].score)
    current_best_reward_score = float(population[0].reward_score)
    center_return_list.append(current_best_reward_score)
    for i_gen in range(n_gen):
        offsprings = []

        for i in range(int((pop_size - n_elite) / 2)):
            parent_index = random.randint(0, n_elite - 1)
            parent = population[parent_index]
            offspring1, offspring2 = perturb_member(parent, sigma, input_size,
                                                    output_size, n_layers,
                                                    size, activation, output_a,
                                                    discrete)
            offsprings.append(offspring1)
            offsprings.append(offspring2)

        for member in offsprings:
            member.setScore((compute_fitness(env, member, member_archive,
                                             fitness_eval_episodes, gamma,
                                             max_steps, discrete)))

        population = population + offsprings
        sort_members_in_place(population, reverse=True)
        for member in offsprings:
            member_archive.save(member)
        population = population[:n_elite]
        current_best_fitness_score = float(population[0].score)
        current_best_reward_score = float(population[0].reward_score)
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", i_gen)
        logz.log_tabular("AverageFitness", current_best_fitness_score)
        logz.log_tabular("stdFitness", -1)
        logz.log_tabular("AverageReturn", current_best_reward_score)
        logz.log_tabular("stdReturn", -1)
        logz.log_tabular("dontcare1", -1)
        logz.log_tabular("dontcare2", -1)
        logz.log_tabular("dontcare3", -1)
        logz.log_tabular("dontcare4", -1)
        logz.dump_tabular()
        logz.pickle_tf_vars()
        center_return_list.append(current_best_reward_score)

    center_return_all.append(center_return_list)
    env.close()
示例#20
0
def main_cartpole(n_iter=100,
                  gamma=1.0,
                  min_timesteps_per_batch=1000,
                  stepsize=1e-2,
                  animate=True,
                  logdir=None):
    env = gym.make("CartPole-v0")
    ob_dim = env.observation_space.shape[0]
    num_actions = env.action_space.n
    logz.configure_output_dir(logdir)
    vf = LinearValueFunction()

    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in these function
    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)  # batch of observations
    sy_ac_n = tf.placeholder(
        shape=[None], name="ac", dtype=tf.int32
    )  # batch of actions taken by the policy, used for policy gradient computation
    sy_adv_n = tf.placeholder(shape=[None], name="adv",
                              dtype=tf.float32)  # advantage function estimate
    sy_h1 = lrelu(dense(sy_ob_no, 32, "h1",
                        weight_init=normc_initializer(1.0)))  # hidden layer
    sy_logits_na = dense(
        sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)
    )  # "logits", describing probability distribution of final layer
    # we use a small initialization for the last layer, so the initial policy has maximal entropy
    sy_oldlogits_na = tf.placeholder(
        shape=[None, num_actions], name='oldlogits',
        dtype=tf.float32)  # logits BEFORE update (just used for KL diagnostic)
    sy_logp_na = tf.nn.log_softmax(sy_logits_na)  # logprobability of actions
    sy_sampled_ac = categorical_sample_logits(
        sy_logits_na
    )[0]  # sampled actions, used for defining the policy (NOT computing the policy gradient)
    sy_n = tf.shape(sy_ob_no)[0]
    sy_logprob_n = fancy_slice_2d(
        sy_logp_na, tf.range(sy_n), sy_ac_n
    )  # log-prob of actions taken -- used for policy gradient calculation

    # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
    sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na)
    sy_oldp_na = tf.exp(sy_oldlogp_na)
    sy_kl = tf.reduce_sum(sy_oldp_na *
                          (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n)
    sy_p_na = tf.exp(sy_logp_na)
    sy_ent = tf.reduce_sum(-sy_p_na * sy_logp_na) / tf.to_float(sy_n)
    # <<<<<<<<<<<<<

    sy_surr = -tf.reduce_mean(
        sy_adv_n * sy_logprob_n
    )  # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(
        shape=[], dtype=tf.float32
    )  # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    # use single thread. on such a small problem, multithreading gives you a slowdown
    # this way, we can better use multiple cores for different experiments
    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    total_timesteps = 0

    for i in range(n_iter):
        print("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (i % 10 == 0)
                                    and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                if done:
                    break
            path = {
                "observation": np.array(obs),
                "terminated": terminated,
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict(path["observation"])
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        vf.fit(ob_no, vtarg_n)

        # Policy update
        _, oldlogits_na = sess.run(
            [update_op, sy_logits_na],
            feed_dict={
                sy_ob_no: ob_no,
                sy_ac_n: ac_n,
                sy_adv_n: standardized_adv_n,
                sy_stepsize: stepsize
            })
        kl, ent = sess.run([sy_kl, sy_ent],
                           feed_dict={
                               sy_ob_no: ob_no,
                               sy_oldlogits_na: oldlogits_na
                           })

        # Log diagnostics
        logz.log_tabular("EpRewMean",
                         np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean",
                         np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter",
                         explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
示例#21
0
def main_pendulum(logdir,
                  seed,
                  n_iter,
                  gamma,
                  min_timesteps_per_batch,
                  initial_stepsize,
                  desired_kl,
                  vf_type,
                  vf_params,
                  animate=False):
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env = QuadCopter(SIM_TIME_STEP, inverted_pendulum=False)
    ob_dim = env.stateSpace
    ac_dim = env.actionSpace
    ac_lim = env.actionLimit
    print("Quadcopter created")
    print('state_dim: ', ob_dim)
    print('action_dim: ', ac_dim)
    print('action_limit: ', ac_lim)
    print('max time: ', MAX_EP_TIME)
    print('max step: ', MAX_EP_STEPS)

    hover_position = np.asarray([0, 0, 0])
    task = hover(hover_position)

    logz.configure_output_dir(logdir)
    if vf_type == 'linear':
        vf = LinearValueFunction(**vf_params)
    elif vf_type == 'nn':
        vf = NnValueFunction(ob_dim=ob_dim, **vf_params)

    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in these function
    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)  # batch of observations
    sy_ac_n = tf.placeholder(
        shape=[None, ac_dim], name="ac", dtype=tf.float32
    )  # batch of actions taken by the policy, used for policy gradient computation
    sy_adv_n = tf.placeholder(shape=[None, 1], name="adv",
                              dtype=tf.float32)  # advantage function estimate
    sy_h1 = tf.nn.relu(
        dense(sy_ob_no, 400, "h1",
              weight_init=normc_initializer(1.0)))  # hidden layer
    sy_h2 = tf.nn.relu(
        dense(sy_h1, 300, "h2",
              weight_init=normc_initializer(1.0)))  # hidden layer

    # mean_na = dense(sy_h1, ac_dim, "mean", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer

    mean_na = tf.tanh(
        dense(sy_h2, ac_dim, "final", weight_init=normc_initializer(
            0.1))) * ac_lim  # Mean control output
    # std_a = tf.constant(1.0, dtype=tf.float32, shape=[ac_dim])
    std_a = tf.get_variable("logstdev", [ac_dim],
                            initializer=tf.ones_initializer())

    # std_a = tf.constant(1.0,  shape=[ac_dim], dtype=tf.float32)

    sy_sampled_ac = sample_gaussian(
        ac_dim, mean_na, std_a
    )  # sampled actions, used for defining the policy (NOT computing the policy gradient)
    # sy_sampled_ac = tf.zeros([1, ac_dim])
    sy_prob_n = (1.0 / tf.sqrt(
        (tf.square(std_a) * 2 * 3.1415926))) * tf.exp(-0.5 * tf.square(
            (sy_ac_n - mean_na) / std_a))
    # sy_prob_n = (1.0/(std_a*2.5067)) * tf.exp(-0.5*tf.square((sy_ac_n - mean_na)/std_a))

    sy_logprob_n = tf.log(sy_prob_n)
    # sub = tf.subtract(sy_ac_n, mean_na)
    # mul = tf.multiply(sub, sy_h1)
    # sy_logprob_n = tf.log(tf.divide(sub, tf.square(std_a))) # log-prob of actions taken -- used for policy gradient calculation

    # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
    sy_n = tf.shape(sy_ob_no)[0]
    old_mean_na = tf.placeholder(
        shape=[None, ac_dim], name='old_mean_a',
        dtype=tf.float32)  # mean_a BEFORE update (just used for KL diagnostic)
    old_std_a = tf.placeholder(
        shape=[ac_dim], name='old_std_a',
        dtype=tf.float32)  # std_a BEFORE update (just used for KL diagnostic)
    # KL
    sy_kl = tf.reduce_mean(
        tf.log(std_a / old_std_a) +
        (tf.square(old_std_a) + tf.square(old_mean_na - mean_na)) /
        (2 * tf.square(std_a)) - 0.5)
    # entropy
    sy_p_na = tf.exp(mean_na)
    sy_ent = tf.reduce_sum(-sy_p_na * mean_na) / tf.to_float(sy_n)
    # <<<<<<<<<<<<<

    sy_surr = -tf.reduce_mean(
        sy_adv_n * sy_logprob_n
    )  # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(
        shape=[], dtype=tf.float32
    )  # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    total_timesteps = 0
    stepsize = initial_stepsize
    for i in range(n_iter):

        print("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps

        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            j = 0
            while True:
                j += 1
                ob = ob.reshape(ob.shape[0], )
                obs.append(ob)
                # print ob
                # mean = sess.run(mean_na, feed_dict={sy_ob_no : ob[None]})[0]
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})[0]
                # print ac
                ob, done, _ = env.step(ac)
                rew = task.reward(ob, done, _)
                # ac = np.asscalar(ac)
                acs.append(ac)

                rew = np.asscalar(rew)
                rewards.append(rew)
                if done or j >= MAX_EP_STEPS:
                    # print "done"
                    break
            path = {
                "observation": np.array(obs),
                "terminated": terminated,
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict(path["observation"])
            adv_t = return_t - vpred_t
            # print("return_t: ", return_t.shape)
            # print("vpred_t: ", vpred_t.shape)
            # print("adv_t: ", adv_t.shape)

            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        ac_n = ac_n.reshape([-1, ac_dim])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        standardized_adv_n = standardized_adv_n.reshape([-1, 1])

        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        vf.fit(ob_no, vtarg_n)

        # Policy update
        # print standardized_adv_n
        surr, adv, logp = sess.run(
            [sy_surr, sy_adv_n, sy_prob_n],
            feed_dict={
                sy_ob_no: ob_no,
                sy_ac_n: ac_n,
                sy_adv_n: standardized_adv_n,
                sy_stepsize: stepsize
            })
        _, old_mean, old_std = sess.run(
            [update_op, mean_na, std_a],
            feed_dict={
                sy_ob_no: ob_no,
                sy_ac_n: ac_n,
                sy_adv_n: standardized_adv_n,
                sy_stepsize: stepsize
            })
        kl, ent = sess.run([sy_kl, sy_ent],
                           feed_dict={
                               sy_ob_no: ob_no,
                               old_mean_na: old_mean,
                               old_std_a: old_std
                           })

        # KL
        if kl > desired_kl * 2:
            stepsize /= 1.5
            print('stepsize -> %s' % stepsize)
        elif kl < desired_kl / 2:
            stepsize *= 1.5
            print('stepsize -> %s' % stepsize)
        else:
            print('stepsize OK')

        # Log diagnostics
        logz.log_tabular("EpRewMean",
                         np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean",
                         np.mean([pathlength(path) for path in paths]))
        # logz.log_tabular("std", old_std)
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter",
                         explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
示例#22
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)

    paths = sample(env,
                   random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   render=False,
                   verbose=False)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    normalization = compute_normalization(paths)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):

        dyn_model.fit(paths)
        new_paths = sample(env,
                           mpc_controller,
                           num_paths=num_paths_onpol,
                           horizon=env_horizon,
                           render=False,
                           verbose=False)
        costs = []
        returns = []
        for new_path in new_paths:
            cost = path_cost(cost_fn, new_path)
            costs.append(cost)
            returns.append(new_path['return'])
        costs = np.array(costs)
        returns = np.array(returns)
        paths = paths + new_paths  # Aggregation
        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
示例#23
0
def train(
    env,
    cost_fn,
    logdir=None,
    render=False,
    learning_rate=1e-3,
    onpol_iters=10,
    dynamics_iters=60,
    batch_size=512,
    num_paths_random=10,
    num_paths_onpol=10,
    num_simulated_paths=10000,
    env_horizon=1000,
    mpc_horizon=15,
    n_layers=2,
    size=500,
    activation=tf.nn.relu,
    output_activation=None,
    clip_param=0.2,
    entcoeff=0.0,
    gamma=0.99,
    lam=0.95,
    optim_epochs=10,
    optim_batchsize=64,
    schedule='linear',
    bc_lr=1e-3,
    ppo_lr=3e-4,
    timesteps_per_actorbatch=1000,
    MPC=True,
    BEHAVIORAL_CLONING=True,
    PPO=True,
):

    start = time.time()

    logz.configure_output_dir(logdir)

    print("-------- env info --------")
    print("observation_space: ", env.observation_space.shape)
    print("action_space: ", env.action_space.shape)
    print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING)
    print("PPO: ", PPO)

    print(" ")

    random_controller = RandomController(env)
    model_data_buffer = DataBuffer()

    ppo_data_buffer = DataBuffer_general(10000, 4)
    bc_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 2)

    # random sample path

    print("collecting random data .....  ")
    paths = sample(env,
                   random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   render=False,
                   verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            model_data_buffer.add(path['observations'][n], path['actions'][n],
                                  path['next_observations'][n])

    print("model data buffer size: ", model_data_buffer.size)

    normalization = compute_normalization(model_data_buffer)

    #========================================================
    #
    # Build dynamics model and MPC controllers and Behavioral cloning network.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    policy_nn = MlpPolicy_bc(sess=sess,
                             env=env,
                             hid_size=128,
                             num_hid_layers=2,
                             clip_param=clip_param,
                             entcoeff=entcoeff)

    mpc_controller_bc_ppo = MPCcontroller_BC_PPO(
        env=env,
        dyn_model=dyn_model,
        bc_ppo_network=policy_nn,
        horizon=mpc_horizon,
        cost_fn=cost_fn,
        num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    # init or load checkpoint with saver
    saver = tf.train.Saver()

    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)

    if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("checkpoint loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old checkpoint")
        if not os.path.exists(CHECKPOINT_DIR):
            os.mkdir(CHECKPOINT_DIR)

    #========================================================
    #
    # Prepare for rollouts
    #

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    max_timesteps = num_paths_onpol * env_horizon
    bc = False
    bc_ppo_mpc = False

    for itr in range(onpol_iters):

        print("onpol_iters: ", itr)
        if MPC:
            dyn_model.fit(model_data_buffer)

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            # cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
            cur_lrmult = 1.0

        print("bc learning_rate: ", bc_lr)
        print("ppo learning_rate: ", ppo_lr)

        # saver.save(sess, CHECKPOINT_DIR)
        bc_return = behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

        if bc_return > 100:
            bc_ppo_mpc = True
        else:
            bc_ppo_mpc = False

        ppo_data_buffer.clear()

        if (itr % 2 != 0 and bc_ppo_mpc) or not MPC:
            direct_mpc = False
        else:
            direct_mpc = True

        seg = traj_segment_generator(policy_nn, mpc_controller,
                                     mpc_controller_bc_ppo, bc_data_buffer,
                                     env, MPC, direct_mpc, bc_ppo_mpc,
                                     env_horizon)
        add_vtarg_and_adv(seg, gamma, lam)

        # check if seg is good
        ep_lengths = seg["ep_lens"]
        returns = seg["ep_rets"]
        if np.mean(returns) > 100:
            bc = True
        else:
            bc = False
        print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING and bc)

        ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = seg["ob"], seg[
            "ac"], seg["mpcac"], seg["rew"], seg["nxt_ob"], seg["adv"], seg[
                "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        for n in range(len(ob)):
            if PPO:
                ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]])

            if BEHAVIORAL_CLONING and bc:
                bc_data_buffer.add([ob[n], mpcac[n]])
            if MPC:
                model_data_buffer.add(ob[n], ac[n], nxt_ob[n])

        print("ppo_data_buffer size", ppo_data_buffer.size)
        print("bc_data_buffer size", bc_data_buffer.size)
        print("model data buffer size: ", model_data_buffer.size)

        # optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(policy_nn, "ob_rms"):
            policy_nn.ob_rms.update(ob)  # update running mean/std for policy
        policy_nn.assign_old_eq_new(
        )  # set old parameter values to new parameter values

        for op_ep in range(optim_epochs):
            # losses = [] # list of tuples, each of which gives the loss for a minibatch
            # for i in range(int(timesteps_per_actorbatch/optim_batchsize)):

            if PPO:
                sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample(
                    optim_batchsize)
                newlosses = policy_nn.lossandupdate_ppo(
                    sample_ob_no, sample_ac_na, sample_adv_n,
                    sample_b_n_target, cur_lrmult, ppo_lr * cur_lrmult)
                # losses.append(newlosses)

            if BEHAVIORAL_CLONING and bc:
                sample_ob_no, sample_ac_na = bc_data_buffer.sample(
                    optim_batchsize)
                # print("sample_ob_no", sample_ob_no.shape)
                # print("sample_ac_na", sample_ac_na.shape)

                policy_nn.update_bc(sample_ob_no, sample_ac_na,
                                    bc_lr * cur_lrmult)

            if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc:
                print('epcho: ', op_ep)
                behavioral_cloning_eval(sess, env, policy_nn, env_horizon)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values

        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        # if np.mean(returns) > 1000:
        #     filename = "seg_data.pkl"
        #     pickle.dump(seg, open(filename, 'wb'))
        #     print("saved", filename)

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", iters_so_far)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#24
0
def train(episodes, learning_rate, batch_size, gamma, eps_start, eps_end,
          eps_decay, target_update, max_steps, buffer_size,
          random_link, random_target, repeat_actions, logdir):

    setup_logger(logdir, locals())

    env = environment()

    eval_policy = evaluation(env,logdir)

    env.reset_robot_position(random_=True)
    env.reset_target_position(random_=False)

    # resize = T.Compose([T.ToPILImage(),
    #                     T.Grayscale(num_output_channels=1),
    #                     T.Resize(64, interpolation = Image.BILINEAR),
    #                     T.ToTensor()])
    img = env.get_obs()
    img = torch.from_numpy(img.copy())
    # img_height, img_width, _ = img.shape

    policy_net = DQN_FC().to(device)
    target_net = DQN_FC().to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr = learning_rate)
    memory = Replay_Buffer(buffer_size)

    obs = env.get_obs()
    obs = torch.from_numpy((obs)).view(1,-1)

    successes = 0

    target_upd = 0
    grad_upd = 0
    steps_train = 0

    for ep in range(1,episodes+1):
        env.reset_robot_position(random_=random_link)
        env.reset_target_position(random_=random_target) #target after link reset so vel=0
        rewards_ep = 0
        steps_ep = 0
        steps_all = []
        rewards_all = []
        sampling_time = 0
        start_time = time.time()
        while True:
            action, eps_threshold = select_actions(obs, eps_start, eps_end,
                                                       eps_decay, steps_train, policy_net,env)

            reward, done = env.step_(action)
            reward = torch.tensor(reward,dtype=torch.float).view(-1,1)
            obs_next = env.get_obs()
            obs_next = torch.from_numpy(obs_next).view(1,-1).to(device)
            transition = {'s': obs.to(device),
                          'a': action.to(device),
                          'r': reward,
                          "s'": obs_next.to(device)
                          }
            steps_ep += 1
            steps_train += 1
            rewards_ep += reward

            memory_state = memory.push(transition)

            obs = env.get_obs()
            obs = torch.from_numpy((obs)).view(1,-1)

            if done:
                rewards_all.append(rewards_ep/steps_ep)
                steps_all.append(steps_ep)
                successes += 1
                break

            elif steps_ep == max_steps:
                rewards_all.append(rewards_ep)
                steps_all.append(steps_ep)
                break

            status = optimize_model(policy_net, target_net, optimizer, memory, gamma, batch_size)
            if status != False:
                grad_upd += 1
                for param, target_param in zip(policy_net.parameters(),target_net.parameters()):
                    target_param.data = 0.995 * target_param.data + (1 - 0.995) * param.data
                    #target_net.load_state_dict(policy_net.state_dict())
                    target_net.eval()


        end_time = time.time()
        sampling_time += end_time-start_time
        sampling_time /= ep

        if ep % 40 == 0:
            return_val, steps_val = eval_policy.sample_episode(policy_net,save_video=True if ep%500==0 else False, n_episodes=5)
            qvalue_eval = eval_policy.get_qvalue(policy_net)
            logz.log_tabular('Averaged Steps Traning',np.around(np.average(steps_all),decimals=0)) # last 10 episodes
            logz.log_tabular('Averaged Return Training',np.around(np.average(rewards_all),decimals=2))
            logz.log_tabular('Averaged Steps Validation',np.around(np.average(steps_val),decimals=0))
            logz.log_tabular('Averaged Return Validation',np.around(np.average(return_val),decimals=2))
            logz.log_tabular('Cumulative Successes',successes)
            logz.log_tabular('Number of episodes',ep)
            logz.log_tabular('Sampling time (s)', sampling_time)
            logz.log_tabular('Epsilon threshold', eps_threshold)
            logz.log_tabular('Gradient update', grad_upd )
            logz.log_tabular('Average q-value evaluation', qvalue_eval)
            logz.dump_tabular()
            steps_all = []
            rewards_all = []
            logz.save_pytorch_model(policy_net.state_dict())

    env.terminate()
示例#25
0
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False):
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env = gym.make("Pendulum-v0")
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    logz.configure_output_dir(logdir, CLEAR_LOGS)
    if vf_type == 'linear':
        vf = LinearValueFunction(**vf_params)
    elif vf_type == 'nn':
        vf = NnValueFunction(ob_dim=ob_dim, **vf_params)

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)  # batch of observations
    sy_ac_n = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32)  # batch of actions taken by the policy, used for policy gradient computation
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)  # advantage function estimate

    sy_h1 = lrelu(dense(sy_ob_no, 128, "h1", weight_init=normc_initializer(1.0)))
    sy_h2 = lrelu(dense(sy_h1, 128, "h2", weight_init=normc_initializer(1.0)))

    sy_mean_na = dense(sy_h2, ac_dim, 'mean_na', weight_init=normc_initializer(0.1))  # Mean control output
    sy_logstd_a = tf.get_variable('logstdev', [ac_dim], initializer=tf.zeros_initializer)  # Variance
    sy_std_a = tf.exp(sy_logstd_a)

    sy_dist = tf.contrib.distributions.Normal(mu=sy_mean_na, sigma=sy_std_a, validate_args=True)
    sy_sampled_ac = sy_dist.sample(ac_dim)[0, :, 0]
    sy_logprob_n = tf.squeeze(tf.log(sy_dist.prob(sy_ac_n)))  # log-prob of actions taken -- used for policy gradient calculation

    sy_old_mean_na = tf.placeholder(shape=[None, ac_dim], name='old_mean_na', dtype=tf.float32)
    sy_old_logstd_a = tf.placeholder(shape=[ac_dim], name='old_logstdev', dtype=tf.float32)
    sy_old_std_a = tf.exp(sy_old_logstd_a)
    sy_old_dist = tf.contrib.distributions.Normal(mu=sy_old_mean_na, sigma=sy_old_std_a, validate_args=True)

    sy_kl = tf.reduce_mean(tf.contrib.distributions.kl(sy_old_dist, sy_dist, allow_nan=False))
    sy_ent = tf.reduce_mean(sy_dist.entropy())

    sy_surr = -tf.reduce_mean(sy_adv_n * sy_logprob_n)  # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  # pylint: disable=E1101

    total_timesteps = 0
    stepsize = initial_stepsize

    for i in range(n_iter):
        print("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (i % 10 == 0) and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                if done:
                    break
            path = {"observation": np.array(obs), "terminated": terminated,
                    "reward": np.array(rewards), "action": np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict(path["observation"])
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        vf.fit(ob_no, vtarg_n)

        # Policy update
        _, old_mean_na, old_logstd_a = sess.run([update_op, sy_mean_na, sy_logstd_a], feed_dict={
            sy_ob_no: ob_no,
            sy_ac_n: ac_n,
            sy_adv_n: standardized_adv_n,
            sy_stepsize: stepsize})
        kl, ent = sess.run([sy_kl, sy_ent], feed_dict={
            sy_ob_no: ob_no,
            sy_old_mean_na: old_mean_na,
            sy_old_logstd_a: old_logstd_a})

        if kl > desired_kl * 2:
            stepsize /= 1.5
            print('stepsize -> %s' % stepsize)
        elif kl < desired_kl / 2:
            stepsize *= 1.5
            print('stepsize -> %s' % stepsize)
        else:
            print('stepsize OK')

        # Log diagnostics
        logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
示例#26
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    print("OB AND ACTION DIM=============")
    print(ob_dim)
    print(ac_dim)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    # CODE
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        # Takes in the observation and returns the logits for actions as per our policy net
        sy_logits_na = build_mlp(input_placeholder=sy_ob_no,
                                 output_size=ac_dim,
                                 scope="Discrete")
        # Sample an action to be taken.
        sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), [1])
        tf.assert_rank(sy_logits_na, 2)
        tf.assert_rank(sy_sampled_ac, 1)
        # Figure out the log probablity (as per our current policy) of the action that was actually
        # taken.
        action_one_hot = tf.one_hot(indices=sy_ac_na, depth=ac_dim)
        action_taken_logit = tf.reduce_sum(action_one_hot * sy_logits_na,
                                           axis=1)
        normalizer = tf.reduce_sum(tf.exp(sy_logits_na), axis=1)
        sy_logprob_n = action_taken_logit - tf.log(normalizer)
        tf.assert_rank(sy_logprob_n, 1)

    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(input_placeholder=sy_ob_no,
                            output_size=ac_dim,
                            scope="Continuous")
        sy_logstd = tf.Variable(
            0.0, name="sy_logstd"
        )  # logstd should just be a trainable variable, not a network output.
        # For sampling, we use a reparameterization trick. mu + sigma * z, where z ~ N(O, I)

        # Hint: Use the log probability under a multivariate gaussian.
        # For finding the probability of the action (multi-dimensional) that was actually taken, first
        # define a normal distribution with the above mean and std. Note that this defines multiple scalar
        # distributions with same variance. Equivalent of multi variate gaussian with diagonal covariance matrix
        # with same diagonal value of std (independent variables).
        # NOTE: we technically don't need the tf.exp() on the std, since we can assume that the variable is
        # representing the std directly than its log, and force > 0. However, that may introduce some numerical
        # instability and leads to some nans in loss and actions.
        dist = tf.distributions.Normal(loc=sy_mean, scale=tf.exp(sy_logstd))
        # Since we are using independent Normal vars to represent a multivariate Gaussian with independent
        # variables, to get the overall probablity, we have to multiply the individual probabilities
        # obtained from the Normal.
        # P(x1, x2) = P(x1) * P(x2). Thus summing in log domain.
        sy_logprob_n = tf.reduce_sum(dist.log_prob(sy_ac_na), axis=1)
        # sy_sampled_ac = sy_mean + sy_logstd * tf.random_normal(shape=[ac_dim])
        sy_sampled_ac = dist.sample()
        tf.assert_rank(sy_sampled_ac, 2)
        tf.assert_rank(sy_logprob_n, 1)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#
    # Note the -ve sign, since the remainder is the reward, whereas we are defining loss.
    loss = -tf.reduce_mean(
        sy_logprob_n * sy_adv_n
    )  # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        # Targets for the baseline will be provided by the paths collected from experience
        target_bn = tf.placeholder(shape=[None],
                                   name="target_bn",
                                   dtype=tf.float32)
        loss_bn = tf.losses.mean_squared_error(target_bn, baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            loss_bn)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                ac = ac[0]
                acs.append(ac)
                #print("OBS, ACTION")
                #print(ob)
                #print(ac)

                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        if reward_to_go is False:  # trajectory (path) based PG.
            # Get the reward for each path as the sum of rewards along the path.
            # In this scheme, the reward Ret(tau) is the same for every timestamp along
            # the path.
            # So just replicate the path reward for each timestamp along that path.
            rewards_path_repl = [[
                np.sum(
                    np.power(gamma, i) * rew
                    for i, rew in enumerate(path["reward"]))
            ] * len(path["reward"]) for path in paths]
            # Concate the paths similar to ob_no and ac_na.
            q_n = np.concatenate(rewards_path_repl)
        else:
            discounted_rewards_paths = []
            for path in paths:
                # path["rewards"] -> array with rewards.
                discounted_sum = 0
                discounted_rewards = []
                # go over the rewards in reverse order. multiply by gamma and add to previous sum
                # to get the next sum. This gets the intended rewards in the reverse order, so ultimately
                # reverse the resulting array (or alternative would be to fill the array at 0 as we go.)
                for i, rew in enumerate(path['reward'][::-1]):
                    # print('i, rew: ', i, rew)
                    discounted_sum = gamma * discounted_sum + rew
                    discounted_rewards.append(discounted_sum)
                discounted_rewards_paths.append(discounted_rewards[::-1])
            q_n = np.concatenate(discounted_rewards_paths)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            b_n_orig = sess.run(baseline_prediction,
                                feed_dict={sy_ob_no: ob_no})
            # b_n_orig is expected to be zero mean and std 1 since that is what we are targeting
            # in the graph training. So scale with the q_n stats.
            mean_q = np.mean(q_n)
            std_q = np.std(q_n)
            # now b_n should have mean of q_n and std of q_n.
            b_n = b_n_orig * std_q + mean_q
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_n = (adv_n - np.mean(adv_n)) / np.std(adv_n)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            # Use the previous network weights to predict the baseline values for calculating
            # targets = reward[i] + gamma * b_n[i+1] (unless end of episode). This is like the
            # TD(0) target [if we want Monte Carlo, then the targets will be q_n but that is
            # going to be noisy].
            # b_n should have mean and std same as that of q_n since we scaled that above
            # before advantage normalization. reward[i] should come from the same distribution
            # as q_n.
            #
            q_values = []
            j = 0
            for path in paths:
                path_reward = path["reward"]
                path_obs = path["observation"]
                for i in range(len(path_reward)):
                    b_next = b_n[j + 1] if i < len(path_reward) - 1 else 0
                    q_values.append(path_reward[i] + gamma * b_next)
                    j = j + 1
            # Now that we have the targets, we should scale them back to 0 mean and 1 std before
            # setting it as target for the graph to backprop.
            q_values = np.array(q_values)

            targets_ = (q_values - np.mean(q_values)) / np.std(q_values)
            sess.run(baseline_update_op,
                     feed_dict={
                         sy_ob_no: ob_no,
                         target_bn: targets_
                     })

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # YOUR_CODE_HERE
        print("q_n shape: ", q_n.shape)
        print("ob_no shape: ", ob_no.shape)
        print("ac_na shape: ", ac_na.shape)
        update_, loss_, sy_logprob_n_ = sess.run(
            [update_op, loss, sy_logprob_n],
            feed_dict={
                sy_ob_no: ob_no,
                sy_ac_na: ac_na,
                sy_adv_n: q_n
            })
        print("sy_logprob_n [Chosen action log prob] Shape: ",
              sy_logprob_n_.shape)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("Loss", loss_)
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#27
0
    def train(self, train_db, val_db, test_db):
        ##################################################################
        ## LOG
        ##################################################################
        logz.configure_output_dir(self.cfg.model_dir)
        logz.save_config(self.cfg)

        ##################################################################
        ## NN table
        ##################################################################
        if self.cfg.use_hard_mining:
            self.train_tables = AllCategoriesTables(train_db)
            self.val_tables = AllCategoriesTables(val_db)
            self.train_tables.build_nntables_for_all_categories(True)
            self.val_tables.build_nntables_for_all_categories(True)

        ##################################################################
        ## Main loop
        ##################################################################
        start = time()
        min_val_loss = 100000000
        for epoch in range(self.epoch, self.cfg.n_epochs):
            ##################################################################
            ## Training
            ##################################################################
            torch.cuda.empty_cache()
            train_loss, train_accu = self.train_epoch(train_db, epoch)

            ##################################################################
            ## Validation
            ##################################################################
            torch.cuda.empty_cache()
            val_loss, val_accu = self.validate_epoch(val_db, epoch)

            ##################################################################
            ## Logging
            ##################################################################

            # update optim scheduler
            current_val_loss = np.mean(val_loss[:, 0])
            # self.optimizer.update(current_val_loss, epoch)
            logz.log_tabular("Time", time() - start)
            logz.log_tabular("Iteration", epoch)
            logz.log_tabular("AverageLoss", np.mean(train_loss[:, 0]))
            logz.log_tabular("AveragePredLoss", np.mean(train_loss[:, 1]))
            logz.log_tabular("AverageEmbedLoss", np.mean(train_loss[:, 2]))
            logz.log_tabular("AverageAttnLoss", np.mean(train_loss[:, 3]))
            logz.log_tabular("AverageObjAccu", np.mean(train_accu[:, 0]))
            logz.log_tabular("AverageCoordAccu", np.mean(train_accu[:, 1]))
            logz.log_tabular("AverageScaleAccu", np.mean(train_accu[:, 2]))
            logz.log_tabular("AverageRatioAccu", np.mean(train_accu[:, 3]))

            logz.log_tabular("ValAverageLoss", np.mean(val_loss[:, 0]))
            logz.log_tabular("ValAveragePredLoss", np.mean(val_loss[:, 1]))
            logz.log_tabular("ValAverageEmbedLoss", np.mean(val_loss[:, 2]))
            logz.log_tabular("ValAverageAttnLoss", np.mean(val_loss[:, 3]))
            logz.log_tabular("ValAverageObjAccu", np.mean(val_accu[:, 0]))
            logz.log_tabular("ValAverageCoordAccu", np.mean(val_accu[:, 1]))
            logz.log_tabular("ValAverageScaleAccu", np.mean(val_accu[:, 2]))
            logz.log_tabular("ValAverageRatioAccu", np.mean(val_accu[:, 3]))
            logz.dump_tabular()

            ##################################################################
            ## Checkpoint
            ##################################################################
            if self.cfg.use_hard_mining:
                if (epoch + 1) % 3 == 0:
                    torch.cuda.empty_cache()
                    t0 = time()
                    self.dump_shape_vectors(train_db)
                    torch.cuda.empty_cache()
                    self.dump_shape_vectors(val_db)
                    print("Dump shape vectors completes (time %.2fs)" %
                          (time() - t0))
                    torch.cuda.empty_cache()
                    t0 = time()
                    self.train_tables.build_nntables_for_all_categories(False)
                    self.val_tables.build_nntables_for_all_categories(False)
                    print("NN completes (time %.2fs)" % (time() - t0))
            self.save_checkpoint(epoch)
示例#28
0
def train_PG(
        exp_name,
        env_name,
        n_iter,
        gamma,
        min_timesteps_per_batch,
        mini_batch_size,
        max_path_length,
        learning_rate,
        num_ppo_updates,
        num_value_iters,
        animate,
        logdir,
        normalize_advantages,
        nn_critic,
        seed,
        n_layers,
        size,
        gru_size,
        history,
        num_tasks,
        l2reg,
        recurrent,
        grain_size
        ):

    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    envs = {'pm': PointEnv,
            'pm-obs': ObservedPointEnv,
            }
    env = envs[env_name](num_tasks, grain_size=grain_size)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    task_dim = len(env._goal) # rude, sorry

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'task_dim': task_dim,
        'size': size,
        'gru_size': gru_size,
        'learning_rate': learning_rate,
        'history': history,
        'num_value_iters': num_value_iters,
        'l2reg': l2reg,
        'recurrent': recurrent,
        }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
        'grain_size': grain_size
    }

    estimate_return_args = {
        'gamma': gamma,
        'nn_critic': nn_critic,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args)

    # build computation graph
    agent.build_computation_graph()


    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    #========================================================================================#
    # Training Loop
    #========================================================================================#
    def unpack_sample(data):
        '''
        unpack a sample from the replay buffer
        '''
        ob = data["observations"]
        ac = data["actions"]
        re = data["rewards"]
        hi = data["hiddens"]
        ma = 1 - data["terminals"]
        return ob, ac, re, hi, ma

    # construct PPO replay buffer, perhaps rude to do outside the agent
    ppo_buffer = PPOReplayBuffer(agent.replay_buffer)

    total_timesteps = 0
    for itr in range(n_iter):
        # for PPO: flush the replay buffer!
        ppo_buffer.flush()

        # sample trajectories to fill agent's replay buffer
        print("********** Iteration %i ************"%itr)
        stats = []
        for _ in range(num_tasks):
            s, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch)
            total_timesteps += timesteps_this_batch
            stats += s

        # compute the log probs, advantages, and returns for all data in agent's buffer
        # store in ppo buffer for use in multiple ppo updates
        # TODO: should move inside the agent probably
        data = agent.replay_buffer.all_batch()
        ob_no, ac_na, re_n, hidden, masks = unpack_sample(data)
        fixed_log_probs = agent.sess.run(agent.sy_lp_n,
            feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na})
        q_n, adv_n = agent.estimate_return(ob_no, re_n, hidden, masks)

        ppo_buffer.add_samples(fixed_log_probs, adv_n, q_n)

        # update with mini-batches sampled from ppo buffer
        for _ in range(num_ppo_updates):

            data = ppo_buffer.random_batch(mini_batch_size)

            ob_no, ac_na, re_n, hidden, masks = unpack_sample(data)
            fixed_log_probs = data["log_probs"]
            adv_n = data["advantages"]
            q_n = data["returns"]

            log_probs = agent.sess.run(agent.sy_lp_n,
                feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na})

            agent.update_parameters(ob_no, hidden, ac_na, fixed_log_probs, q_n, adv_n)

        # compute validation statistics
        print('Validating...')
        val_stats = []
        for _ in range(num_tasks):
            vs, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch // 10, is_evaluation=True)
            val_stats += vs

        # save trajectories for viz
        #with open("output/{}-epoch{}.pkl".format(exp_name, itr), 'wb') as f:
            #pickle.dump(agent.val_replay_buffer.all_batch(), f, pickle.HIGHEST_PROTOCOL)
        #agent.val_replay_buffer.flush()

        # Log TRAIN diagnostics
        returns = [sum(s["rewards"]) for s in stats]
        final_rewards = [s["rewards"][-1] for s in stats]
        ep_lengths = [s['ep_len'] for s in stats]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("FinalReward", np.mean(final_rewards))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)

        # Log VAL diagnostics
        val_returns = [sum(s["rewards"]) for s in val_stats]
        val_final_rewards = [s["rewards"][-1] for s in val_stats]
        logz.log_tabular("ValAverageReturn", np.mean(val_returns))
        logz.log_tabular("ValFinalReward", np.mean(val_final_rewards))

        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#29
0
def learn(env,
          q_func,
          optimizer_spec,
          session,
          exploration=LinearSchedule(1000000, 0.1),
          stopping_criterion=None,
          replay_buffer_size=1000000,
          batch_size=32,
          gamma=0.99,
          learning_starts=50000,
          learning_freq=4,
          frame_history_len=4,
          target_update_freq=10000,
          grad_norm_clipping=10,
          double_q_learning=False):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            img_in: tf.Tensor
                tensorflow tensor representing the input image
            num_actions: int
                number of actions
            scope: str
                scope in which all the model related variables
                should be created
            reuse: bool
                whether previously created variables should be reused.
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    session: tf.Session
        tensorflow session to use.
    exploration: rl_algs.deepq.utils.schedules.Schedule
        schedule for probability of chosing random action.
    stopping_criterion: (env, t) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    grad_norm_clipping: float or None
        If not None gradients' norms are clipped to this value.
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_shape = env.observation_space.shape
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_shape = (img_h, img_w, frame_history_len * img_c)
    num_actions = env.action_space.n

    # set up placeholders
    # placeholder for current observation (or state)
    obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for current action
    act_t_ph = tf.placeholder(tf.int32, [None])
    # placeholder for current reward
    rew_t_ph = tf.placeholder(tf.float32, [None])
    # placeholder for next observation (or state)
    obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for end of episode mask
    # this value is 1 if the next state corresponds to the end of an episode,
    # in which case there is no Q-value at the next state; at the end of an
    # episode, only the current state reward contributes to the target, not the
    # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
    done_mask_ph = tf.placeholder(tf.float32, [None])

    # casting to float on GPU ensures lower data transfer times.
    obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0
    obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0
    # Here, you should fill in your own code to compute the Bellman error. This requires
    # evaluating the current and next Q-values and constructing the corresponding error.
    # TensorFlow will differentiate this error for you, you just need to pass it to the
    # optimizer. See assignment text for details.
    # Your code should produce one scalar-valued tensor: total_error
    # This will be passed to the optimizer in the provided code below.
    # Your code should also produce two collections of variables:
    # q_func_vars
    # target_q_func_vars
    # These should hold all of the variables of the Q-function network and target network,
    # respectively. A convenient way to get these is to make use of TF's "scope" feature.
    # For example, you can create your Q-function network with the scope "q_func" like this:
    # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
    # And then you can obtain the variables like this:
    # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
    # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
    ######
    # YOUR CODE HERE

    ######
    q_func_network = q_func(obs_t_float,
                            num_actions,
                            scope="q_func",
                            reuse=False)
    target_q_func_network = q_func(obs_tp1_float,
                                   num_actions,
                                   scope="target_q_func",
                                   reuse=False)

    q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                    scope='q_func')
    target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_q_func')

    selected_action_q = tf.reduce_sum(tf.one_hot(act_t_ph, depth=num_actions) *
                                      q_func_network,
                                      axis=1)

    if double_q_learning:
        double_q_func_network = q_func(obs_tp1_float,
                                       num_actions,
                                       scope="q_func",
                                       reuse=True)

        selected_target_action = tf.one_hot(tf.argmax(double_q_func_network,
                                                      axis=1),
                                            depth=num_actions)
        target_q_value = tf.reduce_sum(target_q_func_network *
                                       selected_target_action,
                                       axis=1)
        y = rew_t_ph + done_mask_ph * gamma * target_q_value
    else:
        #done_mask_ph is inverted so 0 if the next state corresponds to the end of an episode
        y = rew_t_ph + done_mask_ph * gamma * tf.reduce_max(
            target_q_func_network, axis=1)
    total_error = tf.nn.l2_loss(selected_action_q - y)

    # construct optimization op (with gradient clipping)
    learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
    optimizer = optimizer_spec.constructor(learning_rate=learning_rate,
                                           **optimizer_spec.kwargs)
    train_fn = minimize_and_clip(optimizer,
                                 total_error,
                                 var_list=q_func_vars,
                                 clip_val=grad_norm_clipping)

    # update_target_fn will be called periodically to copy Q network to target Q network
    update_target_fn = []
    for var, var_target in zip(
            sorted(q_func_vars, key=lambda v: v.name),
            sorted(target_q_func_vars, key=lambda v: v.name)):
        update_target_fn.append(var_target.assign(var))
    update_target_fn = tf.group(*update_target_fn)

    # construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    model_initialized = False
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 1000

    # Configure output directory for logging

    # Log experimental parameters
    args = inspect.getargspec(learn)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    if not (os.path.exists('data')):
        os.makedirs('data')

    logdir = ''
    value_params = {}
    for key, value in sorted(params.items()):
        try:
            float(value)
        except (ValueError, TypeError):
            pass
        else:
            value_params[key] = value
            logdir += key + str(value) + '_'
    logdir = logdir[:-1]
    iteration = 1
    while os.path.exists(os.path.join('data/',
                                      (logdir + '/' + str(iteration)))):
        iteration += 1
    logdir = os.path.join('data', logdir)
    if not (os.path.exists(logdir)):
        os.makedirs(logdir)
    logdir = os.path.join(logdir, str(iteration))
    value_params['exp_name'] = logdir
    logz.configure_output_dir(logdir)
    logz.save_params(value_params)

    available_actions = range(num_actions)
    try:
        for t in itertools.count():
            ### 1. Check stopping criterion
            if stopping_criterion is not None and stopping_criterion(env, t):
                break

            ### 2. Step the env and store the transition
            # At this point, "last_obs" contains the latest observation that was
            # recorded from the simulator. Here, your code needs to store this
            # observation and its outcome (reward, next observation, etc.) into
            # the replay buffer while stepping the simulator forward one step.
            # At the end of this block of code, the simulator should have been
            # advanced one step, and the replay buffer should contain one more
            # transition.
            # Specifically, last_obs must point to the new latest observation.
            # Useful functions you'll need to call:
            # obs, reward, done, info = env.step(action)
            # this steps the environment forward one step
            # obs = env.reset()
            # this resets the environment if you reached an episode boundary.
            # Don't forget to call env.reset() to get a new observation if done
            # is true!!
            # Note that you cannot use "last_obs" directly as input
            # into your network, since it needs to be processed to include context
            # from previous frames. You should check out the replay buffer
            # implementation in dqn_utils.py to see what functionality the replay
            # buffer exposes. The replay buffer has a function called
            # encode_recent_observation that will take the latest observation
            # that you pushed into the buffer and compute the corresponding
            # input that should be given to a Q network by appending some
            # previous frames.
            # Don't forget to include epsilon greedy exploration!
            # And remember that the first time you enter this loop, the model
            # may not yet have been initialized (but of course, the first step
            # might as well be random, since you haven't trained your net...)

            #####

            # YOUR CODE HERE

            #####

            buffer_index = replay_buffer.store_frame(last_obs)
            encoded_obsv = replay_buffer.encode_recent_observation()
            epsilon = exploration.value(t)
            if not model_initialized or np.random.rand(1) < epsilon:
                action = np.random.choice(available_actions)
            else:
                action_values = session.run(
                    q_func_network,
                    feed_dict={obs_t_float: encoded_obsv[None]})
                action = np.argmax(action_values)

            obs, reward, done, info = env.step(action)

            replay_buffer.store_effect(buffer_index, action, reward, done)
            if (done):
                obs = env.reset()
            last_obs = obs

            # at this point, the environment should have been advanced one step (and
            # reset if done was true), and last_obs should point to the new latest
            # observation

            ### 3. Perform experience replay and train the network.
            # note that this is only done if the replay buffer contains enough samples
            # for us to learn something useful -- until then, the model will not be
            # initialized and random actions should be taken
            if (t > learning_starts and t % learning_freq == 0
                    and replay_buffer.can_sample(batch_size)):
                # Here, you should perform training. Training consists of four steps:
                # 3.a: use the replay buffer to sample a batch of transitions (see the
                # replay buffer code for function definition, each batch that you sample
                # should consist of current observations, current actions, rewards,
                # next observations, and done indicator).
                # 3.b: initialize the model if it has not been initialized yet; to do
                # that, call
                #    initialize_interdependent_variables(session, tf.global_variables(), {
                #        obs_t_ph: obs_t_batch,
                #        obs_tp1_ph: obs_tp1_batch,
                #    })
                # where obs_t_batch and obs_tp1_batch are the batches of observations at
                # the current and next time step. The boolean variable model_initialized
                # indicates whether or not the model has been initialized.
                # Remember that you have to update the target network too (see 3.d)!
                # 3.c: train the model. To do this, you'll need to use the train_fn and
                # total_error ops that were created earlier: total_error is what you
                # created to compute the total Bellman error in a batch, and train_fn
                # will actually perform a gradient step and update the network parameters
                # to reduce total_error. When calling session.run on these you'll need to
                # populate the following placeholders:
                # obs_t_ph
                # act_t_ph
                # rew_t_ph
                # obs_tp1_ph
                # done_mask_ph
                # (this is needed for computing total_error)
                # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
                # (this is needed by the optimizer to choose the learning rate)
                # 3.d: periodically update the target network by calling
                # session.run(update_target_fn)
                # you should update every target_update_freq steps, and you may find the
                # variable num_param_updates useful for this (it was initialized to 0)
                #####

                # YOUR CODE HERE

                #####

                obs_t_batch, act_batch, rew_batch, obs_tp1_batch, done_mask = replay_buffer.sample(
                    batch_size)
                inverted_done_mask = 1.0 - done_mask

                train_feed_dict = {
                    learning_rate: optimizer_spec.lr_schedule.value(t),
                    rew_t_ph: rew_batch,
                    obs_t_ph: obs_t_batch,
                    obs_tp1_ph: obs_tp1_batch,
                    act_t_ph: act_batch,
                    done_mask_ph: inverted_done_mask
                }

                if not model_initialized:
                    initialize_interdependent_variables(
                        session, tf.global_variables(), {
                            obs_t_ph: obs_t_batch,
                            obs_tp1_ph: obs_tp1_batch
                        })
                    model_initialized = True
                    session.run(update_target_fn)

                _, loss_value, = session.run([train_fn, total_error],
                                             feed_dict=train_feed_dict)
                num_param_updates += 1

                if num_param_updates % target_update_freq == 0 and model_initialized:
                    session.run(update_target_fn)

            ### 4. Log progress
            episode_rewards = get_wrapper_by_name(
                env, "Monitor").get_episode_rewards()
            if len(episode_rewards) > 0:
                mean_episode_reward = np.mean(episode_rewards[-100:])
            if len(episode_rewards) > 100:
                best_mean_episode_reward = max(best_mean_episode_reward,
                                               mean_episode_reward)

            if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
                #print("Timestep %d" % (t,))
                #print("mean reward (100 episodes) %f" % mean_episode_reward)
                #print("best mean reward %f" % best_mean_episode_reward)
                #print("episodes %d" % len(episode_rewards))
                #print("exploration %f" % exploration.value(t))
                #print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
                #sys.stdout.flush()
                # print(q_value[0])
                # print(rew_batch[0])
                # print(qtn[0])
                logz.log_tabular("Timestep", (t, )[0])
                logz.log_tabular("MeanReward(100ep)", mean_episode_reward)
                logz.log_tabular("BestMeanReward", best_mean_episode_reward)
                logz.log_tabular("Episodes", len(episode_rewards))
                logz.log_tabular("LearningRate",
                                 optimizer_spec.lr_schedule.value(t))
                logz.log_tabular("Epsilon", exploration.value(t))
                logz.log_tabular("Loss", np.sum(loss_value))
                #logz.log_tabular("Qval", q_value)
                logz.log_tabular(
                    "WallTime",
                    time.strftime("%d.%m.%y %H:%M:%S", time.localtime()))

                logz.dump_tabular()

                #logz.pickle_tf_vars()
    except KeyboardInterrupt:
        print("imp")
        if os.path.exists("/tmp/hw3_vid_dir2/gym"):
            shutil.move("/tmp/hw3_vid_dir2/gym", logdir)
        save_q(os.path.join(logdir, 'Q_network'), session)

    if os.path.exists("/tmp/hw3_vid_dir2/gym"):
        shutil.move("/tmp/hw3_vid_dir2/gym", logdir)
    save_q(os.path.join(logdir, 'Q_network'), session)
示例#30
0
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch,
             max_path_length, learning_rate, reward_to_go, animate, logdir,
             normalize_advantages, nn_baseline, seed, n_layers, size,
             step_size):

    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#
    ##

    # Make the gym environment
    env = gym.make(env_name)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous, or self.discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
    }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_return_args = {
        'gamma': gamma,
        'reward_to_go': reward_to_go,
        'nn_baseline': nn_baseline,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args,
                  estimate_return_args)

    # build computation graph
    agent.build_computation_graph()

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = [path["reward"] for path in paths]

        q_n, adv_n = agent.estimate_return(ob_no, re_n)
        if step_size == 1:
            agent.update_parameters(ob_no, ac_na, q_n, adv_n)
        else:
            for _ in range(step_size):
                agent.update_parameters(ob_no, ac_na, q_n, adv_n)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#31
0
    def train(self, num_iter):
        max_reward_ever = -1
        start = time.time()
        for i in range(num_iter):

            t1 = time.time()
            self.train_step()
            for iter_ in range(10):
                self.update_explorer_net()
            t2 = time.time()
            print('total time of one step', t2 - t1)
            print('iter ', i, ' done')
            # if i == num_iter-1:
            #     np.savez(self.logdir + "/lin_policy_plus" + str(i), w)
            # record statistics every 10 iterations
            if ((i + 1) % 20 == 0):
                rewards = self.aggregate_rollouts(num_rollouts=30,
                                                  evaluate=True)
                print("SHAPE", rewards.shape)
                if (np.mean(rewards) > max_reward_ever):
                    max_reward_ever = np.mean(rewards)
                #     np.savez(self.logdir + "/lin_policy_plus", w)

                w = ray.get(self.workers[0].get_weights_plus_stats.remote())

                np.savez(self.logdir + "/bi_policy_num_plus" + str(i), w)
                torch.save(
                    self.policy.net.state_dict(),
                    self.logdir + "/bi_policy_num_plus_torch" + str(i) + ".pt")
                torch.save(self.policy.safeQ.state_dict(),
                           self.logdir + "/safeQ_torch" + str(i) + ".pt")

                # np.savez(self.logdir + "/bi_policy_num_plus" + str(i), w)
                # torch.save(self.policy.net.state_dict(),self.logdir + "/bi_policy_num_plus_torch" + str(i)+ ".pt")
                print(sorted(self.params.items()))
                logz.log_tabular("Time", time.time() - start)
                logz.log_tabular("Iteration", i + 1)
                logz.log_tabular("BestRewardEver", max_reward_ever)
                logz.log_tabular("AverageReward", np.mean(rewards))
                logz.log_tabular("StdRewards", np.std(rewards))
                logz.log_tabular("MaxRewardRollout", np.max(rewards))
                logz.log_tabular("MinRewardRollout", np.min(rewards))
                logz.log_tabular("timesteps", self.timesteps)
                logz.dump_tabular()

            t1 = time.time()
            # get statistics from all workers
            for j in range(self.num_workers):
                self.policy.observation_filter.update(
                    ray.get(self.workers[j].get_filter.remote()))
            self.policy.observation_filter.stats_increment()

            # make sure master filter buffer is clear
            self.policy.observation_filter.clear_buffer()
            # sync all workers
            filter_id = ray.put(self.policy.observation_filter)
            setting_filters_ids = [
                worker.sync_filter.remote(filter_id) for worker in self.workers
            ]
            # waiting for sync of all workers
            ray.get(setting_filters_ids)

            increment_filters_ids = [
                worker.stats_increment.remote() for worker in self.workers
            ]
            # waiting for increment of all workers
            ray.get(increment_filters_ids)
            t2 = time.time()
            print('Time to sync statistics:', t2 - t1)

        return
示例#32
0
def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=False, logfile=None):
    env = gym.make("CartPole-v0")
    ob_dim = env.observation_space.shape[0]
    num_actions = env.action_space.n
    logz.configure_output_file(logfile)
    #vf = LinearValueFunction()
    vf = NeuralValueFunction(ob_dim)

    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in these function
    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations
    sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate
    sy_h1 = tf.nn.relu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer
    sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer
    # we use a small initialization for the last layer, so the initial policy has maximal entropy
    sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic)
    sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions
    sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient)
    sy_n = tf.shape(sy_ob_no)[0]
    sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation

    # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
    sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na)
    sy_oldp_na = tf.exp(sy_oldlogp_na) 
    sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n)
    sy_p_na = tf.exp(sy_logp_na)
    sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n)
    # <<<<<<<<<<<<<

    sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__()
    sess.run(tf.global_variables_initializer())

    total_timesteps = 0
    obs_mean = np.zeros(ob_dim)
    obs_std = np.zeros(ob_dim)

    for i in range(n_iter):
        print("********** Iteration %i ************"%i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                if done:
                    break                    
            path = {"observation" : np.array(obs), "terminated" : terminated,
                    "reward" : np.array(rewards), "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict((path["observation"]-obs_mean)/(obs_std+1e-8))
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n-adv_n.mean())/(adv_n.std()+1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        obs_mean = np.average(ob_no,axis=0)
        obs_std = np.std(ob_no,axis=0)
        vf.fit((ob_no-obs_mean)/(obs_std+1e-8), vtarg_n)

        # Policy update
        _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize})
        kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na})

        # Log diagnostics
        logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
def train_AC(
        exp_name,
        env_name,
        n_iter, 
        gamma, 
        min_timesteps_per_batch, 
        max_path_length,
        learning_rate,
        num_target_updates,
        num_grad_steps_per_target_update,
        animate, 
        logdir, 
        normalize_advantages,
        seed,
        n_layers,
        size,
        ########################################################################
        # Exploration args
        bonus_coeff,
        kl_weight,
        density_lr,
        density_train_iters,
        density_batch_size,
        density_hiddim,
        dm,
        replay_size,
        sigma,
        ########################################################################
        ):
    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    ########################################################################
    # Exploration
    if env_name == 'PointMass-v0':
        from pointmass import PointMass
        env = PointMass()
    else:
        env = gym.make(env_name)
    dirname = logz.G.output_dir
    ########################################################################

    # Set random seeds
    # [Mehran Shakeriava] change begin
    import random
    random.seed(seed, version=2)
    # tf.set_random_seed(seed)
    # np.random.seed(seed)
    # env.seed(seed)
    tf.set_random_seed(random.randint(0, 2**32 - 1))
    np.random.seed(random.randint(0, 2**32 - 1))
    env.seed(random.randint(0, 2**32 - 1))
    # [Mehran Shakeriava] change end

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
        'num_target_updates': num_target_updates,
        'num_grad_steps_per_target_update': num_grad_steps_per_target_update,
        }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_advantage_args = {
        'gamma': gamma,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args) #estimate_return_args

    # build computation graph
    agent.build_computation_graph()

    ########################################################################
    # Initalize exploration density model
    if dm != 'none':
        if env_name == 'PointMass-v0' and dm == 'hist':
            density_model = Histogram(
                nbins=env.grid_size, 
                preprocessor=env.preprocess)
            exploration = DiscreteExploration(
                density_model=density_model,
                bonus_coeff=bonus_coeff)
        elif dm == 'rbf':
            density_model = RBF(sigma=sigma)
            exploration = RBFExploration(
                density_model=density_model,
                bonus_coeff=bonus_coeff,
                replay_size=int(replay_size))
        elif dm == 'ex2':
            density_model = Exemplar(
                ob_dim=ob_dim, 
                hid_dim=density_hiddim,
                learning_rate=density_lr, 
                kl_weight=kl_weight)
            exploration = ExemplarExploration(
                density_model=density_model, 
                bonus_coeff=bonus_coeff, 
                train_iters=density_train_iters, 
                bsize=density_batch_size,
                replay_size=int(replay_size))
            exploration.density_model.build_computation_graph()
        else:
            raise NotImplementedError

    ########################################################################

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    ########################################################################
    if dm != 'none':
        exploration.receive_tf_sess(agent.sess)
    ########################################################################

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = np.concatenate([path["reward"] for path in paths])
        next_ob_no = np.concatenate([path["next_observation"] for path in paths])
        terminal_n = np.concatenate([path["terminal"] for path in paths])

        ########################################################################
        # Modify the reward to include exploration bonus
        """
            1. Fit density model
                if dm == 'ex2':
                    the call to exploration.fit_density_model should return ll, kl, elbo
                else:
                    the call to exploration.fit_density_model should return nothing
            2. Modify the re_n with the reward bonus by calling exploration.modify_reward
        """
        old_re_n = re_n
        if dm == 'none':
            pass
        else:
            # 1. Fit density model
            if dm == 'ex2':
                ### PROBLEM 3
                ### YOUR CODE HERE
                ll, kl, elbo = exploration.fit_density_model(next_ob_no)
            elif dm == 'hist' or dm == 'rbf':
                ### PROBLEM 1
                ### YOUR CODE HERE ###
                exploration.fit_density_model(next_ob_no)
                ######################
            else:
                assert False

            # 2. Modify the reward
            ### PROBLEM 1
            ### YOUR CODE HERE ###
            re_n = exploration.modify_reward(re_n, next_ob_no)
            ######################

            print('average state', np.mean(ob_no, axis=0))
            print('average action', np.mean(ac_na, axis=0))

            # Logging stuff.
            # Only works for point mass.
            if env_name == 'PointMass-v0':
                np.save(os.path.join(dirname, '{}'.format(itr)), ob_no)
        ########################################################################
        agent.update_critic(ob_no, next_ob_no, re_n, terminal_n)
        adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)
        agent.update_actor(ob_no, ac_na, adv_n)

        if n_iter - itr < 10:
            max_reward_path_idx = np.argmax(np.array([path["reward"].sum() for path in paths]))
            print(paths[max_reward_path_idx]['reward'])

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        ########################################################################
        logz.log_tabular("Unmodified Rewards Mean", np.mean(old_re_n))
        logz.log_tabular("Unmodified Rewards Std", np.mean(old_re_n))
        logz.log_tabular("Modified Rewards Mean", np.mean(re_n))
        logz.log_tabular("Modified Rewards Std", np.mean(re_n))
        if dm == 'ex2':
            logz.log_tabular("Log Likelihood Mean", np.mean(ll))
            logz.log_tabular("Log Likelihood Std", np.std(ll))
            logz.log_tabular("KL Divergence Mean", np.mean(kl))
            logz.log_tabular("KL Divergence Std", np.std(kl))
            logz.log_tabular("Negative ELBo", -elbo)
        ########################################################################
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#34
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,
             network_activation='tanh'
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    
    #activation function for the network
    if network_activation=='relu':
        activation=torch.nn.functional.relu
    elif network_activation=='leaky_relu':
        activation=torch.nn.functional.leaky_relu
    else:
        activation=torch.nn.functional.tanh
    #todo: create policy
    actor=build_mlp(ob_dim, ac_dim, "actor",\
                             n_layers=n_layers, size=size, activation=activation, discrete=discrete)
    actor_loss=reinforce_loss
    actor_optimizer=torch.optim.Adam(actor.parameters(), lr=learning_rate)
    
    #todo: initilize Agent:
    
    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#
    if nn_baseline:
        critic=build_mlp(ob_dim,1,"nn_baseline",\
                                    n_layers=n_layers,size=size, discrete=discrete)
        critic_loss=nn.MSELoss()
        critic_optimizer=torch.optim.Adam(critic.parameters(), lr=learning_rate)
        

    #========================================================================================#
    # Training Loop
    #========================================================================================#
    
    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards, log_probs = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                ob = torch.from_numpy(ob).float().unsqueeze(0)
                obs.append(ob)
                ac, log_prob = actor.run(ob)
                acs.append(ac)
                log_probs.append(log_prob)
                #format the action from policy
                if discrete:
                    ac = int(ac)
                else:
                    ac = ac.squeeze(0).numpy()
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : torch.cat(obs, 0),
                    "reward" : torch.Tensor(rewards),
                    "action" : torch.cat(acs, 0),
                    "log_prob" : torch.cat(log_probs, 0)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        ob_no = torch.cat([path["observation"] for path in paths], 0)
        ac_na = torch.cat([path["action"] for path in paths], 0)
                                   
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#
        q_n = []
        for path in paths:
            rewards = path['reward']
            num_steps = pathlength(path)
            R=[]
            if reward_to_go:
                for t in range(num_steps):
                    R.append((torch.pow(gamma, torch.arange(num_steps-t))*rewards[t:]).sum().view(-1,1))
                q_n.append(torch.cat(R))
            else:
                q_n.append((torch.pow(gamma, torch.arange(num_steps)) * rewards).sum() * torch.ones(num_steps, 1))
        q_n = torch.cat(q_n, 0)
        
         #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#
        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            b_n = critic(ob_no)
            q_n_std = q_n.std()
            q_n_mean = q_n.mean()
            b_n_scaled = b_n * q_n_std + q_n_mean
            adv_n = (q_n - b_n_scaled).detach()
        else:
            adv_n = q_n
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + np.finfo(np.float32).eps.item())
        
        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            target = (q_n - q_n_mean) / (q_n_std + np.finfo(np.float32).eps.item())
            critic_optimizer.zero_grad()
            c_loss = critic_loss(b_n, target)
            c_loss.backward()
            critic_optimizer.step()
            
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE
        log_probs = torch.cat([path["log_prob"] for path in paths], 0)
        actor_optimizer.zero_grad()
        loss = actor_loss(log_probs, adv_n, len(paths))
        print(loss)
        loss.backward()
        actor_optimizer.step()

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#35
0
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False):
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env = gym.make("Pendulum-v0")
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    logz.configure_output_dir(logdir)

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations
    sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer
    sy_h2 = lrelu(dense(sy_h1, 16, "h2", weight_init=normc_initializer(1.0))) # hidden layer
    # Gaussian distribution (mean, stdev) for each action dimension for the
    # batch.
    sy_mean_na = dense(sy_h2, ac_dim, "mean", weight_init=normc_initializer(0.05))
    # Use the same stdev for all inputs.
    sy_logstd_a = tf.get_variable("logstdev", [ac_dim], initializer=tf.zeros_initializer()) # Variance

    sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # batch of actions taken by the policy, used for policy gradient computation

    # Now, need to compute the logprob for each action taken.
    action_dist = tf.contrib.distributions.Normal(loc=sy_mean_na, scale=tf.exp(sy_logstd_a), validate_args=True)
    # sy_logprob_n is in [batch_size, ac_dim] shape.
    sy_logprob_n = action_dist.log_prob(sy_ac_na)

    # Now, need to sample an action based on input. This should be a 1-D vector
    # with ac_dim float in it.
    sy_sampled_ac = action_dist.sample()[0]

    # old mean/stdev before updating the policy. This is purely used for
    # computing KL
    sy_oldmean_na = tf.placeholder(shape=[None, ac_dim], name='oldmean', dtype=tf.float32)
    sy_oldlogstd_a = tf.placeholder(shape=[ac_dim], name='oldlogstdev', dtype=tf.float32)
    old_action_dist = tf.contrib.distributions.Normal(loc=sy_oldmean_na, scale=tf.exp(sy_oldlogstd_a), validate_args=True)
    sy_kl = tf.reduce_mean(tf.contrib.distributions.kl_divergence(action_dist, old_action_dist))
    # Compute entropy
    sy_ent = tf.reduce_mean(action_dist.entropy())

    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)  # advantage function estimate

    # We do tf.reduce_mean on sy_logprob_n here, as it's shape is [batch_size,
    # ac_dim]. Not sure what's the best way to deal with ac_dim -- but pendulum's
    # ac_dim is 1, so using reduce_mean here is fine.
    sy_surr = - tf.reduce_mean(sy_adv_n * tf.reduce_mean(sy_logprob_n, 1)) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101

    if vf_type == 'linear':
        vf = LinearValueFunction(**vf_params)
    elif vf_type == 'nn':
        vf = NnValueFunction(ob_dim=ob_dim, session=sess, **vf_params)

    initial_ob = env.reset()

    total_timesteps = 0
    stepsize = initial_stepsize

    for i in range(n_iter):
        print("********** Iteration %i ************"%i)
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                if done:
                    break
            path = {"observation" : np.array(obs), "terminated" : terminated,
                    "reward" : np.array(rewards), "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict(path["observation"])
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        vf.fit(ob_no, vtarg_n)

        # Policy update
        _, oldmean_na, oldlogstd_a = sess.run([update_op, sy_mean_na, sy_logstd_a], feed_dict={sy_ob_no:ob_no, sy_ac_na:ac_na, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize})
        kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldmean_na: oldmean_na, sy_oldlogstd_a: oldlogstd_a})

        if kl > desired_kl * 2:
            stepsize /= 1.5
            print('stepsize -> %s'%stepsize)
        elif kl < desired_kl / 2:
            stepsize *= 1.5
            print('stepsize -> %s'%stepsize)
        else:
            print('stepsize OK')

        # Log diagnostics
        logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
示例#36
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #todo: create Agent
    
    #todo: initilize Agent:

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = actor.run(ob)
                print("need to type-check action here:(two lines)")
                print(ac)
                print(ac.size())
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            #One episode finishes; perform update here
            finish_episode(actor, actor_optimizer, critic=None, critic_optimizer=None, )
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch



        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#37
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = TODO


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = TODO
        sy_sampled_ac = TODO # Hint: Use the tf.multinomial op
        sy_logprob_n = TODO

    else:
        # YOUR_CODE_HERE
        sy_mean = TODO
        sy_logstd = TODO # logstd should just be a trainable variable, not a network output.
        sy_sampled_ac = TODO
        sy_logprob_n = TODO  # Hint: Use the log probability under a multivariate gaussian. 



    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = TODO # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no, 
                                1, 
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_update_op = TODO


    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        q_n = TODO

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = TODO
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            pass


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            pass

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE


        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()