예제 #1
0
        )
        path_baseline = baseline.predict(path)
        advantages = []
        returns = []
        return_so_far = 0
        for t in range(len(rewards) - 1, -1, -1):
            return_so_far = rewards[t] + discount * return_so_far
            returns.append(return_so_far)
            advantage = return_so_far - path_baseline[t]
            advantages.append(advantage)
        # The advantages are stored backwards in time, so we need to revert it
        advantages = np.array(advantages[::-1])
        # And we need to do the same thing for the list of returns
        returns = np.array(returns[::-1])

        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)

        path["advantages"] = advantages
        path["returns"] = returns

        paths.append(path)

    baseline.fit(paths)

    observations = np.concatenate([p["observations"] for p in paths])
    actions = np.concatenate([p["actions"] for p in paths])
    advantages = np.concatenate([p["advantages"] for p in paths])

    f_train(observations, actions, advantages)
    print('Average Return:', np.mean([sum(p["rewards"]) for p in paths]))
def run_task(*_):
    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)

    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)
    env = normalize(
        GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True))
    # Initialize a neural network policy with a single hidden layer of 8 hidden units
    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 64))
    # Initialize a linear baseline estimator using default hand-crafted features
    baseline = LinearFeatureBaseline(env.spec)

    # We will collect 100 trajectories per iteration
    N = 3
    # Each trajectory will have at most 100 time steps
    T = 400
    # Number of iterations
    n_itr = 1000
    # Set the discount factor for the problem
    discount = 0.99
    # Learning rate for the gradient update
    learning_rate = 0.001

    # Construct the computation graph

    # Create a Theano variable for storing the observations
    # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However,
    # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data
    # type for the variable. For instance, for an environment with discrete observations, we might want to use integer
    # types if the observations are represented as one-hot vectors.
    observations_var = env.observation_space.new_tensor_variable(
        'observations',
        # It should have 1 extra dimension since we want to represent a list of observations
        extra_dims=1)
    actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1)
    advantages_var = TT.vector('advantages')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
    # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
    # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
    # rllab.distributions.DiagonalGaussian
    dist = policy.distribution

    # Note that we negate the objective, since most optimizers assume a
    # minimization problem
    surr = -TT.mean(
        dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

    # Get the list of trainable parameters.
    params = policy.get_params(trainable=True)
    grads = theano.grad(surr, params)

    f_train = theano.function(
        inputs=[observations_var, actions_var, advantages_var],
        outputs=None,
        updates=adam(grads, params, learning_rate=learning_rate),
        allow_input_downcast=True)

    for epoch in range(n_itr):
        logger.push_prefix('epoch #%d | ' % epoch)
        logger.log("Training started")
        paths = []

        for _ in range(N):
            observations = []
            actions = []
            rewards = []

            observation = env.reset()

            for _ in range(T):
                # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
                # sufficient statistics for the action distribution. It should at least contain entries that would be
                # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
                # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
                # not needed.
                action, _ = policy.get_action(observation)
                # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
                # case it is not needed.
                next_observation, reward, terminal, _ = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = next_observation
                if terminal:
                    # Finish rollout if terminal state reached
                    break

            # We need to compute the empirical return for each time step along the
            # trajectory
            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
            )
            path_baseline = baseline.predict(path)
            advantages = []
            returns = []
            return_so_far = 0
            for t in range(len(rewards) - 1, -1, -1):
                return_so_far = rewards[t] + discount * return_so_far
                returns.append(return_so_far)
                advantage = return_so_far - path_baseline[t]
                advantages.append(advantage)
            # The advantages are stored backwards in time, so we need to revert it
            advantages = np.array(advantages[::-1])
            # And we need to do the same thing for the list of returns
            returns = np.array(returns[::-1])

            advantages = (advantages -
                          np.mean(advantages)) / (np.std(advantages) + 1e-8)

            path["advantages"] = advantages
            path["returns"] = returns

            paths.append(path)

        baseline.fit(paths)

        observations = np.concatenate([p["observations"] for p in paths])
        actions = np.concatenate([p["actions"] for p in paths])
        advantages = np.concatenate([p["advantages"] for p in paths])

        f_train(observations, actions, advantages)
        returns_to_check = [sum(p["rewards"]) for p in paths]
        print('Average Return:', np.mean(returns_to_check))

        ############################################################################
        logger.log("Training finished")
        logger.save_itr_params(epoch, params)
        logger.dump_tabular(with_prefix=False)
        logger.pop_prefix()

        logger.record_tabular('Epoch', epoch)
        logger.record_tabular('Steps', epoch * N * T)
        logger.record_tabular('AverageReturn', np.mean(returns_to_check))
        logger.record_tabular('StdReturn', np.std(returns_to_check))
        logger.record_tabular('MaxReturn', np.max(returns_to_check))
        logger.record_tabular('MinReturn', np.min(returns_to_check))
예제 #3
0
def doit(mode):
    from rllab.envs.box2d.cartpole_env import CartpoleEnv
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from rllab.baselines.zero_baseline import ZeroBaseline
    from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from rllab.envs.normalized_env import normalize
    import numpy as np
    import theano
    import theano.tensor as TT
    from lasagne.updates import adam

    # normalize() makes sure that the actions for the environment lies
    # within the range [-1, 1] (only works for environments with continuous actions)
    env = normalize(CartpoleEnv())
    # Initialize a neural network policy with a single hidden layer of 8 hidden units
    policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,))
    # Initialize a linear baseline estimator using default hand-crafted features
    if "linbaseline" in mode:
        print('linear baseline')
        baseline = LinearFeatureBaseline(env.spec)
    elif "vanilla" in mode:
        print("zero baseline")
        baseline = ZeroBaseline(env.spec)
    elif mode == "batchavg":
        print('batch average baseline')
        # use a zero baseline but subtract the mean of the discounted returns (see below)
        baseline = ZeroBaseline(env.spec)

    if "_ztrans" in mode:
        print('z transform advantages')
    else:
        print('no z transform')


    # We will collect 100 trajectories per iteration
    N = 50
    # Each trajectory will have at most 100 time steps
    T = 50
    # Number of iterations
    n_itr = 50
    # Set the discount factor for the problem
    discount = 0.99
    # Learning rate for the gradient update
    learning_rate = 0.1

    # Construct the computation graph

    # Create a Theano variable for storing the observations
    # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However,
    # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data
    # type for the variable. For instance, for an environment with discrete observations, we might want to use integer
    # types if the observations are represented as one-hot vectors.
    observations_var = env.observation_space.new_tensor_variable(
        'observations',
        # It should have 1 extra dimension since we want to represent a list of observations
        extra_dims=1
    )
    actions_var = env.action_space.new_tensor_variable(
        'actions',
        extra_dims=1
    )
    advantages_var = TT.vector('advantages')

    # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
    # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
    dist_info_vars = policy.dist_info_sym(observations_var)

    # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing
    # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute
    # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class
    # rllab.distributions.DiagonalGaussian
    dist = policy.distribution

    # Note that we negate the objective, since most optimizers assume a
    # minimization problem
    surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var)

    # Get the list of trainable parameters.
    params = policy.get_params(trainable=True)
    grads = theano.grad(surr, params)

    f_train = theano.function(
        inputs=[observations_var, actions_var, advantages_var],
        outputs=None,
        updates=adam(grads, params, learning_rate=learning_rate),
        allow_input_downcast=True
    )

    results = []
    for _ in range(n_itr):

        paths = []

        for _ in range(N):
            observations = []
            actions = []
            rewards = []

            observation = env.reset()

            for _ in range(T):
                # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
                # sufficient statistics for the action distribution. It should at least contain entries that would be
                # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
                # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
                # not needed.
                action, _ = policy.get_action(observation)
                # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
                # case it is not needed.
                next_observation, reward, terminal, _ = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = next_observation
                if terminal:
                    # Finish rollout if terminal state reached
                    break

            # We need to compute the empirical return for each time step along the
            # trajectory
            path = dict(
                observations=np.array(observations),
                actions=np.array(actions),
                rewards=np.array(rewards),
            )
            path_baseline = baseline.predict(path)
            advantages = []
            returns = []
            return_so_far = 0
            for t in range(len(rewards) - 1, -1, -1):
                return_so_far = rewards[t] + discount * return_so_far
                returns.append(return_so_far)
                advantage = return_so_far - path_baseline[t]
                advantages.append(advantage)
            # The advantages are stored backwards in time, so we need to revert it
            advantages = np.array(advantages[::-1])
            # And we need to do the same thing for the list of returns
            returns = np.array(returns[::-1])

            if "_ztrans" in mode:
                advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)


            path["advantages"] = advantages
            path["returns"] = returns

            paths.append(path)

        baseline.fit(paths)

        observations = np.concatenate([p["observations"] for p in paths])
        actions = np.concatenate([p["actions"] for p in paths])
        advantages = np.concatenate([p["advantages"] for p in paths])


        if mode == 'batchavg':
            # in this case `advantages` up to here are just our good old returns, without baseline or z transformation.
            # now we subtract their mean across all episodes.
            advantages = advantages - np.mean(advantages)


        f_train(observations, actions, advantages)
        avgr =  np.mean([sum(p["rewards"]) for p in paths])
        print(('Average Return:',avgr))
        results.append(avgr)
    return results
예제 #4
0
class LowSampler(Sampler):
    def __init__(self):
        """
        :type algo: BatchPolopt
        """
        env_low = normalize(AntEnv(ego_obs=True))
        # baseline_low = LinearFeatureBaseline(env_spec=env_low.spec)
        # low_policy = env.low_policy
        pkl_path = '/home/lsy/Desktop/rllab/data/local/Ant-snn1000/Ant-snn_10MI_5grid_6latCat_bil_0040/params.pkl'
        data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
        low_policy = data['policy']


        self.baseline = LinearFeatureBaseline(env_spec=env_low.spec)
        self.discount = 0.99
        self.gae_lambda = 1.0
        self.center_adv = True
        self.positive_adv = False
        self.policy = low_policy

    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        if hasattr(self.baseline, "predict_n"):
            all_path_baselines = self.baseline.predict_n(paths)
        else:
            all_path_baselines = [self.baseline.predict(path) for path in paths]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"], self.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(
            np.concatenate(baselines),
            np.concatenate(returns)
        )

        # if not self.algo.policy.recurrent:
        observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths])
        actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths])
        rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths])
        returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths])
        advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths])
        env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths])
        agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths])

        if self.center_adv:
            advantages = util.center_advantages(advantages)

        if self.positive_adv:
            advantages = util.shift_advantages_to_positive(advantages)

        average_discounted_return = \
            np.mean([path["returns"][0] for path in paths])

        undiscounted_returns = [sum(path["rewards"]) for path in paths]

        ent = np.mean(self.policy.distribution.entropy(agent_infos))

        samples_data = dict(
            observations=observations,
            actions=actions,
            rewards=rewards,
            returns=returns,
            advantages=advantages,
            env_infos=env_infos,
            agent_infos=agent_infos,
            paths=paths,
        )


        logger.log("fitting baseline...")
        if hasattr(self.baseline, 'fit_with_samples'):
            self.baseline.fit_with_samples(paths, samples_data)
        else:
            self.baseline.fit(paths)
        logger.log("fitted")

        with logger.tabular_prefix('Low_'):
            logger.record_tabular('Iteration', itr)
            logger.record_tabular('AverageDiscountedReturn',
                                  average_discounted_return)
            logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
            logger.record_tabular('ExplainedVariance', ev)
            logger.record_tabular('NumTrajs', len(paths))
            logger.record_tabular('Entropy', ent)
            logger.record_tabular('Perplexity', np.exp(ent))
            logger.record_tabular('StdReturn', np.std(undiscounted_returns))
            logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
            logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
예제 #5
0
                za.append(
                    t *
                    (y -
                     (path_baseline[yk] - discount * path_baseline[yk + 1])))
                t *= discount
            return_so_far = 0
            for t in range(len(x) - 1, -1, -1):
                return_so_far = x[t] + discount * return_so_far
                z_rew.append(return_so_far)
            z_rew = np.array(z_rew[::-1])
            temp.append(np.array(z))
            tempa.append(np.array(za))
            path["advantages"] = za
            path["returns"] = z_rew
            p_4b.append(path)
        baseline.fit(p_4b)
        d_rewards = tempa
        s_g = f_train(observations[0], actions[0], d_rewards[0])
        s_g_fv = [unpack(s_g)]
        for ob, ac, rw in zip(observations[1:], actions[1:], d_rewards[1:]):
            i_g = f_train(ob, ac, rw)
            s_g_fv.append(unpack(i_g))
            s_g = [sum(x) for x in zip(s_g, i_g)]
        s_g = [x / len(paths) for x in s_g]
        stp_snp = f_update[0](s_g[0], s_g[1], s_g[2], s_g[3], s_g[4], s_g[5],
                              s_g[6], s_g[7], s_g[8])

        #        print("step snapshot:", stp_snp)
        rewards_snapshot.append(np.array([sum(p["rewards"]) for p in paths]))
        avg_return[j - N:j] = np.repeat(
            np.mean([sum(p["rewards"]) for p in paths]), N)