示例#1
0
class GESRL:
    def __init__(self):

        self.seed = args.seed
        np.random.seed(self.seed)

        # Init gym env and set the env seed
        self.env = gym.make(args.env)
        self.env.seed(self.seed)
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.max_action = int(self.env.action_space.high[0])

        # Init parameters
        self._init_parameters()

        self.filter = Filter(self.state_dim)
        self.policy = Policy(self.state_dim, self.action_dim, args)
        self.noise = Noise(self.policy.w_policy.size, args)
        self.ddpg = DDPG(self.state_dim, self.action_dim, self.max_action,
                         args)

    def _init_parameters(self):

        self.log_dir = args.dir_path
        # The max steps per episode
        self.max_ep_len = args.max_ep_len
        self.epochs = args.epochs
        self.save_freq = args.save_freq
        self.start_epoch = args.start_epoch
        self.rl_train_steps = args.rl_train_steps
        self.pop_size = args.pop_size
        self.elite_size = args.elite_size
        # subspace dimension
        self.k = args.k

    def evaluate(self, eval=False):
        state, done, ep_reward, ep_len = self.env.reset(), False, 0.0, 0
        while not done and ep_len < self.max_ep_len:
            self.filter.push(state)
            state = self.filter(state)
            action = self.policy(state)
            next_state, reward, done, _ = self.env.step(action)
            if not eval:
                done = False if ep_len + 1 == self.max_ep_len else done
                self.ddpg.replay_buffer.store(
                    (state, next_state, action, reward, done))
            ep_reward += reward
            ep_len += 1
            state = next_state
        return ep_reward, ep_len

    def train(self):

        for epoch in range(self.epochs):
            surr_grads = []
            ddpg_grads = 0
            if epoch >= self.start_epoch:
                self.ddpg.actor.set_params(self.policy.w_policy)
                self.ddpg.actor_t.set_params(self.policy.w_policy)

                for step in range(self.rl_train_steps):
                    grad = self.ddpg.train()
                    ddpg_grads += grad
                    if step >= self.rl_train_steps - self.k:
                        surr_grads.append(grad.flatten())

                self.policy.update_by_ddpg(ddpg_grads / self.rl_train_steps)
                # if epoch % 50 == 0:
                #     self.ddpg.replay_buffer.buffer_flush()
                # self.policy.w_policy = self.ddpg.actor.get_params()

                self.noise.update(np.array(surr_grads).T)

            epsilons = self.noise.sample(
                self.pop_size)  # policy_size x pop_size

            pos_rewards, neg_rewards = [], []
            policy_weights = self.policy.w_policy  # action_dim x state_dim
            for epsilon in epsilons:
                self.policy.w_policy = policy_weights + epsilon.reshape(
                    self.policy.w_policy.shape)
                pos_reward, pos_len = self.evaluate()
                pos_rewards.append(pos_reward)

                self.policy.w_policy = policy_weights - epsilon.reshape(
                    self.policy.w_policy.shape)
                neg_reward, neg_len = self.evaluate()
                neg_rewards.append(neg_reward)
            self.policy.w_policy = policy_weights

            std_rewards = np.array(pos_rewards + neg_rewards).std()

            if self.elite_size != 0:
                scores = {
                    k: max(pos_reward, neg_reward)
                    for k, (
                        pos_reward,
                        neg_reward) in enumerate(zip(pos_rewards, neg_rewards))
                }
                sorted_scores = sorted(scores.keys(),
                                       key=lambda x: scores[x],
                                       reverse=True)[:self.elite_size]
                elite_pos_rewards = [pos_rewards[k] for k in sorted_scores]
                elite_neg_rewards = [neg_rewards[k] for k in sorted_scores]
                elite_epsilons = [epsilons[k] for k in sorted_scores]
                self.policy.update_by_ges(elite_pos_rewards, elite_neg_rewards,
                                          elite_epsilons, std_rewards)
            else:
                self.policy.update_by_ges(pos_rewards, neg_rewards, epsilons,
                                          std_rewards)

            if epoch % self.save_freq == 0:
                train_rewards = np.array(pos_rewards + neg_rewards)
                test_rewards = []
                for _ in range(10):
                    reward, _ = self.evaluate()
                    test_rewards.append(reward)
                test_rewards = np.array(test_rewards)

                np.savez(self.log_dir + '/policy_weights',
                         self.policy.w_policy)
                logz.log_tabular("Epoch", epoch)
                logz.log_tabular("AverageTrainReward", np.mean(train_rewards))
                logz.log_tabular("StdTrainRewards", np.std(train_rewards))
                logz.log_tabular("MaxTrainRewardRollout",
                                 np.max(train_rewards))
                logz.log_tabular("MinTrainRewardRollout",
                                 np.min(train_rewards))
                logz.log_tabular("AverageTestReward", np.mean(test_rewards))
                logz.log_tabular("StdTestRewards", np.std(test_rewards))
                logz.log_tabular("MaxTestRewardRollout", np.max(test_rewards))
                logz.log_tabular("MinTestRewardRollout", np.min(test_rewards))
                logz.dump_tabular()
示例#2
0
文件: ars.py 项目: marsXyr/GESRL
class ARS:
    def __init__(self):

        self.seed = args.seed
        np.random.seed(self.seed)

        # Init gym env and set the env seed
        self.env = gym.make(args.env)
        self.env.seed(self.seed)
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]

        # Init parameters
        self._init_parameters()

        # Init filter, normalizes the input states by tracking the mean and std of states.
        self.filter = Filter(self.state_dim)
        # Init policy, we use linear policy here
        self.policy = Policy(self.state_dim, self.action_dim, args)
        # Init the noise generator
        self.noise = Noise(self.policy.w_policy.shape)

    def _init_parameters(self):

        self.log_dir = args.dir_path
        # The max steps per episode
        self.max_ep_len = args.max_ep_len
        self.epochs = args.epochs
        self.save_freq = args.save_freq
        self.pop_size = args.pop_size
        self.elite_size = args.elite_size
        self.noise_std = args.noise_std

    def evaluate(self):
        state, done, ep_reward, ep_len = self.env.reset(), False, 0.0, 0
        while not done and ep_len < self.max_ep_len:
            self.filter.push(state)
            state = self.filter(state)
            action = self.policy(state)
            state, reward, done, _ = self.env.step(action)
            ep_reward += reward
            ep_len += 1
        return ep_reward, ep_len

    def train(self):

        for epoch in range(self.epochs):
            # Sample noises from the noise generator.
            epsilons = self.noise.sample(self.pop_size)

            pos_rewards, neg_rewards = [], []
            policy_weights = self.policy.w_policy
            # Generate 2 * pop_size policies and rollouts.
            for epsilon in epsilons:
                self.policy.w_policy = policy_weights + self.noise_std * epsilon
                pos_reward, pos_len = self.evaluate()
                pos_rewards.append(pos_reward)

                self.policy.w_policy = policy_weights - self.noise_std * epsilon
                neg_reward, neg_len = self.evaluate()
                neg_rewards.append(neg_reward)
            self.policy.w_policy = policy_weights

            std_rewards = np.array(pos_rewards + neg_rewards).std()

            # ARS update
            if self.elite_size != 0:
                scores = {
                    k: max(pos_reward, neg_reward)
                    for k, (
                        pos_reward,
                        neg_reward) in enumerate(zip(pos_rewards, neg_rewards))
                }
                sorted_scores = sorted(scores.keys(),
                                       key=lambda x: scores[x],
                                       reverse=True)[:self.elite_size]
                elite_pos_rewards = [pos_rewards[k] for k in sorted_scores]
                elite_neg_rewards = [neg_rewards[k] for k in sorted_scores]
                elite_epsilons = [epsilons[k] for k in sorted_scores]
                self.policy.update(elite_pos_rewards, elite_neg_rewards,
                                   elite_epsilons, std_rewards)
            else:

                self.policy.update(pos_rewards, neg_rewards, epsilons,
                                   std_rewards)

            # Save policy and log the information
            if epoch % self.save_freq == 0:
                train_rewards = np.array(pos_rewards + neg_rewards)
                test_rewards = []
                for _ in range(10):
                    reward, _ = self.evaluate()
                    test_rewards.append(reward)
                test_rewards = np.array(test_rewards)

                np.savez(self.log_dir + '/policy_weights',
                         self.policy.w_policy)
                logz.log_tabular("Epoch", epoch)
                logz.log_tabular("AverageTrainReward", np.mean(train_rewards))
                logz.log_tabular("StdTrainRewards", np.std(train_rewards))
                logz.log_tabular("MaxTrainRewardRollout",
                                 np.max(train_rewards))
                logz.log_tabular("MinTrainRewardRollout",
                                 np.min(train_rewards))
                logz.log_tabular("AverageTestReward", np.mean(test_rewards))
                logz.log_tabular("StdTestRewards", np.std(test_rewards))
                logz.log_tabular("MaxTestRewardRollout", np.max(test_rewards))
                logz.log_tabular("MinTestRewardRollout", np.min(test_rewards))
                logz.dump_tabular()