class GESRL: def __init__(self): self.seed = args.seed np.random.seed(self.seed) # Init gym env and set the env seed self.env = gym.make(args.env) self.env.seed(self.seed) self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] self.max_action = int(self.env.action_space.high[0]) # Init parameters self._init_parameters() self.filter = Filter(self.state_dim) self.policy = Policy(self.state_dim, self.action_dim, args) self.noise = Noise(self.policy.w_policy.size, args) self.ddpg = DDPG(self.state_dim, self.action_dim, self.max_action, args) def _init_parameters(self): self.log_dir = args.dir_path # The max steps per episode self.max_ep_len = args.max_ep_len self.epochs = args.epochs self.save_freq = args.save_freq self.start_epoch = args.start_epoch self.rl_train_steps = args.rl_train_steps self.pop_size = args.pop_size self.elite_size = args.elite_size # subspace dimension self.k = args.k def evaluate(self, eval=False): state, done, ep_reward, ep_len = self.env.reset(), False, 0.0, 0 while not done and ep_len < self.max_ep_len: self.filter.push(state) state = self.filter(state) action = self.policy(state) next_state, reward, done, _ = self.env.step(action) if not eval: done = False if ep_len + 1 == self.max_ep_len else done self.ddpg.replay_buffer.store( (state, next_state, action, reward, done)) ep_reward += reward ep_len += 1 state = next_state return ep_reward, ep_len def train(self): for epoch in range(self.epochs): surr_grads = [] ddpg_grads = 0 if epoch >= self.start_epoch: self.ddpg.actor.set_params(self.policy.w_policy) self.ddpg.actor_t.set_params(self.policy.w_policy) for step in range(self.rl_train_steps): grad = self.ddpg.train() ddpg_grads += grad if step >= self.rl_train_steps - self.k: surr_grads.append(grad.flatten()) self.policy.update_by_ddpg(ddpg_grads / self.rl_train_steps) # if epoch % 50 == 0: # self.ddpg.replay_buffer.buffer_flush() # self.policy.w_policy = self.ddpg.actor.get_params() self.noise.update(np.array(surr_grads).T) epsilons = self.noise.sample( self.pop_size) # policy_size x pop_size pos_rewards, neg_rewards = [], [] policy_weights = self.policy.w_policy # action_dim x state_dim for epsilon in epsilons: self.policy.w_policy = policy_weights + epsilon.reshape( self.policy.w_policy.shape) pos_reward, pos_len = self.evaluate() pos_rewards.append(pos_reward) self.policy.w_policy = policy_weights - epsilon.reshape( self.policy.w_policy.shape) neg_reward, neg_len = self.evaluate() neg_rewards.append(neg_reward) self.policy.w_policy = policy_weights std_rewards = np.array(pos_rewards + neg_rewards).std() if self.elite_size != 0: scores = { k: max(pos_reward, neg_reward) for k, ( pos_reward, neg_reward) in enumerate(zip(pos_rewards, neg_rewards)) } sorted_scores = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:self.elite_size] elite_pos_rewards = [pos_rewards[k] for k in sorted_scores] elite_neg_rewards = [neg_rewards[k] for k in sorted_scores] elite_epsilons = [epsilons[k] for k in sorted_scores] self.policy.update_by_ges(elite_pos_rewards, elite_neg_rewards, elite_epsilons, std_rewards) else: self.policy.update_by_ges(pos_rewards, neg_rewards, epsilons, std_rewards) if epoch % self.save_freq == 0: train_rewards = np.array(pos_rewards + neg_rewards) test_rewards = [] for _ in range(10): reward, _ = self.evaluate() test_rewards.append(reward) test_rewards = np.array(test_rewards) np.savez(self.log_dir + '/policy_weights', self.policy.w_policy) logz.log_tabular("Epoch", epoch) logz.log_tabular("AverageTrainReward", np.mean(train_rewards)) logz.log_tabular("StdTrainRewards", np.std(train_rewards)) logz.log_tabular("MaxTrainRewardRollout", np.max(train_rewards)) logz.log_tabular("MinTrainRewardRollout", np.min(train_rewards)) logz.log_tabular("AverageTestReward", np.mean(test_rewards)) logz.log_tabular("StdTestRewards", np.std(test_rewards)) logz.log_tabular("MaxTestRewardRollout", np.max(test_rewards)) logz.log_tabular("MinTestRewardRollout", np.min(test_rewards)) logz.dump_tabular()
class ARS: def __init__(self): self.seed = args.seed np.random.seed(self.seed) # Init gym env and set the env seed self.env = gym.make(args.env) self.env.seed(self.seed) self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Init parameters self._init_parameters() # Init filter, normalizes the input states by tracking the mean and std of states. self.filter = Filter(self.state_dim) # Init policy, we use linear policy here self.policy = Policy(self.state_dim, self.action_dim, args) # Init the noise generator self.noise = Noise(self.policy.w_policy.shape) def _init_parameters(self): self.log_dir = args.dir_path # The max steps per episode self.max_ep_len = args.max_ep_len self.epochs = args.epochs self.save_freq = args.save_freq self.pop_size = args.pop_size self.elite_size = args.elite_size self.noise_std = args.noise_std def evaluate(self): state, done, ep_reward, ep_len = self.env.reset(), False, 0.0, 0 while not done and ep_len < self.max_ep_len: self.filter.push(state) state = self.filter(state) action = self.policy(state) state, reward, done, _ = self.env.step(action) ep_reward += reward ep_len += 1 return ep_reward, ep_len def train(self): for epoch in range(self.epochs): # Sample noises from the noise generator. epsilons = self.noise.sample(self.pop_size) pos_rewards, neg_rewards = [], [] policy_weights = self.policy.w_policy # Generate 2 * pop_size policies and rollouts. for epsilon in epsilons: self.policy.w_policy = policy_weights + self.noise_std * epsilon pos_reward, pos_len = self.evaluate() pos_rewards.append(pos_reward) self.policy.w_policy = policy_weights - self.noise_std * epsilon neg_reward, neg_len = self.evaluate() neg_rewards.append(neg_reward) self.policy.w_policy = policy_weights std_rewards = np.array(pos_rewards + neg_rewards).std() # ARS update if self.elite_size != 0: scores = { k: max(pos_reward, neg_reward) for k, ( pos_reward, neg_reward) in enumerate(zip(pos_rewards, neg_rewards)) } sorted_scores = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:self.elite_size] elite_pos_rewards = [pos_rewards[k] for k in sorted_scores] elite_neg_rewards = [neg_rewards[k] for k in sorted_scores] elite_epsilons = [epsilons[k] for k in sorted_scores] self.policy.update(elite_pos_rewards, elite_neg_rewards, elite_epsilons, std_rewards) else: self.policy.update(pos_rewards, neg_rewards, epsilons, std_rewards) # Save policy and log the information if epoch % self.save_freq == 0: train_rewards = np.array(pos_rewards + neg_rewards) test_rewards = [] for _ in range(10): reward, _ = self.evaluate() test_rewards.append(reward) test_rewards = np.array(test_rewards) np.savez(self.log_dir + '/policy_weights', self.policy.w_policy) logz.log_tabular("Epoch", epoch) logz.log_tabular("AverageTrainReward", np.mean(train_rewards)) logz.log_tabular("StdTrainRewards", np.std(train_rewards)) logz.log_tabular("MaxTrainRewardRollout", np.max(train_rewards)) logz.log_tabular("MinTrainRewardRollout", np.min(train_rewards)) logz.log_tabular("AverageTestReward", np.mean(test_rewards)) logz.log_tabular("StdTestRewards", np.std(test_rewards)) logz.log_tabular("MaxTestRewardRollout", np.max(test_rewards)) logz.log_tabular("MinTestRewardRollout", np.min(test_rewards)) logz.dump_tabular()