def train_ea(n_episodes=1, debug=False, gen_index=0, render=False): """ Train EA process """ batch_steps = 0 actor = Actor(nb_states, nb_actions) actors_params = ea.ask() fitness = [] # evaluate all actors for actor_params in actors_params: actor.set_params(actor_params) f, steps = evaluate(actor, n_episodes=n_episodes, noise=False, render=render, training=False) batch_steps += steps fitness.append(f) # print scores if debug: prLightPurple('Generation#{}: EA actor fitness:{}'.format( gen_index, f)) # update ea ea.tell(fitness) return batch_steps
def test(n_test, filename, debug=False, render=False): """ Test an agent """ # load weights actor = Actor(nb_states, nb_actions) actor.load_model(filename) # evaluate f, _ = evaluate(actor, n_episodes=n_test, noise=False, render=render) # print scores if debug: prLightPurple('Average fitness:{}'.format(f))
def __init__(self, nb_states, nb_actions, memory, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions) self.actor_target = Actor(self.nb_states, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr) self.critic = Critic(self.nb_states, self.nb_actions) self.critic_target = Critic(self.nb_states, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr) # scale the actor parameters self.actor.scale_params(0.1) # Make sure target is with the same weight hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = memory self.random_process = OrnsteinUhlenbeckProcess(nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # hyper-parameters self.reward_scale = 1. self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount if USE_CUDA: self.cuda()
np.random.seed(args.seed) env.seed(args.seed) # replay buffer memory = Memory(args.mem_size) # DDPG agent agent = DDPG(nb_states, nb_actions, memory, args) # EA process ea = GA(agent.get_actor_size(), pop_size=args.pop_size, mut_amp=args.mut_amp, mut_rate=args.mut_rate, elite_frac=args.elite_frac, generator=lambda: Actor(nb_states, nb_actions).get_params()) # Trying ES type algorithms, but without much success # ea = OpenES(agent.get_actor_size(), pop_size=args.pop_size, mut_amp=args.mut_amp, # generator=lambda: Actor(nb_states, nb_actions).get_params()) if args.mode == 'train': train(n_gen=args.n_gen, n_episodes=args.n_episodes, omega=args.omega, output=args.output, debug=args.debug, render=args.render) elif args.mode == 'test': test(args.n_test, args.filename, render=args.render, debug=args.debug)
class DDPG(object): def __init__(self, nb_states, nb_actions, memory, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions # Create Actor and Critic Network self.actor = Actor(self.nb_states, self.nb_actions) self.actor_target = Actor(self.nb_states, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr) self.critic = Critic(self.nb_states, self.nb_actions) self.critic_target = Critic(self.nb_states, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr) # scale the actor parameters self.actor.scale_params(0.1) # Make sure target is with the same weight hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) # Create replay buffer self.memory = memory self.random_process = OrnsteinUhlenbeckProcess(nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # hyper-parameters self.reward_scale = 1. self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount if USE_CUDA: self.cuda() def train(self): # Sample batch batch = self.memory.sample(self.batch_size) state_batch = to_tensor(batch.states).view(-1, self.nb_states) action_batch = to_tensor(batch.actions).view(-1, self.nb_actions) reward_batch = to_tensor(batch.rewards).view(-1, 1) next_state_batch = to_tensor(batch.next_states).view( -1, self.nb_states) done_batch = to_tensor(batch.dones).view(-1, 1) # Prepare for the target q batch next_q_values = self.critic_target( [next_state_batch, self.actor_target(next_state_batch)]).detach() target_q_batch = self.reward_scale * reward_batch + \ self.discount * (1. - done_batch) * next_q_values # Critic update self.critic_optim.zero_grad() q_batch = self.critic([state_batch, action_batch]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() # Actor update self.actor_optim.zero_grad() policy_loss = -1. * self.critic([state_batch, self.actor(state_batch)]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def get_actor_size(self): return np.shape(self.get_actor_params())[0] def get_actor(self): return deepcopy(self.actor) def set_actor(self, actor): self.actor = actor def get_critic(self): return deepcopy(self.critic) def get_actor_params(self): return self.actor.get_params() def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) return action def select_action(self, s_t, noise=True): """ Returns action after seeing state """ action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) if noise: action += self.is_training * \ max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) return action def reset(self): self.random_process.reset_states() def load_model(self, filename): self.actor.load_model(filename) self.critic.load_model(filename) def save_model(self, output): self.actor.save_model(output) self.critic.save_model(output) def seed(self, s): torch.manual_seed(s) if USE_CUDA: torch.cuda.manual_seed(s)