def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) np.random.seed(random_seed) # set the numpy seed # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters (0.01)
def __init__(self, state_size, action_size, random_seed, actor_layers, critic_layers): """ Initialize an Agent object. Params ====== state_size (int): size of the environment state action_size (int): size of the environment action random_seed (int): seed for the random actor_layers (array[int]): array containing the size of each layer of the actor network critic_layers (array[int]): array containing the size of each layer of the critic network """ self.state_size = state_size self.action_size = action_size self.random_seed = random_seed random.seed(random_seed) np.random.seed(random_seed) # Actor print(f'Agent running on {DEVICE}') self.actor_local = Actor(self.state_size, self.action_size, self.random_seed, *actor_layers).to(DEVICE) self.actor_target = Actor(self.state_size, self.action_size, self.random_seed, *actor_layers).to(DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic self.critic_local = Critic(self.state_size, self.action_size, self.random_seed, *critic_layers).to(DEVICE) self.critic_target = Critic(self.state_size, self.action_size, self.random_seed, *critic_layers).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise self.noise = OrsnteinUhlenbeck(self.action_size, self.random_seed) # Replay Buffer self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, self.random_seed)
def __init__(self, env, hp): self.env = env self.hp = hp self.critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], hp) self.target_critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], hp) self.actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high[0], hp) self.target_actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high[0], hp) self.dataset = ReplayBuffer(self.hp['batch_size'], self.hp['max_buffer_size']) self.noise = OrnsteinUhlenbeckProcess(env.action_space.shape[0], sigma=self.hp['noise_sigma']) self.noise.reset_states()
'critic_threshold': 17.5, 'critic_suffices_required': 1, 'critic_steps_start': 200, 'critic_steps_end': 200, 'actor_steps_start': 1000, 'actor_steps_end': 1000, 'batch_size': 256, 'seed': 123456, 'replay_fill_threshold': 1., 'random_exploration': True, 'test_iterations': 30, 'validation_epoch_mod': 3, } # configuring the environment environment = gym.make('Humanoid-v3') # environment._max_episode_steps = 600 # setting up the training components agent = AWRAgent actor = Actor() critic = Critic() # training and testing Training.train((actor, critic), agent, environment, hyper_ps, save=True, debug_type=DebugType.NONE)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters self.score = 0 self.best_score = -np.inf def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): if done: reward = self.eval_episode(reward) self.add_score(reward) # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def add_score(self, reward): self.score += reward if self.best_score < self.score: self.best_score = self.score def reset_score(self): self.score = 0 def acceptable_episode(self): #print(self.task.sim.pose[:3] - self.task.target_pos) print(np.linalg.norm(self.task.sim.pose[:3] - self.task.target_pos)) def eval_episode(self, episode_reward): x = self.task.sim.pose[0] y = self.task.sim.pose[1] z = self.task.sim.pose[2] if z <= 0: episode_reward -= 35 elif z >= 145: episode_reward -= 25 if (z >= 90 and z <= 110) and (x >= -20 and x <= 20) and (y >= -20 and y <= 20): episode_reward += 40 elif (z >= 65 and z < 130) and (x >= -50 and x <= 50) and (y >= -50 and y <= 50): episode_reward += 50 return episode_reward def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPGAgent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) np.random.seed(random_seed) # set the numpy seed # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) # add OU noise for exploration self.noise = OUNoise(action_size, scale=1.0, sigma=.1) def reset(self): self.noise.reset() def step(self, states, actions, rewards, next_states, dones, time_step): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward (for each agent) for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and every 20 steps if len(self.memory) > BATCH_SIZE and time_step % LEARN_STEPS == 0: for _ in range( N_UPDATES): # generate n experiences and realize n updates experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, epsilon=0.0, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: # add a noise (based on normal distribution) to exploration actions += self.noise.noise() * epsilon return np.clip(actions, -1, 1) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences self.__update_critic_local(actions, dones, gamma, next_states, rewards, states) self.__update_actor_local(states) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def __update_critic_local(self, actions, dones, gamma, next_states, rewards, states): # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() def __update_actor_local(self, states): # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def network_summary(self): print('- Actor Summary (both local and target): ') self.actor_local.to(device).summary() print('- Critic Summary (both local and target): ') self.actor_local.to(device).summary() def save(self, checkpoint_actor_name='checkpoint_actor', checkpoint_critic_name='checkpoint_critic'): """Save the actor and critic network weights""" torch.save(self.actor_local.state_dict(), path_result_folder(f'{checkpoint_actor_name}.pth')) torch.save(self.critic_local.state_dict(), path_result_folder(f'{checkpoint_critic_name}.pth')) @staticmethod def load(env: UnityEnvironment, random_seed=0, checkpoint_actor_name='checkpoint_actor', checkpoint_critic_name='checkpoint_critic'): """Load the actor and critic network weights""" # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] state_size = len(env_info.vector_observations[0]) action_size = brain.vector_action_space_size loaded_agent = DDPGAgent(state_size, action_size, random_seed) loaded_agent.actor_local.load_state_dict( torch.load(path_result_folder(f'{checkpoint_actor_name}.pth'))) loaded_agent.critic_local.load_state_dict( torch.load(path_result_folder(f'{checkpoint_critic_name}.pth'))) return loaded_agent
def main(): env = DialogEnvironment() experiment_name = args.logdir.split('/')[1] #model name torch.manual_seed(args.seed) #TODO actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size) critic = Critic(hidden_size=args.hidden_size,num_layers=args.num_layers,input_size=args.input_size,seq_len=args.seq_len) discrim = Discriminator(hidden_size=args.hidden_size,num_layers=args.hidden_size,input_size=args.input_size,seq_len=args.seq_len) actor.to(device), critic.to(device), discrim.to(device) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations writer = SummaryWriter(args.logdir) if args.load_model is not None: #TODO saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] similarity_scores = [] while steps < args.total_sample_size: scores = [] similarity_scores = [] state, expert_action, raw_state, raw_expert_action = env.reset() score = 0 similarity_score = 0 state = state[:args.seq_len,:] expert_action = expert_action[:args.seq_len,:] state = state.to(device) expert_action = expert_action.to(device) for _ in range(10000): steps += 1 mu, std = actor(state.resize(1,args.seq_len,args.input_size)) #TODO: gotta be a better way to resize. action = get_action(mu.cpu(), std.cpu())[0] for i in range(5): emb_sum = expert_action[i,:].sum().cpu().item() if emb_sum == 0: # print(i) action[i:,:] = 0 # manual padding break done= env.step(action) irl_reward = get_reward(discrim, state, action, args) if done: mask = 0 else: mask = 1 memory.append([state, torch.from_numpy(action).to(device), irl_reward, mask,expert_action]) score += irl_reward similarity_score += get_cosine_sim(expert=expert_action,action=action.squeeze(),seq_len=5) #print(get_cosine_sim(s1=expert_action,s2=action.squeeze(),seq_len=5),'sim') if done: break episodes += 1 scores.append(score) similarity_scores.append(similarity_score) score_avg = np.mean(scores) similarity_score_avg = np.mean(similarity_scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg)) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) writer.add_scalar('log/expert_acc', float(expert_acc), iter) #logg writer.add_scalar('log/learner_acc', float(learner_acc), iter) #logg writer.add_scalar('log/avg_acc', float(learner_acc + expert_acc)/2, iter) #logg if args.suspend_accu_exp is not None: #only if not None do we check. if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) writer.add_scalar('log/score', float(score_avg), iter) writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter) writer.add_text('log/raw_state', raw_state[0],iter) raw_action = get_raw_action(action) #TODO writer.add_text('log/raw_action', raw_action,iter) writer.add_text('log/raw_expert_action', raw_expert_action,iter) if iter % 100: score_avg = int(score_avg) # Open a file with access mode 'a' file_object = open(experiment_name+'.txt', 'a') result_str = str(iter) + '|' + raw_state[0] + '|' + raw_action + '|' + raw_expert_action + '\n' # Append at the end of file file_object.write(result_str) # Close the file file_object.close() model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, experiment_name + '_ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'args': args, 'score': score_avg, }, filename=ckpt_path)
def train_ppo (actor, env, epoch_nb, rollout_per_epoch, rollout_len, train_step_per_epoch, init_log_std, model_save_interval, adr_test_prob, tensorboard_path): mpi_role = nodes.mpi_role proc_num = nodes.proc_num pnid = nodes.pnid import os import time from ppo import PPO from models.critic import Critic USE_ADR = hasattr(env, 'adr') and adr_test_prob > 1e-7 if mpi_role == 'main': os.makedirs(tensorboard_path) critic = Critic(env) trainer = PPO(env, actor, critic, tensorboard_path, init_log_std=init_log_std) trainer.model_save_interval = model_save_interval start_time = time.time() for n in range(epoch_nb): # send the network weights # and get the latest rollouts msg = { pnid+"weights" : warehouse.Entry(action="set", value=trainer.get_weights()), pnid+"adr" : warehouse.Entry(action="get", value=None), pnid+"s" : warehouse.Entry(action="get_l", value=rollout_per_epoch), pnid+"a" : warehouse.Entry(action="get_l", value=rollout_per_epoch), pnid+"r" : warehouse.Entry(action="get_l", value=rollout_per_epoch), pnid+"neglog" : warehouse.Entry(action="get_l", value=rollout_per_epoch), pnid+"mask" : warehouse.Entry(action="get_l", value=rollout_per_epoch), "dumped" : warehouse.Entry(action="get", value=None) } data = warehouse.send(msg) all_s = np.concatenate(data[pnid+"s"].value, axis=0) all_a = np.concatenate(data[pnid+"a"].value, axis=0) all_r = np.concatenate(data[pnid+"r"].value, axis=0) all_neglog = np.concatenate(data[pnid+"neglog"].value, axis=0) all_masks = np.concatenate(data[pnid+"mask"].value, axis=0) dumped_rollout_nb = data["dumped"].value if USE_ADR: env.adr.update(data[pnid+"adr"].value) env.adr.log() # update the network weights all_last_values, all_gae, all_new_value = trainer.calc_gae(all_s, all_r, all_masks) trainer.train_networks(n, all_s, all_a, all_r, all_neglog, all_masks, train_step_per_epoch, all_last_values, all_gae, all_new_value) #debug n_rollouts = all_s.shape[0] cur_rollout_len = all_s.shape[1] print("Epoch {} :".format(n), flush=True) #dumped_rollout_nb = "?" print("Loaded {} rollouts for training while dumping {}.".format(n_rollouts, dumped_rollout_nb), flush=True) dt = time.time() - start_time start_time = time.time() if dt > 0: print("fps : {}".format(n_rollouts*cur_rollout_len/dt), flush=True) print("mean_rew : {}".format(np.sum(all_r * all_masks)/np.sum(all_masks)), flush=True) if USE_ADR: env.adr.save() elif mpi_role == 'worker': trainer = PPO(env, actor, Critic(env), init_log_std=init_log_std) msg = { pnid+"weights" : warehouse.Entry(action="get", value=None), pnid+"adr" : warehouse.Entry(action="set", value={}), "proc_num" : warehouse.Entry(action="get", value=None)} data = warehouse.send(msg) while proc_num >= data["proc_num"].value and not warehouse.is_work_done: test_adr = USE_ADR and np.random.random() < adr_test_prob env.test_adr = test_adr trainer.set_weights (data[pnid+"weights"].value) if test_adr: # simulate rollout all_s, all_a, all_r, all_neglog, all_mask = trainer.get_rollout(env.adr_rollout_len) msg = { pnid+"adr" : warehouse.Entry(action="update", value=env.adr.get_msg()), pnid+"weights" : warehouse.Entry(action="get", value=None), "proc_num" : warehouse.Entry(action="get", value=None)} else: # simulate rollout all_s, all_a, all_r, all_neglog, all_mask = trainer.get_rollout(rollout_len) # send rollout back to warehouse # and get network weights to update actor msg = { pnid+"s" : warehouse.Entry(action="add", value=all_s), pnid+"a" : warehouse.Entry(action="add", value=all_a), pnid+"r" : warehouse.Entry(action="add", value=all_r), pnid+"neglog" : warehouse.Entry(action="add", value=all_neglog), pnid+"mask" : warehouse.Entry(action="add", value=all_mask), pnid+"weights" : warehouse.Entry(action="get", value=None), pnid+"adr" : warehouse.Entry(action="get", value=None), "proc_num" : warehouse.Entry(action="get", value=None)} data = warehouse.send(msg) if USE_ADR: env.adr.update(data[pnid+"adr"].value)
class Agent(): """ Interacts with and learn from the environment """ def __init__(self, state_size, action_size, random_seed, actor_layers, critic_layers): """ Initialize an Agent object. Params ====== state_size (int): size of the environment state action_size (int): size of the environment action random_seed (int): seed for the random actor_layers (array[int]): array containing the size of each layer of the actor network critic_layers (array[int]): array containing the size of each layer of the critic network """ self.state_size = state_size self.action_size = action_size self.random_seed = random_seed random.seed(random_seed) np.random.seed(random_seed) # Actor print(f'Agent running on {DEVICE}') self.actor_local = Actor(self.state_size, self.action_size, self.random_seed, *actor_layers).to(DEVICE) self.actor_target = Actor(self.state_size, self.action_size, self.random_seed, *actor_layers).to(DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic self.critic_local = Critic(self.state_size, self.action_size, self.random_seed, *critic_layers).to(DEVICE) self.critic_target = Critic(self.state_size, self.action_size, self.random_seed, *critic_layers).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise self.noise = OrsnteinUhlenbeck(self.action_size, self.random_seed) # Replay Buffer self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE, self.random_seed) def step(self, states, actions, rewards, next_states, dones, time_step): """ Save experience in replay memory, and use random sample from buffer to learn """ for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn only if there is enough samples on memory if len(self.memory) > BATCH_SIZE and time_step % LEARN_STEPS == 0: for _ in range(N_UPDATES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True, epsilon=1.0): """ Returns actions for given state as per current policy """ state = torch.from_numpy(state).float().to(DEVICE) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: # actions += self.noise.sample() actions += np.random.normal(0, .3) * epsilon return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Critic update actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Actor update actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update weights self.soft_update(self.actor_local, self.actor_target, TAU) self.soft_update(self.critic_local, self.critic_target, TAU) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will copied from target_model (PyTorch model): weights will copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)
class DDPGAgent(): def __init__(self, env, hp): self.env = env self.hp = hp self.critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], hp) self.target_critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], hp) self.actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high[0], hp) self.target_actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high[0], hp) self.dataset = ReplayBuffer(self.hp['batch_size'], self.hp['max_buffer_size']) self.noise = OrnsteinUhlenbeckProcess(env.action_space.shape[0], sigma=self.hp['noise_sigma']) self.noise.reset_states() def take_action(self, state, greedy=False): state = Variable(torch.from_numpy(state)).float() action = self.actor.predict(state) if greedy: return action.detach().numpy() return action.detach().numpy() \ + (self.noise.sample() * self.env.action_space.high[0]) def collect(self, n_episodes, max_episodes): state = self.env.reset() reward_list = [] for _ in range(n_episodes): reward = 0 for step in range(max_episodes): action = self.take_action(state, greedy=True) s_next, r, done, _ = self.env.step(action) state = s_next reward += r if done: break reward_list.append(reward) state = self.env.reset() return np.mean(reward_list) def buffer_update(self, sample): self.dataset.add_sample(sample) def _critic_update(self, batch): s = batch[0] a = batch[1] r = batch[2] s_next = batch[3] done = batch[4] target_actions = self.target_actor.predict(s_next) Q_val = self.target_critic.predict(s_next, target_actions) y_target = r + done * (self.hp['gamma'] * Q_val) #y_target2 = r + self.hp['gamma'] * Q_val #print(y_target!=y_target2,done) y_pred = self.critic.predict(s, a) self.critic.train(y_pred, y_target) def _actor_update(self, batch): s = batch[0] pred_a = self.actor.predict(s) loss = torch.mean(-self.critic.predict(s, pred_a)) self.actor.train(loss) def update(self): if self.dataset.length < self.hp['batch_size']: return batch = self.dataset.get_batch() self._critic_update(batch) self._actor_update(batch) self._target_update(self.hp['tau'], self.target_critic, self.critic) self._target_update(self.hp['tau'], self.target_actor, self.actor) def _target_update(self, tau, target_network, network): for target_param, param in zip(target_network.parameters(), network.parameters()): target_param.data.copy_(tau * param.data + target_param.data * (1.0 - tau)) def save_models(self, episode): torch.save(self.target_actor.state_dict(), './trained_models/' + str(episode) + 'actor.pt') torch.save(self.target_critic.state_dict(), './trained_models/' + str(episode) + 'critic.pt') print('Models Saved!') def load_models(self, path): self.actor.load_state_dict(torch.load(path + 'actor.pt')) self.critic.load_state_dict(torch.load(path + 'critic.pt')) self._target_update(1, self.target_actor, self.actor) self._target_update(1, self.target_critic, self.critic) print('Models Loaded!')