def __init__(self, env: str): self.device = torch.device("cuda") self.env = DeepmindHackWrapper(gym.make(env), NOOP_MAX) self.noop_index = self.env.unwrapped.get_action_meanings().index( "NOOP") self.n_actions = self.env.action_space.n self.img_shape = self.preprocess_frame(self.env.reset()).shape self.memory = ReplayBuffer(self.img_shape, REPLY_BUFFER_SIZE, discount_factor=gamma) self.game_steps = 0 self.rand_fill() self.loader = torch.utils.data.DataLoader(self.memory, batch_size=BATCH_SIZE, pin_memory=True, num_workers=0) self.net = AtariNet([4, *self.img_shape], self.n_actions).to(self.device) self.loss = torch.nn.SmoothL1Loss().to(self.device) # self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=0.00025, eps=0.01, alpha=0.95, centered=True) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025) self.loss_sum = 0 self.loss_cnt = 0 self.last_test_time = time.time() self.last_video_time = 0 self.copy_network() self.prefetch_queue = Queue(maxsize=1) self.loader_thread = threading.Thread(target=self.loader_thread) self.loader_thread.start()
def __init__(self, *args, agent=None, target_agent=None, **kwargs): self.agent = agent self.target_agent = target_agent # hard update self.hard_update(self.target_agent, self.agent) self.replay_buffer = ReplayBuffer(buffer_size=int(kwargs['buffer_size']), minibatch_size=kwargs['minibatch_size'], seed=kwargs['seed'], device=kwargs['device']) self.__minibatch = kwargs['minibatch_size'] self.actor_optim = torch.optim.Adam(self.agent.get_actor_parameters(), lr=kwargs['learning_rate']) self.critic_optim = torch.optim.Adam(self.agent.get_critic_parameters(), lr=kwargs['learning_rate']) self.__discount = kwargs['discount'] self.__epsilon = kwargs['epsilon'] self.__tau = kwargs['tau'] return
def __init__( self, env, actor: Actor, critic: Critic, actor_target: Actor, critic_target: Critic, gamma: float, minibatch_size: int, device: torch.device, max_episodes: int, tau: int, actor_lr: float, critic_lr: float, weight_decay: float, replay_buffer_size: int, models_path: str, runs_path: Optional[str], ): self.env = env self.actor = actor self.actor_target = actor_target self.critic = critic self.critic_target = critic_target self.gamma = gamma self.minibatch_size = minibatch_size self.device = device self.max_episodes = max_episodes self.tau = tau self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = Adam( self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay, ) self.critic_loss_fn = nn.MSELoss() self.replay_buffer = ReplayBuffer(replay_buffer_size) self.models_path = models_path self.min_act_value = env.action_space.low[0] self.max_act_value = env.action_space.high[0] self.writer = SummaryWriter(log_dir=runs_path) self.actor_target.eval() self.critic_target.eval() self.episode_i = 0
def __init__(self, id, state_size, action_size, config = Config()): """Initialize an Agent object. Params ====== id (int): id used to identify the agent state_size (int): dimension of each state action_size (int): dimension of each action config (Config): the agents configuration """ self.state_size = state_size self.action_size = action_size self.id = id self.t_step = 0 self.config = config random.seed(config.random_seed) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Actor & Target Network self.actor_local = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device) self.actor_target = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic & Target Network self.critic_local = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device) self.critic_target = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay) # Noise process self.noise = OUNoise(action_size, config.random_seed, config.noise_mu, config.noise_theta, config.noise_sigma) # Replay memory if config.use_per: self.memory = NaivePrioritizedReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed, config.per_alpha,config.per_epsilon) else: self.memory = ReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed)
def __init__(self, env, args): self.env = env self.memory_buffer = ReplayBuffer(args.buffer_size) self.learning_rate_actor = args.lr_actor self.learning_rate_critic = args.lr_critic self.tau = args.TAU self.batch_size = args.batch_size self.discount = args.discount self.states_ph = tf.placeholder(tf.float32, shape=(None, 1)) self.actions_ph = tf.placeholder(tf.float32, shape=((None, ) + self.env.action_space.shape)) self.is_training_ph = tf.placeholder_with_default(True, shape=None) self.Actor = ActorNetwork(env=self.env, states=self.states_ph, LR=self.learning_rate_actor, TAU=self.tau, discount=self.discount, scope="actor_main", batch_size=self.batch_size, is_training=self.is_training_ph) self.Critic = CriticNetwork(env=self.env, states=self.states_ph, actions=self.actions_ph, LR=self.learning_rate_critic, TAU=self.tau, discount=self.discount, scope="critic_main", batch_size=self.batch_size, is_training=self.is_training_ph) self.Actor_target = ActorNetwork(env=self.env, states=self.states_ph, LR=self.learning_rate_actor, TAU=self.tau, discount=self.discount, scope="actor_target", batch_size=self.batch_size, is_training=self.is_training_ph) self.Critic_target = CriticNetwork(env=self.env, states=self.states_ph, actions=self.actions_ph, LR=self.learning_rate_critic, TAU=self.tau, discount=self.discount, scope="critic_target", batch_size=self.batch_size, is_training=self.is_training_ph)
class DQN: grayscale_coeffs = np.asarray([0.11, 0.59, 0.3], dtype=np.float32) @staticmethod def preprocess_frame(frame: np.ndarray) -> np.ndarray: return (frame[::2, ::2].astype(np.float32) * DQN.grayscale_coeffs).sum(-1).astype(np.uint8) @staticmethod def frame_to_nn(frame: torch.Tensor) -> torch.Tensor: return frame.float() / 255.0 def __init__(self, env: str): self.device = torch.device("cuda") self.env = DeepmindHackWrapper(gym.make(env), NOOP_MAX) self.noop_index = self.env.unwrapped.get_action_meanings().index( "NOOP") self.n_actions = self.env.action_space.n self.img_shape = self.preprocess_frame(self.env.reset()).shape self.memory = ReplayBuffer(self.img_shape, REPLY_BUFFER_SIZE, discount_factor=gamma) self.game_steps = 0 self.rand_fill() self.loader = torch.utils.data.DataLoader(self.memory, batch_size=BATCH_SIZE, pin_memory=True, num_workers=0) self.net = AtariNet([4, *self.img_shape], self.n_actions).to(self.device) self.loss = torch.nn.SmoothL1Loss().to(self.device) # self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=0.00025, eps=0.01, alpha=0.95, centered=True) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.00025) self.loss_sum = 0 self.loss_cnt = 0 self.last_test_time = time.time() self.last_video_time = 0 self.copy_network() self.prefetch_queue = Queue(maxsize=1) self.loader_thread = threading.Thread(target=self.loader_thread) self.loader_thread.start() def loader_thread(self): while True: for d in self.loader: self.prefetch_queue.put( {k: v.to(self.device) for k, v in d.items()}) def play(self, get_action: Callable[[int, List[np.ndarray]], int], train: bool, step_hook=lambda: None, maxlen=MAXLEN): total_reward = 0 all_frames = [] while True: observation = self.preprocess_frame(self.env.reset()) all_frames += [observation] * 4 for t in range(maxlen): action = get_action(t, all_frames[-4:]) new_frame, reward, done, info = self.env.step(action) if train: self.memory.add(observation, action, reward, done) self.game_steps += 1 observation = self.preprocess_frame(new_frame) all_frames.append(observation) total_reward += reward step_hook() if done: break if train or self.env.was_done: break return total_reward, all_frames def render_video(self, all_frames: List[np.ndarray]) -> np.ndarray: return np.stack(all_frames, axis=0)[:, np.newaxis] def rand_fill(self): print("Filling the replay buffer with random data") while self.memory.count < PREFILL: print("Starting new episode. Data so far:", self.memory.count) _, frames = self.play( lambda i, observation: self.env.action_space.sample(), True) self.game_steps = 0 print("Prefill completed.") def log_loss(self, loss: float): self.loss_sum += loss self.loss_cnt += 1 if self.loss_cnt == 100: wandb.log({"loss": self.loss_sum / self.loss_cnt}, step=self.game_steps) self.loss_sum = 0 self.loss_cnt = 0 def copy_network(self): self.target_init_step = self.game_steps self.predictor = deepcopy(self.net) self.predictor.eval() def train_step(self): data = self.prefetch_queue.get() action = data["action"].long() frames = self.frame_to_nn(data["frames"]) pred = self.net(frames[:, :-1]) pred = pred.gather(index=action, dim=1) with torch.no_grad(): next_value, _ = self.net(frames[:, 1:]).max(-1, keepdim=True) target = gamma * next_value * ( 1.0 - data["is_done"].float()) + data["reward"] l = self.loss(pred, target) self.optimizer.zero_grad() l.backward() torch.nn.utils.clip_grad_norm_(self.net.parameters(), 1.0) self.optimizer.step() self.log_loss(l.item()) if self.game_steps - self.target_init_step > TARGET_SWITCH * STEPS_PER_TRAIN: self.copy_network() def get_epsilon(self) -> float: e_start = 1.0 e_end = 0.1 n = 1000000.0 return max(e_start - (e_start - e_end) / n * self.game_steps, e_end) def get_action(self, iteration: int, observations: List[np.ndarray], train: bool = True) -> int: if train and (np.random.random() < self.get_epsilon()): return self.env.action_space.sample() else: with torch.no_grad(): observation = np.stack(observations, axis=0) input = self.frame_to_nn( torch.tensor(observation, device=self.device).unsqueeze(0)) pred = self.net(input) _, amax = pred[0].max(-1) return amax.item() def train(self): while True: def do_train(): if self.game_steps % STEPS_PER_TRAIN == 0: self.train_step() log = {} log["epsilon"] = self.get_epsilon() log["train_reward"], frames = self.play(self.get_action, train=True, step_hook=do_train) print( f"Step {self.game_steps}: Episode completed in {len(frames)} steps. Reward: {log['train_reward']}. Epsilon: {log['epsilon']}" ) frames = None now = time.time() if now - self.last_test_time > 60: log["test_reward"], frames = self.play( lambda i, observation: self.get_action( i, observation, train=False), train=False) self.last_test_time = now print( f"--> TEST: Step {self.game_steps}: Episode completed in {len(frames)} stpes. Reward: {log['test_reward']}" ) if now - self.last_video_time > 10 * 60: log["video"] = wandb.Video(self.render_video( frames[-300:]), fps=10) self.last_video_time = now frames = None wandb.log(log, step=self.game_steps)
class DDPG: def __init__(self, *args, agent=None, target_agent=None, **kwargs): self.agent = agent self.target_agent = target_agent # hard update self.hard_update(self.target_agent, self.agent) self.replay_buffer = ReplayBuffer( buffer_size=int(kwargs['buffer_size']), minibatch_size=kwargs['minibatch_size'], seed=kwargs['seed'], device=kwargs['device']) self.__minibatch = kwargs['minibatch_size'] self.actor_optim = torch.optim.Adam(self.agent.get_actor_parameters(), lr=kwargs['actor_lr']) self.critic_optim = torch.optim.Adam( self.agent.get_critic_parameters(), lr=kwargs['critic_lr']) self.__discount = kwargs['discount'] self.__tau = kwargs['tau'] return def soft_update(self, target, source, tau): """ Copies the parameters from source network (x) to target network (y) using the below update y = TAU*x + (1 - TAU)*y :param target: Target network (PyTorch) :param source: Source network (PyTorch) :return: """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): """ Copies the parameters from source network to target network :param target: Target network (PyTorch) :param source: Source network (PyTorch) :return: """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def train(self, env, num_episodes): """ Train the agent to solve environment :param env: environment object (ReacherEnvironment) :param num_episodes: number of episodes (int) :return scores: list of scores for each episode (list) """ noise_gen = OrnsteinUhlenbeckActionNoise(env.get_action_dim()) noise_gen.reset() mean_score = [] scores = [] for episode in range(num_episodes): state = env.reset(train_mode=True) # roll out j = 0 score = 0 while True: # step action = self.agent.act( torch.Tensor(state)).detach().cpu().numpy() noise = [ noise_gen.sample() for _ in range(env.get_num_agents()) ] noised_action = action + noise noised_action = np.clip(noised_action, -1., 1.) next_state, reward, done = env.step(noised_action.squeeze()) score += np.mean(reward) # add experience to replay buffer for i in range(action.shape[0]): self.replay_buffer.add(state[i], action[i], reward[i], next_state[i], done[i]) state = next_state if self.replay_buffer.size() < self.__minibatch: continue # sample minibatch states, actions, rewards, next_states, dones = self.replay_buffer.sample( ) # compute critic loss target_actions = self.target_agent.act(next_states) target_Q = rewards + self.__discount * self.target_agent.Q( next_states, target_actions) * (1 - dones) Q = self.agent.Q(states, actions) critic_loss = (Q - target_Q).pow(2).mean() # update critic self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() # compute actor objective actor_actions = self.agent.act(states) Q = self.agent.Q(states, actor_actions) actor_objective = -Q.mean() # update actor self.actor_optim.zero_grad() actor_objective.backward() self.actor_optim.step() # soft update of target agent self.soft_update(self.target_agent, self.agent, self.__tau) if np.any(done): break print("episode: {:d} | score: {:.4f}".format(episode, score)) scores.append(score) return scores
# Noise processes noise_process1 = OUNoise(action_size, random_seed, mu=0., theta=0.15, sigma=0.1) noise_process2 = OUNoise(action_size, random_seed, mu=0., theta=0.15, sigma=0.1) noise_processes = [noise_process1, noise_process2] # Replay buffer replay_buffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) #################################################################################################### # train the Agent max_episodes = 2000 target_score = 0.5 model_directory = 'resources/models/' actor_checkpoint_file = model_directory + "checkpoint_actor.pth" critic_checkpoint_file = model_directory + "checkpoint_critic.pth" actor_model_file = model_directory + "model_actor.pt" critic_model_file = model_directory + "model_critic.pt" agent = SelfPlayAgent(actor_local, actor_target, critic_local, critic_target, noise_processes, replay_buffer)
class DDPG: def __init__( self, env, actor: Actor, critic: Critic, actor_target: Actor, critic_target: Critic, gamma: float, minibatch_size: int, device: torch.device, max_episodes: int, tau: int, actor_lr: float, critic_lr: float, weight_decay: float, replay_buffer_size: int, models_path: str, runs_path: Optional[str], ): self.env = env self.actor = actor self.actor_target = actor_target self.critic = critic self.critic_target = critic_target self.gamma = gamma self.minibatch_size = minibatch_size self.device = device self.max_episodes = max_episodes self.tau = tau self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = Adam( self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay, ) self.critic_loss_fn = nn.MSELoss() self.replay_buffer = ReplayBuffer(replay_buffer_size) self.models_path = models_path self.min_act_value = env.action_space.low[0] self.max_act_value = env.action_space.high[0] self.writer = SummaryWriter(log_dir=runs_path) self.actor_target.eval() self.critic_target.eval() self.episode_i = 0 def compute_expected_return_target(self, rewards, next_states, dones): """ Compute the expected return obtained by evaluating the critic_target on the actor_target's policy. """ with torch.no_grad(): target_expectation = self.critic_target( next_states, self.actor_target(next_states) ) expected_return_target = ( rewards + (1 - dones) * self.gamma * target_expectation ) return expected_return_target def update_critic(self, states, actions, next_states, rewards, dones): """Update the critic network by minimizing the difference with the target critic""" self.critic_optimizer.zero_grad() expected_return_target = self.compute_expected_return_target( rewards, next_states, dones ) expected_return_pred = self.critic(states, actions) critic_loss = self.critic_loss_fn( expected_return_pred, expected_return_target ) critic_loss.backward() self.critic_optimizer.step() return critic_loss def update_actor(self, states): """Update the actor by maximizing the expected return.""" self.actor_optimizer.zero_grad() self.critic.eval() actor_loss = -self.critic(states, self.actor(states)).mean() self.critic.train() actor_loss.backward() self.actor_optimizer.step() return actor_loss def update_target_networks(self): """Soft update the target networks""" with torch.no_grad(): for p_a, p_a_target in zip( self.actor.parameters(), self.actor_target.parameters() ): p_a_target.data.mul_((1.0 - self.tau)) p_a_target.data.add_(self.tau * p_a.data) for p_c, p_c_target in zip( self.critic.parameters(), self.critic_target.parameters() ): p_c_target.data.mul_(1.0 - self.tau) p_c_target.data.add_(self.tau * p_c.data) def get_minibatch(self): """Return `minibatch_size` (state, action, next_state, reward, done) tuples as Tensors.""" minibatch = self.replay_buffer.get(self.minibatch_size) states = torch.stack([mb.state for mb in minibatch])[:, -1, :] actions = [mb.action for mb in minibatch] actions = torch.tensor(actions, device=self.device)[:, -1, :] next_states = [mb.next_state for mb in minibatch] next_states = torch.stack(next_states)[:, -1, :] rewards = [[mb.reward] for mb in minibatch] rewards = torch.tensor(rewards, device=self.device) dones = [[int(mb.done)] for mb in minibatch] dones = torch.tensor(dones, device=self.device) return states, actions, next_states, rewards, dones def save_models(self): """Save the current models to disk.""" torch.save(self.critic, self.models_path + "critic") torch.save(self.actor, self.models_path + "actor") torch.save(self.critic_target, self.models_path + "critic_target") torch.save(self.actor_target, self.models_path + "actor_target") def select_action(self, state, explore=True): self.actor.eval() with torch.no_grad(): action = self.actor(state).to("cpu").data.numpy() self.actor.train() if explore: action += noise(self.env.action_space.shape) action = action.clip(self.min_act_value, self.max_act_value) return action def log(self, sum_of_actor_losses, sum_of_critic_losses, reward): if self.episode_i % 20 == 0: self.save_models() self.writer.add_scalar( "ActorLoss/train", sum_of_actor_losses, self.episode_i ) self.writer.add_scalar( "CriticLoss/train", sum_of_critic_losses, self.episode_i ) self.writer.add_scalar("Reward/train", reward, self.episode_i) def run_episode(self, explore=True): """Run a single episode in either exploration (explore=True) or exploitation (explore=False) mode.""" self.episode_i += 1 state = to_tensor_variable([self.env.reset()]) t = 0 done = False sum_of_actor_losses = 0 sum_of_critic_losses = 0 while not done: t += 1 with torch.no_grad(): action = self.select_action(state, explore) next_state, reward, done, _ = self.env.step(action[0]) next_state = to_tensor_variable([next_state]) self.replay_buffer.store( Transition(state, action, next_state, reward, done) ) state = next_state if explore and self.replay_buffer.occupied > self.minibatch_size: ( states, actions, next_states, rewards, dones, ) = self.get_minibatch() sum_of_critic_losses += self.update_critic( states, actions, next_states, rewards, dones ) self.critic.eval() sum_of_actor_losses += self.update_actor(states) self.critic.train() self.update_target_networks() if done: self.log(sum_of_actor_losses, sum_of_critic_losses, reward) return reward def run_random_episodes(self, n_episodes): for _ in range(n_episodes): state = to_tensor_variable([self.env.reset()]) done = False while not done: action = np.array([self.env.action_space.sample()]) next_state, reward, done, _ = self.env.step(action[0]) next_state = to_tensor_variable([next_state]) self.replay_buffer.store( Transition(state, action, next_state, reward, done) ) def exploit(self, n_episodes): """Exploits for n_episodes""" rewards = [self.run_episode(explore=False) for _ in range(n_episodes)] return rewards def explore(self, n_episodes): """Explores for n_episodes""" for _ in range(n_episodes): self.run_episode(explore=True) def train(self): """Train the four networks""" self.run_random_episodes(100) count_exploit = 0 count_explore = 0 while self.episode_i < self.max_episodes: self.explore(50) count_explore += 50 rewards = self.exploit(10) count_exploit += 10 s = f"Average final reward after 10 exploitations and " s += f"{count_explore} explorations: {sum(rewards)/len(rewards)}\n" print(s) if sum(rewards) / len(rewards) > 180: break self.env.close() return
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Actor Network (w/ Target Network) actor_local = Actor(state_size, action_size, random_seed).to(device) actor_target = Actor(state_size, action_size, random_seed).to(device) # Critic Network (w/ Target Network) critic_local = Critic(state_size + action_size, 1, random_seed).to(device) critic_target = Critic(state_size + action_size, 1, random_seed).to(device) # Noise process noise_process = OUNoise(action_size, random_seed) # Replay memory memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed, device) #################################################################################################### # train the Agent max_episodes = 300 max_timesteps = 1000 def ddpg(n_episodes=max_episodes, n_timesteps=max_timesteps): """Deep Deterministic Policy Gradient. Args: n_episodes (int): maximum number of training episodes n_timesteps (int): maximum number of timesteps per episode """
class Agent(): """Agent that interacts with and learns from the environment.""" def __init__(self, id, state_size, action_size, config = Config()): """Initialize an Agent object. Params ====== id (int): id used to identify the agent state_size (int): dimension of each state action_size (int): dimension of each action config (Config): the agents configuration """ self.state_size = state_size self.action_size = action_size self.id = id self.t_step = 0 self.config = config random.seed(config.random_seed) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Actor & Target Network self.actor_local = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device) self.actor_target = Actor(state_size, action_size, config.random_seed, config.actor_hidden_units, config.use_bn).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic & Target Network self.critic_local = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device) self.critic_target = Critic(state_size, action_size, config.random_seed, config.critic_hidden_units, config.use_bn).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay) # Noise process self.noise = OUNoise(action_size, config.random_seed, config.noise_mu, config.noise_theta, config.noise_sigma) # Replay memory if config.use_per: self.memory = NaivePrioritizedReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed, config.per_alpha,config.per_epsilon) else: self.memory = ReplayBuffer(action_size, config.buffer_size, config.batch_size, config.random_seed) def step(self, state, action, reward, next_state, done, beta=None): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every n time steps. self.t_step = (self.t_step + 1) % self.config.update_n_step if self.t_step != 0: return # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.config.batch_size: if self.config.use_per: assert(beta != None) experiences, weights = self.memory.sample(beta) states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device) actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device) dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device) weights = torch.from_numpy(np.vstack(weights)).float().to(self.device) experiences = (states, actions, rewards, next_states, dones) self.learn(experiences, self.config.gamma, weights) else: experiences = self.memory.sample() states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device) actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device) rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device) next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device) dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device) experiences = (states, actions, rewards, next_states, dones) self.learn(experiences, self.config.gamma) def act(self, state): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if self.config.add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, weights=None): """ Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor weights (array_like): list of weights for compensation the non-uniform sampling (used only with prioritized experience replay) """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) if self.config.use_per: td_error = Q_expected - Q_targets critic_loss = (td_error) ** 2 critic_loss = critic_loss * weights critic_loss = critic_loss.mean() self.memory.update_priorities(np.hstack(td_error.detach().cpu().numpy())) else: critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ------------------- update target networks ------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.tau) self.soft_update(self.actor_local, self.actor_target, self.config.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def getId(self): """ Return the ID of the agent """ return self.id def summary(self): """ Return a brief summary of the agent""" s = 'DDPG Agent {}:\n'.format(self.id) s += self.config.__str__() s += self.actor_local.__str__() s += self.critic_local.__str__() return s
####################################### # Set up simulation ####################################### env = get_environment(env_input) explorer = EpsGreedy(num_actions=env_input['NUM_ACTIONS'], eps=config.EPS_START, eps_min=config.EPS_MIN, decay=config.DECAY) agent = TabularQFunction(state_size=env_input['STATE_SIZE'][0], num_actions=env_input['NUM_ACTIONS'], mu_init=config.Q_INIT, std_init=config.Q_STD) replay = ReplayBuffer() for ep in range(config.NUM_EPISODES): s = env.reset() for t in range(config.NUM_STEPS): if (ep % config.RENDER_FREQUENCY == 0) and config.RENDER: env.render() a = agent.act(s) a = explorer.explore(a) ss, r, done, info = env.step(a) replay.add((s, a, r, ss, done))
####################################### if "EXPLORER" in agent_input and agent_input['EXPLORER'] == 'EPS_GREEDY': explorer = EpsGreedy(num_actions=env_input['NUM_ACTIONS'], eps=config.EPS_START, eps_min=config.EPS_MIN, decay=config.DECAY) else: explorer = None learner_agent = get_agent(agent_input, env_input) print(learner_agent.q) parameter_server = ParameterServer(learner_agent) replay = ReplayBuffer(max_size=config.REPLAY_SIZE) case_id = str(np.random.randint(10000000)) print("Starting experiment {}".format(case_id)) for i in range(args.n_agents): id_ = str(i) env = get_environment(env_input) actor_agent = get_agent(agent_input, env_input) writer = EpisodeWriter(config.resultsDir, env_name=env_input['ENV_NAME'], agent_name=agent_input["TYPE"] + case_id, id_=id_)