def train(env, args, writer): p1_current_model = DQN(env, args).to(args.device) p1_target_model = DQN(env, args).to(args.device) update_target(p1_current_model, p1_target_model) p2_current_model = DQN(env, args).to(args.device) p2_target_model = DQN(env, args).to(args.device) update_target(p2_current_model, p2_target_model) if args.noisy: p1_current_model.update_noisy_modules() p1_target_model.update_noisy_modules() p2_current_model.update_noisy_modules() p2_target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(p1_current_model, args, 1) load_model(p2_current_model, args, 2) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: p1_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) p2_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: p1_replay_buffer = ReplayBuffer(args.buffer_size) p2_replay_buffer = ReplayBuffer(args.buffer_size) p1_state_deque = deque(maxlen=args.multi_step) p2_state_deque = deque(maxlen=args.multi_step) p1_reward_deque = deque(maxlen=args.multi_step) p1_action_deque = deque(maxlen=args.multi_step) p2_reward_deque = deque(maxlen=args.multi_step) p2_action_deque = deque(maxlen=args.multi_step) p1_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr) p2_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr) length_list = [] p1_reward_list, p1_loss_list = [], [] p2_reward_list, p2_loss_list = [], [] p1_episode_reward, p2_episode_reward = 0, 0 episode_length = 0 prev_time = time.time() prev_frame = 1 (p1_state, p2_state) = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.noisy: p1_current_model.sample_noise() p1_target_model.sample_noise() p2_current_model.sample_noise() p2_target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), epsilon) p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), epsilon) if args.render: env.render() actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, _ = env.step(actions) p1_state_deque.append(p1_state) p2_state_deque.append(p2_state) if args.negative: p1_reward_deque.append(reward[0] - 1) else: p1_reward_deque.append(reward[0]) p1_action_deque.append(p1_action) if args.negative: p2_reward_deque.append(reward[1] - 1) else: p2_reward_deque.append(reward[1]) p2_action_deque.append(p2_action) if len(p1_state_deque) == args.multi_step or done: n_reward = multi_step_reward(p1_reward_deque, args.gamma) n_state = p1_state_deque[0] n_action = p1_action_deque[0] p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done)) n_reward = multi_step_reward(p2_reward_deque, args.gamma) n_state = p2_state_deque[0] n_action = p2_action_deque[0] p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done)) (p1_state, p2_state) = (p1_next_state, p2_next_state) p1_episode_reward += (reward[0]) p2_episode_reward += (reward[1]) if args.negative: p1_episode_reward -= 1 p2_episode_reward -= 1 episode_length += 1 if done or episode_length > args.max_episode_length: (p1_state, p2_state) = env.reset() p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) length_list.append(episode_length) writer.add_scalar("data/p1_episode_reward", p1_episode_reward, frame_idx) writer.add_scalar("data/p2_episode_reward", p2_episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) p1_episode_reward, p2_episode_reward, episode_length = 0, 0, 0 p1_state_deque.clear() p2_state_deque.clear() p1_reward_deque.clear() p2_reward_deque.clear() p1_action_deque.clear() p2_action_deque.clear() if len(p1_replay_buffer) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_optimizer, args, beta) p1_loss_list.append(loss.item()) writer.add_scalar("data/p1_loss", loss.item(), frame_idx) loss = compute_td_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_optimizer, args, beta) p2_loss_list.append(loss.item()) writer.add_scalar("data/p2_loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(p1_current_model, p1_target_model) update_target(p2_current_model, p2_target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, p1_reward_list, length_list, p1_loss_list) print_log(frame_idx, prev_frame, prev_time, p2_reward_list, length_list, p2_loss_list) p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear() p1_loss_list.clear(), p2_loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2) save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2)
class Agent: state: int actions: int history: int = 4 atoms: int = 5 #51 Vmin: float = -10 Vmax: float = 10 lr: float = 1e-5 batch_size: int = 32 discount: float = 0.99 norm_clip: float = 10. def __post_init__(self): self.support = torch.linspace(self.Vmin, self.Vmax, self.atoms) self.delta_z = (self.Vmax - self.Vmin) / (self.atoms - 1) self.online_net = DQN(self.state, self.actions, self.history, self.atoms) self.online_net.train() self.target_net = DQN(self.state, self.actions, self.history, self.atoms) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=self.lr) def act(self, state): state = torch.FloatTensor(state).unsqueeze(0) with torch.no_grad(): return (self.online_net(state) * self.support).sum(2).argmax(1).item() def act_e_greedy(self, state, epsilon=0.001): return random.randrange(self.actions) if random.random() < epsilon else self.act(state) def learn(self, buffer): state, action, reward, next_state, terminal, weights, idx = buffer.sample(self.batch_size) state = torch.FloatTensor(state) action = torch.LongTensor(action) reward = torch.FloatTensor(reward) next_state = torch.FloatTensor(next_state) terminal = torch.FloatTensor(terminal) weights = torch.FloatTensor(weights) log_ps = self.online_net(state, log=True) log_ps_a = log_ps[range(self.batch_size), action] with torch.no_grad(): # Calculate nth next state probabilities pns = self.online_net(next_state) dns = self.support.expand_as(pns) * pns argmax_indices_ns = dns.sum(2).argmax(1) self.target_net.sample_noise() pns = self.target_net(next_state) pns_a = pns[range(self.batch_size), argmax_indices_ns] # Compute Bellman operator T applied to z Tz = reward.unsqueeze(1) + (1 - terminal).unsqueeze(1) * self.discount * self.support.unsqueeze(0) # -10 ... 10 + reward Tz.clamp_(min=self.Vmin, max=self.Vmax) # Compute L2 projection of Tz onto fixed support z b = (Tz - self.Vmin) / self.delta_z # 0 ... 4 l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) # Fix disappearing probability mass when l = b = u (b is int) l[(u > 0) * (l == u)] -= 1 u[(l < (self.atoms - 1)) * (l == u)] += 1 # Distribute probability of Tz m = state.new_zeros(self.batch_size, self.atoms) offset = torch.linspace(0, ((self.batch_size - 1) * self.atoms), self.batch_size).unsqueeze(1).expand(self.batch_size, self.atoms).to(action) m.view(-1).index_add_(0, (l + offset).view(-1), (pns_a * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_(0, (u + offset).view(-1), (pns_a * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) loss = -torch.sum(m * log_ps_a, 1) # Cross-entropy loss (minimises DKL(m||p(s_t, a_t))) loss = weights * loss # q_values = self.online_net(state) # q_value = q_values[range(self.batch_size), action] # next_q_values = self.target_net(next_state) # next_q_value = next_q_values.max(1)[0] # expected_q_value = reward + self.discount * next_q_value * (1 - terminal) # loss = weights * (q_value - expected_q_value).pow(2) self.optimiser.zero_grad() loss.mean().backward() self.optimiser.step() nn.utils.clip_grad_norm_(self.online_net.parameters(), self.norm_clip) buffer.update_priorities(idx, loss.tolist()) def update_target_net(self): self.target_net.load_state_dict(self.online_net.state_dict()) def sample_noise(self): self.online_net.sample_noise() def save(self, path): torch.save(self.online_net.state_dict(), path) # Evaluates Q-value based on single state (no batch) def evaluate_q(self, state): with torch.no_grad(): return self.online_net(state.unsqueeze(0)).max(1)[0].item() def train(self): self.online_net.train() def eval(self): self.online_net.eval()
def train(env, args): # Init WandB wandb.init(config=args) current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 prev_time = time.time() prev_frame = 1 state = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done, _ = env.step(action) state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += reward episode_length += 1 if done: state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) wandb.log({ 'episode_reward': episode_reward, 'episode_length': episode_length, }) episode_reward, episode_length = 0, 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) wandb.log({'loss': loss.item()}) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) save_model(current_model, args)
def train(env, args, writer): current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model: # and os.path.isfile(args.load_model) load_model(current_model, args) load_model(target_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_buffer = deque(maxlen=args.action_repeat) states_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] rewards_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] actions_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 episode = 0 prev_time = time.time() prev_frame = 1 state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) for frame_idx in range(1, args.max_frames + 1): if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done, end = env.step(action, save_screenshots=False) add_state(next_state, state_buffer) next_state = recent_state(state_buffer) for agent_index in range(len(done)): states_deque[agent_index].append((state[agent_index])) rewards_deque[agent_index].append(reward[agent_index]) actions_deque[agent_index].append(action[agent_index]) if len(states_deque[agent_index] ) == args.multi_step or done[agent_index]: n_reward = multi_step_reward(rewards_deque[agent_index], args.gamma) n_state = states_deque[agent_index][0] n_action = actions_deque[agent_index][0] replay_buffer.push(n_state, n_action, n_reward, next_state[agent_index], np.float32(done[agent_index])) # delete the agents that have reached the goal r_index = 0 for r in range(len(done)): if done[r] is True: state_buffer, states_deque, actions_deque, rewards_deque = del_record( r_index, state_buffer, states_deque, actions_deque, rewards_deque) r_index -= 1 r_index += 1 next_state = recent_state(state_buffer) state = next_state episode_reward += np.array(reward).mean() episode_length += 1 if end: if args.save_video and episode % 10 == 0: evaluate(env, current_model, args) state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) reward_list.append(episode_reward) length_list.append(episode_length) writer.add_scalar("data/episode_reward", episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) episode_reward, episode_length = 0, 0 for d in range(len(states_deque)): states_deque[d].clear() rewards_deque[d].clear() actions_deque[d].clear() states_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] rewards_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] actions_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] episode += 1 if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) losses = 0 for _ in range(1): loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) losses += loss.item() loss_list.append(losses) writer.add_scalar("data/loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) save_model(current_model, args)
def train(env, args, writer): current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) for para in target_model.parameters(): para.requires_grad = False if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() #target_model.eval() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) update_target(current_model, target_model) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) args.buffer_size = replay_buffer.it_capacity else: replay_buffer = ReplayBuffer(args.buffer_size) print_args(args) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) if args.optim == 'adam': optimizer = optim.Adam(current_model.parameters(), lr=args.lr, eps=args.adam_eps, betas=(0.9, args.beta2)) elif args.optim == 'laprop': optimizer = laprop.LaProp(current_model.parameters(), lr=args.lr, betas=(0.9, args.beta2)) reward_list, length_list, loss_list = [], [], [] episode_reward = 0. episode_length = 0 prev_time = time.time() prev_frame = 1 state = env.reset() evaluation_interval = args.evaluation_interval for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, raw_reward, done, _ = env.step(action) if args.clip_rewards: reward = np.clip(raw_reward, -1., 1.) else: reward = raw_reward state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += raw_reward episode_length += 1 if episode_length >= 9950: while not done: _, _, done, _ = env.step(random.randrange(env.action_space.n)) if done: state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) if episode_length > 10000: print('{:.2f}'.format(episode_reward), end='') writer.add_scalar("data/episode_reward", episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) episode_reward, episode_length = 0., 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) writer.add_scalar("data/loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % evaluation_interval == 0: if len(length_list) > 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) else: evaluation_interval += args.evaluation_interval if frame_idx % 200000 == 0: if args.adam_eps == 1.5e-4: save_model(current_model, args, name="{}_{}".format(args.optim, frame_idx)) else: save_model(current_model, args, name="{}{:.2e}_{}".format(args.optim, args.adam_eps, frame_idx)) reward_list.append(episode_reward) length_list.append(episode_length) print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args)