def __init__(self, config: Config): self.config = config self.is_training = True if self.config.prioritized_replay: self.buffer = PrioritizedReplayBuffer( self.config.max_buff, alpha=self.config.prioritized_replay_alpha) if self.config.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = self.config.frames self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.config.prioritized_replay_beta0, final_p=1.0) else: self.buffer = ReplayBuffer(self.config.max_buff) self.beta_schedule = None self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda()
def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.replay_buffer = ReplayBuffer(buffer_size) self.dqn = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters()) self.dqn_loss = torch.nn.MSELoss()
def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = ReplayBuffer(self.config.max_buff) self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda()
def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000, tau=1e-2): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.tau = tau self.replay_buffer = ReplayBuffer(buffer_size) self.dqn_a = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.dqn_b = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.optimizer_a = torch.optim.Adam(self.dqn_a.parameters()) self.optimizer_b = torch.optim.Adam(self.dqn_b.parameters()) self.dqn_loss = torch.nn.MSELoss() for param_b, param_a in zip(self.dqn_b.parameters(), self.dqn_a.parameters()): param_b.data.copy_(param_a.data)
def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = RolloutStorage(config) if self.config.Dueling_DQN: self.model = Dueling_DQN(self.config.state_shape, self.config.action_dim) self.target_model = Dueling_DQN(self.config.state_shape, self.config.action_dim) else: self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda()
class CnnDDQNAgent: def __init__(self, config: Config): self.config = config self.is_training = True if self.config.prioritized_replay: self.buffer = PrioritizedReplayBuffer( self.config.max_buff, alpha=self.config.prioritized_replay_alpha) if self.config.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = self.config.frames self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.config.prioritized_replay_beta0, final_p=1.0) else: self.buffer = ReplayBuffer(self.config.max_buff) self.beta_schedule = None self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) if self.config.use_cuda: state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learning(self, fr): if self.config.prioritized_replay: experience = self.buffer.sample(self.config.batch_size, beta=self.beta_schedule.value(fr)) (s0, a, r, s1, done, weights, batch_idxes) = experience else: s0, a, r, s1, done = self.buffer.sample(self.config.batch_size) weights, batch_idxes = np.ones_like(r), None s0 = torch.tensor(s0, dtype=torch.float) s1 = torch.tensor(s1, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) weights = torch.tensor(weights, dtype=torch.float) if self.config.use_cuda: s0 = s0.cuda() s1 = s1.cuda() a = a.cuda() r = r.cuda() done = done.cuda() weights = weights.cuda() q_values = self.model(s0).cuda() next_q_values = self.model(s1).cuda() next_q_state_values = self.target_model(s1).cuda() q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) td_errors = next_q_value - expected_q_value # Notice that detach the expected_q_value loss = F.smooth_l1_loss(q_value, expected_q_value.detach(), reduction='none') loss = (loss * weights).mean() self.model_optim.zero_grad() loss.backward() self.model_optim.step() if self.config.prioritized_replay: new_priorities = np.abs(td_errors.detach().cpu().numpy() ) + self.config.prioritized_replay_eps self.buffer.update_priorities(batch_idxes, new_priorities) if fr % self.config.update_tar_interval == 0: self.target_model.load_state_dict(self.model.state_dict()) return loss.item() def cuda(self): self.model.cuda() self.target_model.cuda() def load_weights(self, model_path): model = torch.load(model_path) if 'model' in model: self.model.load_state_dict(model['model']) else: self.model.load_state_dict(model) def save_model(self, output, name=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n") def save_checkpoint(self, fr, output): checkpath = output + '/checkpoint_model' os.makedirs(checkpath, exist_ok=True) torch.save({ 'frames': fr, 'model': self.model.state_dict() }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr)) def load_checkpoint(self, model_path): checkpoint = torch.load(model_path) fr = checkpoint['frames'] self.model.load_state_dict(checkpoint['model']) self.target_model.load_state_dict(checkpoint['model']) return fr
class Agent: def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.replay_buffer = ReplayBuffer(buffer_size) self.dqn = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters()) self.dqn_loss = torch.nn.MSELoss() def update_model(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size) states = torch.FloatTensor(states) actions = torch.LongTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) dones = torch.FloatTensor(dones) curr_Q = self.dqn.forward(states) curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1) next_Q = self.dqn.forward(next_states) max_next_Q = torch.max(next_Q, 1)[0] expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q self.dqn_optimizer.zero_grad() loss = self.dqn_loss(curr_Q, expected_Q) loss.backward() self.dqn_optimizer.step() return loss def max_action(self, state): state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0)) qvals = self.dqn.forward(state) action = np.argmax(qvals.detach().numpy()) return action def train(self, max_episodes, max_steps, batch_size): episode_rewards = [] loss = [] for episodes in range(max_episodes): state = self.env.reset() episode_reward = 0 for steps in range(max_steps): action = self.max_action(state) next_state, reward, done, _ = self.env.step(action) self.replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: episode_rewards.append(episode_reward) print(episode_reward) break if(len(self.replay_buffer) > batch_size): step_loss = self.update_model(batch_size) loss.append(step_loss) #self.adjust_temperature(loss) # return episode_rewards, loss def run(self, max_episodes, max_steps): episode_rewards = [] for episodes in range(max_episodes): state = self.env.reset() episode_reward = 0 for steps in range(max_steps): action = self.max_action(state) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward if done: episode_rewards.append(episode_reward) break return episode_rewards def save_model(self, PATH): torch.save(self.dqn.state_dict(), PATH)
class CnnDDQNAgent: def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = RolloutStorage(config) if self.config.Dueling_DQN: self.model = Dueling_DQN(self.config.state_shape, self.config.action_dim) self.target_model = Dueling_DQN(self.config.state_shape, self.config.action_dim) else: self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = torch.optim.Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float) / 255.0 if self.config.use_cuda: state = state.to(self.config.device) q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learning(self, fr): s0, s1, a, r, done = self.buffer.sample(self.config.batch_size) if self.config.use_cuda: s0 = s0.float().to(self.config.device) / 255.0 s1 = s1.float().to(self.config.device) / 255.0 a = a.to(self.config.device) r = r.to(self.config.device) done = done.to(self.config.device) # How to calculate Q(s,a) for all actions # q_values is a vector with size (batch_size, action_shape, 1) # each dimension i represents Q(s0,a_i) q_s0_values = self.model(s0).cuda() # How to calculate argmax_a Q(s,a) # actions = q_values.max(1)[1] q_s0_a = torch.gather(q_s0_values, 1, a) # Tips: function torch.gather may be helpful # You need to design how to calculate the loss if self.config.DQN: q_target_s1_values = self.target_model(s1).cuda().detach() q_target_s1_a_prime = q_target_s1_values.max(1)[0].unsqueeze(1) # if current state is end of episode, then there is no next Q value q_target_s1_a_prime = torch.mul(q_target_s1_a_prime, (1 - done)) y = r + self.config.gamma * q_target_s1_a_prime elif self.config.Double_DQN: q_s1_values = self.model(s1).cuda().detach() s1_a_prime = q_s1_values.max(1)[1].unsqueeze(1) q_target_s1_values = self.target_model(s1).cuda().detach() q_target_s1_a_prime = torch.gather(q_target_s1_values, 1, s1_a_prime) q_target_s1_a_prime = torch.mul(q_target_s1_a_prime, (1 - done)) y = r + self.config.gamma * q_target_s1_a_prime else: pass mse_loss = torch.nn.MSELoss() loss = mse_loss(q_s0_a, y) self.model_optim.zero_grad() loss.backward() self.model_optim.step() if fr % self.config.update_tar_interval == 0: self.target_model.load_state_dict(self.model.state_dict()) return loss.item() def cuda(self): self.model.to(self.config.device) self.target_model.to(self.config.device) def load_weights(self, model_path): model = torch.load(model_path) if 'model' in model: self.model.load_state_dict(model['model']) else: self.model.load_state_dict(model) def save_model(self, output, name=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n") def save_checkpoint(self, fr, output): checkpath = output + '/checkpoint_model' os.makedirs(checkpath, exist_ok=True) torch.save({ 'frames': fr, 'model': self.model.state_dict() }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr)) def load_checkpoint(self, model_path): checkpoint = torch.load(model_path) fr = checkpoint['frames'] self.model.load_state_dict(checkpoint['model']) self.target_model.load_state_dict(checkpoint['model']) return fr
elif info: state = env.reset() print('Reward:{}, action certification rate {:.4f}'.format( episode_reward, certified / total)) return certified / total if __name__ == '__main__': args = parser.parse_args() setup_json = read_config(args.env_config) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] env = atari_env(args.env, env_conf, args) model = CnnDQN(env.observation_space.shape[0], env.action_space) if args.gpu_id >= 0: weights = torch.load(args.load_path, map_location=torch.device('cuda:{}'.format( args.gpu_id))) model.load_state_dict(weights) with torch.cuda.device(args.gpu_id): model.cuda() else: weights = torch.load(args.load_path, map_location=torch.device('cpu')) model.load_state_dict(weights) model.eval() save_name = (args.load_path.split('/')[-1]).split('.')[0]
class CnnDDQNAgent: def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = ReplayBuffer(self.config.max_buff) self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) if self.config.use_cuda: state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learning(self, fr): s0, a, r, s1, done = self.buffer.sample(self.config.batch_size) s0 = torch.tensor(s0, dtype=torch.float) s1 = torch.tensor(s1, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) if self.config.use_cuda: s0 = s0.cuda() s1 = s1.cuda() a = a.cuda() r = r.cuda() done = done.cuda() q_values = self.model(s0).cuda() next_q_values = self.model(s1).cuda() next_q_state_values = self.target_model(s1).cuda() q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather(1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) # Notice that detach the expected_q_value loss = (q_value - expected_q_value.detach()).pow(2).mean() self.model_optim.zero_grad() loss.backward() self.model_optim.step() if fr % self.config.update_tar_interval == 0: self.target_model.load_state_dict(self.model.state_dict()) return loss.item() def cuda(self): self.model.cuda() self.target_model.cuda() def load_weights(self, model_path): model = torch.load(model_path) if 'model' in model: self.model.load_state_dict(model['model']) else: self.model.load_state_dict(model) def save_model(self, output, name=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n") def save_checkpoint(self, fr, output): checkpath = output + '/checkpoint_model' os.makedirs(checkpath, exist_ok=True) torch.save({ 'frames': fr, 'model': self.model.state_dict() }, '%s/checkpoint_fr_%d.tar'% (checkpath, fr)) def load_checkpoint(self, model_path): checkpoint = torch.load(model_path) fr = checkpoint['frames'] self.model.load_state_dict(checkpoint['model']) self.target_model.load_state_dict(checkpoint['model']) return fr
parser.set_defaults(robust=False) if __name__ == '__main__': args = parser.parse_args() if args.seed: torch.manual_seed(args.seed) if args.gpu_id >= 0: torch.cuda.manual_seed(args.seed) setup_json = read_config(args.env_config) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env: env_conf = setup_json[i] env = atari_env(args.env, env_conf, args) curr_model = CnnDQN(env.observation_space.shape[0], env.action_space) if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) if not os.path.exists(args.save_model_dir): os.mkdir(args.save_model_dir) if args.load_path: saved_state = torch.load(args.load_path, map_location=lambda storage, loc: storage) curr_model.load_state_dict(saved_state) target_model = CnnDQN(env.observation_space.shape[0], env.action_space) target_model.load_state_dict(curr_model.state_dict()) if args.gpu_id >= 0: with torch.cuda.device(args.gpu_id):
class Agent: def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000, tau=1e-2): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.tau = tau self.replay_buffer = ReplayBuffer(buffer_size) self.dqn_a = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.dqn_b = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.optimizer_a = torch.optim.Adam(self.dqn_a.parameters()) self.optimizer_b = torch.optim.Adam(self.dqn_b.parameters()) self.dqn_loss = torch.nn.MSELoss() for param_b, param_a in zip(self.dqn_b.parameters(), self.dqn_a.parameters()): param_b.data.copy_(param_a.data) def update_model(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size) states = torch.FloatTensor(states) actions = torch.LongTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) dones = torch.FloatTensor(dones) curr_Q = self.dqn_a.forward(states) curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1) print("curr_Q: " + str(curr_Q)) next_Q = self.dqn_a.forward(next_states) best_actions = torch.max(next_Q, 1)[1] #print("next_Q" + str(next_Q)) print("best actions: " + str(best_actions)) dqn_b_Q = self.dqn_b.forward(next_states) max_next_Q = dqn_b_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1) print("max_next_Q: " + str(max_next_Q)) expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q #print(expected_Q) self.optimizer_a.zero_grad() loss = self.dqn_loss(curr_Q, expected_Q) loss.backward() self.optimizer_a.step() for param_b, param_a in zip(self.dqn_b.parameters(), self.dqn_a.parameters()): param_b.data.copy_(param_a.data * self.tau + param_b.data * (1.0 - self.tau)) #update dqn_a by chance """ if(np.random.uniform() < 0.5): # curr_Q = self.dqn_a.forward(states) curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1) next_Q = self.dqn_a.forward(next_states) best_actions = torch.max(next_Q, 1)[1] print("next_Q" + str(next_Q)) print("best actions: " + str(best_actions)) dqn_b_Q = self.dqn_b.forward(next_states) max_next_Q = dqn_b_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1) print("max_next_Q: " + str(max_next_Q)) expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q self.optimizer_a.zero_grad() loss = self.dqn_loss(curr_Q, expected_Q) loss.backward() self.optimizer_a.step() """ # update dqn_b """ else: curr_Q = self.dqn_b.forward(states) curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1) next_Q = self.dqn_b.forward(next_states) best_actions = torch.max(next_Q, 1)[1].detach() #print("next_Q" + str(next_Q)) #print("best actions: " + str(best_actions)) dqn_a_Q = self.dqn_a.forward(next_states) max_next_Q = dqn_a_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1) expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q self.optimizer_b.zero_grad() loss = self.dqn_loss(curr_Q, expected_Q) loss.backward() self.optimizer_b.step() """ def max_action(self, state): state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0)) qvals = self.dqn_a.forward(state) action = np.argmax(qvals.detach().numpy()) # if(np.random.uniform() < 0.2): # return self.env.action_space.sample() return action def train(self, max_episodes, max_steps, batch_size): episode_rewards = [] loss = [] for episodes in range(max_episodes): state = self.env.reset() episode_reward = 0 for steps in range(max_steps): action = self.max_action(state) next_state, reward, done, _ = self.env.step(action) self.replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: episode_rewards.append(episode_reward) print(episode_reward) break if(len(self.replay_buffer) > batch_size): step_loss = self.update_model(batch_size) loss.append(step_loss) #self.adjust_temperature(loss) # return episode_rewards, loss def run(self, max_episodes, max_steps): episode_rewards = [] for episodes in range(max_episodes): state = self.env.reset() episode_reward = 0 for steps in range(max_steps): action = self.max_action(state) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward if done: episode_rewards.append(episode_reward) break return episode_rewards def save_model(self, PATH): torch.save(self.dqn.state_dict(), PATH)
def train(): if conf.env_module == "img": env = make_atari(conf.env_name) env = bench.Monitor(env, os.path.join(conf.path_game_scan, conf.env_name)) env = wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=True) env = WrapPyTorch(env) model = CnnDQN(env, device) target_model = CnnDQN(env, device) else: env = gym.make(conf.env_name) # Instantiate model = DQN(env, device) target_model = DQN(env, device) target_model.load_state_dict(model.state_dict()) model, target_model = model.to(device), target_model.to(device) optimizer = optim.Adam(model.parameters(), lr=conf.lr) replay_buffer = ReplayBuffer(conf.buffer_size) # cal td loss def cal_td_loss(model, batch_size): s, a, r, s_, d = replay_buffer.sample(batch_size) s = torch.tensor(np.float32(s), dtype=torch.float).to(device) s_ = torch.tensor(np.float32(s_), dtype=torch.float).to(device) a = torch.tensor(a, dtype=torch.long).to(device) r = torch.tensor(r, dtype=torch.float).to(device) d = torch.tensor(d, dtype=torch.float).to(device) q_value = model(s).gather(1, a.unsqueeze(1)).squeeze(1) with torch.no_grad(): next_q_value = target_model(s_).max(1)[0] expected_q_value = r + conf.gamma * next_q_value * (1 - d) expected_q_value.to(device) loss = (q_value - expected_q_value).pow(2).mean() optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() return loss episode_reward = 0 losses = [] all_rewards = [] state = env.reset() # (1, 84, 84) for frame_idx in range(1, conf.num_frames + 1): epsilon = conf.epsilon_by_frame(frame_idx) action = model.act(state, epsilon) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > conf.batch_size: loss = cal_td_loss(model, conf.batch_size) losses.append(loss.item()) if frame_idx % conf.target_upfreq == 0: target_model.load_state_dict(model.state_dict()) if frame_idx % conf.log_freq == 0: print("frame: {}, loss: {}, reward: {}.".format( frame_idx, loss, episode_reward)) if conf.save_curve: curve_name = "res_" + conf.exp_name + ".png" curve_path = os.path.join(conf.path_plot, curve_name) curve_plot(curve_path, frame_idx, all_rewards, losses)