class Agent: def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.replay_buffer = ReplayBuffer(buffer_size) self.dqn = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters()) self.dqn_loss = torch.nn.MSELoss() def update_model(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size) states = torch.FloatTensor(states) actions = torch.LongTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) dones = torch.FloatTensor(dones) curr_Q = self.dqn.forward(states) curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1) next_Q = self.dqn.forward(next_states) max_next_Q = torch.max(next_Q, 1)[0] expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q self.dqn_optimizer.zero_grad() loss = self.dqn_loss(curr_Q, expected_Q) loss.backward() self.dqn_optimizer.step() return loss def max_action(self, state): state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0)) qvals = self.dqn.forward(state) action = np.argmax(qvals.detach().numpy()) return action def train(self, max_episodes, max_steps, batch_size): episode_rewards = [] loss = [] for episodes in range(max_episodes): state = self.env.reset() episode_reward = 0 for steps in range(max_steps): action = self.max_action(state) next_state, reward, done, _ = self.env.step(action) self.replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: episode_rewards.append(episode_reward) print(episode_reward) break if(len(self.replay_buffer) > batch_size): step_loss = self.update_model(batch_size) loss.append(step_loss) #self.adjust_temperature(loss) # return episode_rewards, loss def run(self, max_episodes, max_steps): episode_rewards = [] for episodes in range(max_episodes): state = self.env.reset() episode_reward = 0 for steps in range(max_steps): action = self.max_action(state) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward if done: episode_rewards.append(episode_reward) break return episode_rewards def save_model(self, PATH): torch.save(self.dqn.state_dict(), PATH)
q_values = [] values = [] for step in range(args.num_steps): state = torch.FloatTensor(state).unsqueeze(0).cuda() policy, q_value, value = model(state) # print(policy, q_value) action = policy.multinomial(1) next_state, reward, done, _ = env.step(action.item()) step_count += 1 reward = torch.FloatTensor([reward]).unsqueeze(1).cuda() mask = torch.FloatTensor(1 - np.float32([done])).unsqueeze(1).cuda() replay_buffer.push(state.detach(), action, reward, policy.detach(), mask, done) policies.append(policy) actions.append(action) rewards.append(reward) masks.append(mask) q_values.append(q_value) values.append(value) state = next_state if done: state = env.reset() episode_count += 1 next_state = torch.FloatTensor(state).unsqueeze(0).cuda() gaga, lala, retrace = model(next_state)
done = False rewards = [] with torch.no_grad(): while not done: if episode % 100 < 5: env.render() action = get_action(state) next_state, reward, done, _ = env.step(action) rewards.append(reward) next_state = torch.tensor(next_state).float() mem.push((state, action, reward, next_state)) state = next_state if mem.is_full: experience_replay() epsilon_history.append(epsilon) if epsilon > min_epsilon: epsilon *= 0.99 # print(f'Episode {episode}: {sum(rewards)}') sum_rewards.append(sum(rewards)) if episode % 10 == 9: fig, (ax0, ax1) = plt.subplots(2)
def train(args, env): model = CategoricalDQN(env.observation_space.shape[0], env.action_space.n, args) model_target = CategoricalDQN(env.observation_space.shape[0], env.action_space.n, args) update_target(model, model_target) replay_buffer = ReplayBuffer(args.memory_capacity) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) def project_dist(next_state, rewards, dones): delta_z = float(args.vmax - args.vmin) / (args.atom - 1) support = torch.linspace(args.vmin, args.vmax, args.atom) next_dist = model_target(next_state).data.cpu() * support next_action = next_dist.sum(2).max(1)[1] next_action = next_action.unsqueeze(1).unsqueeze(1).expand(args.batch_size, 1, args.atom) next_dist = next_dist.gather(1, next_action).squeeze(1) rewards = rewards.unsqueeze(1).expand_as(next_dist) dones = dones.unsqueeze(1).expand_as(next_dist) support = support.unsqueeze(0).expand_as(next_dist) Tz = rewards + (1 - dones) * args.discount * support Tz = Tz.clamp(min=args.vmin, max=args.vmax) b = (Tz - args.vmin) / delta_z l = b.floor().long() u = b.ceil().long() offset = torch.linspace(0, (args.batch_size - 1) * args.atom, args.batch_size).long()\ .unsqueeze(1).expand(args.batch_size, args.atom) proj_dist = torch.zeros(next_dist.size()) proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)) proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)) return proj_dist def compute_td_loss(): s0, a, r, s1, done = replay_buffer.sample(args.batch_size) s0 = torch.FloatTensor(s0) a = torch.LongTensor(a) r = torch.FloatTensor(r) with torch.no_grad(): s1 = torch.FloatTensor(s1) done = torch.FloatTensor(np.float32(done)) proj_dist = project_dist(s1, r, done) dist = model(s0) action = a.unsqueeze(1).unsqueeze(1).expand(args.batch_size, 1, args.atom) dist = dist.gather(1, action).squeeze(1) dist.data.clamp_(0.01, 0.99) loss = -(proj_dist * dist.log()).sum(1).mean() optimizer.zero_grad() loss.backward() optimizer.step() return loss.item() losses = [] all_rewards = [] episode_reward = 0 state = env.reset() for i in range(args.max_episode_length): action = model.act(state) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > args.batch_size: loss = compute_td_loss() losses.append(loss) if i > 0 and i % args.learn_start == 0: print(np.mean(all_rewards[-10:]), losses[-1]) if i % args.target_update == 0: update_target(model, model_target)
def main(): parser = argparse.ArgumentParser(description='PlaNet for DM control') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--log-dir', type=str, default='log') parser.add_argument('--test-interval', type=int, default=10) parser.add_argument('--domain-name', type=str, default='cheetah') parser.add_argument('--task-name', type=str, default='run') parser.add_argument('-R', '--action-repeat', type=int, default=4) parser.add_argument('--state-dim', type=int, default=30) parser.add_argument('--rnn-hidden-dim', type=int, default=200) parser.add_argument('--buffer-capacity', type=int, default=1000000) parser.add_argument('--all-episodes', type=int, default=1000) parser.add_argument('-S', '--seed-episodes', type=int, default=5) parser.add_argument('-C', '--collect-interval', type=int, default=100) parser.add_argument('-B', '--batch-size', type=int, default=50) parser.add_argument('-L', '--chunk-length', type=int, default=50) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--eps', type=float, default=1e-4) parser.add_argument('--clip-grad-norm', type=int, default=1000) parser.add_argument('--free-nats', type=int, default=3) parser.add_argument('-H', '--horizon', type=int, default=12) parser.add_argument('-I', '--N-iterations', type=int, default=10) parser.add_argument('-J', '--N-candidates', type=int, default=1000) parser.add_argument('-K', '--N-top-candidates', type=int, default=100) parser.add_argument('--action-noise-var', type=float, default=0.3) args = parser.parse_args() # Prepare logging log_dir = os.path.join(args.log_dir, args.domain_name + '_' + args.task_name) log_dir = os.path.join(log_dir, datetime.now().strftime('%Y%m%d_%H%M')) os.makedirs(log_dir) with open(os.path.join(log_dir, 'args.json'), 'w') as f: json.dump(vars(args), f) pprint(vars(args)) writer = SummaryWriter(log_dir=log_dir) # set seed (NOTE: some randomness is still remaining (e.g. cuDNN's randomness)) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # define env and apply wrappers env = suite.load(args.domain_name, args.task_name, task_kwargs={'random': args.seed}) env = pixels.Wrapper(env, render_kwargs={ 'height': 64, 'width': 64, 'camera_id': 0 }) env = GymWrapper(env) env = RepeatAction(env, skip=args.action_repeat) # define replay buffer replay_buffer = ReplayBuffer(capacity=args.buffer_capacity, observation_shape=env.observation_space.shape, action_dim=env.action_space.shape[0]) # define models and optimizer device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') encoder = Encoder().to(device) rssm = RecurrentStateSpaceModel(args.state_dim, env.action_space.shape[0], args.rnn_hidden_dim).to(device) obs_model = ObservationModel(args.state_dim, args.rnn_hidden_dim).to(device) reward_model = RewardModel(args.state_dim, args.rnn_hidden_dim).to(device) all_params = (list(encoder.parameters()) + list(rssm.parameters()) + list(obs_model.parameters()) + list(reward_model.parameters())) optimizer = Adam(all_params, lr=args.lr, eps=args.eps) # collect initial experience with random action for episode in range(args.seed_episodes): obs = env.reset() done = False while not done: action = env.action_space.sample() next_obs, reward, done, _ = env.step(action) replay_buffer.push(obs, action, reward, done) obs = next_obs # main training loop for episode in range(args.seed_episodes, args.all_episodes): # collect experiences start = time.time() cem_agent = CEMAgent(encoder, rssm, reward_model, args.horizon, args.N_iterations, args.N_candidates, args.N_top_candidates) obs = env.reset() done = False total_reward = 0 while not done: action = cem_agent(obs) action += np.random.normal(0, np.sqrt(args.action_noise_var), env.action_space.shape[0]) next_obs, reward, done, _ = env.step(action) replay_buffer.push(obs, action, reward, done) obs = next_obs total_reward += reward writer.add_scalar('total reward at train', total_reward, episode) print('episode [%4d/%4d] is collected. Total reward is %f' % (episode + 1, args.all_episodes, total_reward)) print('elasped time for interaction: %.2fs' % (time.time() - start)) # update model parameters start = time.time() for update_step in range(args.collect_interval): observations, actions, rewards, _ = \ replay_buffer.sample(args.batch_size, args.chunk_length) # preprocess observations and transpose tensor for RNN training observations = preprocess_obs(observations) observations = torch.as_tensor(observations, device=device) observations = observations.transpose(3, 4).transpose(2, 3) observations = observations.transpose(0, 1) actions = torch.as_tensor(actions, device=device).transpose(0, 1) rewards = torch.as_tensor(rewards, device=device).transpose(0, 1) # embed observations with CNN embedded_observations = encoder(observations.reshape( -1, 3, 64, 64)).view(args.chunk_length, args.batch_size, -1) # prepare Tensor to maintain states sequence and rnn hidden states sequence states = torch.zeros(args.chunk_length, args.batch_size, args.state_dim, device=device) rnn_hiddens = torch.zeros(args.chunk_length, args.batch_size, args.rnn_hidden_dim, device=device) # initialize state and rnn hidden state with 0 vector state = torch.zeros(args.batch_size, args.state_dim, device=device) rnn_hidden = torch.zeros(args.batch_size, args.rnn_hidden_dim, device=device) # compute state and rnn hidden sequences and kl loss kl_loss = 0 for l in range(args.chunk_length - 1): next_state_prior, next_state_posterior, rnn_hidden = \ rssm(state, actions[l], rnn_hidden, embedded_observations[l+1]) state = next_state_posterior.rsample() states[l + 1] = state rnn_hiddens[l + 1] = rnn_hidden kl = kl_divergence(next_state_prior, next_state_posterior).sum(dim=1) kl_loss += kl.clamp(min=args.free_nats).mean() kl_loss /= (args.chunk_length - 1) # compute reconstructed observations and predicted rewards flatten_states = states.view(-1, args.state_dim) flatten_rnn_hiddens = rnn_hiddens.view(-1, args.rnn_hidden_dim) recon_observations = obs_model(flatten_states, flatten_rnn_hiddens).view( args.chunk_length, args.batch_size, 3, 64, 64) predicted_rewards = reward_model(flatten_states, flatten_rnn_hiddens).view( args.chunk_length, args.batch_size, 1) # compute loss for observation and reward obs_loss = 0.5 * mse_loss(recon_observations[1:], observations[1:], reduction='none').mean([0, 1]).sum() reward_loss = 0.5 * mse_loss(predicted_rewards[1:], rewards[:-1]) # add all losses and update model parameters with gradient descent loss = kl_loss + obs_loss + reward_loss optimizer.zero_grad() loss.backward() clip_grad_norm_(all_params, args.clip_grad_norm) optimizer.step() # print losses and add tensorboard print( 'update_step: %3d loss: %.5f, kl_loss: %.5f, obs_loss: %.5f, reward_loss: % .5f' % (update_step + 1, loss.item(), kl_loss.item(), obs_loss.item(), reward_loss.item())) total_update_step = episode * args.collect_interval + update_step writer.add_scalar('overall loss', loss.item(), total_update_step) writer.add_scalar('kl loss', kl_loss.item(), total_update_step) writer.add_scalar('obs loss', obs_loss.item(), total_update_step) writer.add_scalar('reward loss', reward_loss.item(), total_update_step) print('elasped time for update: %.2fs' % (time.time() - start)) # test to get score without exploration noise if (episode + 1) % args.test_interval == 0: start = time.time() cem_agent = CEMAgent(encoder, rssm, reward_model, args.horizon, args.N_iterations, args.N_candidates, args.N_top_candidates) obs = env.reset() done = False total_reward = 0 while not done: action = cem_agent(obs) obs, reward, done, _ = env.step(action) total_reward += reward writer.add_scalar('total reward at test', total_reward, episode) print('Total test reward at episode [%4d/%4d] is %f' % (episode + 1, args.all_episodes, total_reward)) print('elasped time for test: %.2fs' % (time.time() - start)) # save learned model parameters torch.save(encoder.state_dict(), os.path.join(log_dir, 'encoder.pth')) torch.save(rssm.state_dict(), os.path.join(log_dir, 'rssm.pth')) torch.save(obs_model.state_dict(), os.path.join(log_dir, 'obs_model.pth')) torch.save(reward_model.state_dict(), os.path.join(log_dir, 'reward_model.pth')) writer.close()
class Agent: def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000, tau=1e-2): self.env = env self.learning_rate = learning_rate self.gamma = gamma self.tau = tau self.replay_buffer = ReplayBuffer(buffer_size) self.dqn_a = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.dqn_b = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) self.optimizer_a = torch.optim.Adam(self.dqn_a.parameters()) self.optimizer_b = torch.optim.Adam(self.dqn_b.parameters()) self.dqn_loss = torch.nn.MSELoss() for param_b, param_a in zip(self.dqn_b.parameters(), self.dqn_a.parameters()): param_b.data.copy_(param_a.data) def update_model(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size) states = torch.FloatTensor(states) actions = torch.LongTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) dones = torch.FloatTensor(dones) curr_Q = self.dqn_a.forward(states) curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1) print("curr_Q: " + str(curr_Q)) next_Q = self.dqn_a.forward(next_states) best_actions = torch.max(next_Q, 1)[1] #print("next_Q" + str(next_Q)) print("best actions: " + str(best_actions)) dqn_b_Q = self.dqn_b.forward(next_states) max_next_Q = dqn_b_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1) print("max_next_Q: " + str(max_next_Q)) expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q #print(expected_Q) self.optimizer_a.zero_grad() loss = self.dqn_loss(curr_Q, expected_Q) loss.backward() self.optimizer_a.step() for param_b, param_a in zip(self.dqn_b.parameters(), self.dqn_a.parameters()): param_b.data.copy_(param_a.data * self.tau + param_b.data * (1.0 - self.tau)) #update dqn_a by chance """ if(np.random.uniform() < 0.5): # curr_Q = self.dqn_a.forward(states) curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1) next_Q = self.dqn_a.forward(next_states) best_actions = torch.max(next_Q, 1)[1] print("next_Q" + str(next_Q)) print("best actions: " + str(best_actions)) dqn_b_Q = self.dqn_b.forward(next_states) max_next_Q = dqn_b_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1) print("max_next_Q: " + str(max_next_Q)) expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q self.optimizer_a.zero_grad() loss = self.dqn_loss(curr_Q, expected_Q) loss.backward() self.optimizer_a.step() """ # update dqn_b """ else: curr_Q = self.dqn_b.forward(states) curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1) next_Q = self.dqn_b.forward(next_states) best_actions = torch.max(next_Q, 1)[1].detach() #print("next_Q" + str(next_Q)) #print("best actions: " + str(best_actions)) dqn_a_Q = self.dqn_a.forward(next_states) max_next_Q = dqn_a_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1) expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q self.optimizer_b.zero_grad() loss = self.dqn_loss(curr_Q, expected_Q) loss.backward() self.optimizer_b.step() """ def max_action(self, state): state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0)) qvals = self.dqn_a.forward(state) action = np.argmax(qvals.detach().numpy()) # if(np.random.uniform() < 0.2): # return self.env.action_space.sample() return action def train(self, max_episodes, max_steps, batch_size): episode_rewards = [] loss = [] for episodes in range(max_episodes): state = self.env.reset() episode_reward = 0 for steps in range(max_steps): action = self.max_action(state) next_state, reward, done, _ = self.env.step(action) self.replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: episode_rewards.append(episode_reward) print(episode_reward) break if(len(self.replay_buffer) > batch_size): step_loss = self.update_model(batch_size) loss.append(step_loss) #self.adjust_temperature(loss) # return episode_rewards, loss def run(self, max_episodes, max_steps): episode_rewards = [] for episodes in range(max_episodes): state = self.env.reset() episode_reward = 0 for steps in range(max_steps): action = self.max_action(state) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward if done: episode_rewards.append(episode_reward) break return episode_rewards def save_model(self, PATH): torch.save(self.dqn.state_dict(), PATH)
optimizer = opt_algorithm(current_net.parameters(), lr=learning_rate) n_episode = 1 episode_return = 0 best_return = 0 returns = [] state = env.reset() for i in count(): # env.render() eps = get_epsilon(i) action = select_action(state, current_net, eps, number_action=number_actions) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) episode_return += reward state = next_state # Perform one step optimization (on policy network) if i > learning_starts: memory_batch = replay_buffer.sample(batch_size) loss = optimize_model(optimizer, current_net, target_net, memory_batch) else: loss = 0 # This episode is end if done: returns.append(episode_return) print( 'episode {}, frame {}, return {}, loss {:.6f}, eps {:.6f}'.format(
def main(): parser = argparse.ArgumentParser(description='Dreamer for DM control') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--log-dir', type=str, default='log') parser.add_argument('--test-interval', type=int, default=10) parser.add_argument('--domain-name', type=str, default='cheetah') parser.add_argument('--task-name', type=str, default='run') parser.add_argument('-R', '--action-repeat', type=int, default=2) parser.add_argument('--state-dim', type=int, default=30) parser.add_argument('--rnn-hidden-dim', type=int, default=200) parser.add_argument('--buffer-capacity', type=int, default=1000000) parser.add_argument('--all-episodes', type=int, default=1000) parser.add_argument('-S', '--seed-episodes', type=int, default=5) parser.add_argument('-C', '--collect-interval', type=int, default=100) parser.add_argument('-B', '--batch-size', type=int, default=50) parser.add_argument('-L', '--chunk-length', type=int, default=50) parser.add_argument('-H', '--imagination-horizon', type=int, default=15) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--lambda_', type=float, default=0.95) parser.add_argument('--model_lr', type=float, default=6e-4) parser.add_argument('--value_lr', type=float, default=8e-5) parser.add_argument('--action_lr', type=float, default=8e-5) parser.add_argument('--eps', type=float, default=1e-4) parser.add_argument('--clip-grad-norm', type=int, default=100) parser.add_argument('--free-nats', type=int, default=3) parser.add_argument('--action-noise-var', type=float, default=0.3) args = parser.parse_args() # Prepare logging log_dir = os.path.join(args.log_dir, args.domain_name + '_' + args.task_name) log_dir = os.path.join(log_dir, datetime.now().strftime('%Y%m%d_%H%M')) os.makedirs(log_dir) with open(os.path.join(log_dir, 'args.json'), 'w') as f: json.dump(vars(args), f) pprint(vars(args)) writer = SummaryWriter(log_dir=log_dir) # set seed (NOTE: some randomness is still remaining (e.g. cuDNN's randomness)) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # define env and apply wrappers env = suite.load(args.domain_name, args.task_name, task_kwargs={'random': args.seed}) env = pixels.Wrapper(env, render_kwargs={'height': 64, 'width': 64, 'camera_id': 0}) env = GymWrapper(env) env = RepeatAction(env, skip=args.action_repeat) # define replay buffer replay_buffer = ReplayBuffer(capacity=args.buffer_capacity, observation_shape=env.observation_space.shape, action_dim=env.action_space.shape[0]) # define models and optimizer device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') encoder = Encoder().to(device) rssm = RecurrentStateSpaceModel(args.state_dim, env.action_space.shape[0], args.rnn_hidden_dim).to(device) obs_model = ObservationModel(args.state_dim, args.rnn_hidden_dim).to(device) reward_model = RewardModel(args.state_dim, args.rnn_hidden_dim).to(device) model_params = (list(encoder.parameters()) + list(rssm.parameters()) + list(obs_model.parameters()) + list(reward_model.parameters())) model_optimizer = Adam(model_params, lr=args.model_lr, eps=args.eps) # define value model and action model and optimizer value_model = ValueModel(args.state_dim, args.rnn_hidden_dim).to(device) action_model = ActionModel(args.state_dim, args.rnn_hidden_dim, env.action_space.shape[0]).to(device) value_optimizer = Adam(value_model.parameters(), lr=args.value_lr, eps=args.eps) action_optimizer = Adam(action_model.parameters(), lr=args.action_lr, eps=args.eps) # collect seed episodes with random action for episode in range(args.seed_episodes): obs = env.reset() done = False while not done: action = env.action_space.sample() next_obs, reward, done, _ = env.step(action) replay_buffer.push(obs, action, reward, done) obs = next_obs # main training loop for episode in range(args.seed_episodes, args.all_episodes): # ----------------------------- # collect experiences # ----------------------------- start = time.time() policy = Agent(encoder, rssm, action_model) obs = env.reset() done = False total_reward = 0 while not done: action = policy(obs) action += np.random.normal(0, np.sqrt(args.action_noise_var), env.action_space.shape[0]) next_obs, reward, done, _ = env.step(action) replay_buffer.push(obs, action, reward, done) obs = next_obs total_reward += reward writer.add_scalar('total reward at train', total_reward, episode) print('episode [%4d/%4d] is collected. Total reward is %f' % (episode+1, args.all_episodes, total_reward)) print('elasped time for interaction: %.2fs' % (time.time() - start)) # update parameters of model, value model, action model start = time.time() for update_step in range(args.collect_interval): # --------------------------------------------------------------- # update model (encoder, rssm, obs_model, reward_model) # --------------------------------------------------------------- observations, actions, rewards, _ = \ replay_buffer.sample(args.batch_size, args.chunk_length) # preprocess observations and transpose tensor for RNN training observations = preprocess_obs(observations) observations = torch.as_tensor(observations, device=device) observations = observations.transpose(3, 4).transpose(2, 3) observations = observations.transpose(0, 1) actions = torch.as_tensor(actions, device=device).transpose(0, 1) rewards = torch.as_tensor(rewards, device=device).transpose(0, 1) # embed observations with CNN embedded_observations = encoder( observations.reshape(-1, 3, 64, 64)).view(args.chunk_length, args.batch_size, -1) # prepare Tensor to maintain states sequence and rnn hidden states sequence states = torch.zeros( args.chunk_length, args.batch_size, args.state_dim, device=device) rnn_hiddens = torch.zeros( args.chunk_length, args.batch_size, args.rnn_hidden_dim, device=device) # initialize state and rnn hidden state with 0 vector state = torch.zeros(args.batch_size, args.state_dim, device=device) rnn_hidden = torch.zeros(args.batch_size, args.rnn_hidden_dim, device=device) # compute state and rnn hidden sequences and kl loss kl_loss = 0 for l in range(args.chunk_length-1): next_state_prior, next_state_posterior, rnn_hidden = \ rssm(state, actions[l], rnn_hidden, embedded_observations[l+1]) state = next_state_posterior.rsample() states[l+1] = state rnn_hiddens[l+1] = rnn_hidden kl = kl_divergence(next_state_prior, next_state_posterior).sum(dim=1) kl_loss += kl.clamp(min=args.free_nats).mean() kl_loss /= (args.chunk_length - 1) # states[0] and rnn_hiddens[0] are always 0 and have no information states = states[1:] rnn_hiddens = rnn_hiddens[1:] # compute reconstructed observations and predicted rewards flatten_states = states.view(-1, args.state_dim) flatten_rnn_hiddens = rnn_hiddens.view(-1, args.rnn_hidden_dim) recon_observations = obs_model(flatten_states, flatten_rnn_hiddens).view( args.chunk_length-1, args.batch_size, 3, 64, 64) predicted_rewards = reward_model(flatten_states, flatten_rnn_hiddens).view( args.chunk_length-1, args.batch_size, 1) # compute loss for observation and reward obs_loss = 0.5 * mse_loss( recon_observations, observations[1:], reduction='none').mean([0, 1]).sum() reward_loss = 0.5 * mse_loss(predicted_rewards, rewards[:-1]) # add all losses and update model parameters with gradient descent model_loss = kl_loss + obs_loss + reward_loss model_optimizer.zero_grad() model_loss.backward() clip_grad_norm_(model_params, args.clip_grad_norm) model_optimizer.step() # ---------------------------------------------- # update value_model and action_model # ---------------------------------------------- # detach gradient because Dreamer doesn't update model with actor-critic loss flatten_states = flatten_states.detach() flatten_rnn_hiddens = flatten_rnn_hiddens.detach() # prepare tensor to maintain imaginated trajectory's states and rnn_hiddens imaginated_states = torch.zeros(args.imagination_horizon + 1, *flatten_states.shape, device=flatten_states.device) imaginated_rnn_hiddens = torch.zeros(args.imagination_horizon + 1, *flatten_rnn_hiddens.shape, device=flatten_rnn_hiddens.device) imaginated_states[0] = flatten_states imaginated_rnn_hiddens[0] = flatten_rnn_hiddens # compute imaginated trajectory using action from action_model for h in range(1, args.imagination_horizon + 1): actions = action_model(flatten_states, flatten_rnn_hiddens) flatten_states_prior, flatten_rnn_hiddens = rssm.prior(flatten_states, actions, flatten_rnn_hiddens) flatten_states = flatten_states_prior.rsample() imaginated_states[h] = flatten_states imaginated_rnn_hiddens[h] = flatten_rnn_hiddens # compute rewards and values corresponding to imaginated states and rnn_hiddens flatten_imaginated_states = imaginated_states.view(-1, args.state_dim) flatten_imaginated_rnn_hiddens = imaginated_rnn_hiddens.view(-1, args.rnn_hidden_dim) imaginated_rewards = \ reward_model(flatten_imaginated_states, flatten_imaginated_rnn_hiddens).view(args.imagination_horizon + 1, -1) imaginated_values = \ value_model(flatten_imaginated_states, flatten_imaginated_rnn_hiddens).view(args.imagination_horizon + 1, -1) # compute lambda target lambda_target_values = lambda_target(imaginated_rewards, imaginated_values, args.gamma, args.lambda_) # update_value model value_loss = 0.5 * mse_loss(imaginated_values, lambda_target_values.detach()) value_optimizer.zero_grad() value_loss.backward(retain_graph=True) clip_grad_norm_(value_model.parameters(), args.clip_grad_norm) value_optimizer.step() # update action model (multiply -1 for gradient ascent) action_loss = -1 * (lambda_target_values.mean()) action_optimizer.zero_grad() action_loss.backward() clip_grad_norm_(action_model.parameters(), args.clip_grad_norm) action_optimizer.step() # print losses and add to tensorboard print('update_step: %3d model loss: %.5f, kl_loss: %.5f, ' 'obs_loss: %.5f, reward_loss: %.5f, ' 'value_loss: %.5f action_loss: %.5f' % (update_step + 1, model_loss.item(), kl_loss.item(), obs_loss.item(), reward_loss.item(), value_loss.item(), action_loss.item())) total_update_step = episode * args.collect_interval + update_step writer.add_scalar('model loss', model_loss.item(), total_update_step) writer.add_scalar('kl loss', kl_loss.item(), total_update_step) writer.add_scalar('obs loss', obs_loss.item(), total_update_step) writer.add_scalar('reward loss', reward_loss.item(), total_update_step) writer.add_scalar('value loss', value_loss.item(), total_update_step) writer.add_scalar('action loss', action_loss.item(), total_update_step) print('elasped time for update: %.2fs' % (time.time() - start)) # ---------------------------------------------- # evaluation without exploration noise # ---------------------------------------------- if (episode + 1) % args.test_interval == 0: policy = Agent(encoder, rssm, action_model) start = time.time() obs = env.reset() done = False total_reward = 0 while not done: action = policy(obs, training=False) obs, reward, done, _ = env.step(action) total_reward += reward writer.add_scalar('total reward at test', total_reward, episode) print('Total test reward at episode [%4d/%4d] is %f' % (episode+1, args.all_episodes, total_reward)) print('elasped time for test: %.2fs' % (time.time() - start)) # save learned model parameters torch.save(encoder.state_dict(), os.path.join(log_dir, 'encoder.pth')) torch.save(rssm.state_dict(), os.path.join(log_dir, 'rssm.pth')) torch.save(obs_model.state_dict(), os.path.join(log_dir, 'obs_model.pth')) torch.save(reward_model.state_dict(), os.path.join(log_dir, 'reward_model.pth')) torch.save(value_model.state_dict(), os.path.join(log_dir, 'value_model.pth')) torch.save(action_model.state_dict(), os.path.join(log_dir, 'action_model.pth')) writer.close()
def train(): if conf.env_module == "img": env = make_atari(conf.env_name) env = bench.Monitor(env, os.path.join(conf.path_game_scan, conf.env_name)) env = wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=True) env = WrapPyTorch(env) model = CnnDQN(env, device) target_model = CnnDQN(env, device) else: env = gym.make(conf.env_name) # Instantiate model = DQN(env, device) target_model = DQN(env, device) target_model.load_state_dict(model.state_dict()) model, target_model = model.to(device), target_model.to(device) optimizer = optim.Adam(model.parameters(), lr=conf.lr) replay_buffer = ReplayBuffer(conf.buffer_size) # cal td loss def cal_td_loss(model, batch_size): s, a, r, s_, d = replay_buffer.sample(batch_size) s = torch.tensor(np.float32(s), dtype=torch.float).to(device) s_ = torch.tensor(np.float32(s_), dtype=torch.float).to(device) a = torch.tensor(a, dtype=torch.long).to(device) r = torch.tensor(r, dtype=torch.float).to(device) d = torch.tensor(d, dtype=torch.float).to(device) q_value = model(s).gather(1, a.unsqueeze(1)).squeeze(1) with torch.no_grad(): next_q_value = target_model(s_).max(1)[0] expected_q_value = r + conf.gamma * next_q_value * (1 - d) expected_q_value.to(device) loss = (q_value - expected_q_value).pow(2).mean() optimizer.zero_grad() loss.backward() for param in model.parameters(): param.grad.data.clamp_(-1, 1) optimizer.step() return loss episode_reward = 0 losses = [] all_rewards = [] state = env.reset() # (1, 84, 84) for frame_idx in range(1, conf.num_frames + 1): epsilon = conf.epsilon_by_frame(frame_idx) action = model.act(state, epsilon) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > conf.batch_size: loss = cal_td_loss(model, conf.batch_size) losses.append(loss.item()) if frame_idx % conf.target_upfreq == 0: target_model.load_state_dict(model.state_dict()) if frame_idx % conf.log_freq == 0: print("frame: {}, loss: {}, reward: {}.".format( frame_idx, loss, episode_reward)) if conf.save_curve: curve_name = "res_" + conf.exp_name + ".png" curve_path = os.path.join(conf.path_plot, curve_name) curve_plot(curve_path, frame_idx, all_rewards, losses)