class Agent:

    def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = ReplayBuffer(buffer_size)
        self.dqn = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
        self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters())
        self.dqn_loss = torch.nn.MSELoss()

    def update_model(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)
        
        curr_Q = self.dqn.forward(states)
        curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_Q = self.dqn.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q 

        self.dqn_optimizer.zero_grad()
        loss = self.dqn_loss(curr_Q, expected_Q)
        loss.backward()
        self.dqn_optimizer.step()
        
        return loss

    def max_action(self, state):
        state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0))
        qvals = self.dqn.forward(state)
        action = np.argmax(qvals.detach().numpy())
  
        return action
      
    def train(self, max_episodes, max_steps, batch_size):
        episode_rewards = []
        loss = []
        
        for episodes in range(max_episodes):
            state = self.env.reset()  
            episode_reward = 0
            for steps in range(max_steps):
                action = self.max_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.push(state, action, reward, next_state, done)
                state = next_state
                episode_reward += reward
                
                if done:
                  episode_rewards.append(episode_reward)
                  print(episode_reward)
                  break
                
                if(len(self.replay_buffer) > batch_size):
                    step_loss = self.update_model(batch_size)
                    loss.append(step_loss)
                    #self.adjust_temperature(loss)
                
        # return episode_rewards, loss
                  
    def run(self, max_episodes, max_steps):
        episode_rewards = []
        for episodes in range(max_episodes):
            state = self.env.reset()  
            episode_reward = 0
            for steps in range(max_steps):
                action = self.max_action(state)
                next_state, reward, done, _ = env.step(action)
                state = next_state
                episode_reward += reward
                  
                if done:
                  episode_rewards.append(episode_reward)
                  break
                  
        return episode_rewards

    def save_model(self, PATH):
        torch.save(self.dqn.state_dict(), PATH)
示例#2
0
    q_values = []
    values = []

    for step in range(args.num_steps):

        state = torch.FloatTensor(state).unsqueeze(0).cuda()
        policy, q_value, value = model(state)
        # print(policy, q_value)

        action = policy.multinomial(1)
        next_state, reward, done, _ = env.step(action.item())
        step_count += 1

        reward = torch.FloatTensor([reward]).unsqueeze(1).cuda()
        mask = torch.FloatTensor(1 - np.float32([done])).unsqueeze(1).cuda()
        replay_buffer.push(state.detach(), action, reward, policy.detach(),
                           mask, done)

        policies.append(policy)
        actions.append(action)
        rewards.append(reward)
        masks.append(mask)
        q_values.append(q_value)
        values.append(value)

        state = next_state
        if done:
            state = env.reset()
            episode_count += 1

    next_state = torch.FloatTensor(state).unsqueeze(0).cuda()
    gaga, lala, retrace = model(next_state)
示例#3
0
    done = False

    rewards = []

    with torch.no_grad():
        while not done:
            if episode % 100 < 5:
                env.render()

            action = get_action(state)
            next_state, reward, done, _ = env.step(action)

            rewards.append(reward)
            next_state = torch.tensor(next_state).float()

            mem.push((state, action, reward, next_state))

            state = next_state

    if mem.is_full:
        experience_replay()

    epsilon_history.append(epsilon)
    if epsilon > min_epsilon:
        epsilon *= 0.99

    # print(f'Episode {episode}: {sum(rewards)}')
    sum_rewards.append(sum(rewards))

    if episode % 10 == 9:
        fig, (ax0, ax1) = plt.subplots(2)
示例#4
0
def train(args, env):
	model = CategoricalDQN(env.observation_space.shape[0], env.action_space.n, args)
	model_target = CategoricalDQN(env.observation_space.shape[0], env.action_space.n, args)
	update_target(model, model_target)

	replay_buffer = ReplayBuffer(args.memory_capacity)

	optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

	def project_dist(next_state, rewards, dones):
		delta_z = float(args.vmax - args.vmin) / (args.atom - 1)
		support = torch.linspace(args.vmin, args.vmax, args.atom)

		next_dist = model_target(next_state).data.cpu() * support
		next_action = next_dist.sum(2).max(1)[1]
		next_action = next_action.unsqueeze(1).unsqueeze(1).expand(args.batch_size, 1, args.atom)
		next_dist = next_dist.gather(1, next_action).squeeze(1)

		rewards = rewards.unsqueeze(1).expand_as(next_dist)
		dones = dones.unsqueeze(1).expand_as(next_dist)
		support = support.unsqueeze(0).expand_as(next_dist)

		Tz = rewards + (1 - dones) * args.discount * support
		Tz = Tz.clamp(min=args.vmin, max=args.vmax)
		b = (Tz - args.vmin) / delta_z
		l = b.floor().long()
		u = b.ceil().long()

		offset = torch.linspace(0, (args.batch_size - 1) * args.atom, args.batch_size).long()\
					.unsqueeze(1).expand(args.batch_size, args.atom)

		proj_dist = torch.zeros(next_dist.size())
		proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1))
		proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1))

		return proj_dist

	def compute_td_loss():
		s0, a, r, s1, done = replay_buffer.sample(args.batch_size)

		s0 = torch.FloatTensor(s0)
		a = torch.LongTensor(a)
		r = torch.FloatTensor(r)
		with torch.no_grad():
			s1 = torch.FloatTensor(s1)
		done = torch.FloatTensor(np.float32(done))

		proj_dist = project_dist(s1, r, done)

		dist = model(s0)
		action = a.unsqueeze(1).unsqueeze(1).expand(args.batch_size, 1, args.atom)
		dist = dist.gather(1, action).squeeze(1)
		dist.data.clamp_(0.01, 0.99)
		loss = -(proj_dist * dist.log()).sum(1).mean()
		
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

		return loss.item()

	losses = []
	all_rewards = []
	episode_reward = 0

	state = env.reset()

	for i in range(args.max_episode_length):
		action = model.act(state)

		next_state, reward, done, _ = env.step(action)

		replay_buffer.push(state, action, reward, next_state, done)
		
		state = next_state
		episode_reward += reward

		if done:
			state = env.reset()
			all_rewards.append(episode_reward)
			episode_reward = 0

		if len(replay_buffer) > args.batch_size:
			loss = compute_td_loss()
			losses.append(loss)
		
		if i > 0 and i % args.learn_start == 0:
			print(np.mean(all_rewards[-10:]), losses[-1])
		
		if i % args.target_update == 0:
			update_target(model, model_target)
示例#5
0
def main():
    parser = argparse.ArgumentParser(description='PlaNet for DM control')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--log-dir', type=str, default='log')
    parser.add_argument('--test-interval', type=int, default=10)
    parser.add_argument('--domain-name', type=str, default='cheetah')
    parser.add_argument('--task-name', type=str, default='run')
    parser.add_argument('-R', '--action-repeat', type=int, default=4)
    parser.add_argument('--state-dim', type=int, default=30)
    parser.add_argument('--rnn-hidden-dim', type=int, default=200)
    parser.add_argument('--buffer-capacity', type=int, default=1000000)
    parser.add_argument('--all-episodes', type=int, default=1000)
    parser.add_argument('-S', '--seed-episodes', type=int, default=5)
    parser.add_argument('-C', '--collect-interval', type=int, default=100)
    parser.add_argument('-B', '--batch-size', type=int, default=50)
    parser.add_argument('-L', '--chunk-length', type=int, default=50)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--eps', type=float, default=1e-4)
    parser.add_argument('--clip-grad-norm', type=int, default=1000)
    parser.add_argument('--free-nats', type=int, default=3)
    parser.add_argument('-H', '--horizon', type=int, default=12)
    parser.add_argument('-I', '--N-iterations', type=int, default=10)
    parser.add_argument('-J', '--N-candidates', type=int, default=1000)
    parser.add_argument('-K', '--N-top-candidates', type=int, default=100)
    parser.add_argument('--action-noise-var', type=float, default=0.3)
    args = parser.parse_args()

    # Prepare logging
    log_dir = os.path.join(args.log_dir,
                           args.domain_name + '_' + args.task_name)
    log_dir = os.path.join(log_dir, datetime.now().strftime('%Y%m%d_%H%M'))
    os.makedirs(log_dir)
    with open(os.path.join(log_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f)
    pprint(vars(args))
    writer = SummaryWriter(log_dir=log_dir)

    # set seed (NOTE: some randomness is still remaining (e.g. cuDNN's randomness))
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # define env and apply wrappers
    env = suite.load(args.domain_name,
                     args.task_name,
                     task_kwargs={'random': args.seed})
    env = pixels.Wrapper(env,
                         render_kwargs={
                             'height': 64,
                             'width': 64,
                             'camera_id': 0
                         })
    env = GymWrapper(env)
    env = RepeatAction(env, skip=args.action_repeat)

    # define replay buffer
    replay_buffer = ReplayBuffer(capacity=args.buffer_capacity,
                                 observation_shape=env.observation_space.shape,
                                 action_dim=env.action_space.shape[0])

    # define models and optimizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = Encoder().to(device)
    rssm = RecurrentStateSpaceModel(args.state_dim, env.action_space.shape[0],
                                    args.rnn_hidden_dim).to(device)
    obs_model = ObservationModel(args.state_dim,
                                 args.rnn_hidden_dim).to(device)
    reward_model = RewardModel(args.state_dim, args.rnn_hidden_dim).to(device)
    all_params = (list(encoder.parameters()) + list(rssm.parameters()) +
                  list(obs_model.parameters()) +
                  list(reward_model.parameters()))
    optimizer = Adam(all_params, lr=args.lr, eps=args.eps)

    # collect initial experience with random action
    for episode in range(args.seed_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = env.action_space.sample()
            next_obs, reward, done, _ = env.step(action)
            replay_buffer.push(obs, action, reward, done)
            obs = next_obs

    # main training loop
    for episode in range(args.seed_episodes, args.all_episodes):
        # collect experiences
        start = time.time()
        cem_agent = CEMAgent(encoder, rssm, reward_model, args.horizon,
                             args.N_iterations, args.N_candidates,
                             args.N_top_candidates)

        obs = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = cem_agent(obs)
            action += np.random.normal(0, np.sqrt(args.action_noise_var),
                                       env.action_space.shape[0])
            next_obs, reward, done, _ = env.step(action)
            replay_buffer.push(obs, action, reward, done)
            obs = next_obs
            total_reward += reward

        writer.add_scalar('total reward at train', total_reward, episode)
        print('episode [%4d/%4d] is collected. Total reward is %f' %
              (episode + 1, args.all_episodes, total_reward))
        print('elasped time for interaction: %.2fs' % (time.time() - start))

        # update model parameters
        start = time.time()
        for update_step in range(args.collect_interval):
            observations, actions, rewards, _ = \
                replay_buffer.sample(args.batch_size, args.chunk_length)

            # preprocess observations and transpose tensor for RNN training
            observations = preprocess_obs(observations)
            observations = torch.as_tensor(observations, device=device)
            observations = observations.transpose(3, 4).transpose(2, 3)
            observations = observations.transpose(0, 1)
            actions = torch.as_tensor(actions, device=device).transpose(0, 1)
            rewards = torch.as_tensor(rewards, device=device).transpose(0, 1)

            # embed observations with CNN
            embedded_observations = encoder(observations.reshape(
                -1, 3, 64, 64)).view(args.chunk_length, args.batch_size, -1)

            # prepare Tensor to maintain states sequence and rnn hidden states sequence
            states = torch.zeros(args.chunk_length,
                                 args.batch_size,
                                 args.state_dim,
                                 device=device)
            rnn_hiddens = torch.zeros(args.chunk_length,
                                      args.batch_size,
                                      args.rnn_hidden_dim,
                                      device=device)

            # initialize state and rnn hidden state with 0 vector
            state = torch.zeros(args.batch_size, args.state_dim, device=device)
            rnn_hidden = torch.zeros(args.batch_size,
                                     args.rnn_hidden_dim,
                                     device=device)

            # compute state and rnn hidden sequences and kl loss
            kl_loss = 0
            for l in range(args.chunk_length - 1):
                next_state_prior, next_state_posterior, rnn_hidden = \
                    rssm(state, actions[l], rnn_hidden, embedded_observations[l+1])
                state = next_state_posterior.rsample()
                states[l + 1] = state
                rnn_hiddens[l + 1] = rnn_hidden
                kl = kl_divergence(next_state_prior,
                                   next_state_posterior).sum(dim=1)
                kl_loss += kl.clamp(min=args.free_nats).mean()
            kl_loss /= (args.chunk_length - 1)

            # compute reconstructed observations and predicted rewards
            flatten_states = states.view(-1, args.state_dim)
            flatten_rnn_hiddens = rnn_hiddens.view(-1, args.rnn_hidden_dim)
            recon_observations = obs_model(flatten_states,
                                           flatten_rnn_hiddens).view(
                                               args.chunk_length,
                                               args.batch_size, 3, 64, 64)
            predicted_rewards = reward_model(flatten_states,
                                             flatten_rnn_hiddens).view(
                                                 args.chunk_length,
                                                 args.batch_size, 1)

            # compute loss for observation and reward
            obs_loss = 0.5 * mse_loss(recon_observations[1:],
                                      observations[1:],
                                      reduction='none').mean([0, 1]).sum()
            reward_loss = 0.5 * mse_loss(predicted_rewards[1:], rewards[:-1])

            # add all losses and update model parameters with gradient descent
            loss = kl_loss + obs_loss + reward_loss
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm_(all_params, args.clip_grad_norm)
            optimizer.step()

            # print losses and add tensorboard
            print(
                'update_step: %3d loss: %.5f, kl_loss: %.5f, obs_loss: %.5f, reward_loss: % .5f'
                % (update_step + 1, loss.item(), kl_loss.item(),
                   obs_loss.item(), reward_loss.item()))
            total_update_step = episode * args.collect_interval + update_step
            writer.add_scalar('overall loss', loss.item(), total_update_step)
            writer.add_scalar('kl loss', kl_loss.item(), total_update_step)
            writer.add_scalar('obs loss', obs_loss.item(), total_update_step)
            writer.add_scalar('reward loss', reward_loss.item(),
                              total_update_step)

        print('elasped time for update: %.2fs' % (time.time() - start))

        # test to get score without exploration noise
        if (episode + 1) % args.test_interval == 0:
            start = time.time()
            cem_agent = CEMAgent(encoder, rssm, reward_model, args.horizon,
                                 args.N_iterations, args.N_candidates,
                                 args.N_top_candidates)
            obs = env.reset()
            done = False
            total_reward = 0
            while not done:
                action = cem_agent(obs)
                obs, reward, done, _ = env.step(action)
                total_reward += reward

            writer.add_scalar('total reward at test', total_reward, episode)
            print('Total test reward at episode [%4d/%4d] is %f' %
                  (episode + 1, args.all_episodes, total_reward))
            print('elasped time for test: %.2fs' % (time.time() - start))

    # save learned model parameters
    torch.save(encoder.state_dict(), os.path.join(log_dir, 'encoder.pth'))
    torch.save(rssm.state_dict(), os.path.join(log_dir, 'rssm.pth'))
    torch.save(obs_model.state_dict(), os.path.join(log_dir, 'obs_model.pth'))
    torch.save(reward_model.state_dict(),
               os.path.join(log_dir, 'reward_model.pth'))
    writer.close()
示例#6
0
class Agent:

    def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000, tau=1e-2):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = ReplayBuffer(buffer_size)
        
        self.dqn_a = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
        self.dqn_b = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
        self.optimizer_a = torch.optim.Adam(self.dqn_a.parameters())
        self.optimizer_b = torch.optim.Adam(self.dqn_b.parameters())
        self.dqn_loss = torch.nn.MSELoss()

        for param_b, param_a in zip(self.dqn_b.parameters(), self.dqn_a.parameters()):
            param_b.data.copy_(param_a.data)
     
    def update_model(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        curr_Q = self.dqn_a.forward(states)
        curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1)
        print("curr_Q: " + str(curr_Q))
        next_Q = self.dqn_a.forward(next_states)
        best_actions = torch.max(next_Q, 1)[1]
        #print("next_Q" + str(next_Q))
        print("best actions: " + str(best_actions))
        dqn_b_Q = self.dqn_b.forward(next_states)
        max_next_Q = dqn_b_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1)

        print("max_next_Q: " + str(max_next_Q))
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q
        #print(expected_Q)
        self.optimizer_a.zero_grad()
        loss = self.dqn_loss(curr_Q, expected_Q)
        loss.backward()
        self.optimizer_a.step()

        for param_b, param_a in zip(self.dqn_b.parameters(), self.dqn_a.parameters()):
            param_b.data.copy_(param_a.data * self.tau + param_b.data * (1.0 - self.tau))
     


        #update dqn_a by chance
        """
        if(np.random.uniform() < 0.5): # 
            curr_Q = self.dqn_a.forward(states)
            curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1)
            next_Q = self.dqn_a.forward(next_states)
            best_actions = torch.max(next_Q, 1)[1]
            print("next_Q" + str(next_Q))
            print("best actions: " + str(best_actions))
            dqn_b_Q = self.dqn_b.forward(next_states)
            max_next_Q = dqn_b_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1)
            print("max_next_Q: " + str(max_next_Q))
            expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q

            self.optimizer_a.zero_grad()
            loss = self.dqn_loss(curr_Q, expected_Q)
            loss.backward()
            self.optimizer_a.step()
        """
        # update dqn_b
        """
        else: 
            curr_Q = self.dqn_b.forward(states)
            curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1)
            next_Q = self.dqn_b.forward(next_states)
            best_actions = torch.max(next_Q, 1)[1].detach()
            #print("next_Q" + str(next_Q))
            #print("best actions: " + str(best_actions))
            dqn_a_Q = self.dqn_a.forward(next_states)
            max_next_Q = dqn_a_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1)
            expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q

            self.optimizer_b.zero_grad()
            loss = self.dqn_loss(curr_Q, expected_Q)
            loss.backward()
            self.optimizer_b.step()
        """    
    
    def max_action(self, state):
        state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0))
        qvals = self.dqn_a.forward(state)
        action = np.argmax(qvals.detach().numpy())

        # if(np.random.uniform() < 0.2):
        #     return self.env.action_space.sample()
  
        return action
      
    def train(self, max_episodes, max_steps, batch_size):
        episode_rewards = []
        loss = []
        
        for episodes in range(max_episodes):
            state = self.env.reset()  
            episode_reward = 0
            for steps in range(max_steps):
                action = self.max_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.push(state, action, reward, next_state, done)
                state = next_state
                episode_reward += reward
                
                if done:
                  episode_rewards.append(episode_reward)
                  print(episode_reward)
                  break
                
                if(len(self.replay_buffer) > batch_size):
                    step_loss = self.update_model(batch_size)
                    loss.append(step_loss)
                    #self.adjust_temperature(loss)
                
        # return episode_rewards, loss
                  
    def run(self, max_episodes, max_steps):
        episode_rewards = []
        for episodes in range(max_episodes):
            state = self.env.reset()  
            episode_reward = 0
            for steps in range(max_steps):
                action = self.max_action(state)
                next_state, reward, done, _ = env.step(action)
                state = next_state
                episode_reward += reward
                  
                if done:
                  episode_rewards.append(episode_reward)
                  break
                  
        return episode_rewards

    def save_model(self, PATH):
        torch.save(self.dqn.state_dict(), PATH)
示例#7
0
optimizer = opt_algorithm(current_net.parameters(), lr=learning_rate)

n_episode = 1
episode_return = 0
best_return = 0
returns = []
state = env.reset()
for i in count():
    # env.render()
    eps = get_epsilon(i)
    action = select_action(state,
                           current_net,
                           eps,
                           number_action=number_actions)
    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)
    episode_return += reward
    state = next_state

    # Perform one step optimization (on policy network)
    if i > learning_starts:
        memory_batch = replay_buffer.sample(batch_size)
        loss = optimize_model(optimizer, current_net, target_net, memory_batch)
    else:
        loss = 0

    # This episode is end
    if done:
        returns.append(episode_return)
        print(
            'episode {}, frame {}, return {}, loss {:.6f}, eps {:.6f}'.format(
示例#8
0
def main():
    parser = argparse.ArgumentParser(description='Dreamer for DM control')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--log-dir', type=str, default='log')
    parser.add_argument('--test-interval', type=int, default=10)
    parser.add_argument('--domain-name', type=str, default='cheetah')
    parser.add_argument('--task-name', type=str, default='run')
    parser.add_argument('-R', '--action-repeat', type=int, default=2)
    parser.add_argument('--state-dim', type=int, default=30)
    parser.add_argument('--rnn-hidden-dim', type=int, default=200)
    parser.add_argument('--buffer-capacity', type=int, default=1000000)
    parser.add_argument('--all-episodes', type=int, default=1000)
    parser.add_argument('-S', '--seed-episodes', type=int, default=5)
    parser.add_argument('-C', '--collect-interval', type=int, default=100)
    parser.add_argument('-B', '--batch-size', type=int, default=50)
    parser.add_argument('-L', '--chunk-length', type=int, default=50)
    parser.add_argument('-H', '--imagination-horizon', type=int, default=15)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--lambda_', type=float, default=0.95)
    parser.add_argument('--model_lr', type=float, default=6e-4)
    parser.add_argument('--value_lr', type=float, default=8e-5)
    parser.add_argument('--action_lr', type=float, default=8e-5)
    parser.add_argument('--eps', type=float, default=1e-4)
    parser.add_argument('--clip-grad-norm', type=int, default=100)
    parser.add_argument('--free-nats', type=int, default=3)
    parser.add_argument('--action-noise-var', type=float, default=0.3)
    args = parser.parse_args()

    # Prepare logging
    log_dir = os.path.join(args.log_dir, args.domain_name + '_' + args.task_name)
    log_dir = os.path.join(log_dir, datetime.now().strftime('%Y%m%d_%H%M'))
    os.makedirs(log_dir)
    with open(os.path.join(log_dir, 'args.json'), 'w') as f:
        json.dump(vars(args), f)
    pprint(vars(args))
    writer = SummaryWriter(log_dir=log_dir)

    # set seed (NOTE: some randomness is still remaining (e.g. cuDNN's randomness))
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # define env and apply wrappers
    env = suite.load(args.domain_name, args.task_name, task_kwargs={'random': args.seed})
    env = pixels.Wrapper(env, render_kwargs={'height': 64,
                                             'width': 64,
                                             'camera_id': 0})
    env = GymWrapper(env)
    env = RepeatAction(env, skip=args.action_repeat)

    # define replay buffer
    replay_buffer = ReplayBuffer(capacity=args.buffer_capacity,
                                 observation_shape=env.observation_space.shape,
                                 action_dim=env.action_space.shape[0])

    # define models and optimizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    encoder = Encoder().to(device)
    rssm = RecurrentStateSpaceModel(args.state_dim,
                                    env.action_space.shape[0],
                                    args.rnn_hidden_dim).to(device)
    obs_model = ObservationModel(args.state_dim, args.rnn_hidden_dim).to(device)
    reward_model = RewardModel(args.state_dim, args.rnn_hidden_dim).to(device)
    model_params = (list(encoder.parameters()) +
                    list(rssm.parameters()) +
                    list(obs_model.parameters()) +
                    list(reward_model.parameters()))
    model_optimizer = Adam(model_params, lr=args.model_lr, eps=args.eps)

    # define value model and action model and optimizer
    value_model = ValueModel(args.state_dim, args.rnn_hidden_dim).to(device)
    action_model = ActionModel(args.state_dim, args.rnn_hidden_dim,
                               env.action_space.shape[0]).to(device)
    value_optimizer = Adam(value_model.parameters(), lr=args.value_lr, eps=args.eps)
    action_optimizer = Adam(action_model.parameters(), lr=args.action_lr, eps=args.eps)

    # collect seed episodes with random action
    for episode in range(args.seed_episodes):
        obs = env.reset()
        done = False
        while not done:
            action = env.action_space.sample()
            next_obs, reward, done, _ = env.step(action)
            replay_buffer.push(obs, action, reward, done)
            obs = next_obs

    # main training loop
    for episode in range(args.seed_episodes, args.all_episodes):
        # -----------------------------
        #      collect experiences
        # -----------------------------
        start = time.time()
        policy = Agent(encoder, rssm, action_model)

        obs = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = policy(obs)
            action += np.random.normal(0, np.sqrt(args.action_noise_var),
                                       env.action_space.shape[0])
            next_obs, reward, done, _ = env.step(action)
            replay_buffer.push(obs, action, reward, done)
            obs = next_obs
            total_reward += reward

        writer.add_scalar('total reward at train', total_reward, episode)
        print('episode [%4d/%4d] is collected. Total reward is %f' %
              (episode+1, args.all_episodes, total_reward))
        print('elasped time for interaction: %.2fs' % (time.time() - start))

        # update parameters of model, value model, action model
        start = time.time()
        for update_step in range(args.collect_interval):
            # ---------------------------------------------------------------
            #      update model (encoder, rssm, obs_model, reward_model)
            # ---------------------------------------------------------------
            observations, actions, rewards, _ = \
                replay_buffer.sample(args.batch_size, args.chunk_length)

            # preprocess observations and transpose tensor for RNN training
            observations = preprocess_obs(observations)
            observations = torch.as_tensor(observations, device=device)
            observations = observations.transpose(3, 4).transpose(2, 3)
            observations = observations.transpose(0, 1)
            actions = torch.as_tensor(actions, device=device).transpose(0, 1)
            rewards = torch.as_tensor(rewards, device=device).transpose(0, 1)

            # embed observations with CNN
            embedded_observations = encoder(
                observations.reshape(-1, 3, 64, 64)).view(args.chunk_length, args.batch_size, -1)

            # prepare Tensor to maintain states sequence and rnn hidden states sequence
            states = torch.zeros(
                args.chunk_length, args.batch_size, args.state_dim, device=device)
            rnn_hiddens = torch.zeros(
                args.chunk_length, args.batch_size, args.rnn_hidden_dim, device=device)

            # initialize state and rnn hidden state with 0 vector
            state = torch.zeros(args.batch_size, args.state_dim, device=device)
            rnn_hidden = torch.zeros(args.batch_size, args.rnn_hidden_dim, device=device)

            # compute state and rnn hidden sequences and kl loss
            kl_loss = 0
            for l in range(args.chunk_length-1):
                next_state_prior, next_state_posterior, rnn_hidden = \
                    rssm(state, actions[l], rnn_hidden, embedded_observations[l+1])
                state = next_state_posterior.rsample()
                states[l+1] = state
                rnn_hiddens[l+1] = rnn_hidden
                kl = kl_divergence(next_state_prior, next_state_posterior).sum(dim=1)
                kl_loss += kl.clamp(min=args.free_nats).mean()
            kl_loss /= (args.chunk_length - 1)

            # states[0] and rnn_hiddens[0] are always 0 and have no information
            states = states[1:]
            rnn_hiddens = rnn_hiddens[1:]

            # compute reconstructed observations and predicted rewards
            flatten_states = states.view(-1, args.state_dim)
            flatten_rnn_hiddens = rnn_hiddens.view(-1, args.rnn_hidden_dim)
            recon_observations = obs_model(flatten_states, flatten_rnn_hiddens).view(
                args.chunk_length-1, args.batch_size, 3, 64, 64)
            predicted_rewards = reward_model(flatten_states, flatten_rnn_hiddens).view(
                args.chunk_length-1, args.batch_size, 1)

            # compute loss for observation and reward
            obs_loss = 0.5 * mse_loss(
                recon_observations, observations[1:], reduction='none').mean([0, 1]).sum()
            reward_loss = 0.5 * mse_loss(predicted_rewards, rewards[:-1])

            # add all losses and update model parameters with gradient descent
            model_loss = kl_loss + obs_loss + reward_loss
            model_optimizer.zero_grad()
            model_loss.backward()
            clip_grad_norm_(model_params, args.clip_grad_norm)
            model_optimizer.step()

            # ----------------------------------------------
            #      update value_model and action_model
            # ----------------------------------------------
            # detach gradient because Dreamer doesn't update model with actor-critic loss
            flatten_states = flatten_states.detach()
            flatten_rnn_hiddens = flatten_rnn_hiddens.detach()

            # prepare tensor to maintain imaginated trajectory's states and rnn_hiddens
            imaginated_states = torch.zeros(args.imagination_horizon + 1,
                                            *flatten_states.shape,
                                            device=flatten_states.device)
            imaginated_rnn_hiddens = torch.zeros(args.imagination_horizon + 1,
                                                 *flatten_rnn_hiddens.shape,
                                                 device=flatten_rnn_hiddens.device)
            imaginated_states[0] = flatten_states
            imaginated_rnn_hiddens[0] = flatten_rnn_hiddens

            # compute imaginated trajectory using action from action_model
            for h in range(1, args.imagination_horizon + 1):
                actions = action_model(flatten_states, flatten_rnn_hiddens)
                flatten_states_prior, flatten_rnn_hiddens = rssm.prior(flatten_states,
                                                                       actions,
                                                                       flatten_rnn_hiddens)
                flatten_states = flatten_states_prior.rsample()
                imaginated_states[h] = flatten_states
                imaginated_rnn_hiddens[h] = flatten_rnn_hiddens

            # compute rewards and values corresponding to imaginated states and rnn_hiddens
            flatten_imaginated_states = imaginated_states.view(-1, args.state_dim)
            flatten_imaginated_rnn_hiddens = imaginated_rnn_hiddens.view(-1, args.rnn_hidden_dim)
            imaginated_rewards = \
                reward_model(flatten_imaginated_states,
                             flatten_imaginated_rnn_hiddens).view(args.imagination_horizon + 1, -1)
            imaginated_values = \
                value_model(flatten_imaginated_states,
                            flatten_imaginated_rnn_hiddens).view(args.imagination_horizon + 1, -1)
            # compute lambda target
            lambda_target_values = lambda_target(imaginated_rewards, imaginated_values,
                                                 args.gamma, args.lambda_)

            # update_value model
            value_loss = 0.5 * mse_loss(imaginated_values, lambda_target_values.detach())
            value_optimizer.zero_grad()
            value_loss.backward(retain_graph=True)
            clip_grad_norm_(value_model.parameters(), args.clip_grad_norm)
            value_optimizer.step()

            # update action model (multiply -1 for gradient ascent)
            action_loss = -1 * (lambda_target_values.mean())
            action_optimizer.zero_grad()
            action_loss.backward()
            clip_grad_norm_(action_model.parameters(), args.clip_grad_norm)
            action_optimizer.step()

            # print losses and add to tensorboard
            print('update_step: %3d model loss: %.5f, kl_loss: %.5f, '
                  'obs_loss: %.5f, reward_loss: %.5f, '
                  'value_loss: %.5f action_loss: %.5f'
                  % (update_step + 1, model_loss.item(), kl_loss.item(),
                     obs_loss.item(), reward_loss.item(),
                     value_loss.item(), action_loss.item()))
            total_update_step = episode * args.collect_interval + update_step
            writer.add_scalar('model loss', model_loss.item(), total_update_step)
            writer.add_scalar('kl loss', kl_loss.item(), total_update_step)
            writer.add_scalar('obs loss', obs_loss.item(), total_update_step)
            writer.add_scalar('reward loss', reward_loss.item(), total_update_step)
            writer.add_scalar('value loss', value_loss.item(), total_update_step)
            writer.add_scalar('action loss', action_loss.item(), total_update_step)

        print('elasped time for update: %.2fs' % (time.time() - start))

        # ----------------------------------------------
        #      evaluation without exploration noise
        # ----------------------------------------------
        if (episode + 1) % args.test_interval == 0:
            policy = Agent(encoder, rssm, action_model)
            start = time.time()
            obs = env.reset()
            done = False
            total_reward = 0
            while not done:
                action = policy(obs, training=False)
                obs, reward, done, _ = env.step(action)
                total_reward += reward

            writer.add_scalar('total reward at test', total_reward, episode)
            print('Total test reward at episode [%4d/%4d] is %f' %
                  (episode+1, args.all_episodes, total_reward))
            print('elasped time for test: %.2fs' % (time.time() - start))

    # save learned model parameters
    torch.save(encoder.state_dict(), os.path.join(log_dir, 'encoder.pth'))
    torch.save(rssm.state_dict(), os.path.join(log_dir, 'rssm.pth'))
    torch.save(obs_model.state_dict(), os.path.join(log_dir, 'obs_model.pth'))
    torch.save(reward_model.state_dict(), os.path.join(log_dir, 'reward_model.pth'))
    torch.save(value_model.state_dict(), os.path.join(log_dir, 'value_model.pth'))
    torch.save(action_model.state_dict(), os.path.join(log_dir, 'action_model.pth'))
    writer.close()
示例#9
0
def train():
    if conf.env_module == "img":
        env = make_atari(conf.env_name)
        env = bench.Monitor(env,
                            os.path.join(conf.path_game_scan, conf.env_name))
        env = wrap_deepmind(env,
                            episode_life=True,
                            clip_rewards=True,
                            frame_stack=False,
                            scale=True)
        env = WrapPyTorch(env)
        model = CnnDQN(env, device)
        target_model = CnnDQN(env, device)
    else:
        env = gym.make(conf.env_name)
        # Instantiate
        model = DQN(env, device)
        target_model = DQN(env, device)

    target_model.load_state_dict(model.state_dict())
    model, target_model = model.to(device), target_model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=conf.lr)
    replay_buffer = ReplayBuffer(conf.buffer_size)

    # cal td loss
    def cal_td_loss(model, batch_size):
        s, a, r, s_, d = replay_buffer.sample(batch_size)
        s = torch.tensor(np.float32(s), dtype=torch.float).to(device)
        s_ = torch.tensor(np.float32(s_), dtype=torch.float).to(device)
        a = torch.tensor(a, dtype=torch.long).to(device)
        r = torch.tensor(r, dtype=torch.float).to(device)
        d = torch.tensor(d, dtype=torch.float).to(device)

        q_value = model(s).gather(1, a.unsqueeze(1)).squeeze(1)
        with torch.no_grad():
            next_q_value = target_model(s_).max(1)[0]
            expected_q_value = r + conf.gamma * next_q_value * (1 - d)
            expected_q_value.to(device)

        loss = (q_value - expected_q_value).pow(2).mean()
        optimizer.zero_grad()
        loss.backward()
        for param in model.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()

        return loss

    episode_reward = 0
    losses = []
    all_rewards = []
    state = env.reset()  # (1, 84, 84)
    for frame_idx in range(1, conf.num_frames + 1):
        epsilon = conf.epsilon_by_frame(frame_idx)
        action = model.act(state, epsilon)

        next_state, reward, done, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)

        state = next_state
        episode_reward += reward

        if done:
            state = env.reset()
            all_rewards.append(episode_reward)
            episode_reward = 0

        if len(replay_buffer) > conf.batch_size:
            loss = cal_td_loss(model, conf.batch_size)
            losses.append(loss.item())

        if frame_idx % conf.target_upfreq == 0:
            target_model.load_state_dict(model.state_dict())

        if frame_idx % conf.log_freq == 0:
            print("frame: {}, loss: {}, reward: {}.".format(
                frame_idx, loss, episode_reward))

    if conf.save_curve:
        curve_name = "res_" + conf.exp_name + ".png"
        curve_path = os.path.join(conf.path_plot, curve_name)
        curve_plot(curve_path, frame_idx, all_rewards, losses)