Exemplo n.º 1
0
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        if self.config.prioritized_replay:
            self.buffer = PrioritizedReplayBuffer(
                self.config.max_buff,
                alpha=self.config.prioritized_replay_alpha)
            if self.config.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = self.config.frames
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.config.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.buffer = ReplayBuffer(self.config.max_buff)
            self.beta_schedule = None

        self.model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model = CnnDQN(self.config.state_shape,
                                   self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = Adam(self.model.parameters(),
                                lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()
Exemplo n.º 2
0
 def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000):
     self.env = env
     self.learning_rate = learning_rate
     self.gamma = gamma
     self.replay_buffer = ReplayBuffer(buffer_size)
     self.dqn = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
     self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters())
     self.dqn_loss = torch.nn.MSELoss()
Exemplo n.º 3
0
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()
Exemplo n.º 4
0
    def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000, tau=1e-2):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = ReplayBuffer(buffer_size)
        
        self.dqn_a = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
        self.dqn_b = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
        self.optimizer_a = torch.optim.Adam(self.dqn_a.parameters())
        self.optimizer_b = torch.optim.Adam(self.dqn_b.parameters())
        self.dqn_loss = torch.nn.MSELoss()

        for param_b, param_a in zip(self.dqn_b.parameters(), self.dqn_a.parameters()):
            param_b.data.copy_(param_a.data)
Exemplo n.º 5
0
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = RolloutStorage(config)
        if self.config.Dueling_DQN:
            self.model = Dueling_DQN(self.config.state_shape,
                                     self.config.action_dim)
            self.target_model = Dueling_DQN(self.config.state_shape,
                                            self.config.action_dim)
        else:
            self.model = CnnDQN(self.config.state_shape,
                                self.config.action_dim)
            self.target_model = CnnDQN(self.config.state_shape,
                                       self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = torch.optim.Adam(self.model.parameters(),
                                            lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()
Exemplo n.º 6
0
class CnnDDQNAgent:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        if self.config.prioritized_replay:
            self.buffer = PrioritizedReplayBuffer(
                self.config.max_buff,
                alpha=self.config.prioritized_replay_alpha)
            if self.config.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = self.config.frames
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.config.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.buffer = ReplayBuffer(self.config.max_buff)
            self.beta_schedule = None

        self.model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model = CnnDQN(self.config.state_shape,
                                   self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = Adam(self.model.parameters(),
                                lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            if self.config.use_cuda:
                state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learning(self, fr):

        if self.config.prioritized_replay:
            experience = self.buffer.sample(self.config.batch_size,
                                            beta=self.beta_schedule.value(fr))
            (s0, a, r, s1, done, weights, batch_idxes) = experience
        else:
            s0, a, r, s1, done = self.buffer.sample(self.config.batch_size)
            weights, batch_idxes = np.ones_like(r), None

        s0 = torch.tensor(s0, dtype=torch.float)
        s1 = torch.tensor(s1, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)
        weights = torch.tensor(weights, dtype=torch.float)

        if self.config.use_cuda:
            s0 = s0.cuda()
            s1 = s1.cuda()
            a = a.cuda()
            r = r.cuda()
            done = done.cuda()
            weights = weights.cuda()

        q_values = self.model(s0).cuda()
        next_q_values = self.model(s1).cuda()
        next_q_state_values = self.target_model(s1).cuda()

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            next_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)
        td_errors = next_q_value - expected_q_value
        # Notice that detach the expected_q_value
        loss = F.smooth_l1_loss(q_value,
                                expected_q_value.detach(),
                                reduction='none')
        loss = (loss * weights).mean()

        self.model_optim.zero_grad()
        loss.backward()
        self.model_optim.step()

        if self.config.prioritized_replay:
            new_priorities = np.abs(td_errors.detach().cpu().numpy()
                                    ) + self.config.prioritized_replay_eps
            self.buffer.update_priorities(batch_idxes, new_priorities)

        if fr % self.config.update_tar_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        return loss.item()

    def cuda(self):
        self.model.cuda()
        self.target_model.cuda()

    def load_weights(self, model_path):
        model = torch.load(model_path)
        if 'model' in model:
            self.model.load_state_dict(model['model'])
        else:
            self.model.load_state_dict(model)

    def save_model(self, output, name=''):
        torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")

    def save_checkpoint(self, fr, output):
        checkpath = output + '/checkpoint_model'
        os.makedirs(checkpath, exist_ok=True)
        torch.save({
            'frames': fr,
            'model': self.model.state_dict()
        }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr))

    def load_checkpoint(self, model_path):
        checkpoint = torch.load(model_path)
        fr = checkpoint['frames']
        self.model.load_state_dict(checkpoint['model'])
        self.target_model.load_state_dict(checkpoint['model'])
        return fr
Exemplo n.º 7
0
class Agent:

    def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = ReplayBuffer(buffer_size)
        self.dqn = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
        self.dqn_optimizer = torch.optim.Adam(self.dqn.parameters())
        self.dqn_loss = torch.nn.MSELoss()

    def update_model(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)
        
        curr_Q = self.dqn.forward(states)
        curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1)
        next_Q = self.dqn.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q 

        self.dqn_optimizer.zero_grad()
        loss = self.dqn_loss(curr_Q, expected_Q)
        loss.backward()
        self.dqn_optimizer.step()
        
        return loss

    def max_action(self, state):
        state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0))
        qvals = self.dqn.forward(state)
        action = np.argmax(qvals.detach().numpy())
  
        return action
      
    def train(self, max_episodes, max_steps, batch_size):
        episode_rewards = []
        loss = []
        
        for episodes in range(max_episodes):
            state = self.env.reset()  
            episode_reward = 0
            for steps in range(max_steps):
                action = self.max_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.push(state, action, reward, next_state, done)
                state = next_state
                episode_reward += reward
                
                if done:
                  episode_rewards.append(episode_reward)
                  print(episode_reward)
                  break
                
                if(len(self.replay_buffer) > batch_size):
                    step_loss = self.update_model(batch_size)
                    loss.append(step_loss)
                    #self.adjust_temperature(loss)
                
        # return episode_rewards, loss
                  
    def run(self, max_episodes, max_steps):
        episode_rewards = []
        for episodes in range(max_episodes):
            state = self.env.reset()  
            episode_reward = 0
            for steps in range(max_steps):
                action = self.max_action(state)
                next_state, reward, done, _ = env.step(action)
                state = next_state
                episode_reward += reward
                  
                if done:
                  episode_rewards.append(episode_reward)
                  break
                  
        return episode_rewards

    def save_model(self, PATH):
        torch.save(self.dqn.state_dict(), PATH)
Exemplo n.º 8
0
class CnnDDQNAgent:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = RolloutStorage(config)
        if self.config.Dueling_DQN:
            self.model = Dueling_DQN(self.config.state_shape,
                                     self.config.action_dim)
            self.target_model = Dueling_DQN(self.config.state_shape,
                                            self.config.action_dim)
        else:
            self.model = CnnDQN(self.config.state_shape,
                                self.config.action_dim)
            self.target_model = CnnDQN(self.config.state_shape,
                                       self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = torch.optim.Adam(self.model.parameters(),
                                            lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None:
            epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float) / 255.0
            if self.config.use_cuda:
                state = state.to(self.config.device)
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learning(self, fr):
        s0, s1, a, r, done = self.buffer.sample(self.config.batch_size)
        if self.config.use_cuda:
            s0 = s0.float().to(self.config.device) / 255.0
            s1 = s1.float().to(self.config.device) / 255.0
            a = a.to(self.config.device)
            r = r.to(self.config.device)
            done = done.to(self.config.device)

        # How to calculate Q(s,a) for all actions
        # q_values is a vector with size (batch_size, action_shape, 1)
        # each dimension i represents Q(s0,a_i)
        q_s0_values = self.model(s0).cuda()

        # How to calculate argmax_a Q(s,a)
        # actions = q_values.max(1)[1]
        q_s0_a = torch.gather(q_s0_values, 1, a)

        # Tips: function torch.gather may be helpful
        # You need to design how to calculate the loss
        if self.config.DQN:
            q_target_s1_values = self.target_model(s1).cuda().detach()
            q_target_s1_a_prime = q_target_s1_values.max(1)[0].unsqueeze(1)
            # if current state is end of episode, then there  is no next Q value
            q_target_s1_a_prime = torch.mul(q_target_s1_a_prime, (1 - done))
            y = r + self.config.gamma * q_target_s1_a_prime
        elif self.config.Double_DQN:
            q_s1_values = self.model(s1).cuda().detach()
            s1_a_prime = q_s1_values.max(1)[1].unsqueeze(1)
            q_target_s1_values = self.target_model(s1).cuda().detach()
            q_target_s1_a_prime = torch.gather(q_target_s1_values, 1,
                                               s1_a_prime)
            q_target_s1_a_prime = torch.mul(q_target_s1_a_prime, (1 - done))
            y = r + self.config.gamma * q_target_s1_a_prime
        else:
            pass
        mse_loss = torch.nn.MSELoss()
        loss = mse_loss(q_s0_a, y)
        self.model_optim.zero_grad()
        loss.backward()
        self.model_optim.step()

        if fr % self.config.update_tar_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())
        return loss.item()

    def cuda(self):
        self.model.to(self.config.device)
        self.target_model.to(self.config.device)

    def load_weights(self, model_path):
        model = torch.load(model_path)
        if 'model' in model:
            self.model.load_state_dict(model['model'])
        else:
            self.model.load_state_dict(model)

    def save_model(self, output, name=''):
        torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")

    def save_checkpoint(self, fr, output):
        checkpath = output + '/checkpoint_model'
        os.makedirs(checkpath, exist_ok=True)
        torch.save({
            'frames': fr,
            'model': self.model.state_dict()
        }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr))

    def load_checkpoint(self, model_path):
        checkpoint = torch.load(model_path)
        fr = checkpoint['frames']
        self.model.load_state_dict(checkpoint['model'])
        self.target_model.load_state_dict(checkpoint['model'])
        return fr
Exemplo n.º 9
0
            elif info:
                state = env.reset()
                print('Reward:{}, action certification rate {:.4f}'.format(
                    episode_reward, certified / total))
                return certified / total


if __name__ == '__main__':
    args = parser.parse_args()
    setup_json = read_config(args.env_config)
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env:
            env_conf = setup_json[i]
    env = atari_env(args.env, env_conf, args)
    model = CnnDQN(env.observation_space.shape[0], env.action_space)

    if args.gpu_id >= 0:
        weights = torch.load(args.load_path,
                             map_location=torch.device('cuda:{}'.format(
                                 args.gpu_id)))
        model.load_state_dict(weights)
        with torch.cuda.device(args.gpu_id):
            model.cuda()
    else:
        weights = torch.load(args.load_path, map_location=torch.device('cpu'))
        model.load_state_dict(weights)

    model.eval()

    save_name = (args.load_path.split('/')[-1]).split('.')[0]
Exemplo n.º 10
0
class CnnDDQNAgent:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            if self.config.use_cuda:
                state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learning(self, fr):
        s0, a, r, s1, done = self.buffer.sample(self.config.batch_size)

        s0 = torch.tensor(s0, dtype=torch.float)
        s1 = torch.tensor(s1, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)

        if self.config.use_cuda:
            s0 = s0.cuda()
            s1 = s1.cuda()
            a = a.cuda()
            r = r.cuda()
            done = done.cuda()

        q_values = self.model(s0).cuda()
        next_q_values = self.model(s1).cuda()
        next_q_state_values = self.target_model(s1).cuda()

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)
        # Notice that detach the expected_q_value
        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.model_optim.zero_grad()
        loss.backward()
        self.model_optim.step()

        if fr % self.config.update_tar_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        return loss.item()

    def cuda(self):
        self.model.cuda()
        self.target_model.cuda()

    def load_weights(self, model_path):
        model = torch.load(model_path)
        if 'model' in model:
            self.model.load_state_dict(model['model'])
        else:
            self.model.load_state_dict(model)

    def save_model(self, output, name=''):
        torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")

    def save_checkpoint(self, fr, output):
        checkpath = output + '/checkpoint_model'
        os.makedirs(checkpath, exist_ok=True)
        torch.save({
            'frames': fr,
            'model': self.model.state_dict()
        }, '%s/checkpoint_fr_%d.tar'% (checkpath, fr))

    def load_checkpoint(self, model_path):
        checkpoint = torch.load(model_path)
        fr = checkpoint['frames']
        self.model.load_state_dict(checkpoint['model'])
        self.target_model.load_state_dict(checkpoint['model'])
        return fr
Exemplo n.º 11
0
parser.set_defaults(robust=False)

if __name__ == '__main__':
    args = parser.parse_args()
    if args.seed:
        torch.manual_seed(args.seed)
        if args.gpu_id >= 0:
            torch.cuda.manual_seed(args.seed)
    setup_json = read_config(args.env_config)
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env:
            env_conf = setup_json[i]
    env = atari_env(args.env, env_conf, args)
    curr_model = CnnDQN(env.observation_space.shape[0], env.action_space)

    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
    if not os.path.exists(args.save_model_dir):
        os.mkdir(args.save_model_dir)

    if args.load_path:
        saved_state = torch.load(args.load_path,
                                 map_location=lambda storage, loc: storage)
        curr_model.load_state_dict(saved_state)

    target_model = CnnDQN(env.observation_space.shape[0], env.action_space)
    target_model.load_state_dict(curr_model.state_dict())
    if args.gpu_id >= 0:
        with torch.cuda.device(args.gpu_id):
Exemplo n.º 12
0
class Agent:

    def __init__(self, env, use_cnn=False, learning_rate=3e-4, gamma=0.99, buffer_size=10000, tau=1e-2):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = ReplayBuffer(buffer_size)
        
        self.dqn_a = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
        self.dqn_b = CnnDQN(env.observation_space.shape, env.action_space.n) if use_cnn else DQN(env.observation_space.shape[0], env.action_space.n) 
        self.optimizer_a = torch.optim.Adam(self.dqn_a.parameters())
        self.optimizer_b = torch.optim.Adam(self.dqn_b.parameters())
        self.dqn_loss = torch.nn.MSELoss()

        for param_b, param_a in zip(self.dqn_b.parameters(), self.dqn_a.parameters()):
            param_b.data.copy_(param_a.data)
     
    def update_model(self, batch_size):
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(dones)

        curr_Q = self.dqn_a.forward(states)
        curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1)
        print("curr_Q: " + str(curr_Q))
        next_Q = self.dqn_a.forward(next_states)
        best_actions = torch.max(next_Q, 1)[1]
        #print("next_Q" + str(next_Q))
        print("best actions: " + str(best_actions))
        dqn_b_Q = self.dqn_b.forward(next_states)
        max_next_Q = dqn_b_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1)

        print("max_next_Q: " + str(max_next_Q))
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q
        #print(expected_Q)
        self.optimizer_a.zero_grad()
        loss = self.dqn_loss(curr_Q, expected_Q)
        loss.backward()
        self.optimizer_a.step()

        for param_b, param_a in zip(self.dqn_b.parameters(), self.dqn_a.parameters()):
            param_b.data.copy_(param_a.data * self.tau + param_b.data * (1.0 - self.tau))
     


        #update dqn_a by chance
        """
        if(np.random.uniform() < 0.5): # 
            curr_Q = self.dqn_a.forward(states)
            curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1)
            next_Q = self.dqn_a.forward(next_states)
            best_actions = torch.max(next_Q, 1)[1]
            print("next_Q" + str(next_Q))
            print("best actions: " + str(best_actions))
            dqn_b_Q = self.dqn_b.forward(next_states)
            max_next_Q = dqn_b_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1)
            print("max_next_Q: " + str(max_next_Q))
            expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q

            self.optimizer_a.zero_grad()
            loss = self.dqn_loss(curr_Q, expected_Q)
            loss.backward()
            self.optimizer_a.step()
        """
        # update dqn_b
        """
        else: 
            curr_Q = self.dqn_b.forward(states)
            curr_Q = curr_Q.gather(1, actions.unsqueeze(1)).squeeze(1)
            next_Q = self.dqn_b.forward(next_states)
            best_actions = torch.max(next_Q, 1)[1].detach()
            #print("next_Q" + str(next_Q))
            #print("best actions: " + str(best_actions))
            dqn_a_Q = self.dqn_a.forward(next_states)
            max_next_Q = dqn_a_Q.gather(1, best_actions.unsqueeze(1)).squeeze(1)
            expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q

            self.optimizer_b.zero_grad()
            loss = self.dqn_loss(curr_Q, expected_Q)
            loss.backward()
            self.optimizer_b.step()
        """    
    
    def max_action(self, state):
        state = autograd.Variable(torch.from_numpy(state).float().unsqueeze(0))
        qvals = self.dqn_a.forward(state)
        action = np.argmax(qvals.detach().numpy())

        # if(np.random.uniform() < 0.2):
        #     return self.env.action_space.sample()
  
        return action
      
    def train(self, max_episodes, max_steps, batch_size):
        episode_rewards = []
        loss = []
        
        for episodes in range(max_episodes):
            state = self.env.reset()  
            episode_reward = 0
            for steps in range(max_steps):
                action = self.max_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.replay_buffer.push(state, action, reward, next_state, done)
                state = next_state
                episode_reward += reward
                
                if done:
                  episode_rewards.append(episode_reward)
                  print(episode_reward)
                  break
                
                if(len(self.replay_buffer) > batch_size):
                    step_loss = self.update_model(batch_size)
                    loss.append(step_loss)
                    #self.adjust_temperature(loss)
                
        # return episode_rewards, loss
                  
    def run(self, max_episodes, max_steps):
        episode_rewards = []
        for episodes in range(max_episodes):
            state = self.env.reset()  
            episode_reward = 0
            for steps in range(max_steps):
                action = self.max_action(state)
                next_state, reward, done, _ = env.step(action)
                state = next_state
                episode_reward += reward
                  
                if done:
                  episode_rewards.append(episode_reward)
                  break
                  
        return episode_rewards

    def save_model(self, PATH):
        torch.save(self.dqn.state_dict(), PATH)
Exemplo n.º 13
0
def train():
    if conf.env_module == "img":
        env = make_atari(conf.env_name)
        env = bench.Monitor(env,
                            os.path.join(conf.path_game_scan, conf.env_name))
        env = wrap_deepmind(env,
                            episode_life=True,
                            clip_rewards=True,
                            frame_stack=False,
                            scale=True)
        env = WrapPyTorch(env)
        model = CnnDQN(env, device)
        target_model = CnnDQN(env, device)
    else:
        env = gym.make(conf.env_name)
        # Instantiate
        model = DQN(env, device)
        target_model = DQN(env, device)

    target_model.load_state_dict(model.state_dict())
    model, target_model = model.to(device), target_model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=conf.lr)
    replay_buffer = ReplayBuffer(conf.buffer_size)

    # cal td loss
    def cal_td_loss(model, batch_size):
        s, a, r, s_, d = replay_buffer.sample(batch_size)
        s = torch.tensor(np.float32(s), dtype=torch.float).to(device)
        s_ = torch.tensor(np.float32(s_), dtype=torch.float).to(device)
        a = torch.tensor(a, dtype=torch.long).to(device)
        r = torch.tensor(r, dtype=torch.float).to(device)
        d = torch.tensor(d, dtype=torch.float).to(device)

        q_value = model(s).gather(1, a.unsqueeze(1)).squeeze(1)
        with torch.no_grad():
            next_q_value = target_model(s_).max(1)[0]
            expected_q_value = r + conf.gamma * next_q_value * (1 - d)
            expected_q_value.to(device)

        loss = (q_value - expected_q_value).pow(2).mean()
        optimizer.zero_grad()
        loss.backward()
        for param in model.parameters():
            param.grad.data.clamp_(-1, 1)
        optimizer.step()

        return loss

    episode_reward = 0
    losses = []
    all_rewards = []
    state = env.reset()  # (1, 84, 84)
    for frame_idx in range(1, conf.num_frames + 1):
        epsilon = conf.epsilon_by_frame(frame_idx)
        action = model.act(state, epsilon)

        next_state, reward, done, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)

        state = next_state
        episode_reward += reward

        if done:
            state = env.reset()
            all_rewards.append(episode_reward)
            episode_reward = 0

        if len(replay_buffer) > conf.batch_size:
            loss = cal_td_loss(model, conf.batch_size)
            losses.append(loss.item())

        if frame_idx % conf.target_upfreq == 0:
            target_model.load_state_dict(model.state_dict())

        if frame_idx % conf.log_freq == 0:
            print("frame: {}, loss: {}, reward: {}.".format(
                frame_idx, loss, episode_reward))

    if conf.save_curve:
        curve_name = "res_" + conf.exp_name + ".png"
        curve_path = os.path.join(conf.path_plot, curve_name)
        curve_plot(curve_path, frame_idx, all_rewards, losses)