示例#1
0
class DoubleDQNAgent:
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 tau=0.01,
                 buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tau = tau
        self.replay_buffer = BasicBuffer(max_size=buffer_size)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.use_conv = use_conv
        if self.use_conv:
            self.model1 = ConvDQN(env.observation_space.shape,
                                  env.action_space.n).to(self.device)
            self.model2 = ConvDQN(env.observation_space.shape,
                                  env.action_space.n).to(self.device)
        else:
            self.model1 = DQN(env.observation_space.shape,
                              len(env.action_space)).to(self.device)
            self.model2 = DQN(env.observation_space.shape,
                              len(env.action_space)).to(self.device)

        self.optimizer1 = torch.optim.Adam(self.model1.parameters())
        self.optimizer2 = torch.optim.Adam(self.model2.parameters())

    def get_action(self, state, eps=0.20):
        if (np.random.randn() < eps):
            return np.random.choice(self.env.action_space)

        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        qvals = self.model1.forward(state)
        action = np.argmax(qvals.cpu().detach().numpy())

        return action

    def compute_loss(self, batch):
        states, actions, rewards, next_states, dones = batch
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)

        # resize tensors
        actions = actions.view(actions.size(0), 1)
        dones = dones.view(dones.size(0), 1)

        # compute loss
        curr_Q1 = self.model1.forward(states).gather(1, actions)
        curr_Q2 = self.model2.forward(states).gather(1, actions)

        next_Q1 = self.model1.forward(next_states)
        next_Q2 = self.model2.forward(next_states)
        next_Q = torch.min(
            torch.max(self.model1.forward(next_states), 1)[0],
            torch.max(self.model2.forward(next_states), 1)[0])
        next_Q = next_Q.view(next_Q.size(0), 1)
        expected_Q = rewards + (1 - dones) * self.gamma * next_Q

        loss1 = F.mse_loss(curr_Q1, expected_Q.detach())
        loss2 = F.mse_loss(curr_Q2, expected_Q.detach())

        return loss1, loss2

    def update(self, batch_size):
        batch = self.replay_buffer.sample(batch_size)
        loss1, loss2 = self.compute_loss(batch)

        self.optimizer1.zero_grad()
        loss1.backward()
        self.optimizer1.step()

        self.optimizer2.zero_grad()
        loss2.backward()
        self.optimizer2.step()
示例#2
0
class DQNAgent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=0.9999,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn',
                 device='cuda:0'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.device = device

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        # Create policy and target DQN models
        self.policy = DQN(self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + 'policy',
                          chkpt_dir=self.chkpt_dir)
        self.target = DQN(self.n_actions,
                          input_dims=self.input_dims,
                          name=self.env_name + '_' + 'target',
                          chkpt_dir=self.chkpt_dir)

        # put on correct device (GPU or CPU)
        self.policy.to(device)
        self.target.to(device)

        # Optimizer
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        # Loss
        self.loss = nn.MSELoss()

    def choose_action(self, observation):
        # Choose an action
        if np.random.random() > self.epsilon:
            state = torch.tensor([observation],
                                 dtype=torch.float).to(self.device)
            actions = self.policy.forward(state)
            action = torch.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = torch.tensor(state).to(self.device)
        rewards = torch.tensor(reward).to(self.device)
        dones = torch.tensor(done).to(self.device)
        actions = torch.tensor(action).to(self.device)
        states_ = torch.tensor(new_state).to(self.device)

        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.target.load_state_dict(self.policy.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.eps_min:
            self.epsilon *= self.eps_dec

    def save_models(self):
        self.policy.save_checkpoint()

    def load_models(self):
        self.policy.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        self.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)

        q_pred = self.policy.forward(states)[indices, actions]
        q_next = self.target.forward(states_).max(dim=1)[0]

        q_next[dones] = 0.0
        q_target = rewards + self.gamma * q_next

        loss = self.loss(q_target, q_pred).to(self.device)
        loss.backward()
        self.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()
示例#3
0
class DQNAgent:
    def __init__(self,
                 env,
                 use_conv=True,
                 learning_rate=3e-4,
                 gamma=0.99,
                 buffer_size=10000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.replay_buffer = BasicBuffer(max_size=buffer_size)
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print(self.device)
        self.use_conv = use_conv
        if self.use_conv:
            self.model = ConvDQN(env.observation_space.shape,
                                 len(env.action_space)).to(self.device)
        else:
            self.model = DQN(env.observation_space.shape,
                             len(env.action_space)).to(self.device)

        self.optimizer = torch.optim.Adam(self.model.parameters())
        self.MSE_loss = nn.MSELoss()

    def get_action(self, state, eps=0.20):
        state = torch.FloatTensor(state).float().unsqueeze(0).to(self.device)
        qvals = self.model.forward(state)
        #print(qvals)
        action = np.argmax(qvals.cpu().detach().numpy())
        action = np.max(action)

        if (np.random.randn() < eps):
            return np.random.choice(self.env.action_space)

        return action

    def compute_loss(self, batch):
        states, actions, rewards, next_states, dones = batch
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones)
        #print(self.model.forward(states))
        curr_Q = self.model.forward(states).gather(1, actions.unsqueeze(1))
        curr_Q = curr_Q.squeeze(1)
        next_Q = self.model.forward(next_states)
        max_next_Q = torch.max(next_Q, 1)[0]
        expected_Q = rewards.squeeze(1) + self.gamma * max_next_Q

        loss = self.MSE_loss(curr_Q, expected_Q)
        return loss

    def update(self, batch_size):
        batch = self.replay_buffer.sample(batch_size)
        loss = self.compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()