예제 #1
0
class Agent():
    """
    Initialize Agent, inclduing:
        DQN Hyperparameters
        Local and Targat State-Action Policy Networks
        Replay Memory Buffer from Replay Buffer Class (define below)
    """
    def __init__(self,
                 state_size,
                 action_size,
                 dqn_type='DQN',
                 replay_memory_size=1e5,
                 batch_size=64,
                 gamma=0.99,
                 learning_rate=1e-3,
                 target_tau=2e-3,
                 update_rate=4,
                 seed=0):
        """
        DQN Agent Parameters
        ====== 
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN.
            replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6)
            batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128)
            gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995)
            learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3))
            seed (int): random seed for initializing training point.
        """
        self.dqn_type = dqn_type
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        """
        # DQN Agent Q-Network
        # For DQN training, two nerual network models are employed;
        # (a) A network that is updated every (step % update_rate == 0)
        # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate.
        # The slower modulation of the target network weights operates to stablize learning.
        """
        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate,
                                    betas=BETAS)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    ########################################################
    # STEP() method
    #
    def step(self, state, action, reward, next_state, done, update=True):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_rate
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                if update:
                    self.learn(experiences, self.gamma)

########################################################
# ACT() method
#

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

########################################################
# LEARN() method
# Update value parameters using given batch of experience tuples.

    def learn(self, experiences, gamma, DQN=True):
        """
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get Q values from current observations (s, a) using model nextwork
        Qsa = self.network(states).gather(1, actions)

        if (self.dqn_type == 'DDQN'):
            #Double DQN
            #************************
            Qsa_prime_actions = self.network(next_states).detach().max(
                1)[1].unsqueeze(1)
            Qsa_prime_targets = self.target_network(
                next_states)[Qsa_prime_actions].unsqueeze(1)

        else:
            #Regular (Vanilla) DQN
            #************************
            # Get max Q values for (s',a') from target model
            Qsa_prime_target_values = self.target_network(next_states).detach()
            Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones))

        # Compute loss (error)
        loss = F.mse_loss(Qsa, Qsa_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.network, self.target_network, self.tau)

    ########################################################
    """
    Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    """

    def soft_update(self, local_model, target_model, tau):
        """
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_the_model(self, iteration, f_name):
        if not os.path.exists('./save/dqn/'):
            os.makedirs('./save/dqn/')
        f_name = 'dqn_param_' + str(iteration) + '_' + f_name + '_model.pth'
        torch.save(self.network.state_dict(), './save/dqn/' + f_name)
        print('DQN Model Saved')

    def load_the_model(self, iteration, f_name):
        f_path = './save/dqn/dqn_param_' + str(
            iteration) + '_' + f_name + '_model.pth'
        self.network.load_state_dict(torch.load(f_path))
        print('DQN Model Loaded')
예제 #2
0
class DQN(object):
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync evac target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()

        # simulator step counter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0

        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)

    # Update target network
    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(),
                                            pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)

    def save_model(self):
        # save prediction network and target network
        self.pred_net.save(PRED_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net.load(PRED_PATH)
        self.target_net.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
        # x:state
        x = torch.FloatTensor(x)
        # print(x.shape)
        if USE_GPU:
            x = x.cuda()

        # epsilon-greedy
        if np.random.uniform() >= EPSILON:
            # greedy case
            action_value, tau = self.pred_net(
                x)  # (N_ENVS, N_ACTIONS, N_QUANT)
            action_value = action_value.mean(dim=2)
            action = torch.argmax(action_value, dim=1).data.cpu().numpy()
            # print(action)
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS, (x.size(0)))
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net, self.pred_net, 1e-2)

        b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE)
        print(b_d)
        b_w, b_idxes = np.ones_like(b_r), None

        b_s = torch.FloatTensor(b_s)
        b_a = torch.LongTensor(b_a)
        b_r = torch.FloatTensor(b_r)
        b_s_ = torch.FloatTensor(b_s_)
        b_d = torch.FloatTensor(b_d)

        if USE_GPU:
            b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(
            ), b_s_.cuda(), b_d.cuda()

        # action value distribution prediction
        q_eval, q_eval_tau = self.pred_net(
            b_s)  # (m, N_ACTIONS, N_QUANT), (N_QUANT, 1)
        mb_size = q_eval.size(0)
        # squeeze去掉第一维
        # torch.stack函数是将矩阵进行叠加,默认dim=0,即将[]中的n个矩阵变成n维
        # index_select函数是进行索引查找。
        q_eval = torch.stack([
            q_eval[i].index_select(0, b_a[i]) for i in range(mb_size)
        ]).squeeze(1)
        # (m, N_QUANT)
        # 在q_eval第二维后面加一个维度
        q_eval = q_eval.unsqueeze(2)  # (m, N_QUANT, 1)
        # note that dim 1 is for present quantile, dim 2 is for next quantile

        # get next state value
        q_next, q_next_tau = self.target_net(
            b_s_)  # (m, N_ACTIONS, N_QUANT), (N_QUANT, 1)
        best_actions = q_next.mean(dim=2).argmax(dim=1)  # (m)
        q_next = torch.stack([
            q_next[i].index_select(0, best_actions[i]) for i in range(mb_size)
        ]).squeeze(1)
        # q_nest: (m, N_QUANT)
        # q_target = R + gamma * (1 - terminate) * q_next
        q_target = b_r.unsqueeze(1) + GAMMA * (1. - b_d.unsqueeze(1)) * q_next
        # q_target: (m, N_QUANT)
        # detach表示该Variable不更新参数
        q_target = q_target.unsqueeze(1).detach()  # (m , 1, N_QUANT)

        # quantile Huber loss
        print('q_target', q_target.shape)
        print('q_eval', q_eval.shape)
        print('q_target_', q_target.detach().shape)
        u = q_target.detach() - q_eval  # (m, N_QUANT, N_QUANT)
        tau = q_eval_tau.unsqueeze(0)  # (1, N_QUANT, 1)
        # note that tau is for present quantile
        # w = |tau - delta(u<0)|
        weight = torch.abs(tau - u.le(0.).float())  # (m, N_QUANT, N_QUANT)
        loss = F.smooth_l1_loss(q_eval, q_target.detach(), reduction='none')
        # (m, N_QUANT, N_QUANT)
        loss = torch.mean(weight * loss, dim=1).mean(dim=1)

        # calculate importance weighted loss
        b_w = torch.Tensor(b_w)
        if USE_GPU:
            b_w = b_w.cuda()
        loss = torch.mean(b_w * loss)

        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss
예제 #3
0
class Agent:
    def __init__(self, state_size, action_size, num_agents):
        self.policy = PolicyNetwork(state_size, action_size).to(device)
        self.old_policy = PolicyNetwork(state_size, action_size).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=LR)

        self.episodes = [Episode() for _ in range(num_agents)]
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.t_step = 0

    def reset(self):
        self.finished = [False] * len(self.episodes)


    # Decide on an action to take in the environment

    def act(self, state, eps=None):
        self.policy.eval()
        with torch.no_grad():
            output = self.policy(torch.from_numpy(state).float().unsqueeze(0).to(device))
            return Categorical(output).sample().item()


    # Record the results of the agent's action and update the model

    def step(self, handle, state, action, next_state, agent_done, episode_done, collision):
        if not self.finished[handle]:
            if agent_done:
                  reward = 1
            elif collision:
                  reward = -.5
            else: reward = 0

            # Push experience into Episode memory
            self.episodes[handle].push(state, action, reward, next_state, agent_done or episode_done)

            # When we finish the episode, discount rewards and push the experience into replay memory
            if agent_done or episode_done:
                self.episodes[handle].discount_rewards(GAMMA)
                self.memory.push_episode(self.episodes[handle])
                self.episodes[handle].reset()
                self.finished[handle] = True

        # Perform a gradient update every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 4:
            self.train(*self.memory.sample(BATCH_SIZE, device))

    def train(self, states, actions, rewards, next_state, done):
        self.policy.train()

        responsible_outputs = torch.gather(self.policy(states), 1, actions)
        old_responsible_outputs = torch.gather(self.old_policy(states), 1, actions).detach()

        # rewards = rewards - rewards.mean()
        ratio = responsible_outputs / (old_responsible_outputs + 1e-5)
        clamped_ratio = torch.clamp(ratio, 1. - CLIP_FACTOR, 1. + CLIP_FACTOR)
        loss = -torch.min(ratio * rewards, clamped_ratio * rewards).mean()

        # Compute loss and perform a gradient step
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


    # Checkpointing methods

    def save(self, path, *data):
        torch.save(self.policy.state_dict(), path / 'ppo/model_checkpoint.policy')
        torch.save(self.optimizer.state_dict(), path / 'ppo/model_checkpoint.optimizer')
        with open(path / 'ppo/model_checkpoint.meta', 'wb') as file:
            pickle.dump(data, file)

    def load(self, path, *defaults):
        try:
            print("Loading model from checkpoint...")
            self.policy.load_state_dict(torch.load(path / 'ppo/model_checkpoint.policy'))
            self.optimizer.load_state_dict(torch.load(path / 'ppo/model_checkpoint.optimizer'))
            with open(path / 'ppo/model_checkpoint.meta', 'rb') as file:
                return pickle.load(file)
        except:
            print("No checkpoint file was found")
            return defaults
예제 #4
0
파일: dqn.py 프로젝트: xuezzee/-
class Agent:
    def __init__(self, state_size, action_size, num_agents, double_dqn=False):
        self.action_size = action_size
        self.double_dqn = double_dqn

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size).to(device)
        self.qnetwork_target = copy.deepcopy(self.qnetwork_local)
        self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=4000, gamma=0.98, last_epoch=-1)

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE)
        self.num_agents = num_agents
        self.t_step = 0

    def reset(self):
        self.finished = [False] * self.num_agents


    # Decide on an action to take in the environment

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)

        # Epsilon-greedy action selection
        if random.random() > eps:
              return torch.argmax(action_values).item()
        else: return torch.randint(self.action_size, ()).item()


    # Record the results of the agent's action and update the model

    def step(self, handle, state, action, reward, next_state, agent_done):
        if not self.finished[handle]:
            # Save experience in replay memory
            self.memory.push(state, action, reward, next_state, agent_done)
            self.finished[handle] = agent_done

        # Perform a gradient update every UPDATE_EVERY time steps
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE * 1: # 320
            self.learn(*self.memory.sample(BATCH_SIZE, device))


    def learn(self, states, actions, rewards, next_states, dones):
        self.qnetwork_local.train()

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        if self.double_dqn:
              Q_best_action = self.qnetwork_local(next_states).argmax(1)
              Q_targets_next = self.qnetwork_target(next_states).gather(1, Q_best_action.unsqueeze(-1))
        else: Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(-1)

        # Compute Q targets for current states
        Q_targets = rewards + GAMMA * Q_targets_next * (1 - dones)

        # Compute loss and perform a gradient step
        self.optimizer.zero_grad()
        loss = F.mse_loss(Q_expected, Q_targets)
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # Update the target network parameters to `tau * local.parameters() + (1 - tau) * target.parameters()`
        for target_param, local_param in zip(self.qnetwork_target.parameters(), self.qnetwork_local.parameters()):
            target_param.data.copy_(TAU * local_param.data + (1.0 - TAU) * target_param.data)


    # Checkpointing methods

    def save(self, path, *data):
        torch.save(self.qnetwork_local.state_dict(), path / 'model_checkpoint.local')
        torch.save(self.qnetwork_target.state_dict(), path / 'model_checkpoint.target')
        torch.save(self.optimizer.state_dict(), path / 'model_checkpoint.optimizer')
        with open(path / 'model_checkpoint.meta', 'wb') as file:
            pickle.dump(data, file)

    def load(self, path, *defaults):
        try:
            print("Loading model from checkpoint...")
            self.qnetwork_local.load_state_dict(torch.load(path / 'model_checkpoint.local'))
            self.qnetwork_target.load_state_dict(torch.load(path / 'model_checkpoint.target'))
            self.optimizer.load_state_dict(torch.load(path / 'model_checkpoint.optimizer'))
            with open(path / 'model_checkpoint.meta', 'rb') as file:
                return pickle.load(file)
        except:
            print("No checkpoint file was found")
            return defaults
예제 #5
0
class Agent():
    def __init__(self):
        self.name = "expected_sarsa_agent"

    def agent_init(self, agent_config):
        self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'],
                                          agent_config['minibatch_sz'],
                                          agent_config.get("seed"))
        self.network = ActionValueNetwork(agent_config['network_config'])
        self.optimizer = Adam(self.network.layer_sizes,
                              agent_config["optimizer_config"])
        self.num_actions = agent_config['network_config']['num_actions']
        self.num_replay = agent_config['num_replay_updates_per_step']
        self.discount = agent_config['gamma']
        self.tau = agent_config['tau']

        self.rand_generator = np.random.RandomState(agent_config.get("seed"))

        self.last_state = None
        self.last_action = None

        self.sum_rewards = 0
        self.episode_steps = 0

    def policy(self, state):
        action_values = self.network.get_action_values(state)
        probs_batch = softmax(action_values, self.tau)
        action = self.rand_generator.choice(self.num_actions,
                                            p=probs_batch.squeeze())
        return action

    def agent_start(self, state):
        self.sum_rewards = 0
        self.episode_steps = 0
        self.last_state = np.array([state])
        self.last_action = self.policy(self.last_state)
        return self.last_action

    def agent_step(self, reward, state):
        self.sum_rewards += reward
        self.episode_steps += 1

        state = np.array([state])

        action = self.policy(state)

        self.replay_buffer.append(self.last_state, self.last_action, reward, 0,
                                  state)

        # Perform replay steps:
        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            current_q = deepcopy(self.network)
            for _ in range(self.num_replay):

                # Get sample experiences from the replay buffer
                experiences = self.replay_buffer.sample()

                # Call optimize_network to update the weights of the network
                optimize_network(experiences, self.discount, self.optimizer,
                                 self.network, current_q, self.tau)

        # Update the last state and last action.
        self.last_state = state
        self.last_action = action

        return action

    # update of the weights using optimize_network
    def agent_end(self, reward):
        self.sum_rewards += reward
        self.episode_steps += 1

        # Set terminal state to an array of zeros
        state = np.zeros_like(self.last_state)

        # Append new experience to replay buffer
        self.replay_buffer.append(self.last_state, self.last_action, reward, 1,
                                  state)

        # Perform replay steps:
        if self.replay_buffer.size() > self.replay_buffer.minibatch_size:
            current_q = deepcopy(self.network)
            for _ in range(self.num_replay):

                # Get sample experiences from the replay buffer
                experiences = self.replay_buffer.sample()

                # Call optimize_network to update the weights of the network
                optimize_network(experiences, self.discount, self.optimizer,
                                 self.network, current_q, self.tau)

    def agent_message(self, message):
        if message == "get_sum_reward":
            return self.sum_rewards
        else:
            raise Exception("Unrecognized Message!")
예제 #6
0
class DQN(object):
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync eval target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()

        # simulator step conter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0

        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)

        # discrete values
        self.value_range = torch.FloatTensor(V_RANGE)  # (N_ATOM)
        if USE_GPU:
            self.value_range = self.value_range.cuda()

    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(),
                                            pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)

    def save_model(self):
        # save prediction network and target network
        self.pred_net.save(PRED_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net.load(PRED_PATH)
        self.target_net.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
        x = torch.FloatTensor(x)
        if USE_GPU:
            x = x.cuda()

        if np.random.uniform() >= EPSILON:
            # greedy case
            action_value_dist = self.pred_net(x)  # (N_ENVS, N_ACTIONS, N_ATOM)
            action_value = torch.sum(action_value_dist *
                                     self.value_range.view(1, 1, -1),
                                     dim=2)  # (N_ENVS, N_ACTIONS)
            action = torch.argmax(action_value, dim=1).data.cpu().numpy()
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS, (x.size(0)))
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net, self.pred_net, 1e-2)

        b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE)
        b_w, b_idxes = np.ones_like(b_r), None

        b_s = torch.FloatTensor(b_s)
        b_a = torch.LongTensor(b_a)
        b_s_ = torch.FloatTensor(b_s_)

        if USE_GPU:
            b_s, b_a, b_s_ = b_s.cuda(), b_a.cuda(), b_s_.cuda()

        # action value distribution prediction
        q_eval = self.pred_net(b_s)  # (m, N_ACTIONS, N_ATOM)
        mb_size = q_eval.size(0)
        q_eval = torch.stack([
            q_eval[i].index_select(0, b_a[i]) for i in range(mb_size)
        ]).squeeze(1)
        # (m, N_ATOM)

        # target distribution
        q_target = np.zeros((mb_size, N_ATOM))  # (m, N_ATOM)

        # get next state value
        q_next = self.target_net(b_s_).detach()  # (m, N_ACTIONS, N_ATOM)
        # next value mean
        q_next_mean = torch.sum(q_next * self.value_range.view(1, 1, -1),
                                dim=2)  # (m, N_ACTIONS)
        best_actions = q_next_mean.argmax(dim=1)  # (m)
        q_next = torch.stack([
            q_next[i].index_select(0, best_actions[i]) for i in range(mb_size)
        ]).squeeze(1)
        q_next = q_next.data.cpu().numpy()  # (m, N_ATOM)

        # categorical projection
        '''
        next_v_range : (z_j) i.e. values of possible return, shape : (m, N_ATOM)
        next_v_pos : relative position when offset of value is V_MIN, shape : (m, N_ATOM)
        '''
        # we vectorized the computation of support and position
        next_v_range = np.expand_dims(b_r, 1) + GAMMA * np.expand_dims((1. - b_d),1) \
        * np.expand_dims(self.value_range.data.cpu().numpy(),0)
        next_v_pos = np.zeros_like(next_v_range)
        # clip for categorical distribution
        next_v_range = np.clip(next_v_range, V_MIN, V_MAX)
        # calc relative position of possible value
        next_v_pos = (next_v_range - V_MIN) / V_STEP
        # get lower/upper bound of relative position
        lb = np.floor(next_v_pos).astype(int)
        ub = np.ceil(next_v_pos).astype(int)
        # we didn't vectorize the computation of target assignment.
        for i in range(mb_size):
            for j in range(N_ATOM):
                # calc prob mass of relative position weighted with distance
                q_target[i, lb[i, j]] += (q_next * (ub - next_v_pos))[i, j]
                q_target[i, ub[i, j]] += (q_next * (next_v_pos - lb))[i, j]

        q_target = torch.FloatTensor(q_target)
        if USE_GPU:
            q_target = q_target.cuda()

        # calc huber loss, dont reduce for importance weight
        loss = q_target * (-torch.log(q_eval + 1e-8))  # (m , N_ATOM)
        loss = torch.mean(loss)

        # calc importance weighted loss
        b_w = torch.Tensor(b_w)
        if USE_GPU:
            b_w = b_w.cuda()
        loss = torch.mean(b_w * loss)

        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
예제 #7
0
class Smoothing_DQN(object):
    def __init__(self):
        self.pred_net_Q1, self.target_net_Q1 = ConvNet(), ConvNet()
        self.pred_net_Q2, self.target_net_Q2 = ConvNet(), ConvNet()
        # sync evac target
        self.target_deque1 = deque(maxlen=n)
        self.target_deque2 = deque(maxlen=n)
        self.update_target(self.target_net_Q1, self.pred_net_Q1, 1.0)
        self.update_target(self.target_net_Q2, self.pred_net_Q2, 1.0)

        self.target_deque1.append(self.target_net_Q1)
        # use gpu
        if USE_GPU:
            self.pred_net_Q1.cuda()
            self.target_net_Q1.cuda()
            self.pred_net_Q2.cuda()
            self.target_net_Q2.cuda()
        # simulator step counter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0
        # loss function
        self.loss_function = nn.MSELoss()
        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net_Q1.parameters(), lr=LR)

        self.optimizer1 = torch.optim.Adam(self.pred_net_Q2.parameters(),
                                           lr=LR)

    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(),
                                            pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)

    def save_model(self):
        # save prediction network and target network
        self.pred_net_Q1.save(PRED_PATH)
        self.target_net_Q1.save(TARGET_PATH)
        self.pred_net_Q2.save(PRED_PATH1)
        self.target_net_Q2.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net_Q1.load(PRED_PATH)
        self.target_net_Q1.load(TARGET_PATH)
        self.pred_net_Q2.load(PRED_PATH)
        self.target_net_Q2.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
        # x:state
        x = torch.FloatTensor(x)
        # print(x.shape)
        if USE_GPU:
            x = x.cuda()

        # epsilon-greedy策略
        if np.random.uniform() >= EPSILON:
            # greedy case
            action_value = self.pred_net_Q1(x)
            action_value += self.pred_net_Q2(x)
            action = torch.argmax(action_value, dim=1).data.cpu().numpy()
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS, (x.size(0)))
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def save_history(self):
        if self.memory_counter % dealy_interval == 0:
            self.target_deque1.append(self.pred_net_Q1)
        if self.memory_counter % dealy_interval + 100 == 0:
            self.target_deque2.append(self.pred_net_Q2)

    # def update_target(self):
    #     # weight=np.array([0.9,0.])
    #     if len(self.target_deque)<n:
    #         for target_param, pred_param in zip(self.target_net.parameters(), self.pred_net.parameters()):
    #           target_param.data.copy_((1.0 - 1e-2) \
    #                                   * target_param.data + 1e-2 * pred_param.data)
    #         return
    #     for i,net in enumerate(self.target_deque):
    #         for target_param, queue_net in zip(self.target_net.parameters(),net.parameters()):
    #           target_param.data.copy_( self.weight[i] * queue_net.data)
    def learn(self):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net_Q1, self.pred_net_Q1, 1e-2)
            self.update_target(self.target_net_Q2, self.pred_net_Q2, 1e-2)

        b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE)
        # b_w, b_idxes = np.ones_like(b_r), None

        b_s = torch.FloatTensor(b_s)
        b_a = torch.LongTensor(b_a)
        b_r = torch.FloatTensor(b_r)
        b_s_ = torch.FloatTensor(b_s_)
        b_d = torch.FloatTensor(b_d)

        if USE_GPU:
            b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(
            ), b_s_.cuda(), b_d.cuda()

        # action value for current state
        q_eval1 = self.pred_net_Q1(b_s)
        mb_size = q_eval1.size(0)
        q_eval1 = torch.stack([q_eval1[i][b_a[i]] for i in range(mb_size)])

        q_eval2 = self.pred_net_Q2(b_s)
        mb_size = q_eval2.size(0)
        q_eval2 = torch.stack([q_eval2[i][b_a[i]] for i in range(mb_size)])
        # optimal action value for current state

        alpha = np.random.uniform(0, 1, len(self.target_deque1) + 1)
        alpha = alpha / alpha.sum()
        # print("alpha:",alpha,alpha.sum())
        q_next1 = self.target_net_Q1(b_s_)
        q_next1 = alpha[-1] * torch.max(q_next1, -1)[0]
        for i, target in enumerate(self.target_deque1):
            q_next_history = target(b_s_)
            q_next1 += alpha[i] * torch.max(q_next_history, -1)[0]

        alpha = np.random.uniform(0, 1, len(self.target_deque2) + 1)
        alpha = alpha / alpha.sum()
        # print("alpha:",alpha,alpha.sum())
        q_next2 = self.target_net_Q2(b_s_)
        q_next2 = alpha[-1] * torch.max(q_next2, -1)[0]
        for i, target in enumerate(self.target_deque2):
            q_next_history = target(b_s_)
            q_next2 += alpha[i] * torch.max(q_next_history, -1)[0]
        # print("q next:",q_next.shape)
        # best_actions = q_next.argmax(dim=1)
        # q_next = torch.stack([q_next[i][best_actions[i]] for i in range(mb_size)])

        # print("shape:",q_next.shape)

        q_target1 = b_r + GAMMA * (1. - b_d) * q_next1
        q_target1 = q_target1.detach()

        q_target2 = b_r + GAMMA * (1. - b_d) * q_next2
        q_target2 = q_target2.detach()

        # loss
        loss = self.loss_function(q_eval1, q_target2)
        logger.store(loss=loss)
        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        loss = self.loss_function(q_eval2, q_target1)
        self.optimizer1.zero_grad()
        loss.backward()
        self.optimizer1.step()
        return loss
예제 #8
0
class DQNAgent():
    def __init__(self, input_shape, action_size, buffer_size, batch_size,
                 gamma, lr, tau, update_every, device):
        """Initialize an Agent object.
        
        Params
        ======
            input_shape (tuple): dimension of each state
            action_size (int): dimension of each action
            buffer_size (int): replay buffer size
            batch_size (int):  minibatch size
            gamma (float): discount factor
            lr (float): learning rate 
            tau (float): Soft-parameter update
            update_every (int): how often to update the network
            device(string): Use Gpu or CPU
        """
        self.input_shape = input_shape
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.lr = lr
        self.update_every = update_every
        self.tau = tau
        self.device = device

        # Q-Network
        self.policy_net = DQNLinear(input_shape, action_size).to(self.device)
        self.target_net = DQNLinear(input_shape, action_size).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size,
                                   self.device)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every

        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=0.01):
        state = torch.from_numpy(state).unsqueeze(0).to(self.device)
        self.policy_net.eval()
        with torch.no_grad():
            action_values = self.policy_net(state)
        self.policy_net.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        # Get expected Q values from policy model
        Q_expected_current = self.policy_net(states)
        Q_expected = Q_expected_current.gather(1,
                                               actions.unsqueeze(1)).squeeze(1)

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.target_net(next_states).detach().max(1)[0]

        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.policy_net, self.target_net, self.tau)

    # θ'=θ×τ+θ'×(1−τ)
    def soft_update(self, policy_model, target_model, tau):
        for target_param, policy_param in zip(target_model.parameters(),
                                              policy_model.parameters()):
            target_param.data.copy_(tau * policy_param.data +
                                    (1.0 - tau) * target_param.data)

    def load_model(self, path):

        checkpoint = torch.load(path)
        self.policy_net.load_state_dict(checkpoint['state_dict'])
        self.target_net.load_state_dict(checkpoint['state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        scores = checkpoint['scores']

        return scores

    def save_model(self, path, scores):
        model = {
            "state_dict": self.policy_net.state_dict(),
            "optimizer": self.optimizer.state_dict(),
            "scores": scores
        }
        torch.save(model, path)
예제 #9
0
class DuelingDQAgent():
    def __init__(self,
                 lr: float,
                 gamma: float,
                 obs_dims,
                 num_actions: int,
                 mem_size,
                 mini_batchsize,
                 epsilon_dec,
                 env_name,
                 algo_name,
                 epsilon=1.0,
                 replace=1000,
                 epsilon_min=0.1,
                 checkpoint_dir='temp/dqn/duelingdqn'):

        self.lr = lr
        self.gamma = gamma
        self.obs_dims = obs_dims
        self.num_actions = num_actions
        self.mini_batchsize = mini_batchsize
        self.epsilon_min = epsilon_min
        self.epsilon_dec = epsilon_dec
        self.epsilon = epsilon

        self.mem_counter = 0
        self.copy_counter = 0
        self.replace_target_cnt = replace
        self.checkpoint_dir = checkpoint_dir
        self.memories = ReplayBuffer(mem_size=mem_size,
                                     state_shape=self.obs_dims,
                                     num_actions=self.num_actions)
        self.action_space = [i for i in range(self.num_actions)]

        self.learning_network = DuelingQNetwork(
            lr=self.lr,
            num_actions=self.num_actions,
            input_dims=self.obs_dims,
            name=env_name + '_' + algo_name + '_learning',
            checkpoint_dir=self.checkpoint_dir)

        self.target_network = DuelingQNetwork(
            lr=self.lr,
            num_actions=self.num_actions,
            input_dims=self.obs_dims,
            name=env_name + '_' + algo_name + '_target',
            checkpoint_dir=self.checkpoint_dir)

    def decrement_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon - self.epsilon_dec
        else:
            self.epsilon = self.epsilon_min

    def store_memory(self, obs, action, reward, new_obs, done):
        self.memories.store(obs, action, reward, new_obs, done)
        self.mem_counter += 1

    def sample_memory(self):
        states, actions, rewards, new_states, dones = self.memories.sample(
            self.mini_batchsize)

        states = T.tensor(states).to(self.target_network.device)
        actions = T.tensor(actions).to(self.target_network.device)
        rewards = T.tensor(rewards).to(self.target_network.device)
        new_states = T.tensor(new_states).to(self.target_network.device)
        dones = T.tensor(dones).to(self.target_network.device)

        # print(f'---States shape: {states.size()}')
        return states, actions, rewards, new_states, dones

    def get_action(self, obs):
        if np.random.random() < self.epsilon:
            action = np.random.choice(len(self.action_space), 1)[0]
        else:
            # obs = np.array([obs])
            state = T.tensor([obs],
                             dtype=T.float).to(self.learning_network.device)

            returns_for_actions = self.target_network.forward(state)
            action = T.argmax(returns_for_actions).cpu().detach().numpy()
        return action

    def learn(self):
        if self.mem_counter < self.mini_batchsize:
            return

        self.learning_network.optimizer.zero_grad()
        states, actions, rewards, new_states, dones = self.sample_memory()

        # print(f'---Actions shape: {actions.size()}')
        # print(f'---Actions: {actions}')

        indices = np.arange(self.mini_batchsize)
        q_pred = self.learning_network.forward(states)[indices, actions]

        q_next = self.learning_network.forward(new_states)
        actions_selected = T.argmax(
            q_next, dim=1)  # Action selection based on online weights

        q_eval = self.target_network.forward(new_states)
        q_eval[dones] = 0.0  #Actions' return value are evaluated

        q_target = rewards + self.gamma * q_eval[indices, actions_selected]
        cost = self.learning_network.loss(q_target, q_pred)
        cost.backward()
        self.learning_network.optimizer.step()

        self.decrement_epsilon()

        if self.copy_counter % self.replace_target_cnt == 0:
            self.copy_target_network()
        self.copy_counter += 1

    def copy_target_network(self):
        self.target_network.load_state_dict(self.learning_network.state_dict())

    def save_models(self):
        self.learning_network.save()
        self.target_network.save()

    def load_models(self):
        self.learning_network.load()
        self.target_network.load()
예제 #10
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 dqn_type='DQN',
                 replay_memory_size=1e5,
                 batch_size=64,
                 gamma=0.99,
                 learning_rate=1e-3,
                 target_tau=2e-3,
                 update_rate=4,
                 seed=0):

        self.dqn_type = dqn_type
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        """
        # DQN Agent Q-Network
        # For DQN training, two neural network models are employed;
        # (a) A network that is updated every (step % update_rate == 0)
        # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate.
        # The slower modulation of the target network weights operates to stablize learning.
        """
        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, actions, rewards, next_state, dones):
        # Save experience in replay memory
        for i in range(len(actions)):
            # print("Step ACTIONS", actions, actions[i], state[i])
            self.memory.add(state[i], actions[i], rewards[i], next_state[i],
                            dones[i])

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_rate
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.0):
        """Returns actions for given state as per current policy.

            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        num_agents = len(action_values[0])

        # print("AGENT ACT VALUES", action_values,  np.argmax(action_values.cpu().data.numpy()[0], 1),  np.array([random.choice(np.arange(self.action_size)) for i in range(num_agents)]))

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()[0], 1)
        else:
            return np.array(
                np.array([
                    random.choice(np.arange(self.action_size))
                    for i in range(num_agents)
                ]))

    # Update value parameters using given batch of experience tuples.
    def learn(self, experiences, gamma, DQN=True):
        """
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """

        states, actions, rewards, next_states, dones = experiences

        # Get Q values from current observations (s, a) using model nextwork
        Qsa = self.network(states).gather(1, actions)

        if (self.dqn_type == 'DDQN'):
            #Double DQN
            #************************
            Qsa_prime_actions = self.network(next_states).detach().max(
                1)[1].unsqueeze(1)
            Qsa_prime_targets = self.target_network(
                next_states)[Qsa_prime_actions].unsqueeze(1)

        else:
            #Regular (Vanilla) DQN
            #************************
            # Get max Q values for (s',a') from target model
            Qsa_prime_target_values = self.target_network(next_states).detach()
            Qsa_prime_targets = Qsa_prime_target_values.max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Qsa_targets = rewards + (gamma * Qsa_prime_targets * (1 - dones))

        # Compute loss (error)
        loss = F.mse_loss(Qsa, Qsa_targets)

        # print(Qsa, Qsa_targets)
        # print(loss)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.network, self.target_network, self.tau)

    """
    Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    """

    def soft_update(self, local_model, target_model, tau):
        """
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DeepQAgent():
    def __init__(self,
                 lr: float,
                 gamma: float,
                 obs_dims,
                 num_actions: int,
                 mem_size,
                 mini_batchsize,
                 epsilon_dec,
                 env_name,
                 algo_name,
                 epsilon_min=0.1,
                 checkpoint_dir='temp/dqn'):

        self.lr = lr
        self.gamma = gamma
        self.obs_dims = obs_dims
        self.num_actions = num_actions
        self.mini_batchsize = mini_batchsize
        self.epsilon_min = epsilon_min
        self.epsilon_dec = epsilon_dec
        self.epsilon = 1.0

        self.mem_counter = 0
        self.copy_counter = 0
        self.checkpoint_dir = checkpoint_dir
        self.memories = ReplayBuffer(mem_size=mem_size,
                                     state_shape=self.obs_dims,
                                     num_actions=self.num_actions)
        self.action_space = [i for i in range(self.num_actions)]

        self.learning_network = DeepQNetwork(
            lr=self.lr,
            num_actions=self.num_actions,
            input_dims=self.obs_dims,
            name=env_name + '_' + algo_name + '_learning',
            checkpoint_dir=self.checkpoint_dir)

        self.target_network = DeepQNetwork(lr=self.lr,
                                           num_actions=self.num_actions,
                                           input_dims=self.obs_dims,
                                           name=env_name + '_' + algo_name +
                                           '_target',
                                           checkpoint_dir=self.checkpoint_dir)

    def decrement_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon - self.epsilon_dec
        else:
            self.epsilon = self.epsilon_min

    def store_memory(self, obs, action, reward, new_obs, done):
        self.memories.store(obs, action, reward, new_obs, done)
        self.mem_counter += 1

    def sample_memory(self):
        states, actions, rewards, new_states, dones = self.memories.sample(
            self.mini_batchsize)

        states = T.tensor(states).to(self.target_network.device)
        actions = T.tensor(actions).to(self.target_network.device)
        rewards = T.tensor(rewards).to(self.target_network.device)
        new_states = T.tensor(new_states).to(self.target_network.device)
        dones = T.tensor(dones).to(self.target_network.device)

        # print(f'---States shape: {states.size()}')
        return states, actions, rewards, new_states, dones

    def get_action(self, obs):
        if np.random.random() < self.epsilon:
            action = np.random.choice(len(self.action_space), 1)[0]
        else:
            # obs = np.array([obs])
            state = T.tensor([obs],
                             dtype=T.float).to(self.learning_network.device)

            returns_for_actions = self.target_network.forward(state)
            action = T.argmax(returns_for_actions).cpu().detach().numpy()
        return action

    def learn(self):
        if self.mem_counter < self.mini_batchsize:
            return

        self.learning_network.optimizer.zero_grad()
        self.copy_target_network()
        states, actions, rewards, new_states, dones = self.sample_memory()

        # print(f'---Actions shape: {actions.size()}')
        # print(f'---Actions: {actions}')
        indices = np.arange(self.mini_batchsize)
        # q_pred = self.learning_network.forward(states)[:, actions]
        q_pred = self.learning_network.forward(states)[indices, actions]
        q_next = self.target_network.forward(new_states).max(dim=1)[0]
        # dim=1 specifies take max along actions and [0] specifies taking the values instead of indices

        # print(f'---q_pred shape: {q_pred.size()}---')
        # print(f'---q_next shape: {q_next.size()}---')

        q_next[dones] = 0.0
        targets = rewards + self.gamma * q_next
        cost = self.learning_network.loss(targets, q_pred)
        cost.backward()
        self.learning_network.optimizer.step()

        self.decrement_epsilon()

        if self.copy_counter % 4 == 0:
            self.copy_target_network()
        self.copy_counter += 1

    def copy_target_network(self):
        self.target_network.load_state_dict(self.learning_network.state_dict())

    def save_models(self):
        self.learning_network.save()
        self.target_network.save()

    def load_models(self):
        self.learning_network.load()
        self.target_network.load()
예제 #12
0
class QR_DQN(object):
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync eval target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()
            
        # simulator step conter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0
        
        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)
        
        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)
        
    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(), pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)
            
    def save_model(self):
        # save prediction network and target network
        self.pred_net.save(PRED_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net.load(PRED_PATH)
        self.target_net.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
        x = torch.FloatTensor(x)
        if USE_GPU:
            x = x.cuda()

        if np.random.uniform() >= EPSILON:
            # greedy case
            action_value = self.pred_net(x).mean(dim=2) # (N_ENVS, N_ACTIONS)
            action = torch.argmax(action_value, dim=1).data.cpu().numpy()
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS, (x.size(0)))
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net, self.pred_net, 1e-2)
    
        b_s, b_a, b_r,b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE)
        b_w, b_idxes = np.ones_like(b_r), None
            
        b_s = torch.FloatTensor(b_s)
        b_a = torch.LongTensor(b_a)
        b_r = torch.FloatTensor(b_r)
        b_s_ = torch.FloatTensor(b_s_)
        b_d = torch.FloatTensor(b_d)

        if USE_GPU:
            b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(), b_s_.cuda(), b_d.cuda()

        # action value distribution prediction
        q_eval = self.pred_net(b_s) # (m, N_ACTIONS, N_QUANT)
        mb_size = q_eval.size(0)
        q_eval = torch.stack([q_eval[i].index_select(0, b_a[i]) for i in range(mb_size)]).squeeze(1) 
        # (m, N_QUANT)
        q_eval = q_eval.unsqueeze(2) # (m, N_QUANT, 1)
        # note that dim 1 is for present quantile, dim 2 is for next quantile
        
        # get next state value
        q_next = self.target_net(b_s_).detach() # (m, N_ACTIONS, N_QUANT)
        best_actions = q_next.mean(dim=2).argmax(dim=1) # (m)
        q_next = torch.stack([q_next[i].index_select(0, best_actions[i]) for i in range(mb_size)]).squeeze(1)
        # (m, N_QUANT)
        q_target = b_r.unsqueeze(1) + GAMMA * (1. -b_d.unsqueeze(1)) * q_next 
        # (m, N_QUANT)
        q_target = q_target.unsqueeze(1) # (m , 1, N_QUANT)

        # quantile Huber loss
        u = q_target.detach() - q_eval # (m, N_QUANT, N_QUANT)
        tau = torch.FloatTensor(QUANTS_TARGET).view(1, -1, 1) # (1, N_QUANT, 1)
        # note that tau is for present quantile
        if USE_GPU:
            tau = tau.cuda()
        weight = torch.abs(tau - u.le(0.).float()) # (m, N_QUANT, N_QUANT)
        loss = F.smooth_l1_loss(q_eval, q_target.detach(), reduction='none')
        # (m, N_QUANT, N_QUANT)
        loss = torch.mean(weight * loss, dim=1).mean(dim=1)
        print('1',loss.shape)
        
        # calc importance weighted loss
        b_w = torch.Tensor(b_w)
        if USE_GPU:
            b_w = b_w.cuda()
        # loos = b_w * loss
        print('2',(b_w * loss).shape)
        loss = torch.mean(b_w * loss)
        
        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
#         torch.nn.utils.clip_grad_norm_(self.pred_net.parameters(),0.1)
        self.optimizer.step()
예제 #13
0
class Agent():
    '''Interact with and learn from environment.'''
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # counter for activating learning every few steps
        self.running_c_loss = 0
        self.running_a_loss = 0
        self.training_cnt = 0

        # Actor network (w/ target network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network (w/ target network)
        self.critic_local = Critic(state_size, action_size, seed).to(device)
        self.critic_target = Critic(state_size, action_size, seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def act(self, state, mode):
        '''Returns actions for given state as per current policy.

        Params
        ======
            state (array): current state
            mode (string): train or test
            epsilon (float): for epsilon-greedy action selection
        '''
        state = torch.from_numpy(state).unsqueeze(0).float().to(
            device)  # shape of state (1, state_size)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        if mode == 'test':
            return np.clip(action, -1, 1)

        elif mode == 'train':  # if train, then add OUNoise in action
            action += self.noise.sample()
            return np.clip(action, -1, 1)

    def step(self, state, action, reward, next_state, done):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # activate learning every few steps
        self.t_step = self.t_step + 1
        if self.t_step % LEARN_EVERY_STEP == 0:
            # Learn, if enough samples are available in memory
            if len(self.memory) > BATCH_SIZE:
                for _ in range(10):  # update 10 times per learning
                    experiences = self.memory.sample()
                    self.learn(experiences, GAMMA)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        self.running_c_loss += float(critic_loss.cpu().data.numpy())
        self.training_cnt += 1
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        self.running_a_loss += float(actor_loss.cpu().data.numpy())
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1)  # clip gradient to max 1
        self.actor_optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
예제 #14
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 behavior_name,
                 index_player,
                 replay_memory_size=1e4,
                 batch_size=512,
                 gamma=0.99,
                 learning_rate=1e4,
                 target_tau=1e3,
                 update_rate=100,
                 seed=0):  #affect your agent vs other agents
        self.state_size = state_size
        self.current_state = []
        self.action_size = action_size
        self.buffer_size = int(replay_memory_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.learn_rate = learning_rate
        self.tau = target_tau
        self.update_rate = update_rate
        self.seed = random.seed(seed)
        self.behavior_name = behavior_name
        self.index_player = index_player
        self.close_ball_reward = 0
        self.touch_ball_reward = 0
        """
        Now we define two models: 
        (a) one netwoek will be updated every (step % update_rate == 0),
        (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate.
        """

        self.network = QNetwork(state_size, action_size, seed).to(device)
        self.target_network = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learn_rate)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def load_model(self, path_model, path_target=None):
        params = torch.load(path_model)
        self.network.set_params(params)
        self.network.load_state_dict(torch.load(path_model))
        if path_target != None:
            self.target_network.load_state_dict(torch.load(path_target))

    def model_step(self, state, action, reward, next_state):
        # save experience in replay memory
        self.memory.add(state, action, reward, next_state)

        # learn every UPDATE_EVERY time steps
        self.t_step = self.t_step + 1
        if self.t_step % self.update_rate == 0:

            # if enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma, self.t_step)

    def choose_action(self, state, eps=0.0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.network.eval()
        with torch.no_grad():
            action_values = self.network(state)
        self.network.train()

        # epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()
                             )  # return a number from 0 to action_size
        else:
            return random.choice(np.arange(
                self.action_size))  # return a number from 0 to action_size

    def learn(self, experiences, gamma, stp):
        states, actions, rewards, next_states = experiences

        # Get Q values from current observations (s,a) using model network
        # get max Q values for (s', a') from target model
        self.network.train()
        Q_sa = self.network(states).gather(1, actions)
        #print(Q_sa)
        Q_sa_prime_target_values = self.target_network(next_states).max(
            1)[0].to(device).float().detach()
        #Q_sa_prime_targets = Q_sa_prime_target_values.max(1)[0].unsqueeze(1)
        #print(Q_sa_prime_target_values)

        # compute Q targets for current states
        #print(rewards)

        Q_sa_targets = rewards + gamma * Q_sa_prime_target_values.unsqueeze(1)
        #print(Q_sa_targets)
        #input('train')

        #Q_sa_targets = Q_sa_targets.unsqueeze(1)

        # Compute loss (error)
        criterion = torch.nn.MSELoss(reduction='sum')
        loss = criterion(
            Q_sa.to(device),
            Q_sa_targets.to(device))  #F.mse_loss(Q_sa, Q_sa_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network
        if stp % 100 == 0:
            print('Updating Model')
            self.soft_update(self.network, self.target_network, self.tau)

    def soft_update(self, local_model, target_model, tau):

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def Read(self):
        decision_steps, terminal_steps = env.get_steps(self.behavior_name)
        try:
            signal_front = np.array(
                sensor_front_sig(
                    decision_steps.obs[0][self.index_player, :]))  # 3 x 11 x 8
            signal_back = np.array(
                sensor_back_sig(
                    decision_steps.obs[1][self.index_player, :]))  # 3 x 3 x 8
            #pre_state = []
            signal_front = np.array(signal_front)
            #print(signal_front.shape)
            #print(signal_back.shape)
            r = np.concatenate((signal_front, signal_back), axis=1)
            #print(r.shape)
            #input('ff')
            #pre_state.extend(list(np.array(signal_front).flatten()))
            #pre_state.extend(list(np.array(signal_back).flatten()))
            #state = np.array(pre_state)
            self.current_state = r
            count_close_to_ball = 0
            count_touch_ball = 0
            count_back_touch = 0
            count_back_close = 0
            self.rew_d_to_our_post = 0
            self.rew_for_ball_dist = -0.1
            # Front Observation
            for i in range(len(signal_front[0])):
                if signal_front[0][i][0] == 1.0:
                    count_close_to_ball += 1
                    self.rew_for_ball_dist = max(
                        0.3 * (1 - signal_front[0][i][7]),
                        self.rew_for_ball_dist)

                    # Kicked the ball at the front
                    if signal_front[0][i][7] <= 0.03:
                        count_touch_ball += 1

                if signal_front[0][i][1] == 1.0:
                    self.rew_d_to_our_post = -0.1
                if signal_front[0][i][2] == 1.0:
                    self.rew_d_to_our_post = 0.1

            # Back observation
            for i in range(len(signal_back[0])):
                if signal_back[0][i][0] == 1.0:
                    count_back_close += 0.2

                    # Touches the ball at the back
                    if signal_back[0][i][7] <= 0.03:
                        count_back_touch += 0.3

            self.back_touch = 1 if count_back_touch > 0 else 0.2
            self.back_close = 1 if count_back_close > 0 else 0.1

            # add reward if kick the ball
            self.touch_ball_reward = 1 if count_touch_ball > 0 else -0.15
            # Penalize for back touching the ball
            if count_back_touch > 0:
                self.touch_ball_reward = -0.25

            # Penalize if the ball is not in view
            self.close_ball_reward = 0.25 if count_close_to_ball > 0 else -0.05
            # Penalize if the ball is behind the agent
            if count_back_close > 0:
                self.close_ball_reward = -0.1

            return self.current_state
        except:
            self.touch_ball_reward = 0
            self.close_ball_reward = 0

        return self.current_state

    def upd_after_goal(self, n_upds):
        self.memory.upd_goal(n_upds)
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)

    def we_goll(self):
        self.memory.we_goll()
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)

    def us_goll(self):
        self.memory.us_goll()
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)
            experiences = self.memory.sample()
            self.learn(experiences, self.gamma, self.t_step)
예제 #15
0
class DQN(object):
    def __init__(self):
        if USE_CNN:
            if USE_GPU:
                self.eval_net, self.target_net = ConvNet().cuda(), ConvNet(
                ).cuda()
            else:
                self.eval_net, self.target_net = ConvNet(), ConvNet()
        else:
            if USE_GPU:
                self.eval_net, self.target_net = Net().cuda(), Net().cuda()
            else:
                self.eval_net, self.target_net = Net(), Net()

        self.learn_step_counter = 0  # for target updating
        self.memory_counter = 0

        # Create the replay buffer
        if MEMORY_MODE == 'PER':
            self.replay_buffer = PrioritizedReplayBuffer(MEMORY_CAPACITY,
                                                         alpha=PER_ALPHA)
        else:
            self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)

    def choose_action(self, x, EPSILON):
        if USE_GPU:
            x = Variable(torch.FloatTensor(x)).cuda()
        else:
            x = Variable(torch.FloatTensor(x))

        # input only one sample
        if np.random.uniform() < EPSILON:  # greedy
            actions_value = self.eval_net.forward(x.unsqueeze(0))
            if USE_GPU:
                action = torch.argmax(
                    actions_value).data.cpu().numpy()  # return the argmax
            else:
                action = torch.argmax(
                    actions_value).data.numpy()  # return the argmax;
        else:  # random
            action = np.random.randint(0, N_ACTIONS)
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self, beta):
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1

        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
        if MEMORY_MODE == 'PER':
            experience = self.replay_buffer.sample(BATCH_SIZE, beta=beta)
            (b_state_memory, b_action_memory, b_reward_memory,
             b_next_state_memory, b_done, b_weights, b_idxes) = experience
        else:
            b_state_memory, b_action_memory, b_reward_memory, b_next_state_memory, b_done = self.replay_buffer.sample(
                BATCH_SIZE)
            b_weights, b_idxes = np.ones_like(b_reward_memory), None

        if USE_GPU:
            b_s = Variable(torch.FloatTensor(b_state_memory)).cuda()
            b_a = Variable(torch.LongTensor(b_action_memory)).cuda()
            b_r = Variable(torch.FloatTensor(b_reward_memory)).cuda()
            b_s_ = Variable(torch.FloatTensor(b_next_state_memory)).cuda()
            b_d = Variable(torch.FloatTensor(b_done)).cuda()
        else:
            b_s = Variable(torch.FloatTensor(b_state_memory))
            b_a = Variable(torch.LongTensor(b_action_memory))
            b_r = Variable(torch.FloatTensor(b_reward_memory))
            b_s_ = Variable(torch.FloatTensor(b_next_state_memory))
            b_d = Variable(torch.FloatTensor(b_done))

        # q_eval w.r.t the action in experience
        q_eval = self.eval_net(b_s).gather(1, b_a.unsqueeze(1)).view(
            -1)  # shape (batch, 1)

        if DOUBLE:
            _, best_actions = self.eval_net.forward(b_s_).detach().max(1)
            q_next = self.target_net(
                b_s_).detach()  # detach from graph, don't backpropagate
            q_target = b_r + GAMMA * (1. - b_d) * q_next.gather(
                1, best_actions.unsqueeze(1)).squeeze(1)  # shape (batch, 1)
        else:
            q_next = self.target_net(
                b_s_).detach()  # detach from graph, don't backpropagate
            q_target = b_r + GAMMA * (
                1. - b_d) * q_next.max(1)[0]  # shape (batch, 1)

        loss = F.smooth_l1_loss(q_eval, q_target, reduce=False)
        loss = torch.mean(torch.Tensor(b_weights).cuda() * loss)
        td_error = (q_target - q_eval).data.cpu().numpy()

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.eval_net.parameters(), 10.)
        self.optimizer.step()

        if MEMORY_MODE == 'PER':
            new_priorities = np.abs(td_error) + PER_EPSILON
            self.replay_buffer.update_priorities(b_idxes, new_priorities)

    def save_model(self):
        # save evaluation network and target network simultaneously
        self.eval_net.save(EVAL_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load evaluation network and target network simultaneously
        self.eval_net.load(EVAL_PATH)
        self.target_net.load(TARGET_PATH)
예제 #16
0
class DQN(object):
    def __init__(self):
        self.pred_net, self.target_net = ConvNet(), ConvNet()
        # sync evac target
        self.update_target(self.target_net, self.pred_net, 1.0)
        # use gpu
        if USE_GPU:
            self.pred_net.cuda()
            self.target_net.cuda()
            
        # simulator step counter
        self.memory_counter = 0
        # target network step counter
        self.learn_step_counter = 0
        # loss function
        self.loss_function = nn.MSELoss()
        # ceate the replay buffer
        self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY)
        
        # define optimizer
        self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)
        
    def update_target(self, target, pred, update_rate):
        # update target network parameters using predcition network
        for target_param, pred_param in zip(target.parameters(), pred.parameters()):
            target_param.data.copy_((1.0 - update_rate) \
                                    * target_param.data + update_rate*pred_param.data)
    
    def save_model(self):
        # save prediction network and target network
        self.pred_net.save(PRED_PATH)
        self.target_net.save(TARGET_PATH)

    def load_model(self):
        # load prediction network and target network
        self.pred_net.load(PRED_PATH)
        self.target_net.load(TARGET_PATH)

    def choose_action(self, x, EPSILON):
    	# x:state
        x = torch.FloatTensor(x)
        # print(x.shape)
        if USE_GPU:
            x = x.cuda()

        # epsilon-greedy策略
        if np.random.uniform() >= EPSILON:
            # greedy case
            action_value = self.pred_net(x) 	# (N_ENVS, N_ACTIONS, N_QUANT)
            action = torch.argmax(action_value, dim=1).data.cpu().numpy()
        else:
            # random exploration case
            action = np.random.randint(0, N_ACTIONS, (x.size(0)))
        return action

    def store_transition(self, s, a, r, s_, done):
        self.memory_counter += 1
        self.replay_buffer.add(s, a, r, s_, float(done))

    def learn(self):
        self.learn_step_counter += 1
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.update_target(self.target_net, self.pred_net, 1e-2)
    
        b_s, b_a, b_r, b_s_, b_d = self.replay_buffer.sample(BATCH_SIZE)
        # b_w, b_idxes = np.ones_like(b_r), None
            
        b_s = torch.FloatTensor(b_s)
        b_a = torch.LongTensor(b_a)
        b_r = torch.FloatTensor(b_r)
        b_s_ = torch.FloatTensor(b_s_)
        b_d = torch.FloatTensor(b_d)

        if USE_GPU:
            b_s, b_a, b_r, b_s_, b_d = b_s.cuda(), b_a.cuda(), b_r.cuda(), b_s_.cuda(), b_d.cuda()

        # action value for current state 
        q_eval = self.pred_net(b_s) 	
        mb_size = q_eval.size(0)
        q_eval = torch.stack([q_eval[i][b_a[i]] for i in range(mb_size)])

        # optimal action value for current state 
        q_next = self.target_net(b_s_) 				
        # best_actions = q_next.argmax(dim=1) 		
        # q_next = torch.stack([q_next[i][best_actions[i]] for i in range(mb_size)])
        q_next = torch.max(q_next, -1)[0]
        q_target = b_r + GAMMA * (1. - b_d) * q_next
        q_target = q_target.detach()

        # loss
        loss = self.loss_function(q_eval, q_target)
        logger.store(loss=loss)
        # backprop loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss
class DoubleDQAgent():
    def __init__(self,
                 lr: float,
                 gamma: float,
                 obs_dims,
                 num_actions: int,
                 mem_size,
                 mini_batchsize,
                 epsilon_dec,
                 env_name,
                 algo_name,
                 epsilon=1.0,
                 replace=1000,
                 epsilon_min=0.1,
                 checkpoint_dir='results\\doubledqn'):

        self.lr = lr
        self.gamma = gamma
        self.obs_dims = obs_dims
        self.num_actions = num_actions
        self.mini_batchsize = mini_batchsize
        self.epsilon_min = epsilon_min
        self.epsilon_dec = epsilon_dec
        self.epsilon = epsilon
        self.replace_target_cnt = replace

        self.mem_counter = 0
        self.copy_counter = 0
        self.checkpoint_dir = checkpoint_dir
        self.memories = ReplayBuffer(mem_size=mem_size,
                                     state_shape=self.obs_dims,
                                     num_actions=self.num_actions)
        self.action_space = [i for i in range(self.num_actions)]

        self.learning_network = DeepQNetwork(
            lr=self.lr,
            num_actions=self.num_actions,
            input_dims=self.obs_dims,
            name=algo_name + '_' + env_name + '_' + 'learning',
            checkpoint_dir=self.checkpoint_dir)

        self.target_network = DeepQNetwork(lr=self.lr,
                                           num_actions=self.num_actions,
                                           input_dims=self.obs_dims,
                                           name=env_name + '_' + algo_name +
                                           '_target',
                                           checkpoint_dir=self.checkpoint_dir)

        self.loss_value = 0
        self.writer = SummaryWriter(os.path.join(self.checkpoint_dir, 'logs'))

    def decrement_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon - self.epsilon_dec
        else:
            self.epsilon = self.epsilon_min

    def store_memory(self, obs, action, reward, new_obs, done):
        self.memories.store(obs, action, reward, new_obs, done)
        self.mem_counter += 1

    def sample_memory(self):
        states, actions, rewards, new_states, dones = self.memories.sample(
            self.mini_batchsize)

        states = T.tensor(states).to(self.target_network.device)
        actions = T.tensor(actions).to(self.target_network.device)
        rewards = T.tensor(rewards).to(self.target_network.device)
        new_states = T.tensor(new_states).to(self.target_network.device)
        dones = T.tensor(dones).to(self.target_network.device)

        # print(f'---States shape: {states.size()}')
        return states, actions, rewards, new_states, dones

    def get_action(self, obs):
        if np.random.random() < self.epsilon:
            action = np.random.choice(len(self.action_space), 1)[0]
        else:
            state = T.tensor([obs],
                             dtype=T.float).to(self.learning_network.device)

            returns_for_actions = self.target_network.forward(state)
            action = T.argmax(returns_for_actions).cpu().detach().numpy()
        return action

    def learn(self):
        if self.mem_counter < self.mini_batchsize:
            return

        self.learning_network.optimizer.zero_grad()
        states, actions, rewards, new_states, dones = self.sample_memory()

        indices = np.arange(self.mini_batchsize)
        q_pred = self.learning_network.forward(states)[indices, actions]

        q_next = self.learning_network.forward(new_states)
        actions_selected = T.argmax(
            q_next, dim=1)  # Action selection based on online weights

        q_eval = self.target_network.forward(new_states)
        q_eval[dones] = 0.0  #Actions' return value are evaluated

        q_target = rewards + self.gamma * q_eval[indices, actions_selected]
        cost = self.learning_network.loss(q_target, q_pred)
        cost.backward()
        self.learning_network.optimizer.step()

        self.decrement_epsilon()

        if self.copy_counter % self.replace_target_cnt == 0:
            self.copy_target_network()
        self.copy_counter += 1

        self.loss_value = cost

    def log(self, num_episode):
        diff = 0
        for p_learning, p_target in zip(self.learning_network.parameters(),
                                        self.target_network.parameters()):
            p_learning = p_learning.data.cpu()
            p_target = p_target.data.cpu()
            diff += T.sum(p_learning - p_target)

        self.writer.add_scalar("td_error", self.loss_value, num_episode)
        self.writer.add_scalar("learning_target_diff", diff, num_episode)

        return diff

    def copy_target_network(self):
        self.target_network.load_state_dict(self.learning_network.state_dict())

    def save_models(self):
        self.learning_network.save()
        self.target_network.save()

    def load_models(self):
        self.learning_network.load()
        self.target_network.load()