예제 #1
0
class DQNAgent(object):
    def __init__(self, device):
        self.device = device
        self.env = None

        self.model = QModule().to(self.device)
        self.target = copy.deepcopy(self.model)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=3e-4)

        self.BATCH_SIZE = 10
        self.REPLAY_MEMORY_SIZE = 1_000_000
        self.replay_memory = ReplayMemory(self.device,
                                          maximum_size=self.REPLAY_MEMORY_SIZE)

        self.EPS_MIN = 0.01
        self.EPS_EP = 100
        self.GAMMA = 0.99
        self.TAU = 0.005

        self.explore = False

    def set(self, env):
        self.env = env

    def memorize(self, current_state, action, reward, next_state, done):
        self.replay_memory.push(current_state, action, reward, next_state,
                                done)

    def update_target(self, ):
        with torch.no_grad():
            for model_kernel, target_kernel in zip(self.model.parameters(),
                                                   self.target.parameters()):
                target_kernel.copy_((1 - self.TAU) * target_kernel +
                                    self.TAU * model_kernel)

    def train(self):
        if len(self.replay_memory) < self.BATCH_SIZE:
            return
        samples = self.replay_memory.sample(self.BATCH_SIZE)
        current_states, actions, rewards, next_states, dones = \
            samples["state"], samples["action"], samples["reward"], samples["next_state"], samples["done"]

        with torch.no_grad():
            next_action = self.model(next_states).argmax(dim=-1, keepdim=True)
            next_q = self.target(next_states).gather(1, next_action).squeeze()

        y = rewards + (1.0 - dones) * self.GAMMA * next_q

        actions = torch.LongTensor(get_actions_number(actions)).to(self.device)
        current_q = self.model(current_states).gather(
            1, actions.view(1, self.BATCH_SIZE))[0]

        self.optimizer.zero_grad()
        loss = f.mse_loss(current_q, y.detach())
        loss.backward()
        self.optimizer.step()

        self.update_target()

    def act(self, state, episode):
        if self.explore:
            exploration = max((self.EPS_MIN - 1) / self.EPS_EP * episode + 1,
                              self.EPS_MIN)
            if random.random() < exploration:
                return get_action(random.randint(0, 6))

        state = np.array([state])
        state = torch.FloatTensor(state).to(self.device)
        with torch.no_grad():
            output = self.model(state).detach().cpu().numpy()
        action = get_action(np.argmax(output))
        return action
class MultiAgentPlanner():
    def __init__(self, index, reward_threshold, collision_threshold,
                 world_size, states, num_agents, collision_distance):
        self.index = index
        self.name = 'multi safe q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        kernel = RBF(length_scale=world_size, length_scale_bounds=[(1e1, 1e5), (1e1, 1e5), (1e1, 1e5)]) \
            + WhiteKernel(noise_level=1)
        self.reward_gp = GaussianProcessRegressor(kernel=kernel)
        self.reward_threshold = reward_threshold
        self.collision_threshold = collision_threshold
        self.collision_distance = collision_distance
        self.trajs = [[] for _ in range(num_agents)]
        self.my_states = []
        self.action_traj = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1

        self.dimensions = [3, 50, 50, 7]
        self.dqn = MLP(self.dimensions).double()
        self.dqn_l = MLP(self.dimensions).double()
        self.dqn_u = MLP(self.dimensions).double()
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.optimizer_l = optim.RMSprop(self.dqn_l.parameters())
        self.optimizer_u = optim.RMSprop(self.dqn_u.parameters())

        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.target_l = MLP(self.dimensions).double()
        self.target_l.load_state_dict(self.dqn_l.state_dict())
        self.target_l.eval()
        self.target_u = MLP(self.dimensions).double()
        self.target_u.load_state_dict(self.dqn_u.state_dict())
        self.target_u.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.lr = 1e-3
        self.epsilons = [0. for _ in range(num_agents)]
        self.tau_exploits = [1. for _ in range(num_agents)]
        self.tau_explores = [1. for _ in range(num_agents)]
        self.num_collisions = 0
        self.num_unsafe = 0
        self.eps = 0.1
        self.cum_rewards = 0
        self.target_usage = 0

    def choose_action(self, explore=False):
        possible_actions = copy.copy(Action.SET)
        action_next_states = []
        reward_ls = []
        reward_uncertainty = []
        no_collision_ls = [1.0 for _ in Action.SET]
        best_action = Action.STAY

        # best_action = np.argmax(self.target(torch.tensor(self.states[self.index])).tolist())
        #
        # if explore or np.random.binomial(1, self.eps) == 1:
        #     best_action = possible_actions[np.random.choice(len(possible_actions))]

        for a in Action.SET:
            next_state = self._move_coordinate(self.states[self.index], a)
            reward, std = self.reward_gp.predict(np.array([next_state]),
                                                 return_std=True)
            reward = reward[0]
            std = std[0]
            action_next_states += [(a, next_state)]
            reward_ls += [reward - self.beta * std]
            reward_uncertainty += [std]

        for action, next_state in action_next_states:
            for agent in range(self.num_agents):
                if agent == self.index:
                    continue

                cur_agent_state = self.states[agent]

                for agent_action in Action.SET:
                    possible_next_agent_state = cur_agent_state + get_movement(
                        agent_action)
                    if np.linalg.norm(possible_next_agent_state -
                                      next_state) < self.collision_distance:
                        continue

                    a_prob = self._get_policy(agent, agent_action)
                    no_collision_ls[action] *= (1 - a_prob)

        # for i, l in enumerate(reward_ls):
        #     if l <= self.reward_threshold or not self._returnable(action_next_states[i][1]):
        #         possible_actions.remove(i)

        for i, l in enumerate(no_collision_ls):
            if l < self.collision_threshold and i in possible_actions:
                possible_actions.remove(i)

        possible_actions = list(possible_actions)
        if explore or np.random.binomial(1, self.eps) == 1:
            # most_uncertain_action = Action.STAY
            # largest_uncertainty = -math.inf
            # for action in possible_actions:
            #     if reward_uncertainty[action] > largest_uncertainty:
            #         most_uncertain_action = action
            #         largest_uncertainty = reward_uncertainty[action]
            #
            # best_action = most_uncertain_action
            if len(possible_actions) > 0:
                best_action = possible_actions[np.random.choice(
                    len(possible_actions))]
        else:
            best_q_action = Action.STAY
            best_q = -math.inf
            q_values = self.target(
                torch.tensor(self.states[self.index],
                             dtype=torch.double)).tolist()
            for action in possible_actions:
                if q_values[action] > best_q:
                    best_q_action = action
                    best_q = q_values[action]
            best_action = best_q_action

        if len(possible_actions) == 0:
            # joint_prob = np.array(reward_ls) * np.array(no_collision_ls)
            # best_action = np.argmax(joint_prob)
            best_action = np.argmax(no_collision_ls)

        self.action_traj += [best_action]
        return best_action

    def update_buffer(self, reward, states):
        self.buffer.push(
            torch.tensor([self.states[self.index]], dtype=torch.double),
            torch.tensor([[self.action_traj[-1]]], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([states[self.index]], dtype=torch.double))

        if len(self.rewards) > 50:
            self.rewards.pop(0)
            self.my_states.pop(0)

        self.rewards += [reward]
        self.states = states

        for i, state in enumerate(states):
            if i == self.index:
                continue
            if np.linalg.norm(state -
                              states[self.index]) > self.collision_distance:
                self.num_collisions += 1
                break

        if reward < self.reward_threshold:
            self.num_unsafe += 1

        for i in range(self.num_agents):
            self.trajs[i] += [states[i]]
            if i == self.index:
                self.my_states += [states[i]]

        self.cum_rewards += reward

    def reset(self, states):
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.target_u.load_state_dict(self.dqn_u.state_dict())
        self.target_u.eval()
        self.target_l.load_state_dict(self.dqn_l.state_dict())
        self.target_l.eval()
        self.trajs = [[] for _ in range(self.num_agents)]
        self.action_traj = []
        self.states = states
        # self.epsilons = [0. for _ in range(self.num_agents)]
        # self.tau_exploits = [1. for _ in range(self.num_agents)]
        # self.tau_explores = [1. for _ in range(self.num_agents)]
        # self.rewards = []
        # self.cum_rewards = 0

    def learn_from_buffer(self):
        self.reward_gp.fit(self.my_states, self.rewards)
        self._value_func_estimate()
        for agent in range(self.num_agents):
            if agent == self.index:
                continue
            self._optimize_parameters(agent)

    def _value_func_estimate(self):
        if len(self.buffer) < 32:
            return

        self.target_usage += 1
        transitions = self.buffer.sample(32)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        reward_batch = torch.cat(batch.reward)
        reward_l_batch = []
        reward_u_batch = []
        for state in state_batch:
            cur_state = state.tolist()
            reward, std = self.reward_gp.predict(np.array([cur_state]), True)
            reward_l_batch.append(
                torch.tensor([reward[0] - self.beta * std[0]],
                             dtype=torch.double))
            reward_u_batch.append(
                torch.tensor([reward[0] + self.beta * std[0]],
                             dtype=torch.double))

        reward_l_batch = torch.cat(reward_l_batch)
        reward_u_batch = torch.cat(reward_u_batch)
        action_batch = torch.cat(batch.action)
        next_state_batch = torch.cat(batch.next_state)

        state_action_values = self.dqn(state_batch).gather(1, action_batch)
        next_state_values = self.target(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.dqn.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        state_action_values = self.dqn_u(state_batch).gather(1, action_batch)
        next_state_values = self.target_u(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_u_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer_u.zero_grad()
        loss.backward()
        for param in self.dqn_u.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer_u.step()

        state_action_values = self.dqn_l(state_batch).gather(1, action_batch)
        next_state_values = self.target_l(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_l_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer_l.zero_grad()
        loss.backward()
        for param in self.dqn_l.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer_l.step()

        if self.target_usage == 10:
            self.target_usage = 0
            self.target.load_state_dict(self.dqn.state_dict())
            self.target.eval()
            self.target_u.load_state_dict(self.dqn_u.state_dict())
            self.target_u.eval()
            self.target_l.load_state_dict(self.dqn_l.state_dict())
            self.target_l.eval()

    def _move_coordinate(self, state, action):
        movement = get_movement(action)
        return movement + state

    def _get_policy(self, agent, action):
        epsilon = self.epsilons[agent]
        tau_explore = self.tau_explores[agent]
        tau_exploit = self.tau_exploits[agent]
        return self._compute_policy_upperbound(epsilon, tau_explore,
                                               tau_exploit, agent, action)

    def _returnable(self, state):
        for a in Action.SET:
            next_state = self._move_coordinate(state, a)
            reward, std = self.reward_gp.predict(np.array([next_state]), True)
            reward = reward[0]
            std = std[0]
            if reward - self.beta * std >= self.reward_threshold:
                return True

        return False

    def _optimize_parameters(self, agent):
        traj = self.trajs[agent]
        if len(traj) > 20:
            traj = traj[len(traj) - 20:]

        def _compute_log_likelihood(parameters):
            epsilon = parameters[0]
            tau_explore = parameters[1]
            tau_exploit = parameters[2]

            sum_log_likelihood = 1.0
            for step in range(1, len(traj)):
                prev_state = traj[step - 1]
                cur_state = traj[step]

                movement = np.rint(cur_state - prev_state)
                action = get_action(movement, self.world_size)
                if action == -1:
                    continue

                sum_log_likelihood *= (self._compute_policy_upperbound(
                    epsilon, tau_explore, tau_exploit, agent, action))

            return -np.log(sum_log_likelihood)

        res = minimize(_compute_log_likelihood,
                       np.array([0.5, 1.0, 1.0]),
                       method='L-BFGS-B',
                       bounds=np.array([(1e-6, 1.0), (0.1, 10.0),
                                        (0.1, 10.0)]))

        if not np.all(np.equal(res.x, np.array([0.5, 1.0, 1.0]))):
            self.epsilons[agent] = res.x[0]
            self.tau_explores[agent] = res.x[1]
            self.tau_exploits[agent] = res.x[2]

    def _compute_policy_upperbound(self, epsilon, tau_explore, tau_exploit,
                                   agent, action):
        q = self.dqn(torch.tensor(self.states[agent],
                                  dtype=torch.double)).detach().numpy()
        q_u = self.dqn_u(torch.tensor(self.states[agent],
                                      dtype=torch.double)).detach().numpy()
        q_l = self.dqn_l(torch.tensor(self.states[agent],
                                      dtype=torch.double)).detach().numpy()

        ofu_denom = copy.copy(q)
        ofu_denom[action] = q_u[action]

        boltz_denom = copy.copy(q_l)
        boltz_denom[action] = q[action]

        explore_mean_q = np.mean(q_u / tau_explore)
        prob_ofu = np.exp(q_u[action] / tau_explore - explore_mean_q) / np.sum(
            np.exp(ofu_denom / tau_explore - explore_mean_q))

        exploit_mean_q = np.mean(q / tau_exploit)
        prob_boltz = np.exp(q[action] / tau_exploit - exploit_mean_q) / np.sum(
            np.exp(boltz_denom / tau_exploit - exploit_mean_q))

        return epsilon * prob_ofu + (1 - epsilon) * prob_boltz
class DeepExpIDSAgent(Agent):
    def __init__(self,
                 feed_units: List[int],
                 agent_name: str,
                 ensemble_size: int = 100,
                 prior_variance: float = 1.0,
                 model_dims: List[int] = [20],
                 lr: float = 1e-3,
                 batch_size: int = 128,
                 noise_variance=0):
        self.feed_units = copy.deepcopy(feed_units)
        #         self.available_units = copy.deepcopy(feed_units)
        self.agent_name = agent_name

        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.num_features: int = len(feed_units) + 1
        self.noise_variance = noise_variance

        self.ensemble_size: int = ensemble_size
        self.training_data = ReplayMemory(100000)
        #         self.training_datas = []
        #         for i in range(self.ensemble_size):
        #             self.training_datas.append(ReplayMemory(100000))

        self.latest_feature = None
        self.latest_action = None

        self.prior_variance = prior_variance

        self.model_dims: List[int] = [self.num_features] + model_dims + [2]
        priors = []

        for i in range(self.ensemble_size):
            priors.append(MLP(self.model_dims))
            priors[i].initialize()
            priors[i].double()
            priors[i].eval()
            priors[i].to(device)

        self.models: List[DQNWithPrior] = []
        for i in range(self.ensemble_size):
            self.models.append(
                DQNWithPrior(self.model_dims,
                             priors[i],
                             scale=np.sqrt(self.prior_variance)))
            self.models[i].initialize()
            self.models[i].double()
            self.models[i].to(device)

        self.target_nets: List[DQNWithPrior] = []
        for i in range(self.ensemble_size):
            self.target_nets.append(
                DQNWithPrior(self.model_dims,
                             priors[i],
                             scale=np.sqrt(self.prior_variance)))
            self.target_nets[i].load_state_dict(self.models[i].state_dict())
            self.target_nets[i].double()
            self.target_nets[i].eval()
            self.target_nets[i].to(device)

        self.loss_fn = torch.nn.MSELoss(reduction='sum')

        self.optimizers = []
        for i in range(self.ensemble_size):
            self.optimizers.append(
                optim.Adam(self.models[i].parameters(), lr=lr))

        self.cur_net = self.target_nets[np.random.choice(self.ensemble_size)]
        self.batch_size = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_unit_indices: List[int] = []
        self.cum_reward_history: List[float] = []
        self.current_feed = 0

    def choose_action(self):
        available_actions = [0, 1]

        features: List[float] = [-1. for _ in range(self.num_features)]
        for index in range(self.current_feed):
            features[index] = 0.
        for index in self.history_unit_indices:
            features[index] = 1.

        with torch.no_grad():
            all_outcomes = [
                self.target_nets[model_index](torch.tensor(features,
                                                           dtype=torch.double))
                for model_index in range(self.ensemble_size)
            ]

            mean_immediate_regret = self.mean_immediate_regret(all_outcomes)
            var_immediate_regret = self.var_immediate_regret(
                all_outcomes, len(available_actions))
            best_index = self.best_ids_action(mean_immediate_regret,
                                              var_immediate_regret)

            best_action = [available_actions[best_index]]
            self.latest_feature = features
            self.latest_action = best_action
            if best_action[0] == 1:
                self.history_unit_indices.append(self.current_feed)

            self.current_feed += 1

            # print('action: {}'.format(best_action[0]))
            return best_action[0]

    def mean_immediate_regret(self, all_outcomes):
        sum_immediate_regret = None
        for model_index in range(self.ensemble_size):
            outcomes = all_outcomes[model_index]
            max_outcome, _ = torch.max(outcomes, 0)
            if sum_immediate_regret is None:
                sum_immediate_regret = max_outcome - outcomes
            else:
                sum_immediate_regret += max_outcome - outcomes

        return sum_immediate_regret / self.ensemble_size

    def var_immediate_regret(self, all_outcomes, num_actions):
        count_best_outcome = [0 for _ in range(num_actions)]
        sum_out_best = {}
        sum_out_all = None

        for model_index in range(self.ensemble_size):
            outcomes = all_outcomes[model_index]
            max_outcome, best_index = torch.max(outcomes, 0)
            count_best_outcome[best_index] += 1
            if best_index in sum_out_best:
                sum_out_best[best_index] += outcomes
            else:
                sum_out_best[best_index] = outcomes

            if sum_out_all is None:
                sum_out_all = outcomes
            else:
                sum_out_all += outcomes

        var = torch.tensor([0. for _ in range(num_actions)]).double()
        for a in range(num_actions):
            if a not in sum_out_best:
                sum_out_best[a] = torch.tensor(
                    [0. for _ in range(num_actions)]).double()

            coeff = count_best_outcome[a] / self.ensemble_size
            if coeff == 0:
                continue

            sum_err = (1 / count_best_outcome[a] * sum_out_best[a][a] -
                       1 / num_actions * sum_out_all[a])**2
            var[a] = coeff * sum_err.item()

        return var

    def best_ids_action(self, mean_immediate_regret, var_immediate_regret):
        regret_sq = mean_immediate_regret**2
        info_gain = torch.log(1 + var_immediate_regret) + 1e-5
        return torch.argmin(regret_sq / info_gain)

    def update_buffer(
        self,
        scroll: bool,
        reward: int,
    ):
        # print('reward: {}'.format(reward))
        self.cum_rewards += reward
        if not scroll:
            self.training_data.push(
                torch.tensor([self.latest_feature], dtype=torch.double),
                torch.tensor([self.latest_action], dtype=torch.long),
                torch.tensor([reward], dtype=torch.double),
                None,
            )
            return

        features: List[float] = [-1. for _ in range(self.num_features)]
        for index in range(self.current_feed):
            features[index] = 0.
        for index in self.history_unit_indices:
            features[index] = 1.

        self.training_data.push(
            torch.tensor([self.latest_feature], dtype=torch.double),
            torch.tensor([self.latest_action], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([features], dtype=torch.double),
        )

    def learn_from_buffer(self):
        if len(self.training_data) < self.batch_size:
            return

        loss_ensemble = 0.0
        # try:
        for _ in range(10):
            transitions = self.training_data.sample(self.batch_size)
            for i in range(self.ensemble_size):
                batch = Transition(*zip(*transitions))
                non_final_mask = torch.tensor(tuple(
                    map(lambda s: s is not None, batch.next_state)),
                                              device=device,
                                              dtype=torch.bool)
                non_final_next_states = torch.cat(
                    [s for s in batch.next_state if s is not None])

                state_batch = torch.cat(batch.state)
                action_batch = torch.cat(batch.action)
                reward_batch = torch.cat(batch.reward)
                state_action_values = self.models[i](state_batch).gather(
                    1, action_batch)

                next_state_values = torch.zeros(self.batch_size,
                                                device=device,
                                                dtype=torch.double)
                next_state_values[non_final_mask] = self.target_nets[i](
                    non_final_next_states).max(1)[0].detach()

                expected_state_action_values = self.gamma * next_state_values + reward_batch

                loss = self.loss_fn(state_action_values,
                                    expected_state_action_values.unsqueeze(1))
                loss_ensemble += loss.item()

                self.optimizers[i].zero_grad()
                loss.backward()

                #             for param in self.model.parameters():
                #                 param.grad.data.clamp_(-1, 1)
                self.optimizers[i].step()

            self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble
        # except:
        # print('{}: no non-terminal state'.format(self.agent_name))

    def reset(self):
        self.available_units = copy.deepcopy(self.feed_units)
        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.latest_feature = None
        self.latest_action = None
        #         self.history_unit_indices = []
        self.cum_reward_history.append(self.cum_rewards)

        for i in range(self.ensemble_size):
            self.target_nets[i].load_state_dict(self.models[i].state_dict())
            self.target_nets[i].double()
            self.target_nets[i].eval()
            self.target_nets[i].to(device)

        self.cur_net = self.target_nets[np.random.choice(self.ensemble_size)]
        self.history_unit_indices = []
        self.cum_reward_history.append(self.cum_rewards)
        self.current_feed = 0
        self.current_loc = [0, 0]
예제 #4
0
class QLearningAgent():
    def __init__(self, index, world_size, states, num_agents,
                 collision_distance):
        self.index = index
        self.name = 'q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1
        self.action_traj = []
        self.num_collisions = 0
        self.collision_distance = collision_distance

        self.dimensions = [2, 5, 5, 4]
        self.dqn = MLP(self.dimensions).double()
        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.lr = 1e-4
        self.num_collisions = 0
        self.eps = 0.1

        self.cum_rewards = 0
        self.target_usage = 0

    def choose_action(self, explore):
        possible_actions = list(copy.copy(Action.SET))
        best_action = np.argmax(
            self.target(
                torch.tensor(self.states[self.index],
                             dtype=torch.double)).tolist())

        if explore or np.random.binomial(1, self.eps) == 1:
            best_action = possible_actions[np.random.choice(
                len(possible_actions))]

        self.action_traj.append(best_action)
        return best_action

    def update_buffer(self, reward, states):
        self.buffer.push(
            torch.tensor([self.states[self.index]], dtype=torch.double),
            torch.tensor([[self.action_traj[-1]]], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([states[self.index]], dtype=torch.double))
        self.rewards += [reward]
        self.states = states

        for i, state in enumerate(states):
            if i == self.index:
                continue
            if np.linalg.norm(state -
                              states[self.index]) < self.collision_distance:
                self.num_collisions += 1
                break

        self.cum_rewards += reward

    def learn_from_buffer(self):
        self._value_func_estimate()

    def reset(self, states):
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.action_traj = []
        self.states = states
        self.rewards = []

    def _value_func_estimate(self):
        if len(self.buffer) < 32:
            return

        self.target_usage += 1
        transitions = self.buffer.sample(32)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_state_batch = torch.cat(batch.next_state)

        state_action_values = self.dqn(state_batch).gather(1, action_batch)
        next_state_values = self.target(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.dqn.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if self.target_usage == 10:
            self.target_usage = 0
            self.target.load_state_dict(self.dqn.state_dict())
            self.target.eval()

    def _move_coordinate(self, state, action):
        movement = get_movement(action)
        return movement + state
예제 #5
0
class SoftActorCritic(object):
    def __init__(self, observation_space, action_space):
        self.s_dim = observation_space.shape[0]
        self.a_dim = action_space.shape[0]
        self.alpha = hyp.ALPHA

        # create component networks
        self.q_network_1 = QNetwork(self.s_dim, self.a_dim,
                                    hyp.H_DIM).to(hyp.device)
        self.q_network_2 = QNetwork(self.s_dim, self.a_dim,
                                    hyp.H_DIM).to(hyp.device)
        self.target_q_network_1 = QNetwork(self.s_dim, self.a_dim,
                                           hyp.H_DIM).to(hyp.device)
        self.target_q_network_2 = QNetwork(self.s_dim, self.a_dim,
                                           hyp.H_DIM).to(hyp.device)
        self.policy_network = PolicyNetwork(self.s_dim, self.a_dim, hyp.H_DIM,
                                            action_space).to(hyp.device)

        # copy weights from q networks to target networks
        copy_params(self.target_q_network_1, self.q_network_1)
        copy_params(self.target_q_network_2, self.q_network_2)

        # optimizers
        self.q_network_1_opt = opt.Adam(self.q_network_1.parameters(), hyp.LR)
        self.q_network_2_opt = opt.Adam(self.q_network_2.parameters(), hyp.LR)
        self.policy_network_opt = opt.Adam(self.policy_network.parameters(),
                                           hyp.LR)

        # automatic entropy tuning
        if hyp.ENTROPY_TUNING:
            self.target_entropy = -torch.prod(
                torch.Tensor(action_space.shape).to(hyp.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=hyp.device)
            self.alpha_optim = opt.Adam([self.log_alpha], lr=hyp.LR)

        self.replay_memory = ReplayMemory(hyp.REPLAY_MEMORY_SIZE)

    def get_action(self, s):
        state = torch.FloatTensor(s).to(hyp.device).unsqueeze(0)
        action, _, _ = self.policy_network.sample_action(state)
        return action.detach().cpu().numpy()[0]

    def update_params(self):
        states, actions, rewards, next_states, ndones = self.replay_memory.sample(
            hyp.BATCH_SIZE)

        # make sure all are torch tensors
        states = torch.FloatTensor(states).to(hyp.device)
        actions = torch.FloatTensor(actions).to(hyp.device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(hyp.device)
        next_states = torch.FloatTensor(next_states).to(hyp.device)
        ndones = torch.FloatTensor(np.float32(ndones)).unsqueeze(1).to(
            hyp.device)

        # compute targets
        with torch.no_grad():
            next_action, next_log_pi, _ = self.policy_network.sample_action(
                next_states)
            next_target_q1 = self.target_q_network_1(next_states, next_action)
            next_target_q2 = self.target_q_network_2(next_states, next_action)
            next_target_q = torch.min(
                next_target_q1, next_target_q2) - self.alpha * next_log_pi
            next_q = rewards + hyp.GAMMA * ndones * next_target_q

        # compute losses
        q1 = self.q_network_1(states, actions)
        q2 = self.q_network_2(states, actions)

        q1_loss = F.mse_loss(q1, next_q)
        q2_loss = F.mse_loss(q2, next_q)

        pi, log_pi, _ = self.policy_network.sample_action(states)
        q1_pi = self.q_network_1(states, pi)
        q2_pi = self.q_network_2(states, pi)
        min_q_pi = torch.min(q1_pi, q2_pi)

        policy_loss = ((self.alpha * log_pi) - min_q_pi).mean()

        # gradient descent
        self.q_network_1_opt.zero_grad()
        q1_loss.backward()
        self.q_network_1_opt.step()

        self.q_network_2_opt.zero_grad()
        q2_loss.backward()
        self.q_network_2_opt.step()

        self.policy_network_opt.zero_grad()
        policy_loss.backward()
        self.policy_network_opt.step()

        # alpha loss
        if hyp.ENTROPY_TUNING:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
        else:
            alpha_loss = torch.tensor(0.).to(hyp.device)

        # update target network params
        soft_update(self.target_q_network_1, self.q_network_1)
        soft_update(self.target_q_network_2, self.q_network_2)

        return q1_loss.item(), q2_loss.item(), policy_loss.item(
        ), alpha_loss.item()
class NaiveSafeQLearningAgent():
    def __init__(self, index, world_size, states, num_agents,
                 collision_distance, collision_threshold, reward_threshold):
        self.index = index
        self.name = 'naive q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1
        self.action_traj = []
        self.num_collisions = 0
        self.collision_distance = collision_distance
        self.collision_threshold = collision_threshold
        self.reward_threshold = reward_threshold

        self.dimensions = [2, 5, 5, 4]
        self.dqn = MLP(self.dimensions).double()
        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.lr = 1e-4
        self.num_collisions = 0
        self.num_unsafe = 0
        self.eps = 0.1
        self.cum_rewards = 0
        self.target_usage = 0

    def choose_action(self, explore):
        possible_actions = list(copy.copy(Action.SET))
        no_collision_ls = [1.0 for _ in Action.SET]

        for a in Action.SET:
            next_state = self._move_coordinate(self.states[self.index], a)
            next_state = bound_action(next_state, self.world_size[0],
                                      self.world_size[1])

            for agent in range(self.num_agents):
                if agent == self.index:
                    continue

                cur_agent_state = self.states[agent]
                if np.linalg.norm(cur_agent_state -
                                  next_state) > 1.0 + self.collision_distance:
                    continue

                movement = np.rint(next_state - cur_agent_state)
                action = get_action(movement, self.world_size)
                if action == -1:
                    continue

                no_collision_ls[a] *= 0.75

        for i, l in enumerate(no_collision_ls):
            if l < self.collision_threshold and i in possible_actions:
                possible_actions.remove(i)

        q = self.target(
            torch.tensor(self.states[self.index],
                         dtype=torch.double)).tolist()

        if len(possible_actions) == 0:
            best_action = np.argmax(no_collision_ls)
            self.action_traj.append(best_action)
            return best_action

        best_q = -math.inf
        best_action = Action.UP
        for action in possible_actions:
            if q[action] > best_q:
                best_q = q[action]
                best_action = action

        if explore or np.random.binomial(1, self.eps) == 1:
            best_action = possible_actions[np.random.choice(
                len(possible_actions))]

        self.action_traj.append(best_action)
        return best_action

    def update_buffer(self, reward, states):
        self.buffer.push(
            torch.tensor([self.states[self.index]], dtype=torch.double),
            torch.tensor([[self.action_traj[-1]]], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([states[self.index]], dtype=torch.double))
        self.rewards += [reward]
        self.states = states

        if reward < self.reward_threshold:
            self.num_unsafe += 1

        for i, state in enumerate(states):
            if i == self.index:
                continue
            if np.linalg.norm(state -
                              states[self.index]) < self.collision_distance:
                self.num_collisions += 1
                break

        self.cum_rewards += reward

    def learn_from_buffer(self):
        self._value_func_estimate()

    def reset(self, states):
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.action_traj = []
        self.states = states
        self.rewards = []

    def _value_func_estimate(self):
        if len(self.buffer) < 32:
            return

        self.target_usage += 1
        transitions = self.buffer.sample(32)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_state_batch = torch.cat(batch.next_state)

        state_action_values = self.dqn(state_batch).gather(1, action_batch)
        next_state_values = self.target(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.dqn.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if self.target_usage == 10:
            self.target_usage = 0
            self.target.load_state_dict(self.dqn.state_dict())
            self.target.eval()

    def _move_coordinate(self, state, action):
        movement = get_movement(action)
        return movement + state