示例#1
0
    def __init__(self, index, world_size, states, num_agents,
                 collision_distance):
        self.index = index
        self.name = 'q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1
        self.action_traj = []
        self.num_collisions = 0
        self.collision_distance = collision_distance

        self.dimensions = [2, 5, 5, 4]
        self.dqn = MLP(self.dimensions).double()
        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.lr = 1e-4
        self.num_collisions = 0
        self.eps = 0.1

        self.cum_rewards = 0
        self.target_usage = 0
    def __init__(
            self,
            initial_feed_candidates,
            user_features,
            feed_counts,
            agent_name: str,
            feed_feature_count = 6,
            user_feature_count = 6,
            model_dims: List[int] = [50, 25],
            lr: float = 1e-3,
            boltzmann: bool = True,
            epsilon: float = 0.05,
            batch_size: int = 128,
    ):
        self.initial_feed_candidates = initial_feed_candidates
        self.current_feed_candidates = initial_feed_candidates
        self.user_features = user_features
        self.feed_counts = feed_counts
        self.agent_name = agent_name
        self.interest_level = 0

        self.cum_rewards: float = 0.
        self.feed_feature_count = feed_feature_count
        self.user_feature_count = user_feature_count
        self.num_features = feed_counts * feed_feature_count + feed_feature_count + user_feature_count
        self.training_data: ReplayMemory = ReplayMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [1]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.target_net = MLP(self.model_dims).double().to(device)
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = epsilon
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_actions = []
        self.latest_feature = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []
示例#3
0
    def __init__(self,
                 initial_feed_candidates,
                 user_features,
                 feed_counts: int,
                 agent_name: str,
                 feed_feature_count=6,
                 user_feature_count=6,
                 model_dims=[50, 25],
                 batch_size: int = 128,
                 interest_unknown: bool = False,
                 boltzmann: bool = True):
        self.initial_feed_candidates = initial_feed_candidates
        self.current_feed_candidates = initial_feed_candidates
        self.user_features = user_features
        self.feed_counts = feed_counts
        self.agent_name = agent_name

        self.cum_rewards: float = 0.
        self.rewards = []
        self.actions = []
        self.training_data = []
        self.feed_feature_count = feed_feature_count
        self.user_feature_count = user_feature_count
        self.num_features: int = feed_counts * feed_feature_count + feed_feature_count + user_feature_count
        self.buffer: SupervisedMemory = SupervisedMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [1]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = 0.05
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_actions = []
        self.latest_feature = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []
示例#4
0
    def __init__(
        self,
        feed_units: List[int],
        agent_name: str,
        model_dims: List[int] = [],
        lr: float = 1e-3,
        boltzmann: bool = False,
        epsilon: float = 0.05,
        batch_size: int = 128,
    ):
        self.feed_units = copy.deepcopy(feed_units)
        self.agent_name = agent_name
        self.interest_level = 0

        self.cum_rewards: float = 0.
        self.num_features: int = len(feed_units)
        self.training_data: ReplayMemory = ReplayMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [2]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.target_net = MLP(self.model_dims).double().to(device)
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = epsilon
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_unit_indices: List[int] = []
        self.latest_feature = None
        self.latest_action = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []
        self.current_loc = [0, 0]
示例#5
0
    def __init__(self,
                 embedding_size=128,
                 num_layers=2,
                 bidirectional=True,
                 cuda=False):
        """Create a new RNNModel object based on the specifications

        embedding_size -- size of each RNN embedding
        num_layers -- number of RNN layers
        bidirectional -- whether the RNN is bidirectional
        cuda -- whether to use GPU
        """
        super(RNNModel, self).__init__()
        self._num_layers = num_layers
        self._embedding_size = embedding_size
        self._hidden_size = embedding_size
        if bidirectional:
            self._hidden_size //= 2

        # create an embedding for the tokens
        self.embedding = nn.Embedding(len(RNN_TOKENS), embedding_size)

        # create a separate LSTM model for each relation type
        self.lstms = {}
        for relation_type in RNN_RELATIONS:
            lstm = nn.LSTM(embedding_size,
                           self._hidden_size,
                           self._num_layers,
                           bidirectional=bidirectional)
            self.lstms[relation_type] = lstm
            self.add_module("lstm_%s" % relation_type, lstm)

        # create a scoring MLP
        self.score = MLP([embedding_size, 1])

        # check CUDA
        self._cuda = cuda
        if self._cuda:
            self.cuda()
    def __init__(self, index, reward_threshold, collision_threshold,
                 world_size, states, num_agents, collision_distance):
        self.index = index
        self.name = 'multi safe q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        kernel = RBF(length_scale=world_size, length_scale_bounds=[(1e1, 1e5), (1e1, 1e5), (1e1, 1e5)]) \
            + WhiteKernel(noise_level=1)
        self.reward_gp = GaussianProcessRegressor(kernel=kernel)
        self.reward_threshold = reward_threshold
        self.collision_threshold = collision_threshold
        self.collision_distance = collision_distance
        self.trajs = [[] for _ in range(num_agents)]
        self.my_states = []
        self.action_traj = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1

        self.dimensions = [3, 50, 50, 7]
        self.dqn = MLP(self.dimensions).double()
        self.dqn_l = MLP(self.dimensions).double()
        self.dqn_u = MLP(self.dimensions).double()
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.optimizer_l = optim.RMSprop(self.dqn_l.parameters())
        self.optimizer_u = optim.RMSprop(self.dqn_u.parameters())

        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.target_l = MLP(self.dimensions).double()
        self.target_l.load_state_dict(self.dqn_l.state_dict())
        self.target_l.eval()
        self.target_u = MLP(self.dimensions).double()
        self.target_u.load_state_dict(self.dqn_u.state_dict())
        self.target_u.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.lr = 1e-3
        self.epsilons = [0. for _ in range(num_agents)]
        self.tau_exploits = [1. for _ in range(num_agents)]
        self.tau_explores = [1. for _ in range(num_agents)]
        self.num_collisions = 0
        self.num_unsafe = 0
        self.eps = 0.1
        self.cum_rewards = 0
        self.target_usage = 0
class MultiAgentPlanner():
    def __init__(self, index, reward_threshold, collision_threshold,
                 world_size, states, num_agents, collision_distance):
        self.index = index
        self.name = 'multi safe q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        kernel = RBF(length_scale=world_size, length_scale_bounds=[(1e1, 1e5), (1e1, 1e5), (1e1, 1e5)]) \
            + WhiteKernel(noise_level=1)
        self.reward_gp = GaussianProcessRegressor(kernel=kernel)
        self.reward_threshold = reward_threshold
        self.collision_threshold = collision_threshold
        self.collision_distance = collision_distance
        self.trajs = [[] for _ in range(num_agents)]
        self.my_states = []
        self.action_traj = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1

        self.dimensions = [3, 50, 50, 7]
        self.dqn = MLP(self.dimensions).double()
        self.dqn_l = MLP(self.dimensions).double()
        self.dqn_u = MLP(self.dimensions).double()
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.optimizer_l = optim.RMSprop(self.dqn_l.parameters())
        self.optimizer_u = optim.RMSprop(self.dqn_u.parameters())

        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.target_l = MLP(self.dimensions).double()
        self.target_l.load_state_dict(self.dqn_l.state_dict())
        self.target_l.eval()
        self.target_u = MLP(self.dimensions).double()
        self.target_u.load_state_dict(self.dqn_u.state_dict())
        self.target_u.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.lr = 1e-3
        self.epsilons = [0. for _ in range(num_agents)]
        self.tau_exploits = [1. for _ in range(num_agents)]
        self.tau_explores = [1. for _ in range(num_agents)]
        self.num_collisions = 0
        self.num_unsafe = 0
        self.eps = 0.1
        self.cum_rewards = 0
        self.target_usage = 0

    def choose_action(self, explore=False):
        possible_actions = copy.copy(Action.SET)
        action_next_states = []
        reward_ls = []
        reward_uncertainty = []
        no_collision_ls = [1.0 for _ in Action.SET]
        best_action = Action.STAY

        # best_action = np.argmax(self.target(torch.tensor(self.states[self.index])).tolist())
        #
        # if explore or np.random.binomial(1, self.eps) == 1:
        #     best_action = possible_actions[np.random.choice(len(possible_actions))]

        for a in Action.SET:
            next_state = self._move_coordinate(self.states[self.index], a)
            reward, std = self.reward_gp.predict(np.array([next_state]),
                                                 return_std=True)
            reward = reward[0]
            std = std[0]
            action_next_states += [(a, next_state)]
            reward_ls += [reward - self.beta * std]
            reward_uncertainty += [std]

        for action, next_state in action_next_states:
            for agent in range(self.num_agents):
                if agent == self.index:
                    continue

                cur_agent_state = self.states[agent]

                for agent_action in Action.SET:
                    possible_next_agent_state = cur_agent_state + get_movement(
                        agent_action)
                    if np.linalg.norm(possible_next_agent_state -
                                      next_state) < self.collision_distance:
                        continue

                    a_prob = self._get_policy(agent, agent_action)
                    no_collision_ls[action] *= (1 - a_prob)

        # for i, l in enumerate(reward_ls):
        #     if l <= self.reward_threshold or not self._returnable(action_next_states[i][1]):
        #         possible_actions.remove(i)

        for i, l in enumerate(no_collision_ls):
            if l < self.collision_threshold and i in possible_actions:
                possible_actions.remove(i)

        possible_actions = list(possible_actions)
        if explore or np.random.binomial(1, self.eps) == 1:
            # most_uncertain_action = Action.STAY
            # largest_uncertainty = -math.inf
            # for action in possible_actions:
            #     if reward_uncertainty[action] > largest_uncertainty:
            #         most_uncertain_action = action
            #         largest_uncertainty = reward_uncertainty[action]
            #
            # best_action = most_uncertain_action
            if len(possible_actions) > 0:
                best_action = possible_actions[np.random.choice(
                    len(possible_actions))]
        else:
            best_q_action = Action.STAY
            best_q = -math.inf
            q_values = self.target(
                torch.tensor(self.states[self.index],
                             dtype=torch.double)).tolist()
            for action in possible_actions:
                if q_values[action] > best_q:
                    best_q_action = action
                    best_q = q_values[action]
            best_action = best_q_action

        if len(possible_actions) == 0:
            # joint_prob = np.array(reward_ls) * np.array(no_collision_ls)
            # best_action = np.argmax(joint_prob)
            best_action = np.argmax(no_collision_ls)

        self.action_traj += [best_action]
        return best_action

    def update_buffer(self, reward, states):
        self.buffer.push(
            torch.tensor([self.states[self.index]], dtype=torch.double),
            torch.tensor([[self.action_traj[-1]]], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([states[self.index]], dtype=torch.double))

        if len(self.rewards) > 50:
            self.rewards.pop(0)
            self.my_states.pop(0)

        self.rewards += [reward]
        self.states = states

        for i, state in enumerate(states):
            if i == self.index:
                continue
            if np.linalg.norm(state -
                              states[self.index]) > self.collision_distance:
                self.num_collisions += 1
                break

        if reward < self.reward_threshold:
            self.num_unsafe += 1

        for i in range(self.num_agents):
            self.trajs[i] += [states[i]]
            if i == self.index:
                self.my_states += [states[i]]

        self.cum_rewards += reward

    def reset(self, states):
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.target_u.load_state_dict(self.dqn_u.state_dict())
        self.target_u.eval()
        self.target_l.load_state_dict(self.dqn_l.state_dict())
        self.target_l.eval()
        self.trajs = [[] for _ in range(self.num_agents)]
        self.action_traj = []
        self.states = states
        # self.epsilons = [0. for _ in range(self.num_agents)]
        # self.tau_exploits = [1. for _ in range(self.num_agents)]
        # self.tau_explores = [1. for _ in range(self.num_agents)]
        # self.rewards = []
        # self.cum_rewards = 0

    def learn_from_buffer(self):
        self.reward_gp.fit(self.my_states, self.rewards)
        self._value_func_estimate()
        for agent in range(self.num_agents):
            if agent == self.index:
                continue
            self._optimize_parameters(agent)

    def _value_func_estimate(self):
        if len(self.buffer) < 32:
            return

        self.target_usage += 1
        transitions = self.buffer.sample(32)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        reward_batch = torch.cat(batch.reward)
        reward_l_batch = []
        reward_u_batch = []
        for state in state_batch:
            cur_state = state.tolist()
            reward, std = self.reward_gp.predict(np.array([cur_state]), True)
            reward_l_batch.append(
                torch.tensor([reward[0] - self.beta * std[0]],
                             dtype=torch.double))
            reward_u_batch.append(
                torch.tensor([reward[0] + self.beta * std[0]],
                             dtype=torch.double))

        reward_l_batch = torch.cat(reward_l_batch)
        reward_u_batch = torch.cat(reward_u_batch)
        action_batch = torch.cat(batch.action)
        next_state_batch = torch.cat(batch.next_state)

        state_action_values = self.dqn(state_batch).gather(1, action_batch)
        next_state_values = self.target(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.dqn.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        state_action_values = self.dqn_u(state_batch).gather(1, action_batch)
        next_state_values = self.target_u(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_u_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer_u.zero_grad()
        loss.backward()
        for param in self.dqn_u.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer_u.step()

        state_action_values = self.dqn_l(state_batch).gather(1, action_batch)
        next_state_values = self.target_l(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_l_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer_l.zero_grad()
        loss.backward()
        for param in self.dqn_l.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer_l.step()

        if self.target_usage == 10:
            self.target_usage = 0
            self.target.load_state_dict(self.dqn.state_dict())
            self.target.eval()
            self.target_u.load_state_dict(self.dqn_u.state_dict())
            self.target_u.eval()
            self.target_l.load_state_dict(self.dqn_l.state_dict())
            self.target_l.eval()

    def _move_coordinate(self, state, action):
        movement = get_movement(action)
        return movement + state

    def _get_policy(self, agent, action):
        epsilon = self.epsilons[agent]
        tau_explore = self.tau_explores[agent]
        tau_exploit = self.tau_exploits[agent]
        return self._compute_policy_upperbound(epsilon, tau_explore,
                                               tau_exploit, agent, action)

    def _returnable(self, state):
        for a in Action.SET:
            next_state = self._move_coordinate(state, a)
            reward, std = self.reward_gp.predict(np.array([next_state]), True)
            reward = reward[0]
            std = std[0]
            if reward - self.beta * std >= self.reward_threshold:
                return True

        return False

    def _optimize_parameters(self, agent):
        traj = self.trajs[agent]
        if len(traj) > 20:
            traj = traj[len(traj) - 20:]

        def _compute_log_likelihood(parameters):
            epsilon = parameters[0]
            tau_explore = parameters[1]
            tau_exploit = parameters[2]

            sum_log_likelihood = 1.0
            for step in range(1, len(traj)):
                prev_state = traj[step - 1]
                cur_state = traj[step]

                movement = np.rint(cur_state - prev_state)
                action = get_action(movement, self.world_size)
                if action == -1:
                    continue

                sum_log_likelihood *= (self._compute_policy_upperbound(
                    epsilon, tau_explore, tau_exploit, agent, action))

            return -np.log(sum_log_likelihood)

        res = minimize(_compute_log_likelihood,
                       np.array([0.5, 1.0, 1.0]),
                       method='L-BFGS-B',
                       bounds=np.array([(1e-6, 1.0), (0.1, 10.0),
                                        (0.1, 10.0)]))

        if not np.all(np.equal(res.x, np.array([0.5, 1.0, 1.0]))):
            self.epsilons[agent] = res.x[0]
            self.tau_explores[agent] = res.x[1]
            self.tau_exploits[agent] = res.x[2]

    def _compute_policy_upperbound(self, epsilon, tau_explore, tau_exploit,
                                   agent, action):
        q = self.dqn(torch.tensor(self.states[agent],
                                  dtype=torch.double)).detach().numpy()
        q_u = self.dqn_u(torch.tensor(self.states[agent],
                                      dtype=torch.double)).detach().numpy()
        q_l = self.dqn_l(torch.tensor(self.states[agent],
                                      dtype=torch.double)).detach().numpy()

        ofu_denom = copy.copy(q)
        ofu_denom[action] = q_u[action]

        boltz_denom = copy.copy(q_l)
        boltz_denom[action] = q[action]

        explore_mean_q = np.mean(q_u / tau_explore)
        prob_ofu = np.exp(q_u[action] / tau_explore - explore_mean_q) / np.sum(
            np.exp(ofu_denom / tau_explore - explore_mean_q))

        exploit_mean_q = np.mean(q / tau_exploit)
        prob_boltz = np.exp(q[action] / tau_exploit - exploit_mean_q) / np.sum(
            np.exp(boltz_denom / tau_exploit - exploit_mean_q))

        return epsilon * prob_ofu + (1 - epsilon) * prob_boltz
    def __init__(self,
                 feed_units: List[int],
                 agent_name: str,
                 ensemble_size: int = 100,
                 prior_variance: float = 1.0,
                 model_dims: List[int] = [20],
                 lr: float = 1e-3,
                 batch_size: int = 128,
                 noise_variance=0):
        self.feed_units = copy.deepcopy(feed_units)
        #         self.available_units = copy.deepcopy(feed_units)
        self.agent_name = agent_name

        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.num_features: int = len(feed_units) + 1
        self.noise_variance = noise_variance

        self.ensemble_size: int = ensemble_size
        self.training_data = ReplayMemory(100000)
        #         self.training_datas = []
        #         for i in range(self.ensemble_size):
        #             self.training_datas.append(ReplayMemory(100000))

        self.latest_feature = None
        self.latest_action = None

        self.prior_variance = prior_variance

        self.model_dims: List[int] = [self.num_features] + model_dims + [2]
        priors = []

        for i in range(self.ensemble_size):
            priors.append(MLP(self.model_dims))
            priors[i].initialize()
            priors[i].double()
            priors[i].eval()
            priors[i].to(device)

        self.models: List[DQNWithPrior] = []
        for i in range(self.ensemble_size):
            self.models.append(
                DQNWithPrior(self.model_dims,
                             priors[i],
                             scale=np.sqrt(self.prior_variance)))
            self.models[i].initialize()
            self.models[i].double()
            self.models[i].to(device)

        self.target_nets: List[DQNWithPrior] = []
        for i in range(self.ensemble_size):
            self.target_nets.append(
                DQNWithPrior(self.model_dims,
                             priors[i],
                             scale=np.sqrt(self.prior_variance)))
            self.target_nets[i].load_state_dict(self.models[i].state_dict())
            self.target_nets[i].double()
            self.target_nets[i].eval()
            self.target_nets[i].to(device)

        self.loss_fn = torch.nn.MSELoss(reduction='sum')

        self.optimizers = []
        for i in range(self.ensemble_size):
            self.optimizers.append(
                optim.Adam(self.models[i].parameters(), lr=lr))

        self.cur_net = self.target_nets[np.random.choice(self.ensemble_size)]
        self.batch_size = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_unit_indices: List[int] = []
        self.cum_reward_history: List[float] = []
        self.current_feed = 0
示例#9
0
class QLearningAgent():
    def __init__(self, index, world_size, states, num_agents,
                 collision_distance):
        self.index = index
        self.name = 'q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1
        self.action_traj = []
        self.num_collisions = 0
        self.collision_distance = collision_distance

        self.dimensions = [2, 5, 5, 4]
        self.dqn = MLP(self.dimensions).double()
        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.lr = 1e-4
        self.num_collisions = 0
        self.eps = 0.1

        self.cum_rewards = 0
        self.target_usage = 0

    def choose_action(self, explore):
        possible_actions = list(copy.copy(Action.SET))
        best_action = np.argmax(
            self.target(
                torch.tensor(self.states[self.index],
                             dtype=torch.double)).tolist())

        if explore or np.random.binomial(1, self.eps) == 1:
            best_action = possible_actions[np.random.choice(
                len(possible_actions))]

        self.action_traj.append(best_action)
        return best_action

    def update_buffer(self, reward, states):
        self.buffer.push(
            torch.tensor([self.states[self.index]], dtype=torch.double),
            torch.tensor([[self.action_traj[-1]]], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([states[self.index]], dtype=torch.double))
        self.rewards += [reward]
        self.states = states

        for i, state in enumerate(states):
            if i == self.index:
                continue
            if np.linalg.norm(state -
                              states[self.index]) < self.collision_distance:
                self.num_collisions += 1
                break

        self.cum_rewards += reward

    def learn_from_buffer(self):
        self._value_func_estimate()

    def reset(self, states):
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.action_traj = []
        self.states = states
        self.rewards = []

    def _value_func_estimate(self):
        if len(self.buffer) < 32:
            return

        self.target_usage += 1
        transitions = self.buffer.sample(32)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_state_batch = torch.cat(batch.next_state)

        state_action_values = self.dqn(state_batch).gather(1, action_batch)
        next_state_values = self.target(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.dqn.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if self.target_usage == 10:
            self.target_usage = 0
            self.target.load_state_dict(self.dqn.state_dict())
            self.target.eval()

    def _move_coordinate(self, state, action):
        movement = get_movement(action)
        return movement + state
示例#10
0
    def __init__(self,
                 embedding_size=64,
                 message_size=128,
                 msg_fn_layers=2,
                 merge_fn_extra_layers=2,
                 num_passes=1,
                 edge_embedding_size=32,
                 cuda=False):
        super(GNNModel, self).__init__()

        # set hyperparameters
        self.num_passes = num_passes # number of up/down passes
        self.embedding_size = embedding_size
        self.message_size = message_size
        self.edge_embedding_size = edge_embedding_size
        # set known vocab embedding -- for now also the consts
        self.embedding = nn.Embedding(len(GNN_TOKENS), embedding_size)
        # leaf scoring function for outputting
        self.score = MLP([embedding_size, 1])
        # message functions for each class x child x direction
        self.msg_fn_keys = [k for Class in GNN_NODES
                             for k in Class.msg_fn_keys()]
        # edge embedding for each edge type
        self.edge_embedding = nn.Embedding(len(self.msg_fn_keys),
                                           self.edge_embedding_size)
        # create mapping of msg fn keys -> index
        self.msg_fn_dict = {} 
        for i, k in enumerate(self.msg_fn_keys):
            self.msg_fn_dict[k] = Variable(torch.LongTensor([i]))
            if cuda:
                self.msg_fn_dict[k] = self.msg_fn_dict[k].cuda()
        # create the message functions:
        msg_fn_shape = [self.embedding_size + self.edge_embedding_size] + \
                       [self.message_size] * (msg_fn_layers - 1) +\
                       [self.message_size]
        self.msg_fn_shared = MLP(msg_fn_shape)
        # merge function for each class
        self.merge_fn = {}
        for Class in GNN_NODES:
            if Class.nmerge > 0:
                layers = [self.message_size * i
                          for i in range(Class.nmerge, 0, -1)] + \
                         [self.message_size] * merge_fn_extra_layers

                self.merge_fn[Class.name] = MergeMLP(layers)

        self.lvar_epsilon = torch.nn.Parameter(torch.FloatTensor([-10.0]))

        # gru for each class
        self.gru = {
            Class.name : nn.GRUCell(
                input_size=self.message_size,
                hidden_size=self.embedding_size,
                bias=True)
            for Class in GNN_NODES
        }

        self.lvar_epsilon = torch.nn.Parameter(torch.FloatTensor([-10.0]))
        # add modules in msgfn, mergefn, gru manually
        for k, module in self.gru.items():
            self.add_module("gru_%s" % k, module)
        for k, module in self.merge_fn.items():
            self.add_module("merge_%s" % k, module)

        self._cuda = cuda
        if self._cuda:
            self.cuda()
class SupervisedAgent(Agent):
    def __init__(
            self,
            feed_units: List[int],
            agent_name: str,
            model_dims: List[int] = [20],
            lr: float = 1e-3,
            boltzmann: bool = False,
            epsilon: float = 0.05,
            batch_size: int = 128,
    ):
        self.feed_units = copy.deepcopy(feed_units)
        self.agent_name = agent_name
        self.interest_level = 0

        self.cum_rewards: float = 0.
        self.num_features: int = len(feed_units)
        self.training_data = []
        self.buffer: SupervisedMemory = SupervisedMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [2]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = epsilon
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_unit_indices: List[int] = []
        self.latest_feature = None
        self.latest_action = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []
        self.rewards: List[float] = []
        self.actions = []

    def choose_action(self):
        available_actions = [0, 1]

        features: List[float] = [-1. for _ in range(self.num_features)]
        for index in range(self.current_feed):
            features[index] = 0.
        for index in self.history_unit_indices:
            features[index] = 1.

#         base_feature.append(self.interest_level)
        with torch.no_grad():
            outcomes = self.model(
                torch.tensor(features, dtype=torch.double)
            )

            _, best_index = torch.max(outcomes, 0)
            best_index = best_index.item()

            best_action = [available_actions[best_index]]
            self.latest_feature = features
            self.latest_action = best_action
            if best_action[0] == 1:
                self.history_unit_indices.append(self.current_feed)

            self.current_feed += 1

            if np.random.rand() < self.epsilon:
                return np.random.randint(2)
#             print(best_action)
            return best_action[0]

    def update_buffer(
        self,
        scroll: bool,
        reward: int,
    ):
#         print(reward)
        self.cum_rewards += reward
        self.rewards += [reward]
        self.training_data += [self.latest_feature]
        self.actions += [self.latest_action]
        # self.current_feed += 1

    def learn_from_buffer(self):
        # print(self.actions)
        for i, data in enumerate(self.training_data):
            self.buffer.push(
                torch.tensor([data], dtype=torch.double),
                torch.tensor([[np.sum(self.rewards[i:])]], dtype=torch.double),
                torch.tensor([self.actions[i]], dtype=torch.long),
            )

        if len(self.buffer) < self.batch_size:
            return

        loss_ensemble = 0.
        for _ in range(10):
            transitions = self.buffer.sample(self.batch_size)
            batch = SupervisedTransition(*zip(*transitions))
            state_batch = torch.cat(batch.feature)
            action_batch = torch.cat(batch.actions)
            lifetime_value_batch = torch.cat(batch.lifetime_value)

            predicted_lifetime_value = self.model(state_batch).gather(1, action_batch)
            loss = self.loss_fn(predicted_lifetime_value, lifetime_value_batch)
            loss_ensemble += loss.item()

            self.optimizer.zero_grad()
            loss.backward()

            for param in self.model.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optimizer.step()

        self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble

    def reset(self):
        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.latest_feature = None
        self.latest_action = None
        self.history_unit_indices = []
        self.cum_reward_history.append(self.cum_rewards)
        self.current_loc = [0, 0]
        self.current_feed = 0
        self.rewards = []
        self.actions = []
        self.training_data = []
示例#12
0
class YahooSupervisedAgent():
    def __init__(self,
                 initial_feed_candidates,
                 user_features,
                 feed_counts: int,
                 agent_name: str,
                 feed_feature_count=6,
                 user_feature_count=6,
                 model_dims=[50, 25],
                 batch_size: int = 128,
                 interest_unknown: bool = False,
                 boltzmann: bool = True):
        self.initial_feed_candidates = initial_feed_candidates
        self.current_feed_candidates = initial_feed_candidates
        self.user_features = user_features
        self.feed_counts = feed_counts
        self.agent_name = agent_name

        self.cum_rewards: float = 0.
        self.rewards = []
        self.actions = []
        self.training_data = []
        self.feed_feature_count = feed_feature_count
        self.user_feature_count = user_feature_count
        self.num_features: int = feed_counts * feed_feature_count + feed_feature_count + user_feature_count
        self.buffer: SupervisedMemory = SupervisedMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [1]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = 0.05
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_actions = []
        self.latest_feature = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []

    def choose_action(self):
        available_actions = [
            candidate.features for candidate in self.current_feed_candidates
        ]

        features = np.array([-1. for _ in range(self.num_features)])
        for index, action in enumerate(self.history_actions):
            features[index * self.feed_feature_count:(index + 1) *
                     self.feed_feature_count] = action
        features[-self.user_feature_count:] = self.user_features

        candidate_features = []
        for f in available_actions:
            candidate_feature = np.copy(features)
            candidate_feature[self.feed_counts *
                              self.feed_feature_count:(self.feed_counts + 1) *
                              self.feed_feature_count] = f
            candidate_features.append(candidate_feature)
        candidate_features = np.array(candidate_features)

        with torch.no_grad():
            outcomes = self.model(
                torch.tensor(candidate_features,
                             dtype=torch.double).to(device))

            _, best_index = torch.max(outcomes, 0)
            best_index = best_index.item()

            if self.boltzmann:
                outcomes = outcomes / 0.05
                best_index = np.random.choice(
                    len(available_actions),
                    p=torch.nn.functional.softmax(outcomes.reshape(
                        (len(available_actions))),
                                                  dim=0).cpu().numpy())
            elif np.random.rand() < 0.05:
                best_index = np.random.choice(len(available_actions))

            best_action = self.current_feed_candidates[best_index]
            self.latest_feature = candidate_features[best_index]
            self.history_actions.append(best_action.features)

            self.current_feed += 1
            return best_action

    def update_buffer(self, scroll: bool, reward: int, new_batch):
        self.cum_rewards += reward
        self.rewards += [reward]
        self.training_data += [self.latest_feature]
        self.current_feed_candidates = new_batch

    def learn_from_buffer(self):
        for i, data in enumerate(self.training_data):
            self.buffer.push(
                torch.tensor([data], dtype=torch.double).to(device),
                torch.tensor([[np.sum(self.rewards[i:])]],
                             dtype=torch.double).to(device),
            )

        if len(self.buffer) < self.batch_size:
            return

        loss_ensemble = 0.
        for _ in range(10):
            transitions = self.buffer.sample(self.batch_size)
            batch = SupervisedTransition(*zip(*transitions))
            state_batch = torch.cat(batch.feature)
            lifetime_value_batch = torch.cat(batch.lifetime_value)

            predicted_lifetime_value = self.model(state_batch)
            loss = self.loss_fn(predicted_lifetime_value, lifetime_value_batch)
            loss_ensemble += loss.item()

            self.optimizer.zero_grad()
            loss.backward()

            for param in self.model.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optimizer.step()

        self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble

    def reset(self, user_features, initial_feeds, user_embedding):
        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.latest_feature = None
        self.history_actions = []
        self.rewards = []
        self.actions = []
        self.training_data = []
        self.cum_reward_history.append(self.cum_rewards)
        self.current_feed = 0
        self.current_feed_candidates = initial_feeds
        self.user_features = user_features
    def __init__(
        self,
        initial_feed_candidates,
        user_features,
        feed_counts,
        agent_name: str,
        feed_feature_count = 6,
        user_feature_count = 6,
        ensemble_size: int = 10,
        prior_variance: float = 1.0,
        model_dims: List[int] = [50, 25],
        bootstrap: bool = True,
        lr: float = 1e-3,
        batch_size: int = 32,
        noise_variance = 0
    ):
        self.initial_feed_candidates = initial_feed_candidates
        self.current_feed_candidates = initial_feed_candidates
        self.user_features = user_features
        self.feed_counts = feed_counts
        self.agent_name = agent_name
        self.bootstrap = bootstrap

        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.feed_feature_count = feed_feature_count
        self.user_feature_count = user_feature_count
        self.num_features = feed_counts * feed_feature_count + feed_feature_count + user_feature_count
        self.noise_variance = noise_variance

        self.ensemble_size: int = ensemble_size
        self.training_datas = [ReplayMemory(100000) for _ in range(ensemble_size)]

        self.latest_feature = None

        self.prior_variance = prior_variance

        self.model_dims: List[int] = [self.num_features] + model_dims + [1]
        priors = []

        for i in range(self.ensemble_size):
            priors.append(MLP(self.model_dims))
            priors[i].initialize()
            priors[i].double()
            priors[i].eval()
            priors[i].to(device)

        self.models: List[DQNWithPrior] = []
        for i in range(self.ensemble_size):
            self.models.append(DQNWithPrior(self.model_dims, priors[i], scale=np.sqrt(self.prior_variance)))
            self.models[i].initialize()
            self.models[i].double()
            self.models[i].to(device)

        self.target_nets: List[DQNWithPrior] = []
        for i in range(self.ensemble_size):
            self.target_nets.append(DQNWithPrior(self.model_dims, priors[i], scale=np.sqrt(self.prior_variance)))
            self.target_nets[i].load_state_dict(self.models[i].state_dict())
            self.target_nets[i].double()
            self.target_nets[i].eval()
            self.target_nets[i].to(device)

        self.loss_fn = torch.nn.MSELoss(reduction='sum')

        self.optimizers = []
        for i in range(self.ensemble_size):
            self.optimizers.append(optim.Adam(self.models[i].parameters(), lr=lr))

        self.cur_index = np.random.choice(self.ensemble_size)
        self.cur_net = self.target_nets[self.cur_index]
        self.batch_size = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_actions = []
        self.cum_reward_history: List[float] = []
        self.current_feed = 0
示例#14
0
class DQNAgent(Agent):
    def __init__(
        self,
        feed_units: List[int],
        agent_name: str,
        model_dims: List[int] = [],
        lr: float = 1e-3,
        boltzmann: bool = False,
        epsilon: float = 0.05,
        batch_size: int = 128,
    ):
        self.feed_units = copy.deepcopy(feed_units)
        self.agent_name = agent_name
        self.interest_level = 0

        self.cum_rewards: float = 0.
        self.num_features: int = len(feed_units)
        self.training_data: ReplayMemory = ReplayMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [2]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.target_net = MLP(self.model_dims).double().to(device)
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = epsilon
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_unit_indices: List[int] = []
        self.latest_feature = None
        self.latest_action = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []
        self.current_loc = [0, 0]

    def choose_action(self):
        available_actions = [0, 1]

        features: List[float] = [-1. for _ in range(self.num_features)]
        for index in range(self.current_feed):
            features[index] = 0.
        for index in self.history_unit_indices:
            features[index] = 1.

#         base_feature.append(self.interest_level)
        with torch.no_grad():
            outcomes = self.model(torch.tensor(features, dtype=torch.double))

            _, best_index = torch.max(outcomes, 0)
            best_index = best_index.item()

            best_action = [available_actions[best_index]]
            self.latest_feature = features
            self.latest_action = best_action
            if best_action[0] == 1:
                self.history_unit_indices.append(self.current_feed)

            self.current_feed += 1

            if np.random.rand() < self.epsilon:
                return np.random.randint(2)
#             print(best_action)
            return best_action[0]

    def update_buffer(
        self,
        scroll: bool,
        reward: int,
    ):
        #         print(reward)
        self.cum_rewards += reward
        if not scroll:
            self.training_data.push(
                torch.tensor([self.latest_feature], dtype=torch.double),
                torch.tensor([self.latest_action], dtype=torch.long),
                torch.tensor([reward], dtype=torch.double),
                None,
            )
            return

        features: List[float] = [-1. for _ in range(self.num_features)]
        for index in range(self.current_feed):
            features[index] = 0.
        for index in self.history_unit_indices:
            features[index] = 1.

        self.training_data.push(
            torch.tensor([self.latest_feature], dtype=torch.double),
            torch.tensor([self.latest_action], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([features], dtype=torch.double),
        )

    def learn_from_buffer(self):
        if len(self.training_data) < self.batch_size:
            return

        try:
            loss_ensemble = 0.
            for i in range(0, 10):
                transitions = self.training_data.sample(self.batch_size)
                batch = Transition(*zip(*transitions))
                non_final_mask = torch.tensor(tuple(
                    map(lambda s: s is not None, batch.next_state)),
                                              device=device,
                                              dtype=torch.bool)
                non_final_next_states = torch.cat(
                    [s for s in batch.next_state if s is not None])

                state_batch = torch.cat(batch.state)
                action_batch = torch.cat(batch.action)
                reward_batch = torch.cat(batch.reward)
                state_action_values = self.model(state_batch).gather(
                    1, action_batch)

                next_state_values = torch.zeros(self.batch_size,
                                                device=device,
                                                dtype=torch.double)
                next_state_values[non_final_mask] = self.target_net(
                    non_final_next_states).max(1)[0].detach()

                expected_state_action_values = self.gamma * next_state_values + reward_batch

                loss = self.loss_fn(state_action_values,
                                    expected_state_action_values.unsqueeze(1))
                loss_ensemble += loss.item()

                self.optimizer.zero_grad()
                loss.backward()

                #             for param in self.model.parameters():
                #                 param.grad.data.clamp_(-1, 1)
                self.optimizer.step()

            self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble
            self.epsilon = 0.999 * self.epsilon
        except:
            print('{}: no non-terminal state'.format(self.agent_name))

    def reset(self):
        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.latest_feature = None
        self.latest_action = None
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.double()
        self.target_net.eval()
        self.target_net.to(device)
        self.history_unit_indices = []
        self.cum_reward_history.append(self.cum_rewards)
        self.current_loc = [0, 0]
        self.current_feed = 0
class YahooDQNAgent():
    def __init__(
            self,
            initial_feed_candidates,
            user_features,
            feed_counts,
            agent_name: str,
            feed_feature_count = 6,
            user_feature_count = 6,
            model_dims: List[int] = [50, 25],
            lr: float = 1e-3,
            boltzmann: bool = True,
            epsilon: float = 0.05,
            batch_size: int = 128,
    ):
        self.initial_feed_candidates = initial_feed_candidates
        self.current_feed_candidates = initial_feed_candidates
        self.user_features = user_features
        self.feed_counts = feed_counts
        self.agent_name = agent_name
        self.interest_level = 0

        self.cum_rewards: float = 0.
        self.feed_feature_count = feed_feature_count
        self.user_feature_count = user_feature_count
        self.num_features = feed_counts * feed_feature_count + feed_feature_count + user_feature_count
        self.training_data: ReplayMemory = ReplayMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [1]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.target_net = MLP(self.model_dims).double().to(device)
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = epsilon
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_actions = []
        self.latest_feature = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []

    def choose_action(self):
        available_actions = [candidate.features for candidate in self.current_feed_candidates]

        features = [-1. for _ in range(self.num_features)]
        for index, action in enumerate(self.history_actions):
            features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action
        features[-self.user_feature_count:] = self.user_features

        candidate_features = []
        for f in available_actions:
            candidate_feature = np.copy(features)
            candidate_feature[
                self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count
            ] = f
            candidate_features.append(candidate_feature)
        candidate_features = np.array(candidate_features)

#         base_feature.append(self.interest_level)
        with torch.no_grad():
            outcomes = self.model(
                torch.tensor(candidate_features, dtype=torch.double).to(device)
            )

            _, best_index = torch.max(outcomes, 0)
            best_index = best_index.item()

            if self.boltzmann:
                outcomes = outcomes / 0.05
                best_index = np.random.choice(
                    len(available_actions),
                    p=torch.nn.functional.softmax(outcomes.reshape((len(available_actions))), dim=0).cpu().numpy()
                )
            elif np.random.rand() < 0.05:
                best_index = np.random.choice(len(available_actions))

            best_action = self.current_feed_candidates[best_index]
            self.latest_feature = candidate_features[best_index]
            self.history_actions.append(best_action.features)

            self.current_feed += 1
            return best_action

    def update_buffer(
        self,
        scroll: bool,
        reward: int,
        new_batch
    ):
#         print(reward)
        self.cum_rewards += reward
        self.current_feed_candidates = new_batch
        if not scroll:
            self.training_data.push(
                torch.tensor([self.latest_feature], dtype=torch.double).to(device),
                torch.tensor([reward], dtype=torch.double).to(device),
                None,
            )
            return

        available_actions = [candidate.features for candidate in self.current_feed_candidates]
        features: List[float] = [-1. for _ in range(self.num_features)]
        for index, action in enumerate(self.history_actions):
            features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action
        features[-self.user_feature_count:] = self.user_features

        candidate_features = []
        for f in available_actions:
            candidate_feature = np.copy(features)
            candidate_feature[
                self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count
            ] = f
            candidate_features.append(candidate_feature)
        candidate_features = np.array(candidate_features)

        self.training_data.push(
            torch.tensor([self.latest_feature], dtype=torch.double).to(device),
            torch.tensor([reward], dtype=torch.double).to(device),
            torch.tensor([candidate_features], dtype=torch.double).to(device),
        )

    def learn_from_buffer(self):
        if len(self.training_data) < self.batch_size:
            return

        loss_ensemble = 0.
        for i in range(0, 10):
            transitions = self.training_data.sample(self.batch_size)
            batch = Transition(*zip(*transitions))
            non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                        batch.next_state)), device=device, dtype=torch.bool)
            state_batch = torch.cat(batch.state)
            reward_batch = torch.cat(batch.reward)
            state_action_values = self.model(state_batch)

            all_none = True
            for s in batch.next_state:
                if s is not None:
                    all_none = False

            next_state_values = torch.zeros(self.batch_size, device=device, dtype=torch.double)
            if not all_none:
                non_final_next_states = torch.cat([s for s in batch.next_state
                                                            if s is not None])

                next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].reshape((-1)).detach()

            expected_state_action_values = self.gamma * next_state_values + reward_batch

            loss = self.loss_fn(state_action_values, expected_state_action_values.unsqueeze(1))
            loss_ensemble += loss.item()

            self.optimizer.zero_grad()
            loss.backward()

            for param in self.model.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optimizer.step()

        self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble
        self.epsilon = 0.999 * self.epsilon


    def reset(self, user_features, initial_feeds, user_embedding):
        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.latest_feature = None
        self.current_feed_candidates = initial_feeds
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.double()
        self.target_net.eval()
        self.target_net.to(device)
        self.history_actions = []
        self.cum_reward_history.append(self.cum_rewards)
        self.current_feed = 0
        self.user_features = user_features
class NaiveSafeQLearningAgent():
    def __init__(self, index, world_size, states, num_agents,
                 collision_distance, collision_threshold, reward_threshold):
        self.index = index
        self.name = 'naive q agent'
        self.world_size = world_size
        self.states = states
        self.num_agents = num_agents
        self.rewards = []
        self.buffer = ReplayMemory(10000)
        self.gamma = 0.9
        self.beta = 1
        self.action_traj = []
        self.num_collisions = 0
        self.collision_distance = collision_distance
        self.collision_threshold = collision_threshold
        self.reward_threshold = reward_threshold

        self.dimensions = [2, 5, 5, 4]
        self.dqn = MLP(self.dimensions).double()
        self.target = MLP(self.dimensions).double()
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.RMSprop(self.dqn.parameters())
        self.lr = 1e-4
        self.num_collisions = 0
        self.num_unsafe = 0
        self.eps = 0.1
        self.cum_rewards = 0
        self.target_usage = 0

    def choose_action(self, explore):
        possible_actions = list(copy.copy(Action.SET))
        no_collision_ls = [1.0 for _ in Action.SET]

        for a in Action.SET:
            next_state = self._move_coordinate(self.states[self.index], a)
            next_state = bound_action(next_state, self.world_size[0],
                                      self.world_size[1])

            for agent in range(self.num_agents):
                if agent == self.index:
                    continue

                cur_agent_state = self.states[agent]
                if np.linalg.norm(cur_agent_state -
                                  next_state) > 1.0 + self.collision_distance:
                    continue

                movement = np.rint(next_state - cur_agent_state)
                action = get_action(movement, self.world_size)
                if action == -1:
                    continue

                no_collision_ls[a] *= 0.75

        for i, l in enumerate(no_collision_ls):
            if l < self.collision_threshold and i in possible_actions:
                possible_actions.remove(i)

        q = self.target(
            torch.tensor(self.states[self.index],
                         dtype=torch.double)).tolist()

        if len(possible_actions) == 0:
            best_action = np.argmax(no_collision_ls)
            self.action_traj.append(best_action)
            return best_action

        best_q = -math.inf
        best_action = Action.UP
        for action in possible_actions:
            if q[action] > best_q:
                best_q = q[action]
                best_action = action

        if explore or np.random.binomial(1, self.eps) == 1:
            best_action = possible_actions[np.random.choice(
                len(possible_actions))]

        self.action_traj.append(best_action)
        return best_action

    def update_buffer(self, reward, states):
        self.buffer.push(
            torch.tensor([self.states[self.index]], dtype=torch.double),
            torch.tensor([[self.action_traj[-1]]], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([states[self.index]], dtype=torch.double))
        self.rewards += [reward]
        self.states = states

        if reward < self.reward_threshold:
            self.num_unsafe += 1

        for i, state in enumerate(states):
            if i == self.index:
                continue
            if np.linalg.norm(state -
                              states[self.index]) < self.collision_distance:
                self.num_collisions += 1
                break

        self.cum_rewards += reward

    def learn_from_buffer(self):
        self._value_func_estimate()

    def reset(self, states):
        self.target.load_state_dict(self.dqn.state_dict())
        self.target.eval()
        self.action_traj = []
        self.states = states
        self.rewards = []

    def _value_func_estimate(self):
        if len(self.buffer) < 32:
            return

        self.target_usage += 1
        transitions = self.buffer.sample(32)
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        next_state_batch = torch.cat(batch.next_state)

        state_action_values = self.dqn(state_batch).gather(1, action_batch)
        next_state_values = self.target(next_state_batch).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch

        loss = self.loss_fn(
            state_action_values,
            expected_state_action_values.unsqueeze(1),
        )

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.dqn.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if self.target_usage == 10:
            self.target_usage = 0
            self.target.load_state_dict(self.dqn.state_dict())
            self.target.eval()

    def _move_coordinate(self, state, action):
        movement = get_movement(action)
        return movement + state