Пример #1
0
class DQNAgent(Agent):
    def __init__(
        self,
        feed_units: List[int],
        agent_name: str,
        model_dims: List[int] = [],
        lr: float = 1e-3,
        boltzmann: bool = False,
        epsilon: float = 0.05,
        batch_size: int = 128,
    ):
        self.feed_units = copy.deepcopy(feed_units)
        self.agent_name = agent_name
        self.interest_level = 0

        self.cum_rewards: float = 0.
        self.num_features: int = len(feed_units)
        self.training_data: ReplayMemory = ReplayMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [2]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.target_net = MLP(self.model_dims).double().to(device)
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = epsilon
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_unit_indices: List[int] = []
        self.latest_feature = None
        self.latest_action = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []
        self.current_loc = [0, 0]

    def choose_action(self):
        available_actions = [0, 1]

        features: List[float] = [-1. for _ in range(self.num_features)]
        for index in range(self.current_feed):
            features[index] = 0.
        for index in self.history_unit_indices:
            features[index] = 1.

#         base_feature.append(self.interest_level)
        with torch.no_grad():
            outcomes = self.model(torch.tensor(features, dtype=torch.double))

            _, best_index = torch.max(outcomes, 0)
            best_index = best_index.item()

            best_action = [available_actions[best_index]]
            self.latest_feature = features
            self.latest_action = best_action
            if best_action[0] == 1:
                self.history_unit_indices.append(self.current_feed)

            self.current_feed += 1

            if np.random.rand() < self.epsilon:
                return np.random.randint(2)
#             print(best_action)
            return best_action[0]

    def update_buffer(
        self,
        scroll: bool,
        reward: int,
    ):
        #         print(reward)
        self.cum_rewards += reward
        if not scroll:
            self.training_data.push(
                torch.tensor([self.latest_feature], dtype=torch.double),
                torch.tensor([self.latest_action], dtype=torch.long),
                torch.tensor([reward], dtype=torch.double),
                None,
            )
            return

        features: List[float] = [-1. for _ in range(self.num_features)]
        for index in range(self.current_feed):
            features[index] = 0.
        for index in self.history_unit_indices:
            features[index] = 1.

        self.training_data.push(
            torch.tensor([self.latest_feature], dtype=torch.double),
            torch.tensor([self.latest_action], dtype=torch.long),
            torch.tensor([reward], dtype=torch.double),
            torch.tensor([features], dtype=torch.double),
        )

    def learn_from_buffer(self):
        if len(self.training_data) < self.batch_size:
            return

        try:
            loss_ensemble = 0.
            for i in range(0, 10):
                transitions = self.training_data.sample(self.batch_size)
                batch = Transition(*zip(*transitions))
                non_final_mask = torch.tensor(tuple(
                    map(lambda s: s is not None, batch.next_state)),
                                              device=device,
                                              dtype=torch.bool)
                non_final_next_states = torch.cat(
                    [s for s in batch.next_state if s is not None])

                state_batch = torch.cat(batch.state)
                action_batch = torch.cat(batch.action)
                reward_batch = torch.cat(batch.reward)
                state_action_values = self.model(state_batch).gather(
                    1, action_batch)

                next_state_values = torch.zeros(self.batch_size,
                                                device=device,
                                                dtype=torch.double)
                next_state_values[non_final_mask] = self.target_net(
                    non_final_next_states).max(1)[0].detach()

                expected_state_action_values = self.gamma * next_state_values + reward_batch

                loss = self.loss_fn(state_action_values,
                                    expected_state_action_values.unsqueeze(1))
                loss_ensemble += loss.item()

                self.optimizer.zero_grad()
                loss.backward()

                #             for param in self.model.parameters():
                #                 param.grad.data.clamp_(-1, 1)
                self.optimizer.step()

            self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble
            self.epsilon = 0.999 * self.epsilon
        except:
            print('{}: no non-terminal state'.format(self.agent_name))

    def reset(self):
        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.latest_feature = None
        self.latest_action = None
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.double()
        self.target_net.eval()
        self.target_net.to(device)
        self.history_unit_indices = []
        self.cum_reward_history.append(self.cum_rewards)
        self.current_loc = [0, 0]
        self.current_feed = 0
class YahooDQNAgent():
    def __init__(
            self,
            initial_feed_candidates,
            user_features,
            feed_counts,
            agent_name: str,
            feed_feature_count = 6,
            user_feature_count = 6,
            model_dims: List[int] = [50, 25],
            lr: float = 1e-3,
            boltzmann: bool = True,
            epsilon: float = 0.05,
            batch_size: int = 128,
    ):
        self.initial_feed_candidates = initial_feed_candidates
        self.current_feed_candidates = initial_feed_candidates
        self.user_features = user_features
        self.feed_counts = feed_counts
        self.agent_name = agent_name
        self.interest_level = 0

        self.cum_rewards: float = 0.
        self.feed_feature_count = feed_feature_count
        self.user_feature_count = user_feature_count
        self.num_features = feed_counts * feed_feature_count + feed_feature_count + user_feature_count
        self.training_data: ReplayMemory = ReplayMemory(100000)

        self.model_dims: List[int] = [self.num_features] + model_dims + [1]
        self.model = MLP(self.model_dims).double()
        self.model.initialize()
        self.model.to(device)

        self.target_net = MLP(self.model_dims).double().to(device)
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.eval()

        self.loss_fn = torch.nn.MSELoss(reduction='sum')
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.boltzmann: bool = boltzmann
        self.epsilon: float = epsilon
        self.batch_size: int = batch_size
        self.gamma = 0.99
        self.running_loss = 0.0
        self.history_actions = []
        self.latest_feature = None
        self.current_feed = 0
        self.cum_reward_history: List[float] = []

    def choose_action(self):
        available_actions = [candidate.features for candidate in self.current_feed_candidates]

        features = [-1. for _ in range(self.num_features)]
        for index, action in enumerate(self.history_actions):
            features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action
        features[-self.user_feature_count:] = self.user_features

        candidate_features = []
        for f in available_actions:
            candidate_feature = np.copy(features)
            candidate_feature[
                self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count
            ] = f
            candidate_features.append(candidate_feature)
        candidate_features = np.array(candidate_features)

#         base_feature.append(self.interest_level)
        with torch.no_grad():
            outcomes = self.model(
                torch.tensor(candidate_features, dtype=torch.double).to(device)
            )

            _, best_index = torch.max(outcomes, 0)
            best_index = best_index.item()

            if self.boltzmann:
                outcomes = outcomes / 0.05
                best_index = np.random.choice(
                    len(available_actions),
                    p=torch.nn.functional.softmax(outcomes.reshape((len(available_actions))), dim=0).cpu().numpy()
                )
            elif np.random.rand() < 0.05:
                best_index = np.random.choice(len(available_actions))

            best_action = self.current_feed_candidates[best_index]
            self.latest_feature = candidate_features[best_index]
            self.history_actions.append(best_action.features)

            self.current_feed += 1
            return best_action

    def update_buffer(
        self,
        scroll: bool,
        reward: int,
        new_batch
    ):
#         print(reward)
        self.cum_rewards += reward
        self.current_feed_candidates = new_batch
        if not scroll:
            self.training_data.push(
                torch.tensor([self.latest_feature], dtype=torch.double).to(device),
                torch.tensor([reward], dtype=torch.double).to(device),
                None,
            )
            return

        available_actions = [candidate.features for candidate in self.current_feed_candidates]
        features: List[float] = [-1. for _ in range(self.num_features)]
        for index, action in enumerate(self.history_actions):
            features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action
        features[-self.user_feature_count:] = self.user_features

        candidate_features = []
        for f in available_actions:
            candidate_feature = np.copy(features)
            candidate_feature[
                self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count
            ] = f
            candidate_features.append(candidate_feature)
        candidate_features = np.array(candidate_features)

        self.training_data.push(
            torch.tensor([self.latest_feature], dtype=torch.double).to(device),
            torch.tensor([reward], dtype=torch.double).to(device),
            torch.tensor([candidate_features], dtype=torch.double).to(device),
        )

    def learn_from_buffer(self):
        if len(self.training_data) < self.batch_size:
            return

        loss_ensemble = 0.
        for i in range(0, 10):
            transitions = self.training_data.sample(self.batch_size)
            batch = Transition(*zip(*transitions))
            non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                        batch.next_state)), device=device, dtype=torch.bool)
            state_batch = torch.cat(batch.state)
            reward_batch = torch.cat(batch.reward)
            state_action_values = self.model(state_batch)

            all_none = True
            for s in batch.next_state:
                if s is not None:
                    all_none = False

            next_state_values = torch.zeros(self.batch_size, device=device, dtype=torch.double)
            if not all_none:
                non_final_next_states = torch.cat([s for s in batch.next_state
                                                            if s is not None])

                next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].reshape((-1)).detach()

            expected_state_action_values = self.gamma * next_state_values + reward_batch

            loss = self.loss_fn(state_action_values, expected_state_action_values.unsqueeze(1))
            loss_ensemble += loss.item()

            self.optimizer.zero_grad()
            loss.backward()

            for param in self.model.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optimizer.step()

        self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble
        self.epsilon = 0.999 * self.epsilon


    def reset(self, user_features, initial_feeds, user_embedding):
        self.cum_rewards: float = 0.
        self.interest_level = 0.
        self.latest_feature = None
        self.current_feed_candidates = initial_feeds
        self.target_net.load_state_dict(self.model.state_dict())
        self.target_net.double()
        self.target_net.eval()
        self.target_net.to(device)
        self.history_actions = []
        self.cum_reward_history.append(self.cum_rewards)
        self.current_feed = 0
        self.user_features = user_features