예제 #1
0
    def get_obs(self, obs):
        items, rew, history, targets = ReturnStateTuple(*list(zip(*obs)))
        if len(obs) == 1:
            # Transforming the elements for batch-size first
            items_padded = torch.tensor(items,
                                        device=self.device,
                                        dtype=torch.long).view(-1, 1)
            if rew[0] is not None:
                rewards = torch.tensor(rew, device=self.device).view(-1, 1)
            else:
                rewards = None
            if history[0] is not None:
                history = torch.tensor(history, device=self.device)
        else:
            items = [torch.from_numpy(item) for item in items]
            items_padded = pad_sequence(items,
                                        batch_first=False).to(self.device)
            if rew[0] is not None:
                rewards = [torch.from_numpy(r) for r in rew]
                rewards = pad_sequence(rewards,
                                       batch_first=False).to(self.device)
            else:
                rewards = None

            if history[0] is not None:
                history = torch.from_numpy(np.stack(history)).to(self.device)

        return ReturnStateTuple(items_padded, rewards, history, None)
예제 #2
0
def unwrap_state(states, device):
    """
    Converts a list of multiple ReturnStateTuples of numpy arrays to one ReturnStateTuple of padded tensors
    """
    state_tuple = ReturnStateTuple(*zip(*states))
    items = [torch.from_numpy(item) for item in state_tuple.items]
    items_padded = pad_sequence(items, batch_first=False).to(device)
    if any(state_tuple.targets):
        targets = torch.as_tensor(np.concatenate(state_tuple.targets),
                                  device=device)
    else:
        targets = None

    if isinstance(state_tuple.history[0], np.ndarray):
        history = torch.from_numpy(np.stack(state_tuple.history)).to(device)
        return ReturnStateTuple(items_padded, None, history, targets)

    return ReturnStateTuple(items_padded, None, None, targets)
예제 #3
0
    def forward(self, action):
        with torch.no_grad():
            state = self.current_state
            state = ReturnStateTuple(
                torch.as_tensor(state.items,
                                device=self.agent.device).unsqueeze(1), None,
                None, None)
            state = self.agent.state_agg(state)
            logits = self.agent.agent(state)
            reward = torch.softmax(logits, dim=-1).cpu().numpy()

        action_rewards = reward[..., action]
        return action_rewards.flatten()
예제 #4
0
    def reset(self):
        self.user_index += 1

        if self.user_index == len(self.trajectories):
            # end of this simulator
            return None
        self.current_user_real_trajectory = np.array(
            self.trajectories.loc[self.user_index])
        self.current_trajectory = np.array(
            [self.current_user_real_trajectory[0]])

        self.last_recommendation_actions = []
        self.episode_rewards = []
        self.time_step = 0
        return ReturnStateTuple(self.current_trajectory, None, None, None)
예제 #5
0
    def create_state_vector(self, state):
        """
        Accumulate sequential state to one vector
        :param state:
        :return:
        """
        if isinstance(state, np.ndarray):
            state = ReturnStateTuple(*[
                torch.as_tensor(a, device=self.device) if a is not None else a
                for a in state
            ])
        self.user_history_mask_items = state.items.cpu().t()

        curr_targets = state.targets
        state = self.state_agg(state)
        return state, curr_targets
예제 #6
0
    def reset(self):
        self.user_index += 1
        if self.user_index == len(self.trajectories):
            # end of this simulator
            return None

        self.current_user_real_trajectory = np.array(
            self.trajectories.loc[self.user_index])

        time_start = self.session_start_index()
        self.current_trajectory = np.array(
            [self.current_user_real_trajectory[time_start]])
        self.last_recommendation_actions = []
        self.episode_rewards = []
        self.real_next_index = time_start + 1

        self.current_state = ReturnStateTuple(self.current_trajectory, None,
                                              None, None)
        return self.current_state
예제 #7
0
    def step(self, action: np.ndarray):
        self.time_step += 1

        next_user_action = self.current_user_real_trajectory[self.time_step]
        reward, index_in_action_list = target_in_action_reward(
            action, next_user_action)

        self.current_trajectory = np.concatenate(
            [self.current_trajectory,
             np.array([next_user_action])])

        done = self.time_step + 1 == len(self.current_user_real_trajectory) or \
               self.max_episode_steps == self.time_step

        self.last_recommendation_actions.append(action)
        self.episode_rewards.append(reward)

        if done:
            self.store_metrics()

        return ReturnStateTuple(self.current_trajectory[-self.max_state_len:], None, None, None), reward, done, \
               {"item": index_in_action_list}
예제 #8
0
    def step(self, action: np.ndarray):
        self.time_step += 1

        reward_list = self._receive_reward(action)
        leave_prob = self.default_leave_prob

        next_click = self._accept_item(action, reward_list)
        if next_click == -1:
            # We do not accept the action, but click on something else, i.e. go back to user log
            next_click = self.current_user_real_trajectory[
                self.real_next_index]
            self.real_next_index += 1
            # while self.real_next_index < len(self.current_user_real_trajectory):
            #     next_click = self.current_user_real_trajectory[self.real_next_index]
            #     self.real_next_index += 1
            #     if next_click not in self.current_trajectory:
            #         break
            leave_prob = self.other_item_leave_prob

        if self.last_recommendation_actions:
            last = np.in1d(action, self.last_recommendation_actions[-1])
            percentile_repeats = last.sum() / len(last)
            # Modelling the impatience and annoyance of repetitive recommendations
            leave_prob += self.repetition_factor * percentile_repeats

        self.current_trajectory = np.concatenate(
            [self.current_trajectory,
             np.array([next_click])])

        interrupt_session = self._interrupt_trajectory(leave_prob)
        done = interrupt_session or \
               self.real_next_index == len(self.current_user_real_trajectory) or \
               0 < self.max_episode_steps == self.time_step

        self.last_recommendation_actions.append(action)

        total_reward = 0
        if self.reward_type == "item":
            total_reward = float(next_click in action)
            # if next_click in action:
            #     total_reward = reward_list[action == next_click].item()
        else:
            total_reward = max(reward_list.sum(), 0)

        if interrupt_session:
            total_reward = 0

        self.episode_rewards.append(total_reward)

        if done:
            self.store_metrics()

        info = {
            "item":
            -1 if next_click not in action else np.flatnonzero(
                next_click == action)[0]
        }

        self.current_state = ReturnStateTuple(
            self.current_trajectory[-self.max_state_len:], None, None, None)
        return self.current_state, total_reward, done, info