def get_obs(self, obs): items, rew, history, targets = ReturnStateTuple(*list(zip(*obs))) if len(obs) == 1: # Transforming the elements for batch-size first items_padded = torch.tensor(items, device=self.device, dtype=torch.long).view(-1, 1) if rew[0] is not None: rewards = torch.tensor(rew, device=self.device).view(-1, 1) else: rewards = None if history[0] is not None: history = torch.tensor(history, device=self.device) else: items = [torch.from_numpy(item) for item in items] items_padded = pad_sequence(items, batch_first=False).to(self.device) if rew[0] is not None: rewards = [torch.from_numpy(r) for r in rew] rewards = pad_sequence(rewards, batch_first=False).to(self.device) else: rewards = None if history[0] is not None: history = torch.from_numpy(np.stack(history)).to(self.device) return ReturnStateTuple(items_padded, rewards, history, None)
def unwrap_state(states, device): """ Converts a list of multiple ReturnStateTuples of numpy arrays to one ReturnStateTuple of padded tensors """ state_tuple = ReturnStateTuple(*zip(*states)) items = [torch.from_numpy(item) for item in state_tuple.items] items_padded = pad_sequence(items, batch_first=False).to(device) if any(state_tuple.targets): targets = torch.as_tensor(np.concatenate(state_tuple.targets), device=device) else: targets = None if isinstance(state_tuple.history[0], np.ndarray): history = torch.from_numpy(np.stack(state_tuple.history)).to(device) return ReturnStateTuple(items_padded, None, history, targets) return ReturnStateTuple(items_padded, None, None, targets)
def forward(self, action): with torch.no_grad(): state = self.current_state state = ReturnStateTuple( torch.as_tensor(state.items, device=self.agent.device).unsqueeze(1), None, None, None) state = self.agent.state_agg(state) logits = self.agent.agent(state) reward = torch.softmax(logits, dim=-1).cpu().numpy() action_rewards = reward[..., action] return action_rewards.flatten()
def reset(self): self.user_index += 1 if self.user_index == len(self.trajectories): # end of this simulator return None self.current_user_real_trajectory = np.array( self.trajectories.loc[self.user_index]) self.current_trajectory = np.array( [self.current_user_real_trajectory[0]]) self.last_recommendation_actions = [] self.episode_rewards = [] self.time_step = 0 return ReturnStateTuple(self.current_trajectory, None, None, None)
def create_state_vector(self, state): """ Accumulate sequential state to one vector :param state: :return: """ if isinstance(state, np.ndarray): state = ReturnStateTuple(*[ torch.as_tensor(a, device=self.device) if a is not None else a for a in state ]) self.user_history_mask_items = state.items.cpu().t() curr_targets = state.targets state = self.state_agg(state) return state, curr_targets
def reset(self): self.user_index += 1 if self.user_index == len(self.trajectories): # end of this simulator return None self.current_user_real_trajectory = np.array( self.trajectories.loc[self.user_index]) time_start = self.session_start_index() self.current_trajectory = np.array( [self.current_user_real_trajectory[time_start]]) self.last_recommendation_actions = [] self.episode_rewards = [] self.real_next_index = time_start + 1 self.current_state = ReturnStateTuple(self.current_trajectory, None, None, None) return self.current_state
def step(self, action: np.ndarray): self.time_step += 1 next_user_action = self.current_user_real_trajectory[self.time_step] reward, index_in_action_list = target_in_action_reward( action, next_user_action) self.current_trajectory = np.concatenate( [self.current_trajectory, np.array([next_user_action])]) done = self.time_step + 1 == len(self.current_user_real_trajectory) or \ self.max_episode_steps == self.time_step self.last_recommendation_actions.append(action) self.episode_rewards.append(reward) if done: self.store_metrics() return ReturnStateTuple(self.current_trajectory[-self.max_state_len:], None, None, None), reward, done, \ {"item": index_in_action_list}
def step(self, action: np.ndarray): self.time_step += 1 reward_list = self._receive_reward(action) leave_prob = self.default_leave_prob next_click = self._accept_item(action, reward_list) if next_click == -1: # We do not accept the action, but click on something else, i.e. go back to user log next_click = self.current_user_real_trajectory[ self.real_next_index] self.real_next_index += 1 # while self.real_next_index < len(self.current_user_real_trajectory): # next_click = self.current_user_real_trajectory[self.real_next_index] # self.real_next_index += 1 # if next_click not in self.current_trajectory: # break leave_prob = self.other_item_leave_prob if self.last_recommendation_actions: last = np.in1d(action, self.last_recommendation_actions[-1]) percentile_repeats = last.sum() / len(last) # Modelling the impatience and annoyance of repetitive recommendations leave_prob += self.repetition_factor * percentile_repeats self.current_trajectory = np.concatenate( [self.current_trajectory, np.array([next_click])]) interrupt_session = self._interrupt_trajectory(leave_prob) done = interrupt_session or \ self.real_next_index == len(self.current_user_real_trajectory) or \ 0 < self.max_episode_steps == self.time_step self.last_recommendation_actions.append(action) total_reward = 0 if self.reward_type == "item": total_reward = float(next_click in action) # if next_click in action: # total_reward = reward_list[action == next_click].item() else: total_reward = max(reward_list.sum(), 0) if interrupt_session: total_reward = 0 self.episode_rewards.append(total_reward) if done: self.store_metrics() info = { "item": -1 if next_click not in action else np.flatnonzero( next_click == action)[0] } self.current_state = ReturnStateTuple( self.current_trajectory[-self.max_state_len:], None, None, None) return self.current_state, total_reward, done, info