def _neg_sample_by_pair_wise_sampling(self, inter_feat, neg_iids): inter_feat = inter_feat.repeat(self.times) neg_item_feat = Interaction({self.iid_field: neg_iids}) neg_item_feat = self.dataset.join(neg_item_feat) neg_item_feat.add_prefix(self.neg_prefix) inter_feat.update(neg_item_feat) return inter_feat
def __init__(self, config, dataset, sampler, shuffle=False): if shuffle is False: shuffle = True self.logger.warning('UserDataLoader must shuffle the data.') self.uid_field = dataset.uid_field self.user_list = Interaction( {self.uid_field: torch.arange(dataset.user_num)}) super().__init__(config, dataset, sampler, shuffle=shuffle)
class UserDataLoader(AbstractDataLoader): """:class:`UserDataLoader` will return a batch of data which only contains user-id when it is iterated. Args: config (Config): The config of dataloader. dataset (Dataset): The dataset of dataloader. batch_size (int, optional): The batch_size of dataloader. Defaults to ``1``. dl_format (InputType, optional): The input type of dataloader. Defaults to :obj:`~recbole.utils.enum_type.InputType.POINTWISE`. shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``. Attributes: shuffle (bool): Whether the dataloader will be shuffle after a round. However, in :class:`UserDataLoader`, it's guaranteed to be ``True``. """ dl_type = DataLoaderType.ORIGIN def __init__(self, config, dataset, batch_size=1, dl_format=InputType.POINTWISE, shuffle=False): self.uid_field = dataset.uid_field self.user_list = Interaction( {self.uid_field: torch.arange(dataset.user_num)}) super().__init__(config=config, dataset=dataset, batch_size=batch_size, dl_format=dl_format, shuffle=shuffle) def setup(self): """Make sure that the :attr:`shuffle` is True. If :attr:`shuffle` is False, it will be changed to True and give a warning to user. """ if self.shuffle is False: self.shuffle = True self.logger.warning('UserDataLoader must shuffle the data') @property def pr_end(self): return len(self.user_list) def _shuffle(self): self.user_list.shuffle() def _next_batch_data(self): cur_data = self.user_list[self.pr:self.pr + self.step] self.pr += self.step return cur_data
def __init__(self, config, dataset, batch_size=1, dl_format=InputType.POINTWISE, shuffle=False): self.uid_field = dataset.uid_field self.user_list = Interaction( {self.uid_field: torch.arange(dataset.user_num)}) super().__init__(config=config, dataset=dataset, batch_size=batch_size, dl_format=dl_format, shuffle=shuffle)
def augmentation(self, item_list_index, target_index, item_list_length): """Data augmentation. Args: item_list_index (numpy.ndarray): the index of history items list in interaction. target_index (numpy.ndarray): the index of items to be predicted in interaction. item_list_length (numpy.ndarray): history list length. Returns: dict: the augmented data. """ new_length = len(item_list_index) new_data = self.dataset.inter_feat[target_index] new_dict = { self.item_list_length_field: torch.tensor(item_list_length), } for field in self.dataset.inter_feat: if field != self.uid_field: list_field = getattr(self, f'{field}_list_field') list_len = self.dataset.field2seqlen[list_field] shape = (new_length, list_len) if isinstance(list_len, int) else (new_length,) + list_len list_ftype = self.dataset.field2type[list_field] dtype = torch.int64 if list_ftype in [FeatureType.TOKEN, FeatureType.TOKEN_SEQ] else torch.float64 new_dict[list_field] = torch.zeros(shape, dtype=dtype) value = self.dataset.inter_feat[field] for i, (index, length) in enumerate(zip(item_list_index, item_list_length)): new_dict[list_field][i][:length] = value[index] new_data.update(Interaction(new_dict)) return new_data
def __init__(self, config, dataset, sampler, shuffle=False): self.uid_field = dataset.uid_field self.iid_field = dataset.iid_field self.is_sequential = config['MODEL_TYPE'] == ModelType.SEQUENTIAL if not self.is_sequential: user_num = dataset.user_num self.uid_list = [] self.uid2items_num = np.zeros(user_num, dtype=np.int64) self.uid2positive_item = np.array([None] * user_num) self.uid2history_item = np.array([None] * user_num) dataset.sort(by=self.uid_field, ascending=True) last_uid = None positive_item = set() uid2used_item = sampler.used_ids for uid, iid in zip(dataset.inter_feat[self.uid_field].numpy(), dataset.inter_feat[self.iid_field].numpy()): if uid != last_uid: self._set_user_property(last_uid, uid2used_item[last_uid], positive_item) last_uid = uid self.uid_list.append(uid) positive_item = set() positive_item.add(iid) self._set_user_property(last_uid, uid2used_item[last_uid], positive_item) self.uid_list = torch.tensor(self.uid_list, dtype=torch.int64) self.user_df = dataset.join( Interaction({self.uid_field: self.uid_list})) super().__init__(config, dataset, sampler, shuffle=shuffle)
def _neg_sampling(self, inter_feat): if 'dynamic' in self.neg_sample_args.keys( ) and self.neg_sample_args['dynamic'] != 'none': candidate_num = self.neg_sample_args['dynamic'] user_ids = inter_feat[self.uid_field] item_ids = inter_feat[self.iid_field] neg_candidate_ids = self.sampler.sample_by_user_ids( user_ids, item_ids, self.neg_sample_num * candidate_num) self.model.eval() interaction = copy.deepcopy(inter_feat).to(self.model.device) interaction = interaction.repeat(self.neg_sample_num * candidate_num) neg_item_feat = Interaction( {self.iid_field: neg_candidate_ids.to(self.model.device)}) interaction.update(neg_item_feat) scores = self.model.predict(interaction).reshape(candidate_num, -1) indices = torch.max(scores, dim=0)[1].detach() neg_candidate_ids = neg_candidate_ids.reshape(candidate_num, -1) neg_item_ids = neg_candidate_ids[ indices, [i for i in range(neg_candidate_ids.shape[1])]].view(-1) self.model.train() return self.sampling_func(inter_feat, neg_item_ids) elif self.neg_sample_args['strategy'] == 'by': user_ids = inter_feat[self.uid_field] item_ids = inter_feat[self.iid_field] neg_item_ids = self.sampler.sample_by_user_ids( user_ids, item_ids, self.neg_sample_num) return self.sampling_func(inter_feat, neg_item_ids) else: return inter_feat
def _next_batch_data(self): cur_data = self.dataset.kg_feat[self.pr:self.pr + self.step] head_ids = cur_data[self.hid_field] neg_tail_ids = self.sampler.sample_by_entity_ids( head_ids, self.neg_sample_num) cur_data.update(Interaction({self.neg_tid_field: neg_tail_ids})) self.pr += self.step return cur_data
def _neg_sample_by_point_wise_sampling(self, data, neg_iids): pos_inter_num = len(data) new_data = data.repeat(self.times) new_data[self.iid_field][pos_inter_num:] = neg_iids labels = torch.zeros(pos_inter_num * self.times) labels[: pos_inter_num] = 1.0 new_data.update(Interaction({self.label_field: labels})) return new_data
def _neg_sample_by_point_wise_sampling(self, inter_feat, neg_item_ids): pos_inter_num = len(inter_feat) new_data = inter_feat.repeat(self.times) new_data[self.iid_field][pos_inter_num:] = neg_item_ids new_data = self.dataset.join(new_data) labels = torch.zeros(pos_inter_num * self.times) labels[:pos_inter_num] = 1.0 new_data.update(Interaction({self.label_field: labels})) return new_data
def full_sort_scores(uid_series, model, test_data, device=None): """Calculate the scores of all items for each user in uid_series. Note: The score of [pad] and history items will be set into -inf. Args: uid_series (numpy.ndarray or list): User id series. model (AbstractRecommender): Model to predict. test_data (FullSortEvalDataLoader): The test_data of model. device (torch.device, optional): The device which model will run on. Defaults to ``None``. Note: ``device=None`` is equivalent to ``device=torch.device('cpu')``. Returns: torch.Tensor: the scores of all items for each user in uid_series. """ device = device or torch.device('cpu') uid_series = torch.tensor(uid_series) uid_field = test_data.dataset.uid_field dataset = test_data.dataset model.eval() if not test_data.is_sequential: input_interaction = dataset.join(Interaction({uid_field: uid_series})) history_item = test_data.uid2history_item[list(uid_series)] history_row = torch.cat([ torch.full_like(hist_iid, i) for i, hist_iid in enumerate(history_item) ]) history_col = torch.cat(list(history_item)) history_index = history_row, history_col else: _, index = ( dataset.inter_feat[uid_field] == uid_series[:, None]).nonzero( as_tuple=True) input_interaction = dataset[index] history_index = None # Get scores of all items input_interaction = input_interaction.to(device) try: scores = model.full_sort_predict(input_interaction) except NotImplementedError: input_interaction = input_interaction.repeat(dataset.item_num) input_interaction.update( test_data.dataset.get_item_feature().to(device).repeat( len(uid_series))) scores = model.predict(input_interaction) scores = scores.view(-1, dataset.item_num) scores[:, 0] = -np.inf # set scores of [pad] to -inf if history_index is not None: scores[history_index] = -np.inf # set scores of history items to -inf return scores
class UserDataLoader(AbstractDataLoader): """:class:`UserDataLoader` will return a batch of data which only contains user-id when it is iterated. Args: config (Config): The config of dataloader. dataset (Dataset): The dataset of dataloader. sampler (Sampler): The sampler of dataloader. shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``. Attributes: shuffle (bool): Whether the dataloader will be shuffle after a round. However, in :class:`UserDataLoader`, it's guaranteed to be ``True``. """ def __init__(self, config, dataset, sampler, shuffle=False): if shuffle is False: shuffle = True self.logger.warning('UserDataLoader must shuffle the data.') self.uid_field = dataset.uid_field self.user_list = Interaction( {self.uid_field: torch.arange(dataset.user_num)}) super().__init__(config, dataset, sampler, shuffle=shuffle) def _init_batch_size_and_step(self): batch_size = self.config['train_batch_size'] self.step = batch_size self.set_batch_size(batch_size) @property def pr_end(self): return len(self.user_list) def _shuffle(self): self.user_list.shuffle() def _next_batch_data(self): cur_data = self.user_list[self.pr:self.pr + self.step] self.pr += self.step return cur_data
def _neg_sample_by_point_wise_sampling(self, inter_feat, pos_idx, neg_iids, neg_idx, pos_iids): pos_inter_num = len(pos_idx[0]) neg_inter_num = len(neg_idx[0]) new_data_pos = inter_feat[pos_idx].repeat(self.neg_sample_by + 1) new_data_pos[self.iid_field][pos_inter_num:] = neg_iids new_data_pos = self.dataset.join(new_data_pos) labels_pos = torch.zeros(pos_inter_num * (self.neg_sample_by + 1)) labels_pos[:pos_inter_num] = 1 new_data_pos.update(Interaction({self.label_field: labels_pos})) if neg_inter_num > 0: new_data_neg = inter_feat[neg_idx].repeat( round(self.neg_sample_by / 3) + 1) new_data_neg[self.iid_field][neg_inter_num:] = pos_iids new_data_neg = self.dataset.join(new_data_neg) labels_neg = torch.ones(neg_inter_num * (round(self.neg_sample_by / 3) + 1)) labels_neg[:neg_inter_num] = 0 new_data_neg.update(Interaction({self.label_field: labels_neg})) new_data = cat_interactions([new_data_pos, new_data_neg]) else: new_data = cat_interactions([new_data_pos]) return new_data
def _spilt_predict(self, interaction, batch_size): spilt_interaction = dict() for key, tensor in interaction.interaction.items(): spilt_interaction[key] = tensor.split(self.test_batch_size, dim=0) num_block = (batch_size + self.test_batch_size - 1) // self.test_batch_size result_list = [] for i in range(num_block): current_interaction = dict() for key, spilt_tensor in spilt_interaction.items(): current_interaction[key] = spilt_tensor[i] result = self.model.predict( Interaction(current_interaction).to(self.device)) result_list.append(result) return torch.cat(result_list, dim=0)
def __init__(self, config, dataset, sampler, neg_sample_args, batch_size=1, dl_format=InputType.POINTWISE, shuffle=False): if neg_sample_args['strategy'] != 'full': raise ValueError( 'neg_sample strategy in GeneralFullDataLoader() should be `full`' ) uid_field = dataset.uid_field iid_field = dataset.iid_field user_num = dataset.user_num self.uid_list = [] self.uid2items_num = np.zeros(user_num, dtype=np.int64) self.uid2swap_idx = np.array([None] * user_num) self.uid2rev_swap_idx = np.array([None] * user_num) self.uid2history_item = np.array([None] * user_num) dataset.sort(by=uid_field, ascending=True) last_uid = None positive_item = set() uid2used_item = sampler.used_ids for uid, iid in zip(dataset.inter_feat[uid_field].numpy(), dataset.inter_feat[iid_field].numpy()): if uid != last_uid: self._set_user_property(last_uid, uid2used_item[last_uid], positive_item) last_uid = uid self.uid_list.append(uid) positive_item = set() positive_item.add(iid) self._set_user_property(last_uid, uid2used_item[last_uid], positive_item) self.uid_list = torch.tensor(self.uid_list) self.user_df = dataset.join(Interaction({uid_field: self.uid_list})) super().__init__(config, dataset, sampler, neg_sample_args, batch_size=batch_size, dl_format=dl_format, shuffle=shuffle)
def inter_matrix(self, form='coo', value_field=None): """Get sparse matrix that describe interactions between user_id and item_id. Sparse matrix has shape (user_num, item_num). For a row of <src, tgt>, ``matrix[src, tgt] = 1`` if ``value_field`` is ``None``, else ``matrix[src, tgt] = self.inter_feat[src, tgt]``. Args: form (str, optional): Sparse matrix format. Defaults to ``coo``. value_field (str, optional): Data of sparse matrix, which should exist in ``df_feat``. Defaults to ``None``. Returns: scipy.sparse: Sparse matrix in form ``coo`` or ``csr``. """ if not self.uid_field or not self.iid_field: raise ValueError( 'dataset does not exist uid/iid, thus can not converted to sparse matrix.' ) l1_idx = (self.inter_feat[self.item_list_length_field] == 1) l1_inter_dict = self.inter_feat[l1_idx].interaction new_dict = {} list_suffix = self.config['LIST_SUFFIX'] candidate_field_set = set() for field in l1_inter_dict: if field != self.uid_field and field + list_suffix in l1_inter_dict: candidate_field_set.add(field) new_dict[field] = torch.cat([ self.inter_feat[field], l1_inter_dict[field + list_suffix][:, 0] ]) elif (not field.endswith(list_suffix)) and ( field != self.item_list_length_field): new_dict[field] = torch.cat( [self.inter_feat[field], l1_inter_dict[field]]) local_inter_feat = Interaction(new_dict) return self._create_sparse_matrix(local_inter_feat, self.uid_field, self.iid_field, form, value_field)
class CollectTestCases(object): interaction0 = Interaction({}, [0, 2, 3, 4], [2, 3, 4, 5]) scores_tensor0 = torch.Tensor( [0.1, 0.2, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.2, 0.1, 0.4, 0.3])
def data_augmentation(self): """Augmentation processing for sequential dataset. E.g., ``u1`` has purchase sequence ``<i1, i2, i3, i4>``, then after augmentation, we will generate three cases. ``u1, <i1> | i2`` (Which means given user_id ``u1`` and item_seq ``<i1>``, we need to predict the next item ``i2``.) The other cases are below: ``u1, <i1, i2> | i3`` ``u1, <i1, i2, i3> | i4`` """ self.logger.debug('data_augmentation') self._aug_presets() self._check_field('uid_field', 'time_field') max_item_list_len = self.config['MAX_ITEM_LIST_LENGTH'] self.sort(by=[self.uid_field, self.time_field], ascending=True) last_uid = None uid_list, item_list_index, target_index, item_list_length = [], [], [], [] seq_start = 0 for i, uid in enumerate(self.inter_feat[self.uid_field].numpy()): if last_uid != uid: last_uid = uid seq_start = i else: if i - seq_start > max_item_list_len: seq_start += 1 uid_list.append(uid) item_list_index.append(slice(seq_start, i)) target_index.append(i) item_list_length.append(i - seq_start) uid_list = np.array(uid_list) item_list_index = np.array(item_list_index) target_index = np.array(target_index) item_list_length = np.array(item_list_length, dtype=np.int64) new_length = len(item_list_index) new_data = self.inter_feat[target_index] new_dict = { self.item_list_length_field: torch.tensor(item_list_length), } for field in self.inter_feat: if field != self.uid_field: list_field = getattr(self, f'{field}_list_field') list_len = self.field2seqlen[list_field] shape = (new_length, list_len) if isinstance( list_len, int) else (new_length, ) + list_len list_ftype = self.field2type[list_field] dtype = torch.int64 if list_ftype in [ FeatureType.TOKEN, FeatureType.TOKEN_SEQ ] else torch.float64 new_dict[list_field] = torch.zeros(shape, dtype=dtype) value = self.inter_feat[field] for i, (index, length) in enumerate( zip(item_list_index, item_list_length)): new_dict[list_field][i][:length] = value[index] new_data.update(Interaction(new_dict)) self.inter_feat = new_data
def _neg_sample_by_pair_wise_sampling(self, data, neg_iids): new_data = data.repeat(self.times) new_data.update(Interaction({self.neg_item_id: neg_iids})) return new_data
def _neg_sampling(self, kg_feat): hids = kg_feat[self.hid_field] neg_tids = self.sampler.sample_by_entity_ids(hids, self.neg_sample_num) kg_feat.update(Interaction({self.neg_tid_field: neg_tids})) return kg_feat