예제 #1
0
 def _neg_sample_by_pair_wise_sampling(self, inter_feat, neg_iids):
     inter_feat = inter_feat.repeat(self.times)
     neg_item_feat = Interaction({self.iid_field: neg_iids})
     neg_item_feat = self.dataset.join(neg_item_feat)
     neg_item_feat.add_prefix(self.neg_prefix)
     inter_feat.update(neg_item_feat)
     return inter_feat
    def __init__(self, config, dataset, sampler, shuffle=False):
        if shuffle is False:
            shuffle = True
            self.logger.warning('UserDataLoader must shuffle the data.')

        self.uid_field = dataset.uid_field
        self.user_list = Interaction(
            {self.uid_field: torch.arange(dataset.user_num)})

        super().__init__(config, dataset, sampler, shuffle=shuffle)
예제 #3
0
class UserDataLoader(AbstractDataLoader):
    """:class:`UserDataLoader` will return a batch of data which only contains user-id when it is iterated.

    Args:
        config (Config): The config of dataloader.
        dataset (Dataset): The dataset of dataloader.
        batch_size (int, optional): The batch_size of dataloader. Defaults to ``1``.
        dl_format (InputType, optional): The input type of dataloader. Defaults to
            :obj:`~recbole.utils.enum_type.InputType.POINTWISE`.
        shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``.

    Attributes:
        shuffle (bool): Whether the dataloader will be shuffle after a round.
            However, in :class:`UserDataLoader`, it's guaranteed to be ``True``.
    """
    dl_type = DataLoaderType.ORIGIN

    def __init__(self,
                 config,
                 dataset,
                 batch_size=1,
                 dl_format=InputType.POINTWISE,
                 shuffle=False):
        self.uid_field = dataset.uid_field
        self.user_list = Interaction(
            {self.uid_field: torch.arange(dataset.user_num)})

        super().__init__(config=config,
                         dataset=dataset,
                         batch_size=batch_size,
                         dl_format=dl_format,
                         shuffle=shuffle)

    def setup(self):
        """Make sure that the :attr:`shuffle` is True. If :attr:`shuffle` is False, it will be changed to True
        and give a warning to user.
        """
        if self.shuffle is False:
            self.shuffle = True
            self.logger.warning('UserDataLoader must shuffle the data')

    @property
    def pr_end(self):
        return len(self.user_list)

    def _shuffle(self):
        self.user_list.shuffle()

    def _next_batch_data(self):
        cur_data = self.user_list[self.pr:self.pr + self.step]
        self.pr += self.step
        return cur_data
예제 #4
0
    def __init__(self,
                 config,
                 dataset,
                 batch_size=1,
                 dl_format=InputType.POINTWISE,
                 shuffle=False):
        self.uid_field = dataset.uid_field
        self.user_list = Interaction(
            {self.uid_field: torch.arange(dataset.user_num)})

        super().__init__(config=config,
                         dataset=dataset,
                         batch_size=batch_size,
                         dl_format=dl_format,
                         shuffle=shuffle)
예제 #5
0
    def augmentation(self, item_list_index, target_index, item_list_length):
        """Data augmentation.

        Args:
            item_list_index (numpy.ndarray): the index of history items list in interaction.
            target_index (numpy.ndarray): the index of items to be predicted in interaction.
            item_list_length (numpy.ndarray): history list length.

        Returns:
            dict: the augmented data.
        """
        new_length = len(item_list_index)
        new_data = self.dataset.inter_feat[target_index]
        new_dict = {
            self.item_list_length_field: torch.tensor(item_list_length),
        }

        for field in self.dataset.inter_feat:
            if field != self.uid_field:
                list_field = getattr(self, f'{field}_list_field')
                list_len = self.dataset.field2seqlen[list_field]
                shape = (new_length, list_len) if isinstance(list_len, int) else (new_length,) + list_len
                list_ftype = self.dataset.field2type[list_field]
                dtype = torch.int64 if list_ftype in [FeatureType.TOKEN, FeatureType.TOKEN_SEQ] else torch.float64
                new_dict[list_field] = torch.zeros(shape, dtype=dtype)

                value = self.dataset.inter_feat[field]
                for i, (index, length) in enumerate(zip(item_list_index, item_list_length)):
                    new_dict[list_field][i][:length] = value[index]

        new_data.update(Interaction(new_dict))
        return new_data
예제 #6
0
    def __init__(self, config, dataset, sampler, shuffle=False):
        self.uid_field = dataset.uid_field
        self.iid_field = dataset.iid_field
        self.is_sequential = config['MODEL_TYPE'] == ModelType.SEQUENTIAL
        if not self.is_sequential:
            user_num = dataset.user_num
            self.uid_list = []
            self.uid2items_num = np.zeros(user_num, dtype=np.int64)
            self.uid2positive_item = np.array([None] * user_num)
            self.uid2history_item = np.array([None] * user_num)

            dataset.sort(by=self.uid_field, ascending=True)
            last_uid = None
            positive_item = set()
            uid2used_item = sampler.used_ids
            for uid, iid in zip(dataset.inter_feat[self.uid_field].numpy(),
                                dataset.inter_feat[self.iid_field].numpy()):
                if uid != last_uid:
                    self._set_user_property(last_uid, uid2used_item[last_uid],
                                            positive_item)
                    last_uid = uid
                    self.uid_list.append(uid)
                    positive_item = set()
                positive_item.add(iid)
            self._set_user_property(last_uid, uid2used_item[last_uid],
                                    positive_item)
            self.uid_list = torch.tensor(self.uid_list, dtype=torch.int64)
            self.user_df = dataset.join(
                Interaction({self.uid_field: self.uid_list}))

        super().__init__(config, dataset, sampler, shuffle=shuffle)
예제 #7
0
 def _neg_sampling(self, inter_feat):
     if 'dynamic' in self.neg_sample_args.keys(
     ) and self.neg_sample_args['dynamic'] != 'none':
         candidate_num = self.neg_sample_args['dynamic']
         user_ids = inter_feat[self.uid_field]
         item_ids = inter_feat[self.iid_field]
         neg_candidate_ids = self.sampler.sample_by_user_ids(
             user_ids, item_ids, self.neg_sample_num * candidate_num)
         self.model.eval()
         interaction = copy.deepcopy(inter_feat).to(self.model.device)
         interaction = interaction.repeat(self.neg_sample_num *
                                          candidate_num)
         neg_item_feat = Interaction(
             {self.iid_field: neg_candidate_ids.to(self.model.device)})
         interaction.update(neg_item_feat)
         scores = self.model.predict(interaction).reshape(candidate_num, -1)
         indices = torch.max(scores, dim=0)[1].detach()
         neg_candidate_ids = neg_candidate_ids.reshape(candidate_num, -1)
         neg_item_ids = neg_candidate_ids[
             indices,
             [i for i in range(neg_candidate_ids.shape[1])]].view(-1)
         self.model.train()
         return self.sampling_func(inter_feat, neg_item_ids)
     elif self.neg_sample_args['strategy'] == 'by':
         user_ids = inter_feat[self.uid_field]
         item_ids = inter_feat[self.iid_field]
         neg_item_ids = self.sampler.sample_by_user_ids(
             user_ids, item_ids, self.neg_sample_num)
         return self.sampling_func(inter_feat, neg_item_ids)
     else:
         return inter_feat
예제 #8
0
 def _next_batch_data(self):
     cur_data = self.dataset.kg_feat[self.pr:self.pr + self.step]
     head_ids = cur_data[self.hid_field]
     neg_tail_ids = self.sampler.sample_by_entity_ids(
         head_ids, self.neg_sample_num)
     cur_data.update(Interaction({self.neg_tid_field: neg_tail_ids}))
     self.pr += self.step
     return cur_data
예제 #9
0
 def _neg_sample_by_point_wise_sampling(self, data, neg_iids):
     pos_inter_num = len(data)
     new_data = data.repeat(self.times)
     new_data[self.iid_field][pos_inter_num:] = neg_iids
     labels = torch.zeros(pos_inter_num * self.times)
     labels[: pos_inter_num] = 1.0
     new_data.update(Interaction({self.label_field: labels}))
     return new_data
예제 #10
0
 def _neg_sample_by_point_wise_sampling(self, inter_feat, neg_item_ids):
     pos_inter_num = len(inter_feat)
     new_data = inter_feat.repeat(self.times)
     new_data[self.iid_field][pos_inter_num:] = neg_item_ids
     new_data = self.dataset.join(new_data)
     labels = torch.zeros(pos_inter_num * self.times)
     labels[:pos_inter_num] = 1.0
     new_data.update(Interaction({self.label_field: labels}))
     return new_data
예제 #11
0
def full_sort_scores(uid_series, model, test_data, device=None):
    """Calculate the scores of all items for each user in uid_series.

    Note:
        The score of [pad] and history items will be set into -inf.

    Args:
        uid_series (numpy.ndarray or list): User id series.
        model (AbstractRecommender): Model to predict.
        test_data (FullSortEvalDataLoader): The test_data of model.
        device (torch.device, optional): The device which model will run on. Defaults to ``None``.
            Note: ``device=None`` is equivalent to ``device=torch.device('cpu')``.

    Returns:
        torch.Tensor: the scores of all items for each user in uid_series.
    """
    device = device or torch.device('cpu')
    uid_series = torch.tensor(uid_series)
    uid_field = test_data.dataset.uid_field
    dataset = test_data.dataset
    model.eval()

    if not test_data.is_sequential:
        input_interaction = dataset.join(Interaction({uid_field: uid_series}))
        history_item = test_data.uid2history_item[list(uid_series)]
        history_row = torch.cat([
            torch.full_like(hist_iid, i)
            for i, hist_iid in enumerate(history_item)
        ])
        history_col = torch.cat(list(history_item))
        history_index = history_row, history_col
    else:
        _, index = (
            dataset.inter_feat[uid_field] == uid_series[:, None]).nonzero(
                as_tuple=True)
        input_interaction = dataset[index]
        history_index = None

    # Get scores of all items
    input_interaction = input_interaction.to(device)
    try:
        scores = model.full_sort_predict(input_interaction)
    except NotImplementedError:
        input_interaction = input_interaction.repeat(dataset.item_num)
        input_interaction.update(
            test_data.dataset.get_item_feature().to(device).repeat(
                len(uid_series)))
        scores = model.predict(input_interaction)

    scores = scores.view(-1, dataset.item_num)
    scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    if history_index is not None:
        scores[history_index] = -np.inf  # set scores of history items to -inf

    return scores
class UserDataLoader(AbstractDataLoader):
    """:class:`UserDataLoader` will return a batch of data which only contains user-id when it is iterated.

    Args:
        config (Config): The config of dataloader.
        dataset (Dataset): The dataset of dataloader.
        sampler (Sampler): The sampler of dataloader.
        shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``.

    Attributes:
        shuffle (bool): Whether the dataloader will be shuffle after a round.
            However, in :class:`UserDataLoader`, it's guaranteed to be ``True``.
    """
    def __init__(self, config, dataset, sampler, shuffle=False):
        if shuffle is False:
            shuffle = True
            self.logger.warning('UserDataLoader must shuffle the data.')

        self.uid_field = dataset.uid_field
        self.user_list = Interaction(
            {self.uid_field: torch.arange(dataset.user_num)})

        super().__init__(config, dataset, sampler, shuffle=shuffle)

    def _init_batch_size_and_step(self):
        batch_size = self.config['train_batch_size']
        self.step = batch_size
        self.set_batch_size(batch_size)

    @property
    def pr_end(self):
        return len(self.user_list)

    def _shuffle(self):
        self.user_list.shuffle()

    def _next_batch_data(self):
        cur_data = self.user_list[self.pr:self.pr + self.step]
        self.pr += self.step
        return cur_data
예제 #13
0
 def _neg_sample_by_point_wise_sampling(self, inter_feat, pos_idx, neg_iids,
                                        neg_idx, pos_iids):
     pos_inter_num = len(pos_idx[0])
     neg_inter_num = len(neg_idx[0])
     new_data_pos = inter_feat[pos_idx].repeat(self.neg_sample_by + 1)
     new_data_pos[self.iid_field][pos_inter_num:] = neg_iids
     new_data_pos = self.dataset.join(new_data_pos)
     labels_pos = torch.zeros(pos_inter_num * (self.neg_sample_by + 1))
     labels_pos[:pos_inter_num] = 1
     new_data_pos.update(Interaction({self.label_field: labels_pos}))
     if neg_inter_num > 0:
         new_data_neg = inter_feat[neg_idx].repeat(
             round(self.neg_sample_by / 3) + 1)
         new_data_neg[self.iid_field][neg_inter_num:] = pos_iids
         new_data_neg = self.dataset.join(new_data_neg)
         labels_neg = torch.ones(neg_inter_num *
                                 (round(self.neg_sample_by / 3) + 1))
         labels_neg[:neg_inter_num] = 0
         new_data_neg.update(Interaction({self.label_field: labels_neg}))
         new_data = cat_interactions([new_data_pos, new_data_neg])
     else:
         new_data = cat_interactions([new_data_pos])
     return new_data
예제 #14
0
 def _spilt_predict(self, interaction, batch_size):
     spilt_interaction = dict()
     for key, tensor in interaction.interaction.items():
         spilt_interaction[key] = tensor.split(self.test_batch_size, dim=0)
     num_block = (batch_size + self.test_batch_size -
                  1) // self.test_batch_size
     result_list = []
     for i in range(num_block):
         current_interaction = dict()
         for key, spilt_tensor in spilt_interaction.items():
             current_interaction[key] = spilt_tensor[i]
         result = self.model.predict(
             Interaction(current_interaction).to(self.device))
         result_list.append(result)
     return torch.cat(result_list, dim=0)
예제 #15
0
    def __init__(self,
                 config,
                 dataset,
                 sampler,
                 neg_sample_args,
                 batch_size=1,
                 dl_format=InputType.POINTWISE,
                 shuffle=False):
        if neg_sample_args['strategy'] != 'full':
            raise ValueError(
                'neg_sample strategy in GeneralFullDataLoader() should be `full`'
            )

        uid_field = dataset.uid_field
        iid_field = dataset.iid_field
        user_num = dataset.user_num
        self.uid_list = []
        self.uid2items_num = np.zeros(user_num, dtype=np.int64)
        self.uid2swap_idx = np.array([None] * user_num)
        self.uid2rev_swap_idx = np.array([None] * user_num)
        self.uid2history_item = np.array([None] * user_num)

        dataset.sort(by=uid_field, ascending=True)
        last_uid = None
        positive_item = set()
        uid2used_item = sampler.used_ids
        for uid, iid in zip(dataset.inter_feat[uid_field].numpy(),
                            dataset.inter_feat[iid_field].numpy()):
            if uid != last_uid:
                self._set_user_property(last_uid, uid2used_item[last_uid],
                                        positive_item)
                last_uid = uid
                self.uid_list.append(uid)
                positive_item = set()
            positive_item.add(iid)
        self._set_user_property(last_uid, uid2used_item[last_uid],
                                positive_item)
        self.uid_list = torch.tensor(self.uid_list)
        self.user_df = dataset.join(Interaction({uid_field: self.uid_list}))

        super().__init__(config,
                         dataset,
                         sampler,
                         neg_sample_args,
                         batch_size=batch_size,
                         dl_format=dl_format,
                         shuffle=shuffle)
    def inter_matrix(self, form='coo', value_field=None):
        """Get sparse matrix that describe interactions between user_id and item_id.
        Sparse matrix has shape (user_num, item_num).
        For a row of <src, tgt>, ``matrix[src, tgt] = 1`` if ``value_field`` is ``None``,
        else ``matrix[src, tgt] = self.inter_feat[src, tgt]``.

        Args:
            form (str, optional): Sparse matrix format. Defaults to ``coo``.
            value_field (str, optional): Data of sparse matrix, which should exist in ``df_feat``.
                Defaults to ``None``.

        Returns:
            scipy.sparse: Sparse matrix in form ``coo`` or ``csr``.
        """
        if not self.uid_field or not self.iid_field:
            raise ValueError(
                'dataset does not exist uid/iid, thus can not converted to sparse matrix.'
            )

        l1_idx = (self.inter_feat[self.item_list_length_field] == 1)
        l1_inter_dict = self.inter_feat[l1_idx].interaction
        new_dict = {}
        list_suffix = self.config['LIST_SUFFIX']
        candidate_field_set = set()
        for field in l1_inter_dict:
            if field != self.uid_field and field + list_suffix in l1_inter_dict:
                candidate_field_set.add(field)
                new_dict[field] = torch.cat([
                    self.inter_feat[field],
                    l1_inter_dict[field + list_suffix][:, 0]
                ])
            elif (not field.endswith(list_suffix)) and (
                    field != self.item_list_length_field):
                new_dict[field] = torch.cat(
                    [self.inter_feat[field], l1_inter_dict[field]])
        local_inter_feat = Interaction(new_dict)
        return self._create_sparse_matrix(local_inter_feat, self.uid_field,
                                          self.iid_field, form, value_field)
예제 #17
0
class CollectTestCases(object):
    interaction0 = Interaction({}, [0, 2, 3, 4], [2, 3, 4, 5])
    scores_tensor0 = torch.Tensor(
        [0.1, 0.2, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.2, 0.1, 0.4, 0.3])
    def data_augmentation(self):
        """Augmentation processing for sequential dataset.

        E.g., ``u1`` has purchase sequence ``<i1, i2, i3, i4>``,
        then after augmentation, we will generate three cases.

        ``u1, <i1> | i2``

        (Which means given user_id ``u1`` and item_seq ``<i1>``,
        we need to predict the next item ``i2``.)

        The other cases are below:

        ``u1, <i1, i2> | i3``

        ``u1, <i1, i2, i3> | i4``
        """
        self.logger.debug('data_augmentation')

        self._aug_presets()

        self._check_field('uid_field', 'time_field')
        max_item_list_len = self.config['MAX_ITEM_LIST_LENGTH']
        self.sort(by=[self.uid_field, self.time_field], ascending=True)
        last_uid = None
        uid_list, item_list_index, target_index, item_list_length = [], [], [], []
        seq_start = 0
        for i, uid in enumerate(self.inter_feat[self.uid_field].numpy()):
            if last_uid != uid:
                last_uid = uid
                seq_start = i
            else:
                if i - seq_start > max_item_list_len:
                    seq_start += 1
                uid_list.append(uid)
                item_list_index.append(slice(seq_start, i))
                target_index.append(i)
                item_list_length.append(i - seq_start)

        uid_list = np.array(uid_list)
        item_list_index = np.array(item_list_index)
        target_index = np.array(target_index)
        item_list_length = np.array(item_list_length, dtype=np.int64)

        new_length = len(item_list_index)
        new_data = self.inter_feat[target_index]
        new_dict = {
            self.item_list_length_field: torch.tensor(item_list_length),
        }

        for field in self.inter_feat:
            if field != self.uid_field:
                list_field = getattr(self, f'{field}_list_field')
                list_len = self.field2seqlen[list_field]
                shape = (new_length, list_len) if isinstance(
                    list_len, int) else (new_length, ) + list_len
                list_ftype = self.field2type[list_field]
                dtype = torch.int64 if list_ftype in [
                    FeatureType.TOKEN, FeatureType.TOKEN_SEQ
                ] else torch.float64
                new_dict[list_field] = torch.zeros(shape, dtype=dtype)

                value = self.inter_feat[field]
                for i, (index, length) in enumerate(
                        zip(item_list_index, item_list_length)):
                    new_dict[list_field][i][:length] = value[index]

        new_data.update(Interaction(new_dict))
        self.inter_feat = new_data
예제 #19
0
 def _neg_sample_by_pair_wise_sampling(self, data, neg_iids):
     new_data = data.repeat(self.times)
     new_data.update(Interaction({self.neg_item_id: neg_iids}))
     return new_data
예제 #20
0
 def _neg_sampling(self, kg_feat):
     hids = kg_feat[self.hid_field]
     neg_tids = self.sampler.sample_by_entity_ids(hids, self.neg_sample_num)
     kg_feat.update(Interaction({self.neg_tid_field: neg_tids}))
     return kg_feat