Пример #1
0
    def _iter_indices_mask(self, x, y, indices):
        if self._shuffle:
            check_random_state(self._random_state).shuffle(indices)

        mask = np.zeros(len(indices), dtype=np.bool)
        for i in indices:
            new_mask = mask.copy()
            new_mask[i] = True
            yield new_mask
Пример #2
0
    def _iter_indices_mask(self, x, y, indices):
        if self._shuffle:
            check_random_state(self._random_state).shuffle(indices)

        n_splits = self._folds
        fold_sizes = (len(x) // n_splits) * np.ones(n_splits, dtype=np.int)
        fold_sizes[:len(x) % n_splits] += 1
        current = 0
        mask = np.zeros(len(indices), dtype=np.bool)
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            copy_mask = np.copy(mask)
            copy_mask[indices[start:stop]] = True
            current = stop
            yield copy_mask
Пример #3
0
 def __init__(self,
              ratio=0.8,
              times=10,
              user_idx=0,
              item_idx=1,
              random_state=None):
     super(UserHoldOut, self).__init__()
     self._times = times
     self._ratio = ratio
     self._user_idx = user_idx
     self._item_idx = item_idx
     self._random_state = check_random_state(random_state)
Пример #4
0
    def split(self, x, y):
        data = pd.DataFrame(x)
        n_samples = data.shape[0]
        indices = np.arange(len(x))

        for i in range(self._times):
            check_random_state(self._random_state).shuffle(indices)

            train_size = int(n_samples * self._ratio)
            # split data according to the shuffled index and the holdout size
            train_idx = indices[:train_size]
            train_split = data.ix[indices[:train_size]]
            test_split = data.ix[indices[train_size:]]

            # remove new user and new items from the test split
            train_users = train_split[self._user_idx].unique()
            train_items = train_split[self._item_idx].unique()
            test_idx = test_split.index[
                (test_split[self._user_idx].isin(train_users))
                & (test_split[self._item_idx].isin(train_items))]
            yield train_idx, test_idx
Пример #5
0
    def _iter_indices_mask(self, x, y, indices):
        data = pd.DataFrame(x)[[self._user_idx, self._item_idx]]
        mask = np.zeros(data.shape[0], dtype=np.bool)
        for i in range(self._times):
            copy_mask = np.copy(mask)
            grouped = data.groupby(0)
            for user, g in grouped:
                idx_shuffled = g.index.values.reshape(-1)
                n_observed = int((1 - self._ratio) * len(idx_shuffled))
                check_random_state(self._random_state).shuffle(idx_shuffled)
                copy_mask[idx_shuffled[0:n_observed]] = True

            # cleaning
            train_split = data.ix[indices[np.logical_not(copy_mask)]]
            test_split = data.ix[indices[copy_mask]]

            # remove new user and new items from the test split
            train_items = train_split[self._item_idx].unique()
            test_idx = test_split.index[~test_split[self._item_idx].
                                        isin(train_items)]
            copy_mask[test_idx] = False
            print(1 - (sum(copy_mask) / len(copy_mask)))
            yield copy_mask
Пример #6
0
    def split(self, x, y):
        data = pd.DataFrame(x)
        n_samples = data.shape[0]
        indices = np.arange(len(x))

        for i in range(self._times):
            check_random_state(self._random_state).shuffle(indices)

            train_size = int(n_samples * self._ratio)
            # split data according to the shuffled index and the holdout size
            train_idx = indices[:train_size]
            test_idx = indices[train_size:]

            # This block of code checks whether the user and the item
            # in the test set belong to the train_set otherwise
            # they are dropped

            # train_split = data.ix[indices[:train_size]]
            # test_split = data.ix[indices[train_size:]]
            # train_users = train_split[self._user_idx].unique()
            # train_items = train_split[self._item_idx].unique()
            # test_idx = test_split.index[(test_split[self._user_idx].isin(train_users)) &
            #                             (test_split[self._item_idx].isin(train_items))]
            yield train_idx, test_idx