def _iter_indices_mask(self, x, y, indices): if self._shuffle: check_random_state(self._random_state).shuffle(indices) mask = np.zeros(len(indices), dtype=np.bool) for i in indices: new_mask = mask.copy() new_mask[i] = True yield new_mask
def _iter_indices_mask(self, x, y, indices): if self._shuffle: check_random_state(self._random_state).shuffle(indices) n_splits = self._folds fold_sizes = (len(x) // n_splits) * np.ones(n_splits, dtype=np.int) fold_sizes[:len(x) % n_splits] += 1 current = 0 mask = np.zeros(len(indices), dtype=np.bool) for fold_size in fold_sizes: start, stop = current, current + fold_size copy_mask = np.copy(mask) copy_mask[indices[start:stop]] = True current = stop yield copy_mask
def __init__(self, ratio=0.8, times=10, user_idx=0, item_idx=1, random_state=None): super(UserHoldOut, self).__init__() self._times = times self._ratio = ratio self._user_idx = user_idx self._item_idx = item_idx self._random_state = check_random_state(random_state)
def split(self, x, y): data = pd.DataFrame(x) n_samples = data.shape[0] indices = np.arange(len(x)) for i in range(self._times): check_random_state(self._random_state).shuffle(indices) train_size = int(n_samples * self._ratio) # split data according to the shuffled index and the holdout size train_idx = indices[:train_size] train_split = data.ix[indices[:train_size]] test_split = data.ix[indices[train_size:]] # remove new user and new items from the test split train_users = train_split[self._user_idx].unique() train_items = train_split[self._item_idx].unique() test_idx = test_split.index[ (test_split[self._user_idx].isin(train_users)) & (test_split[self._item_idx].isin(train_items))] yield train_idx, test_idx
def _iter_indices_mask(self, x, y, indices): data = pd.DataFrame(x)[[self._user_idx, self._item_idx]] mask = np.zeros(data.shape[0], dtype=np.bool) for i in range(self._times): copy_mask = np.copy(mask) grouped = data.groupby(0) for user, g in grouped: idx_shuffled = g.index.values.reshape(-1) n_observed = int((1 - self._ratio) * len(idx_shuffled)) check_random_state(self._random_state).shuffle(idx_shuffled) copy_mask[idx_shuffled[0:n_observed]] = True # cleaning train_split = data.ix[indices[np.logical_not(copy_mask)]] test_split = data.ix[indices[copy_mask]] # remove new user and new items from the test split train_items = train_split[self._item_idx].unique() test_idx = test_split.index[~test_split[self._item_idx]. isin(train_items)] copy_mask[test_idx] = False print(1 - (sum(copy_mask) / len(copy_mask))) yield copy_mask
def split(self, x, y): data = pd.DataFrame(x) n_samples = data.shape[0] indices = np.arange(len(x)) for i in range(self._times): check_random_state(self._random_state).shuffle(indices) train_size = int(n_samples * self._ratio) # split data according to the shuffled index and the holdout size train_idx = indices[:train_size] test_idx = indices[train_size:] # This block of code checks whether the user and the item # in the test set belong to the train_set otherwise # they are dropped # train_split = data.ix[indices[:train_size]] # test_split = data.ix[indices[train_size:]] # train_users = train_split[self._user_idx].unique() # train_items = train_split[self._item_idx].unique() # test_idx = test_split.index[(test_split[self._user_idx].isin(train_users)) & # (test_split[self._item_idx].isin(train_items))] yield train_idx, test_idx