def test_2dim_list(self): """Test AliasTable with 2-dimension list as input.""" obj_freq = [[1, 2, 3], [4, 5, 6]] try: self.aliasTable = AliasTable(obj_freq) except ValueError as e: self.assertEqual(type(e), ValueError)
def __init__( self, split_dataset, intersect=True, binarize=True, bin_thld=0.0, normalize=False, ): self.train, self.valid, self.test = split_dataset self.user_pool = list(self.train[DEFAULT_USER_COL].unique()) self.item_pool = list(self.train[DEFAULT_ITEM_COL].unique()) self.n_users = len(self.user_pool) self.n_items = len(self.item_pool) self.user_id_pool = [i for i in range(self.n_users)] self.item_id_pool = [i for i in range(self.n_items)] if intersect: self._intersect() if binarize: self._binarize(bin_thld) if normalize: self._normalize() self._re_index() self.item_sampler = AliasTable( self.train[DEFAULT_ITEM_COL].value_counts().to_dict() ) self.user_sampler = AliasTable( self.train[DEFAULT_USER_COL].value_counts().to_dict() )
def test_dict(self, mock_uniform, mock_randint): """Test AliasTable with dictionary as input.""" obj_freq = {100: 6, 102: 4, 103: 1, 104: 1} self.aliasTable = AliasTable(obj_freq) # check vacab_size self.assertEqual(self.aliasTable.vocab_size, 4) # check prob_arr self.assertEqual(round(self.aliasTable.prob_arr[0], 2), 1.0) self.assertEqual(round(self.aliasTable.prob_arr[1], 2), 0.67) self.assertEqual(round(self.aliasTable.prob_arr[2], 2), 0.33) self.assertEqual(round(self.aliasTable.prob_arr[3], 2), 0.33) # check alias_arr # check alias_arr self.assertEqual(self.aliasTable.alias_arr[0], 0) self.assertEqual(self.aliasTable.alias_arr[1], 0) self.assertEqual(self.aliasTable.alias_arr[2], 0) self.assertEqual(self.aliasTable.alias_arr[3], 1) mock_uniform.side_effect = [0.1, 0.64, 0.8, 0.6] mock_randint.return_value = [2, 2, 0, 1] result = self.aliasTable.sample(4) self.assertEqual(result[0], 103) self.assertEqual(result[1], 100) self.assertEqual(result[2], 100) self.assertEqual(result[3], 102)
def test_list(self, mock_uniform, mock_randint): """Test AliasTable with list as input.""" obj_freq = [6, 4, 1, 1] self.aliasTable = AliasTable(obj_freq) # check vocab_size self.assertEqual(self.aliasTable.vocab_size, 4) # check prob_arr self.assertEqual(round(self.aliasTable.prob_arr[0], 2), 1.0) self.assertEqual(round(self.aliasTable.prob_arr[1], 2), 0.67) self.assertEqual(round(self.aliasTable.prob_arr[2], 2), 0.33) self.assertEqual(round(self.aliasTable.prob_arr[3], 2), 0.33) # check alias_arr self.assertEqual(self.aliasTable.alias_arr[0], 0) self.assertEqual(self.aliasTable.alias_arr[1], 0) self.assertEqual(self.aliasTable.alias_arr[2], 0) self.assertEqual(self.aliasTable.alias_arr[3], 1) mock_uniform.side_effect = [0.1, 0.64, 0.8, 0.6] mock_randint.return_value = [2, 2, 0, 1] result = self.aliasTable.sample(4) self.assertEqual(result[0], 2) self.assertEqual(result[1], 0) self.assertEqual(result[2], 0) self.assertEqual(result[3], 1)
def __init__(self, config): """Init GroceryData Class. Args: config: """ self.config = config self.n_users = 0 self.n_items = 0 self.sub_set = 0 self.random_dim = 512 # subset of the dataset. use a small set of users and items if >0, otherwise use full dataset if "sub_set" in config: self.sub_set = config["dataset"]["sub_set"] if "random_dim" in config: self.random_dim = config["model"]["random_dim"] # data preprocessing for training and test data # To be replaced with new data method train, valid, test = load_split_dataset(config) self.train = self._intersect_train_test(train, test[0]) self.n_train = len(self.train.index) self.valid = self._reindex_list(valid) self.test = self._reindex_list(test) self.item_sampler = AliasTable( self.train[DEFAULT_ITEM_COL].value_counts().to_dict()) self.user_sampler = AliasTable( self.train[DEFAULT_USER_COL].value_counts().to_dict()) if ("item_fea_type" in self.config["dataset"] or "user_fea_type" in self.config["dataset"]): self.init_item_fea() self.init_user_fea()
def split_data( data, split_type, test_rate, random=False, n_negative=100, save_dir=None, by_user=False, n_test=10, ): """Split data by split_type and other parameters. Args: data (DataFrame): interaction DataFrame to be split split_type (string): options can be: - random - random_basket - leave_one_out - leave_one_basket - temporal - temporal_basket random (bool): Whether random leave one item/basket as testing. only for leave_one_out and leave_one_basket. test_rate (float): percentage of the test data. Note that percentage of the validation data will be the same as testing. n_negative (int): Number of negative samples for testing and validation data. save_dir (string or Path): Default None. If specified, the split data will be saved to the dir. by_user (bool): Default False. - True: user-based split, - False: global split, n_test (int): Default 10. The number of testing and validation copies. Returns: DataFrame: The split data. Note that the returned data will not have negative samples. """ print(f"Splitting data by {split_type} ...") if n_negative < 0 and n_test > 1: # n_negative < 0, validate and testing sets of splits will contain all the negative items. # There will be only one validate and one testing sets. n_test = 1 if split_type == "random": data = random_split(data, test_rate, by_user) elif split_type == "random_basket": data = random_basket_split(data, test_rate, by_user) elif split_type == "leave_one_out": data = leave_one_out(data, random) elif split_type == "leave_one_basket": data = leave_one_basket(data, random) elif split_type == "temporal": data = temporal_split(data, test_rate, by_user) elif split_type == "temporal_basket": data = temporal_basket_split(data, test_rate, by_user) else: print("[ERROR] wrong split_type.") return None tp_train = data[data[DEFAULT_FLAG_COL] == "train"] tp_validate = data[data[DEFAULT_FLAG_COL] == "validate"] tp_test = data[data[DEFAULT_FLAG_COL] == "test"] if save_dir is None: return data parameterized_path = generate_parameterized_path( test_rate=test_rate, random=random, n_negative=n_negative, by_user=by_user ) save_split_data(tp_train, save_dir, split_type, parameterized_path, "train.npz") # keep the original validation and test sets. save_split_data(tp_validate, save_dir, split_type, parameterized_path, "valid.npz") save_split_data(tp_test, save_dir, split_type, parameterized_path, "test.npz") item_sampler = AliasTable(data[DEFAULT_ITEM_COL].value_counts().to_dict()) n_items = tp_train[DEFAULT_ITEM_COL].nunique() valid_neg_max = ( tp_validate.groupby([DEFAULT_USER_COL])[DEFAULT_ITEM_COL].count().max() ) test_neg_max = tp_test.groupby([DEFAULT_USER_COL])[DEFAULT_ITEM_COL].count().max() if n_items - valid_neg_max < n_negative or n_items - test_neg_max < n_negative: raise RuntimeError( "This dataset do not have sufficient negative items for sampling! \n" + f"valid_neg_max: {n_items - valid_neg_max}, " + f"test_neg_max: {n_items - test_neg_max}," + f"n_negative: {n_negative}\nPlease directly use valid.npz and test.npz." ) for i in range(n_test): tp_validate_new = feed_neg_sample(tp_validate, n_negative, item_sampler) tp_test_new = feed_neg_sample(tp_test, n_negative, item_sampler) save_split_data( tp_validate_new, save_dir, split_type, parameterized_path, "valid_" + str(i) + ".npz", ) save_split_data( tp_test_new, save_dir, split_type, parameterized_path, "test_" + str(i) + ".npz", ) return data
class TestAliasTable(unittest.TestCase): """TestAliasTable Class.""" @mock.patch("numpy.random.randint") @mock.patch("numpy.random.uniform") def test_list(self, mock_uniform, mock_randint): """Test AliasTable with list as input.""" obj_freq = [6, 4, 1, 1] self.aliasTable = AliasTable(obj_freq) # check vocab_size self.assertEqual(self.aliasTable.vocab_size, 4) # check prob_arr self.assertEqual(round(self.aliasTable.prob_arr[0], 2), 1.0) self.assertEqual(round(self.aliasTable.prob_arr[1], 2), 0.67) self.assertEqual(round(self.aliasTable.prob_arr[2], 2), 0.33) self.assertEqual(round(self.aliasTable.prob_arr[3], 2), 0.33) # check alias_arr self.assertEqual(self.aliasTable.alias_arr[0], 0) self.assertEqual(self.aliasTable.alias_arr[1], 0) self.assertEqual(self.aliasTable.alias_arr[2], 0) self.assertEqual(self.aliasTable.alias_arr[3], 1) mock_uniform.side_effect = [0.1, 0.64, 0.8, 0.6] mock_randint.return_value = [2, 2, 0, 1] result = self.aliasTable.sample(4) self.assertEqual(result[0], 2) self.assertEqual(result[1], 0) self.assertEqual(result[2], 0) self.assertEqual(result[3], 1) @mock.patch("numpy.random.randint") @mock.patch("numpy.random.uniform") def test_dict(self, mock_uniform, mock_randint): """Test AliasTable with dictionary as input.""" obj_freq = {100: 6, 102: 4, 103: 1, 104: 1} self.aliasTable = AliasTable(obj_freq) # check vacab_size self.assertEqual(self.aliasTable.vocab_size, 4) # check prob_arr self.assertEqual(round(self.aliasTable.prob_arr[0], 2), 1.0) self.assertEqual(round(self.aliasTable.prob_arr[1], 2), 0.67) self.assertEqual(round(self.aliasTable.prob_arr[2], 2), 0.33) self.assertEqual(round(self.aliasTable.prob_arr[3], 2), 0.33) # check alias_arr # check alias_arr self.assertEqual(self.aliasTable.alias_arr[0], 0) self.assertEqual(self.aliasTable.alias_arr[1], 0) self.assertEqual(self.aliasTable.alias_arr[2], 0) self.assertEqual(self.aliasTable.alias_arr[3], 1) mock_uniform.side_effect = [0.1, 0.64, 0.8, 0.6] mock_randint.return_value = [2, 2, 0, 1] result = self.aliasTable.sample(4) self.assertEqual(result[0], 103) self.assertEqual(result[1], 100) self.assertEqual(result[2], 100) self.assertEqual(result[3], 102) def test_2dim_list(self): """Test AliasTable with 2-dimension list as input.""" obj_freq = [[1, 2, 3], [4, 5, 6]] try: self.aliasTable = AliasTable(obj_freq) except ValueError as e: self.assertEqual(type(e), ValueError)