示例#1
0
 def test_2dim_list(self):
     """Test AliasTable with 2-dimension list as input."""
     obj_freq = [[1, 2, 3], [4, 5, 6]]
     try:
         self.aliasTable = AliasTable(obj_freq)
     except ValueError as e:
         self.assertEqual(type(e), ValueError)
示例#2
0
    def __init__(
        self,
        split_dataset,
        intersect=True,
        binarize=True,
        bin_thld=0.0,
        normalize=False,
    ):
        self.train, self.valid, self.test = split_dataset
        self.user_pool = list(self.train[DEFAULT_USER_COL].unique())
        self.item_pool = list(self.train[DEFAULT_ITEM_COL].unique())
        self.n_users = len(self.user_pool)
        self.n_items = len(self.item_pool)
        self.user_id_pool = [i for i in range(self.n_users)]
        self.item_id_pool = [i for i in range(self.n_items)]

        if intersect:
            self._intersect()

        if binarize:
            self._binarize(bin_thld)

        if normalize:
            self._normalize()

        self._re_index()

        self.item_sampler = AliasTable(
            self.train[DEFAULT_ITEM_COL].value_counts().to_dict()
        )
        self.user_sampler = AliasTable(
            self.train[DEFAULT_USER_COL].value_counts().to_dict()
        )
示例#3
0
    def test_dict(self, mock_uniform, mock_randint):
        """Test AliasTable with dictionary as input."""
        obj_freq = {100: 6, 102: 4, 103: 1, 104: 1}
        self.aliasTable = AliasTable(obj_freq)

        # check vacab_size
        self.assertEqual(self.aliasTable.vocab_size, 4)

        # check prob_arr
        self.assertEqual(round(self.aliasTable.prob_arr[0], 2), 1.0)
        self.assertEqual(round(self.aliasTable.prob_arr[1], 2), 0.67)
        self.assertEqual(round(self.aliasTable.prob_arr[2], 2), 0.33)
        self.assertEqual(round(self.aliasTable.prob_arr[3], 2), 0.33)

        # check alias_arr
        # check alias_arr
        self.assertEqual(self.aliasTable.alias_arr[0], 0)
        self.assertEqual(self.aliasTable.alias_arr[1], 0)
        self.assertEqual(self.aliasTable.alias_arr[2], 0)
        self.assertEqual(self.aliasTable.alias_arr[3], 1)

        mock_uniform.side_effect = [0.1, 0.64, 0.8, 0.6]
        mock_randint.return_value = [2, 2, 0, 1]

        result = self.aliasTable.sample(4)
        self.assertEqual(result[0], 103)
        self.assertEqual(result[1], 100)
        self.assertEqual(result[2], 100)
        self.assertEqual(result[3], 102)
示例#4
0
    def test_list(self, mock_uniform, mock_randint):
        """Test AliasTable with list as input."""
        obj_freq = [6, 4, 1, 1]
        self.aliasTable = AliasTable(obj_freq)

        # check vocab_size
        self.assertEqual(self.aliasTable.vocab_size, 4)

        # check prob_arr
        self.assertEqual(round(self.aliasTable.prob_arr[0], 2), 1.0)
        self.assertEqual(round(self.aliasTable.prob_arr[1], 2), 0.67)
        self.assertEqual(round(self.aliasTable.prob_arr[2], 2), 0.33)
        self.assertEqual(round(self.aliasTable.prob_arr[3], 2), 0.33)

        # check alias_arr
        self.assertEqual(self.aliasTable.alias_arr[0], 0)
        self.assertEqual(self.aliasTable.alias_arr[1], 0)
        self.assertEqual(self.aliasTable.alias_arr[2], 0)
        self.assertEqual(self.aliasTable.alias_arr[3], 1)

        mock_uniform.side_effect = [0.1, 0.64, 0.8, 0.6]
        mock_randint.return_value = [2, 2, 0, 1]

        result = self.aliasTable.sample(4)
        self.assertEqual(result[0], 2)
        self.assertEqual(result[1], 0)
        self.assertEqual(result[2], 0)
        self.assertEqual(result[3], 1)
示例#5
0
    def __init__(self, config):
        """Init GroceryData Class.

        Args:
            config:
        """
        self.config = config
        self.n_users = 0
        self.n_items = 0
        self.sub_set = 0
        self.random_dim = 512
        # subset of the dataset. use a small set of users and items if >0, otherwise use full dataset
        if "sub_set" in config:
            self.sub_set = config["dataset"]["sub_set"]
        if "random_dim" in config:
            self.random_dim = config["model"]["random_dim"]
        # data preprocessing for training and test data
        # To be replaced with new data method
        train, valid, test = load_split_dataset(config)
        self.train = self._intersect_train_test(train, test[0])
        self.n_train = len(self.train.index)
        self.valid = self._reindex_list(valid)
        self.test = self._reindex_list(test)
        self.item_sampler = AliasTable(
            self.train[DEFAULT_ITEM_COL].value_counts().to_dict())
        self.user_sampler = AliasTable(
            self.train[DEFAULT_USER_COL].value_counts().to_dict())
        if ("item_fea_type" in self.config["dataset"]
                or "user_fea_type" in self.config["dataset"]):
            self.init_item_fea()
            self.init_user_fea()
示例#6
0
def split_data(
    data,
    split_type,
    test_rate,
    random=False,
    n_negative=100,
    save_dir=None,
    by_user=False,
    n_test=10,
):
    """Split data by split_type and other parameters.

    Args:
        data (DataFrame): interaction DataFrame to be split
        split_type (string): options can be:
                        - random
                        - random_basket
                        - leave_one_out
                        - leave_one_basket
                        - temporal
                        - temporal_basket
        random (bool): Whether random leave one item/basket as testing. only for leave_one_out and leave_one_basket.
        test_rate (float): percentage of the test data.
            Note that percentage of the validation data will be the same as testing.
        n_negative (int): Number of negative samples for testing and validation data.
        save_dir (string or Path): Default None. If specified, the split data will be saved to the dir.
        by_user (bool): Default False.
                    - True: user-based split,
                    - False: global split,
        n_test (int): Default 10. The number of testing and validation copies.

    Returns:
        DataFrame: The split data. Note that the returned data will not have negative samples.

    """
    print(f"Splitting data by {split_type} ...")
    if n_negative < 0 and n_test > 1:
        # n_negative < 0, validate and testing sets of splits will contain all the negative items.
        # There will be only one validate and one testing sets.
        n_test = 1
    if split_type == "random":
        data = random_split(data, test_rate, by_user)
    elif split_type == "random_basket":
        data = random_basket_split(data, test_rate, by_user)
    elif split_type == "leave_one_out":
        data = leave_one_out(data, random)
    elif split_type == "leave_one_basket":
        data = leave_one_basket(data, random)
    elif split_type == "temporal":
        data = temporal_split(data, test_rate, by_user)
    elif split_type == "temporal_basket":
        data = temporal_basket_split(data, test_rate, by_user)
    else:
        print("[ERROR] wrong split_type.")
        return None
    tp_train = data[data[DEFAULT_FLAG_COL] == "train"]
    tp_validate = data[data[DEFAULT_FLAG_COL] == "validate"]
    tp_test = data[data[DEFAULT_FLAG_COL] == "test"]
    if save_dir is None:
        return data

    parameterized_path = generate_parameterized_path(
        test_rate=test_rate, random=random, n_negative=n_negative, by_user=by_user
    )

    save_split_data(tp_train, save_dir, split_type, parameterized_path, "train.npz")
    # keep the original validation and test sets.
    save_split_data(tp_validate, save_dir, split_type, parameterized_path, "valid.npz")
    save_split_data(tp_test, save_dir, split_type, parameterized_path, "test.npz")
    item_sampler = AliasTable(data[DEFAULT_ITEM_COL].value_counts().to_dict())
    n_items = tp_train[DEFAULT_ITEM_COL].nunique()
    valid_neg_max = (
        tp_validate.groupby([DEFAULT_USER_COL])[DEFAULT_ITEM_COL].count().max()
    )
    test_neg_max = tp_test.groupby([DEFAULT_USER_COL])[DEFAULT_ITEM_COL].count().max()
    if n_items - valid_neg_max < n_negative or n_items - test_neg_max < n_negative:
        raise RuntimeError(
            "This dataset do not have sufficient negative items for sampling! \n"
            + f"valid_neg_max: {n_items - valid_neg_max}, "
            + f"test_neg_max: {n_items - test_neg_max},"
            + f"n_negative: {n_negative}\nPlease directly use valid.npz and test.npz."
        )
    for i in range(n_test):
        tp_validate_new = feed_neg_sample(tp_validate, n_negative, item_sampler)
        tp_test_new = feed_neg_sample(tp_test, n_negative, item_sampler)
        save_split_data(
            tp_validate_new,
            save_dir,
            split_type,
            parameterized_path,
            "valid_" + str(i) + ".npz",
        )
        save_split_data(
            tp_test_new,
            save_dir,
            split_type,
            parameterized_path,
            "test_" + str(i) + ".npz",
        )
    return data
示例#7
0
class TestAliasTable(unittest.TestCase):
    """TestAliasTable Class."""
    @mock.patch("numpy.random.randint")
    @mock.patch("numpy.random.uniform")
    def test_list(self, mock_uniform, mock_randint):
        """Test AliasTable with list as input."""
        obj_freq = [6, 4, 1, 1]
        self.aliasTable = AliasTable(obj_freq)

        # check vocab_size
        self.assertEqual(self.aliasTable.vocab_size, 4)

        # check prob_arr
        self.assertEqual(round(self.aliasTable.prob_arr[0], 2), 1.0)
        self.assertEqual(round(self.aliasTable.prob_arr[1], 2), 0.67)
        self.assertEqual(round(self.aliasTable.prob_arr[2], 2), 0.33)
        self.assertEqual(round(self.aliasTable.prob_arr[3], 2), 0.33)

        # check alias_arr
        self.assertEqual(self.aliasTable.alias_arr[0], 0)
        self.assertEqual(self.aliasTable.alias_arr[1], 0)
        self.assertEqual(self.aliasTable.alias_arr[2], 0)
        self.assertEqual(self.aliasTable.alias_arr[3], 1)

        mock_uniform.side_effect = [0.1, 0.64, 0.8, 0.6]
        mock_randint.return_value = [2, 2, 0, 1]

        result = self.aliasTable.sample(4)
        self.assertEqual(result[0], 2)
        self.assertEqual(result[1], 0)
        self.assertEqual(result[2], 0)
        self.assertEqual(result[3], 1)

    @mock.patch("numpy.random.randint")
    @mock.patch("numpy.random.uniform")
    def test_dict(self, mock_uniform, mock_randint):
        """Test AliasTable with dictionary as input."""
        obj_freq = {100: 6, 102: 4, 103: 1, 104: 1}
        self.aliasTable = AliasTable(obj_freq)

        # check vacab_size
        self.assertEqual(self.aliasTable.vocab_size, 4)

        # check prob_arr
        self.assertEqual(round(self.aliasTable.prob_arr[0], 2), 1.0)
        self.assertEqual(round(self.aliasTable.prob_arr[1], 2), 0.67)
        self.assertEqual(round(self.aliasTable.prob_arr[2], 2), 0.33)
        self.assertEqual(round(self.aliasTable.prob_arr[3], 2), 0.33)

        # check alias_arr
        # check alias_arr
        self.assertEqual(self.aliasTable.alias_arr[0], 0)
        self.assertEqual(self.aliasTable.alias_arr[1], 0)
        self.assertEqual(self.aliasTable.alias_arr[2], 0)
        self.assertEqual(self.aliasTable.alias_arr[3], 1)

        mock_uniform.side_effect = [0.1, 0.64, 0.8, 0.6]
        mock_randint.return_value = [2, 2, 0, 1]

        result = self.aliasTable.sample(4)
        self.assertEqual(result[0], 103)
        self.assertEqual(result[1], 100)
        self.assertEqual(result[2], 100)
        self.assertEqual(result[3], 102)

    def test_2dim_list(self):
        """Test AliasTable with 2-dimension list as input."""
        obj_freq = [[1, 2, 3], [4, 5, 6]]
        try:
            self.aliasTable = AliasTable(obj_freq)
        except ValueError as e:
            self.assertEqual(type(e), ValueError)