Exemplo n.º 1
0
def test_min_rating_filter():
    python_dataset = pd.DataFrame({
        DEFAULT_USER_COL: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5],
        DEFAULT_ITEM_COL: [5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1],
        DEFAULT_RATING_COL:
        np.random.randint(1, 6, 15),
    })

    def count_filtered_rows(data, filter_by="user"):
        split_by_column = DEFAULT_USER_COL if filter_by == "user" else DEFAULT_ITEM_COL
        data_grouped = data.groupby(split_by_column)

        row_counts = []
        for name, group in data_grouped:
            data_group = data_grouped.get_group(name)
            row_counts.append(data_group.shape[0])

        return row_counts

    df_user = min_rating_filter_pandas(python_dataset,
                                       min_rating=3,
                                       filter_by="user")
    df_item = min_rating_filter_pandas(python_dataset,
                                       min_rating=2,
                                       filter_by="item")
    user_rating_counts = count_filtered_rows(df_user, filter_by="user")
    item_rating_counts = count_filtered_rows(df_item, filter_by="item")

    assert all(u >= 3 for u in user_rating_counts)
    assert all(i >= 2 for i in item_rating_counts)
Exemplo n.º 2
0
def test_min_rating_filter(python_dataset):
    """Test min rating filter
    """
    df_rating = python_dataset

    def count_filtered_rows(data, filter_by="user"):
        split_by_column = DEFAULT_USER_COL if filter_by == "user" else DEFAULT_ITEM_COL
        data_grouped = data.groupby(split_by_column)

        row_counts = []
        for name, group in data_grouped:
            data_group = data_grouped.get_group(name)
            row_counts.append(data_group.shape[0])

        return row_counts

    df_user = min_rating_filter_pandas(df_rating,
                                       min_rating=5,
                                       filter_by="user")
    df_item = min_rating_filter_pandas(df_rating,
                                       min_rating=5,
                                       filter_by="item")

    user_rating_counts = count_filtered_rows(df_user, filter_by="user")
    item_rating_counts = count_filtered_rows(df_item, filter_by="item")

    assert all(user_rating_counts)
    assert all(item_rating_counts)
def test_min_rating_filter():
    python_dataset = pd.DataFrame(
        {
            DEFAULT_USER_COL: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5],
            DEFAULT_ITEM_COL: [5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1],
            DEFAULT_RATING_COL: np.random.randint(1, 6, 15)
        }
    )

    def count_filtered_rows(data, filter_by="user"):
        split_by_column = DEFAULT_USER_COL if filter_by == "user" else DEFAULT_ITEM_COL
        data_grouped = data.groupby(split_by_column)

        row_counts = []
        for name, group in data_grouped:
            data_group = data_grouped.get_group(name)
            row_counts.append(data_group.shape[0])

        return row_counts

    df_user = min_rating_filter_pandas(python_dataset, min_rating=3, filter_by="user")
    df_item = min_rating_filter_pandas(python_dataset, min_rating=2, filter_by="item")
    user_rating_counts = count_filtered_rows(df_user, filter_by="user")
    item_rating_counts = count_filtered_rows(df_item, filter_by="item")

    assert all(u >= 3 for u in user_rating_counts)
    assert all(i >= 2 for i in item_rating_counts)
Exemplo n.º 4
0
def _do_stratification(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    is_random=True,
    seed=42,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    # A few preliminary checks.
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError(
            "min_rating should be integer and larger than or equal to 1.")

    if col_user not in data.columns:
        raise ValueError("Schema of data not valid. Missing User Col")

    if col_item not in data.columns:
        raise ValueError("Schema of data not valid. Missing Item Col")

    if not is_random:
        if col_timestamp not in data.columns:
            raise ValueError("Schema of data not valid. Missing Timestamp Col")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    ratio = ratio if multi_split else [ratio, 1 - ratio]

    if min_rating > 1:
        data = min_rating_filter_pandas(
            data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    # Split by each group and aggregate splits together.
    splits = []

    # If it is for chronological splitting, the split will be performed in a random way.
    df_grouped = (data.sort_values(col_timestamp).groupby(split_by_column)
                  if is_random is False else data.groupby(split_by_column))

    for name, group in df_grouped:
        group_splits = split_pandas_data_with_ratios(
            df_grouped.get_group(name), ratio, shuffle=is_random, seed=seed)

        # Concatenate the list of split dataframes.
        concat_group_splits = pd.concat(group_splits)

        splits.append(concat_group_splits)

    # Concatenate splits for all the groups together.
    splits_all = pd.concat(splits)

    # Take split by split_index
    splits_list = [
        splits_all[splits_all["split_index"] == x].drop("split_index", axis=1)
        for x in range(len(ratio))
    ]

    return splits_list
Exemplo n.º 5
0
def python_chrono_split(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Pandas chronological splitter
    This function splits data in a chronological manner. That is, for each user / item, the
    split function takes proportions of ratings which is specified by the split ratio(s).
    The split is stratified.

    Args:
        data (pd.DataFrame): Pandas DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two halfs and the ratio argument indicates the ratio of 
            training data set; if it is a list of float numbers, the splitter splits 
            data into several portions corresponding to the split ratios. If a list is 
            provided and the ratios are not summed to 1, they will be normalized.
        seed (int): Seed.
        min_rating (int): minimum number of ratings for user or item.
        filter_by (str): either "user" or "item", depending on which of the two is to 
            filter with min_rating.
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps.

    Returns:
        list: Splits of the input data as pd.DataFrame.
    """
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError(
            "min_rating should be integer and larger than or equal to 1.")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    # Sort data by timestamp.
    data = data.sort_values(by=[split_by_column, col_timestamp],
                            axis=0,
                            ascending=False)

    ratio = ratio if multi_split else [ratio, 1 - ratio]

    if min_rating > 1:
        data = min_rating_filter_pandas(
            data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    num_of_splits = len(ratio)
    splits = [pd.DataFrame({})] * num_of_splits
    df_grouped = data.sort_values(col_timestamp).groupby(split_by_column)
    for name, group in df_grouped:
        group_splits = split_pandas_data_with_ratios(
            df_grouped.get_group(name), ratio, resample=False)
        for x in range(num_of_splits):
            splits[x] = pd.concat([splits[x], group_splits[x]])

    return splits
Exemplo n.º 6
0
def _do_stratification(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    is_random=True,
    seed=42,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    # A few preliminary checks.
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError("min_rating should be integer and larger than or equal to 1.")

    if col_user not in data.columns:
        raise ValueError("Schema of data not valid. Missing User Col")

    if col_item not in data.columns:
        raise ValueError("Schema of data not valid. Missing Item Col")

    if not is_random:
        if col_timestamp not in data.columns:
            raise ValueError("Schema of data not valid. Missing Timestamp Col")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    ratio = ratio if multi_split else [ratio, 1 - ratio]

    if min_rating > 1:
        data = min_rating_filter_pandas(
            data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    # Split by each group and aggregate splits together.
    splits = []

    # If it is for chronological splitting, the split will be performed in a random way.
    df_grouped = (
        data.sort_values(col_timestamp).groupby(split_by_column)
        if is_random is False
        else data.groupby(split_by_column)
    )

    for name, group in df_grouped:
        group_splits = split_pandas_data_with_ratios(
            df_grouped.get_group(name), ratio, shuffle=is_random, seed=seed
        )

        # Concatenate the list of split dataframes.
        concat_group_splits = pd.concat(group_splits)

        splits.append(concat_group_splits)

    # Concatenate splits for all the groups together.
    splits_all = pd.concat(splits)

    # Take split by split_index
    splits_list = [
        splits_all[splits_all["split_index"] == x].drop("split_index", axis=1)
        for x in range(len(ratio))
    ]

    return splits_list
Exemplo n.º 7
0
def python_chrono_split(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Pandas chronological splitter
    This function splits data in a chronological manner. That is, for each user / item, the
    split function takes proportions of ratings which is specified by the split ratio(s).
    The split is stratified.

    Args:
        data (pd.DataFrame): Pandas DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two halfs and the ratio argument indicates the ratio of 
            training data set; if it is a list of float numbers, the splitter splits 
            data into several portions corresponding to the split ratios. If a list is 
            provided and the ratios are not summed to 1, they will be normalized.
        seed (int): Seed.
        min_rating (int): minimum number of ratings for user or item.
        filter_by (str): either "user" or "item", depending on which of the two is to 
            filter with min_rating.
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps.

    Returns:
        list: Splits of the input data as pd.DataFrame.
    """
    # A few preliminary checks.
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError(
            "min_rating should be integer and larger than or equal to 1.")

    if col_user not in data.columns:
        raise ValueError("Schema of data not valid. Missing User Col")

    if col_item not in data.columns:
        raise ValueError("Schema of data not valid. Missing Item Col")

    if col_timestamp not in data.columns:
        raise ValueError("Schema of data not valid. Missing Timestamp Col")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    ratio = ratio if multi_split else [ratio, 1 - ratio]

    if min_rating > 1:
        data = min_rating_filter_pandas(
            data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    # Split by each group and aggregate splits together.
    splits = []
    df_grouped = data.sort_values(col_timestamp).groupby(split_by_column)
    for name, group in df_grouped:
        group_splits = split_pandas_data_with_ratios(
            df_grouped.get_group(name), ratio, shuffle=False)

        # Concatenate the list of split dataframes.
        concat_group_splits = pd.concat(group_splits)

        splits.append(concat_group_splits)

    # Concatenate splits for all the groups together.
    splits_all = pd.concat(splits)

    # Take split by split_index
    splits_list = [
        splits_all[splits_all["split_index"] == x].drop("split_index", axis=1)
        for x in range(len(ratio))
    ]

    return splits_list