def test_min_rating_filter(): python_dataset = pd.DataFrame({ DEFAULT_USER_COL: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5], DEFAULT_ITEM_COL: [5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1], DEFAULT_RATING_COL: np.random.randint(1, 6, 15), }) def count_filtered_rows(data, filter_by="user"): split_by_column = DEFAULT_USER_COL if filter_by == "user" else DEFAULT_ITEM_COL data_grouped = data.groupby(split_by_column) row_counts = [] for name, group in data_grouped: data_group = data_grouped.get_group(name) row_counts.append(data_group.shape[0]) return row_counts df_user = min_rating_filter_pandas(python_dataset, min_rating=3, filter_by="user") df_item = min_rating_filter_pandas(python_dataset, min_rating=2, filter_by="item") user_rating_counts = count_filtered_rows(df_user, filter_by="user") item_rating_counts = count_filtered_rows(df_item, filter_by="item") assert all(u >= 3 for u in user_rating_counts) assert all(i >= 2 for i in item_rating_counts)
def test_min_rating_filter(python_dataset): """Test min rating filter """ df_rating = python_dataset def count_filtered_rows(data, filter_by="user"): split_by_column = DEFAULT_USER_COL if filter_by == "user" else DEFAULT_ITEM_COL data_grouped = data.groupby(split_by_column) row_counts = [] for name, group in data_grouped: data_group = data_grouped.get_group(name) row_counts.append(data_group.shape[0]) return row_counts df_user = min_rating_filter_pandas(df_rating, min_rating=5, filter_by="user") df_item = min_rating_filter_pandas(df_rating, min_rating=5, filter_by="item") user_rating_counts = count_filtered_rows(df_user, filter_by="user") item_rating_counts = count_filtered_rows(df_item, filter_by="item") assert all(user_rating_counts) assert all(item_rating_counts)
def test_min_rating_filter(): python_dataset = pd.DataFrame( { DEFAULT_USER_COL: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5], DEFAULT_ITEM_COL: [5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1], DEFAULT_RATING_COL: np.random.randint(1, 6, 15) } ) def count_filtered_rows(data, filter_by="user"): split_by_column = DEFAULT_USER_COL if filter_by == "user" else DEFAULT_ITEM_COL data_grouped = data.groupby(split_by_column) row_counts = [] for name, group in data_grouped: data_group = data_grouped.get_group(name) row_counts.append(data_group.shape[0]) return row_counts df_user = min_rating_filter_pandas(python_dataset, min_rating=3, filter_by="user") df_item = min_rating_filter_pandas(python_dataset, min_rating=2, filter_by="item") user_rating_counts = count_filtered_rows(df_user, filter_by="user") item_rating_counts = count_filtered_rows(df_item, filter_by="item") assert all(u >= 3 for u in user_rating_counts) assert all(i >= 2 for i in item_rating_counts)
def _do_stratification( data, ratio=0.75, min_rating=1, filter_by="user", is_random=True, seed=42, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): # A few preliminary checks. if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError( "min_rating should be integer and larger than or equal to 1.") if col_user not in data.columns: raise ValueError("Schema of data not valid. Missing User Col") if col_item not in data.columns: raise ValueError("Schema of data not valid. Missing Item Col") if not is_random: if col_timestamp not in data.columns: raise ValueError("Schema of data not valid. Missing Timestamp Col") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item ratio = ratio if multi_split else [ratio, 1 - ratio] if min_rating > 1: data = min_rating_filter_pandas( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) # Split by each group and aggregate splits together. splits = [] # If it is for chronological splitting, the split will be performed in a random way. df_grouped = (data.sort_values(col_timestamp).groupby(split_by_column) if is_random is False else data.groupby(split_by_column)) for name, group in df_grouped: group_splits = split_pandas_data_with_ratios( df_grouped.get_group(name), ratio, shuffle=is_random, seed=seed) # Concatenate the list of split dataframes. concat_group_splits = pd.concat(group_splits) splits.append(concat_group_splits) # Concatenate splits for all the groups together. splits_all = pd.concat(splits) # Take split by split_index splits_list = [ splits_all[splits_all["split_index"] == x].drop("split_index", axis=1) for x in range(len(ratio)) ] return splits_list
def python_chrono_split( data, ratio=0.75, min_rating=1, filter_by="user", col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Pandas chronological splitter This function splits data in a chronological manner. That is, for each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (pd.DataFrame): Pandas DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halfs and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Returns: list: Splits of the input data as pd.DataFrame. """ if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError( "min_rating should be integer and larger than or equal to 1.") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item # Sort data by timestamp. data = data.sort_values(by=[split_by_column, col_timestamp], axis=0, ascending=False) ratio = ratio if multi_split else [ratio, 1 - ratio] if min_rating > 1: data = min_rating_filter_pandas( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) num_of_splits = len(ratio) splits = [pd.DataFrame({})] * num_of_splits df_grouped = data.sort_values(col_timestamp).groupby(split_by_column) for name, group in df_grouped: group_splits = split_pandas_data_with_ratios( df_grouped.get_group(name), ratio, resample=False) for x in range(num_of_splits): splits[x] = pd.concat([splits[x], group_splits[x]]) return splits
def _do_stratification( data, ratio=0.75, min_rating=1, filter_by="user", is_random=True, seed=42, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): # A few preliminary checks. if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError("min_rating should be integer and larger than or equal to 1.") if col_user not in data.columns: raise ValueError("Schema of data not valid. Missing User Col") if col_item not in data.columns: raise ValueError("Schema of data not valid. Missing Item Col") if not is_random: if col_timestamp not in data.columns: raise ValueError("Schema of data not valid. Missing Timestamp Col") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item ratio = ratio if multi_split else [ratio, 1 - ratio] if min_rating > 1: data = min_rating_filter_pandas( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) # Split by each group and aggregate splits together. splits = [] # If it is for chronological splitting, the split will be performed in a random way. df_grouped = ( data.sort_values(col_timestamp).groupby(split_by_column) if is_random is False else data.groupby(split_by_column) ) for name, group in df_grouped: group_splits = split_pandas_data_with_ratios( df_grouped.get_group(name), ratio, shuffle=is_random, seed=seed ) # Concatenate the list of split dataframes. concat_group_splits = pd.concat(group_splits) splits.append(concat_group_splits) # Concatenate splits for all the groups together. splits_all = pd.concat(splits) # Take split by split_index splits_list = [ splits_all[splits_all["split_index"] == x].drop("split_index", axis=1) for x in range(len(ratio)) ] return splits_list
def python_chrono_split( data, ratio=0.75, min_rating=1, filter_by="user", col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Pandas chronological splitter This function splits data in a chronological manner. That is, for each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (pd.DataFrame): Pandas DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halfs and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Returns: list: Splits of the input data as pd.DataFrame. """ # A few preliminary checks. if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError( "min_rating should be integer and larger than or equal to 1.") if col_user not in data.columns: raise ValueError("Schema of data not valid. Missing User Col") if col_item not in data.columns: raise ValueError("Schema of data not valid. Missing Item Col") if col_timestamp not in data.columns: raise ValueError("Schema of data not valid. Missing Timestamp Col") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item ratio = ratio if multi_split else [ratio, 1 - ratio] if min_rating > 1: data = min_rating_filter_pandas( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) # Split by each group and aggregate splits together. splits = [] df_grouped = data.sort_values(col_timestamp).groupby(split_by_column) for name, group in df_grouped: group_splits = split_pandas_data_with_ratios( df_grouped.get_group(name), ratio, shuffle=False) # Concatenate the list of split dataframes. concat_group_splits = pd.concat(group_splits) splits.append(concat_group_splits) # Concatenate splits for all the groups together. splits_all = pd.concat(splits) # Take split by split_index splits_list = [ splits_all[splits_all["split_index"] == x].drop("split_index", axis=1) for x in range(len(ratio)) ] return splits_list