def test_data_splitters_imbalanced_binary_tv(): X = pd.DataFrame({ "a": [i for i in range(1000)], "b": [i % 5 for i in range(1000)] }) # make y a 9:1 class ratio y = pd.Series([0] * 100 + [1] * 900) splitter = BalancedClassificationDataTVSplit() for i, (train_indices, test_indices) in enumerate(splitter.split(X, y)): assert len(test_indices) == 250 # test_size defaults to 0.25 # remaining data will still preserve 9:1 ratio, which we want to get to 4:1 # we don't know the exact number since we don't stratify split assert len(train_indices) < 500 # we can only test the balance of the train since the split isn't stratified y_balanced_train = y.iloc[train_indices] y_train_counts = y_balanced_train.value_counts(normalize=True) assert max(y_train_counts.values) == 0.8
def test_data_splitters_imbalanced_multiclass_tv(): X = pd.DataFrame({ "a": [i for i in range(1500)], "b": [i % 5 for i in range(1500)] }) # make y a 8:1:1 class ratio y = pd.Series([0] * 150 + [1] * 1200 + [2] * 150) splitter = BalancedClassificationDataTVSplit() for i, (train_indices, test_indices) in enumerate(splitter.split(X, y)): assert len(test_indices) == 375 # test_size defaults to 0.25 # we don't know the exact number since we don't stratify split assert len(train_indices) < 1000 # we can only test the balance of the train since the split isn't stratified y_balanced_train = y.iloc[train_indices] y_train_counts = y_balanced_train.value_counts(normalize=True) # assert the values are around 2/3 for the majority class assert max(y_train_counts.values) < 7 / 10 assert max(y_train_counts.values) > 6 / 10