def test_tvsplit_size():
    X = pd.DataFrame({'col1': np.arange(0, 10)})
    y = pd.Series(np.arange(100, 110), name='target')
    splitter = TrainingValidationSplit(test_size=0.2, train_size=0.3)
    splits = list(splitter.split(X, y=y))
    assert len(splits) == 1 and len(splits[0]) == 2
    np.testing.assert_equal(splits[0][0], [0, 1, 2])
    np.testing.assert_equal(splits[0][1], [3, 4])

    splitter = TrainingValidationSplit(test_size=2, train_size=3)
    splits = list(splitter.split(X, y=y))
    assert len(splits) == 1 and len(splits[0]) == 2
    np.testing.assert_equal(splits[0][0], [0, 1, 2])
    np.testing.assert_equal(splits[0][1], [3, 4])
def test_tvsplit_always_within_bounds_with_custom_index(random_state):
    N = 11000
    X = pd.DataFrame({'col1': np.arange(0, N)}, index=np.arange(20000, 20000 + N))
    splitter = TrainingValidationSplit(train_size=0.75, shuffle=True, random_state=random_state)
    splits = list(splitter.split(X, y=None))
    assert np.all(np.logical_and(splits[0][0] < N, splits[0][0] >= 0))
    assert np.all(np.logical_and(splits[0][1] < N, splits[0][1] >= 0))
예제 #3
0
def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0):
    """Splits data into train and test sets.

    Arguments:
        X (ww.DataTable, pd.DataFrame or np.ndarray): data of shape [n_samples, n_features]
        y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples]
        problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list.
        problem_configuration (dict): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the date_index, gap, and max_delay variables.
        test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%).
        random_seed (int): Seed for the random number generator. Defaults to 0.

    Returns:
        ww.DataTable, ww.DataTable, ww.DataColumn, ww.DataColumn: Feature and target data each split into train and test sets
    """

    X = infer_feature_types(X)
    y = infer_feature_types(y)

    data_splitter = None
    if is_time_series(problem_type):
        data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed)
    elif is_regression(problem_type):
        data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed)
    elif is_classification(problem_type):
        data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed)

    train, test = next(data_splitter.split(X.to_dataframe(), y.to_series()))

    X_train = X.iloc[train]
    X_test = X.iloc[test]
    y_train = y.iloc[train]
    y_test = y.iloc[test]

    return X_train, X_test, y_train, y_test
def test_tvsplit_stratify():
    X = pd.DataFrame({'col1': np.arange(0, 10)})
    y = pd.Series(np.arange(5).repeat(2), name='target')
    splitter = TrainingValidationSplit(train_size=5, test_size=5, shuffle=True, stratify=y, random_state=0)
    splits = list(splitter.split(X, y=y))
    assert len(splits) == 1 and len(splits[0]) == 2
    np.testing.assert_equal(splits[0][0], [1, 4, 2, 8, 7])
    np.testing.assert_equal(splits[0][1], [3, 6, 9, 0, 5])
def test_tvsplit_shuffle():
    X = pd.DataFrame({'col1': np.arange(0, 10)})
    y = pd.Series(np.arange(100, 110), name='target')
    splitter = TrainingValidationSplit(shuffle=True, random_state=0)
    splits = list(splitter.split(X, y=y))
    assert len(splits) == 1 and len(splits[0]) == 2
    np.testing.assert_equal(splits[0][0], [9, 1, 6, 7, 3, 0, 5])
    np.testing.assert_equal(splits[0][1], [2, 8, 4])
def test_tvsplit_default():
    X = pd.DataFrame({'col1': np.arange(0, 10)})
    y = pd.Series(np.arange(100, 110), name='target')
    splitter = TrainingValidationSplit()
    splits = list(splitter.split(X, y=y))
    assert len(splits) == 1 and len(splits[0]) == 2
    # sklearn train_test_split will do a 75/25 split by default
    np.testing.assert_equal(splits[0][0], [0, 1, 2, 3, 4, 5, 6])
    np.testing.assert_equal(splits[0][1], [7, 8, 9])