Exemplo n.º 1
0
def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0):
    """Splits data into train and test sets.

    Arguments:
        X (ww.DataTable, pd.DataFrame or np.ndarray): data of shape [n_samples, n_features]
        y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples]
        problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list.
        problem_configuration (dict): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the date_index, gap, and max_delay variables.
        test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%).
        random_seed (int): Seed for the random number generator. Defaults to 0.

    Returns:
        ww.DataTable, ww.DataTable, ww.DataColumn, ww.DataColumn: Feature and target data each split into train and test sets
    """

    X = infer_feature_types(X)
    y = infer_feature_types(y)

    data_splitter = None
    if is_time_series(problem_type):
        data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed)
    elif is_regression(problem_type):
        data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed)
    elif is_classification(problem_type):
        data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed)

    train, test = next(data_splitter.split(X.to_dataframe(), y.to_series()))

    X_train = X.iloc[train]
    X_test = X.iloc[test]
    y_train = y.iloc[train]
    y_test = y.iloc[test]

    return X_train, X_test, y_train, y_test
def test_tvsplit_always_within_bounds_with_custom_index(random_state):
    N = 11000
    X = pd.DataFrame({'col1': np.arange(0, N)}, index=np.arange(20000, 20000 + N))
    splitter = TrainingValidationSplit(train_size=0.75, shuffle=True, random_state=random_state)
    splits = list(splitter.split(X, y=None))
    assert np.all(np.logical_and(splits[0][0] < N, splits[0][0] >= 0))
    assert np.all(np.logical_and(splits[0][1] < N, splits[0][1] >= 0))
def test_tvsplit_stratify():
    X = pd.DataFrame({'col1': np.arange(0, 10)})
    y = pd.Series(np.arange(5).repeat(2), name='target')
    splitter = TrainingValidationSplit(train_size=5, test_size=5, shuffle=True, stratify=y, random_state=0)
    splits = list(splitter.split(X, y=y))
    assert len(splits) == 1 and len(splits[0]) == 2
    np.testing.assert_equal(splits[0][0], [1, 4, 2, 8, 7])
    np.testing.assert_equal(splits[0][1], [3, 6, 9, 0, 5])
def test_tvsplit_shuffle():
    X = pd.DataFrame({'col1': np.arange(0, 10)})
    y = pd.Series(np.arange(100, 110), name='target')
    splitter = TrainingValidationSplit(shuffle=True, random_state=0)
    splits = list(splitter.split(X, y=y))
    assert len(splits) == 1 and len(splits[0]) == 2
    np.testing.assert_equal(splits[0][0], [9, 1, 6, 7, 3, 0, 5])
    np.testing.assert_equal(splits[0][1], [2, 8, 4])
def test_tvsplit_default():
    X = pd.DataFrame({'col1': np.arange(0, 10)})
    y = pd.Series(np.arange(100, 110), name='target')
    splitter = TrainingValidationSplit()
    splits = list(splitter.split(X, y=y))
    assert len(splits) == 1 and len(splits[0]) == 2
    # sklearn train_test_split will do a 75/25 split by default
    np.testing.assert_equal(splits[0][0], [0, 1, 2, 3, 4, 5, 6])
    np.testing.assert_equal(splits[0][1], [7, 8, 9])
Exemplo n.º 6
0
def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=3, shuffle=True, random_state=0):
    """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search.

    Arguments:
        X (pd.DataFrame, ww.DataTable): The input training data of shape [n_samples, n_features].
        y (pd.Series, ww.DataColumn): The target training data of length [n_samples].
        problem_type (ProblemType): the type of machine learning problem.
        problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the gap and max_delay variables.
        n_splits (int, None): the number of CV splits, if applicable. Default 3.
        shuffle (bool): whether or not to shuffle the data before splitting, if applicable. Default True.
        random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.

    Returns:
        sklearn.model_selection.BaseCrossValidator: data splitting method.
    """
    problem_type = handle_problem_types(problem_type)
    data_splitter = None
    if problem_type == ProblemTypes.REGRESSION:
        data_splitter = KFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
        data_splitter = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    elif is_time_series(problem_type):
        if not problem_configuration:
            raise ValueError("problem_configuration is required for time series problem types")
        data_splitter = TimeSeriesSplit(n_splits=n_splits, gap=problem_configuration.get('gap'),
                                        max_delay=problem_configuration.get('max_delay'))
    if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
        data_splitter = TrainingValidationSplit(test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=True)
    return data_splitter
Exemplo n.º 7
0
def make_data_splitter(X,
                       y,
                       problem_type,
                       problem_configuration=None,
                       n_splits=3,
                       shuffle=True,
                       random_state=None,
                       random_seed=0):
    """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search.

    Arguments:
        X (ww.DataTable, pd.DataFrame): The input training data of shape [n_samples, n_features].
        y (ww.DataColumn, pd.Series): The target training data of length [n_samples].
        problem_type (ProblemType): The type of machine learning problem.
        problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the gap and max_delay variables. Defaults to None.
        n_splits (int, None): The number of CV splits, if applicable. Defaults to 3.
        shuffle (bool): Whether or not to shuffle the data before splitting, if applicable. Defaults to True.
        random_state (None, int): Deprecated - use random_seed instead.
        random_seed (int): Seed for the random number generator. Defaults to 0.

    Returns:
        sklearn.model_selection.BaseCrossValidator: Data splitting method.
    """
    random_seed = deprecate_arg("random_state", "random_seed", random_state,
                                random_seed)
    problem_type = handle_problem_types(problem_type)
    if is_time_series(problem_type):
        if not problem_configuration:
            raise ValueError(
                "problem_configuration is required for time series problem types"
            )
        return TimeSeriesSplit(
            n_splits=n_splits,
            gap=problem_configuration.get('gap'),
            max_delay=problem_configuration.get('max_delay'))
    if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
        if problem_type == ProblemTypes.REGRESSION:
            return TrainingValidationSplit(
                test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle)
        elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
            return BalancedClassificationDataTVSplit(
                test_size=_LARGE_DATA_PERCENT_VALIDATION,
                shuffle=shuffle,
                random_seed=random_seed)
    if problem_type == ProblemTypes.REGRESSION:
        return KFold(n_splits=n_splits,
                     random_state=random_seed,
                     shuffle=shuffle)
    elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
        return BalancedClassificationDataCVSplit(n_splits=n_splits,
                                                 random_seed=random_seed,
                                                 shuffle=shuffle)
def test_tvsplit_size():
    X = pd.DataFrame({'col1': np.arange(0, 10)})
    y = pd.Series(np.arange(100, 110), name='target')
    splitter = TrainingValidationSplit(test_size=0.2, train_size=0.3)
    splits = list(splitter.split(X, y=y))
    assert len(splits) == 1 and len(splits[0]) == 2
    np.testing.assert_equal(splits[0][0], [0, 1, 2])
    np.testing.assert_equal(splits[0][1], [3, 4])

    splitter = TrainingValidationSplit(test_size=2, train_size=3)
    splits = list(splitter.split(X, y=y))
    assert len(splits) == 1 and len(splits[0]) == 2
    np.testing.assert_equal(splits[0][0], [0, 1, 2])
    np.testing.assert_equal(splits[0][1], [3, 4])
Exemplo n.º 9
0
    y = pd.Series(y)
    X.iloc[0, :] = value
    data_split = splitter()
    # handles both TV and CV iterations
    next(data_split.split(X, y))
    data_split.transform_sample(X, y)


@pytest.mark.parametrize(
    'balanced_splitter,data_splitter',
    [(BalancedClassificationDataTVSplit(sampling_ratio=1,
                                        min_samples=50,
                                        test_size=0.2,
                                        shuffle=True,
                                        random_seed=0),
      TrainingValidationSplit(test_size=0.2, shuffle=True, random_seed=0)),
     (BalancedClassificationDataCVSplit(sampling_ratio=1,
                                        min_samples=50,
                                        shuffle=True,
                                        n_splits=3,
                                        random_seed=0),
      StratifiedKFold(shuffle=True, n_splits=3, random_state=0))])
@pytest.mark.parametrize('data_type', ['np', 'pd', 'ww'])
def test_data_splitters_data_type(data_type, balanced_splitter, data_splitter,
                                  make_data_type, X_y_binary):
    X, y = X_y_binary
    # make imbalanced
    X_extended = np.append(X, X, 0)
    y_extended = np.append(y, np.array([0] * len(y)), 0)
    sample_method = BalancedClassificationSampler(sampling_ratio=1,
                                                  min_samples=50,
def test_tvsplit_nsplits():
    assert TrainingValidationSplit().get_n_splits() == 1
Exemplo n.º 11
0
import time

from evalml.automl.utils import AutoMLConfig
from evalml.exceptions import PipelineScoreError
from evalml.objectives.utils import get_objective
from evalml.pipelines import BinaryClassificationPipeline
from evalml.preprocessing.data_splitters import TrainingValidationSplit


# Top-level replacement for AutoML object to supply data for testing purposes.
def err_call(*args, **kwargs):
    """No-op"""


ensembling_indices = [0]
data_splitter = TrainingValidationSplit()
problem_type = "binary"
objective = get_objective("Log Loss Binary", return_instance=True)
additional_objectives = []
optimize_thresholds = False
error_callback = err_call
random_seed = 0
automl_data = AutoMLConfig(ensembling_indices=ensembling_indices,
                           data_splitter=data_splitter,
                           problem_type=problem_type,
                           objective=objective,
                           additional_objectives=additional_objectives,
                           optimize_thresholds=optimize_thresholds,
                           error_callback=error_callback,
                           random_seed=random_seed)