예제 #1
0
def test_data_splitters_imbalanced_binary_tv():
    X = pd.DataFrame({
        "a": [i for i in range(1000)],
        "b": [i % 5 for i in range(1000)]
    })
    # make y a 9:1 class ratio
    y = pd.Series([0] * 100 + [1] * 900)
    splitter = BalancedClassificationDataTVSplit()

    for i, (train_indices, test_indices) in enumerate(splitter.split(X, y)):
        assert len(test_indices) == 250  # test_size defaults to 0.25
        # remaining data will still preserve 9:1 ratio, which we want to get to 4:1
        # we don't know the exact number since we don't stratify split
        assert len(train_indices) < 500
        # we can only test the balance of the train since the split isn't stratified
        y_balanced_train = y.iloc[train_indices]
        y_train_counts = y_balanced_train.value_counts(normalize=True)
        assert max(y_train_counts.values) == 0.8
예제 #2
0
def test_data_splitters_imbalanced_multiclass_tv():
    X = pd.DataFrame({
        "a": [i for i in range(1500)],
        "b": [i % 5 for i in range(1500)]
    })
    # make y a 8:1:1 class ratio
    y = pd.Series([0] * 150 + [1] * 1200 + [2] * 150)
    splitter = BalancedClassificationDataTVSplit()

    for i, (train_indices, test_indices) in enumerate(splitter.split(X, y)):
        assert len(test_indices) == 375  # test_size defaults to 0.25
        # we don't know the exact number since we don't stratify split
        assert len(train_indices) < 1000
        # we can only test the balance of the train since the split isn't stratified
        y_balanced_train = y.iloc[train_indices]
        y_train_counts = y_balanced_train.value_counts(normalize=True)
        # assert the values are around 2/3 for the majority class
        assert max(y_train_counts.values) < 7 / 10
        assert max(y_train_counts.values) > 6 / 10
예제 #3
0
파일: utils.py 프로젝트: joalmjoalm/evalml
def make_data_splitter(X,
                       y,
                       problem_type,
                       problem_configuration=None,
                       n_splits=3,
                       shuffle=True,
                       random_state=None,
                       random_seed=0):
    """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search.

    Arguments:
        X (ww.DataTable, pd.DataFrame): The input training data of shape [n_samples, n_features].
        y (ww.DataColumn, pd.Series): The target training data of length [n_samples].
        problem_type (ProblemType): The type of machine learning problem.
        problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the gap and max_delay variables. Defaults to None.
        n_splits (int, None): The number of CV splits, if applicable. Defaults to 3.
        shuffle (bool): Whether or not to shuffle the data before splitting, if applicable. Defaults to True.
        random_state (None, int): Deprecated - use random_seed instead.
        random_seed (int): Seed for the random number generator. Defaults to 0.

    Returns:
        sklearn.model_selection.BaseCrossValidator: Data splitting method.
    """
    random_seed = deprecate_arg("random_state", "random_seed", random_state,
                                random_seed)
    problem_type = handle_problem_types(problem_type)
    if is_time_series(problem_type):
        if not problem_configuration:
            raise ValueError(
                "problem_configuration is required for time series problem types"
            )
        return TimeSeriesSplit(
            n_splits=n_splits,
            gap=problem_configuration.get('gap'),
            max_delay=problem_configuration.get('max_delay'))
    if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
        if problem_type == ProblemTypes.REGRESSION:
            return TrainingValidationSplit(
                test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle)
        elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
            return BalancedClassificationDataTVSplit(
                test_size=_LARGE_DATA_PERCENT_VALIDATION,
                shuffle=shuffle,
                random_seed=random_seed)
    if problem_type == ProblemTypes.REGRESSION:
        return KFold(n_splits=n_splits,
                     random_state=random_seed,
                     shuffle=shuffle)
    elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
        return BalancedClassificationDataCVSplit(n_splits=n_splits,
                                                 random_seed=random_seed,
                                                 shuffle=shuffle)
예제 #4
0
def test_data_splitter_no_error(splitter, value, X_y_binary):
    X, y = X_y_binary
    X = pd.DataFrame(X)
    y = pd.Series(y)
    X.iloc[0, :] = value
    data_split = splitter()
    # handles both TV and CV iterations
    next(data_split.split(X, y))
    data_split.transform_sample(X, y)


@pytest.mark.parametrize(
    'balanced_splitter,data_splitter',
    [(BalancedClassificationDataTVSplit(sampling_ratio=1,
                                        min_samples=50,
                                        test_size=0.2,
                                        shuffle=True,
                                        random_seed=0),
      TrainingValidationSplit(test_size=0.2, shuffle=True, random_seed=0)),
     (BalancedClassificationDataCVSplit(sampling_ratio=1,
                                        min_samples=50,
                                        shuffle=True,
                                        n_splits=3,
                                        random_seed=0),
      StratifiedKFold(shuffle=True, n_splits=3, random_state=0))])
@pytest.mark.parametrize('data_type', ['np', 'pd', 'ww'])
def test_data_splitters_data_type(data_type, balanced_splitter, data_splitter,
                                  make_data_type, X_y_binary):
    X, y = X_y_binary
    # make imbalanced
    X_extended = np.append(X, X, 0)