Пример #1
0
def test_time_series_split(max_delay, gap, X_none, y_none):
    X = pd.DataFrame({"features": range(1, 32)})
    y = pd.Series(range(1, 32))

    # Splitter does not need a daterange index. We use a daterange index so that the
    # expected answer is easier to understand
    y.index = pd.date_range("2020-10-01", "2020-10-31")
    X.index = pd.date_range("2020-10-01", "2020-10-31")

    answer = [(pd.date_range("2020-10-01", f"2020-10-{10 + gap}"), pd.date_range(f"2020-10-{11 - max_delay}", f"2020-10-{17 + gap}")),
              (pd.date_range("2020-10-01", f"2020-10-{17 + gap}"), pd.date_range(f"2020-10-{18 - max_delay}", f"2020-10-{24 + gap}")),
              (pd.date_range("2020-10-01", f"2020-10-{24 + gap}"), pd.date_range(f"2020-10-{25 - max_delay}", "2020-10-31"))]

    if X_none:
        X = None
    if y_none:
        y = None

    ts_split = TimeSeriesSplit(gap=gap, max_delay=max_delay)
    for i, (train, test) in enumerate(ts_split.split(X, y)):
        if not X_none:
            X_train, X_test = X.iloc[train], X.iloc[test]
            pd.testing.assert_index_equal(X_train.index, answer[i][0])
            pd.testing.assert_index_equal(X_test.index, answer[i][1])
        if not y_none:
            y_train, y_test = y.iloc[train], y.iloc[test]
            pd.testing.assert_index_equal(y_train.index, answer[i][0])
            pd.testing.assert_index_equal(y_test.index, answer[i][1])
Пример #2
0
def test_time_series_split_n_splits_too_big():
    splitter = TimeSeriesSplit(gap=7, n_splits=4, max_delay=3)
    X = pd.DataFrame({"features": range(15)})
    # Each split would have 15 // 5 = 3 data points. However, this is smaller than the number of data_points required
    # for max_delay and gap
    with pytest.raises(ValueError, match="Please use a smaller number of splits or collect more data."):
        list(splitter.split(X))
Пример #3
0
def test_time_series_split_init():
    ts_split = TimeSeriesSplit(gap=3, max_delay=4, n_splits=5)
    assert ts_split.get_n_splits() == 5

    with pytest.raises(ValueError, match="Both X and y cannot be None or empty in TimeSeriesSplit.split"):
        _ = list(ts_split.split(X=None, y=None))

    with pytest.raises(ValueError, match="Both X and y cannot be None or empty in TimeSeriesSplit.split"):
        _ = list(ts_split.split(X=pd.DataFrame(), y=pd.Series([])))
Пример #4
0
def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=3, shuffle=True, random_state=0):
    """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search.

    Arguments:
        X (pd.DataFrame, ww.DataTable): The input training data of shape [n_samples, n_features].
        y (pd.Series, ww.DataColumn): The target training data of length [n_samples].
        problem_type (ProblemType): the type of machine learning problem.
        problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the gap and max_delay variables.
        n_splits (int, None): the number of CV splits, if applicable. Default 3.
        shuffle (bool): whether or not to shuffle the data before splitting, if applicable. Default True.
        random_state (int, np.random.RandomState): The random seed/state. Defaults to 0.

    Returns:
        sklearn.model_selection.BaseCrossValidator: data splitting method.
    """
    problem_type = handle_problem_types(problem_type)
    data_splitter = None
    if problem_type == ProblemTypes.REGRESSION:
        data_splitter = KFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
        data_splitter = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    elif is_time_series(problem_type):
        if not problem_configuration:
            raise ValueError("problem_configuration is required for time series problem types")
        data_splitter = TimeSeriesSplit(n_splits=n_splits, gap=problem_configuration.get('gap'),
                                        max_delay=problem_configuration.get('max_delay'))
    if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
        data_splitter = TrainingValidationSplit(test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=True)
    return data_splitter
Пример #5
0
def make_data_splitter(X,
                       y,
                       problem_type,
                       problem_configuration=None,
                       n_splits=3,
                       shuffle=True,
                       random_state=None,
                       random_seed=0):
    """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search.

    Arguments:
        X (ww.DataTable, pd.DataFrame): The input training data of shape [n_samples, n_features].
        y (ww.DataColumn, pd.Series): The target training data of length [n_samples].
        problem_type (ProblemType): The type of machine learning problem.
        problem_configuration (dict, None): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the gap and max_delay variables. Defaults to None.
        n_splits (int, None): The number of CV splits, if applicable. Defaults to 3.
        shuffle (bool): Whether or not to shuffle the data before splitting, if applicable. Defaults to True.
        random_state (None, int): Deprecated - use random_seed instead.
        random_seed (int): Seed for the random number generator. Defaults to 0.

    Returns:
        sklearn.model_selection.BaseCrossValidator: Data splitting method.
    """
    random_seed = deprecate_arg("random_state", "random_seed", random_state,
                                random_seed)
    problem_type = handle_problem_types(problem_type)
    if is_time_series(problem_type):
        if not problem_configuration:
            raise ValueError(
                "problem_configuration is required for time series problem types"
            )
        return TimeSeriesSplit(
            n_splits=n_splits,
            gap=problem_configuration.get('gap'),
            max_delay=problem_configuration.get('max_delay'))
    if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD:
        if problem_type == ProblemTypes.REGRESSION:
            return TrainingValidationSplit(
                test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle)
        elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
            return BalancedClassificationDataTVSplit(
                test_size=_LARGE_DATA_PERCENT_VALIDATION,
                shuffle=shuffle,
                random_seed=random_seed)
    if problem_type == ProblemTypes.REGRESSION:
        return KFold(n_splits=n_splits,
                     random_state=random_seed,
                     shuffle=shuffle)
    elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]:
        return BalancedClassificationDataCVSplit(n_splits=n_splits,
                                                 random_seed=random_seed,
                                                 shuffle=shuffle)