示例#1
0
def test_stratified_kfold_split_size():
    data = tc.SFrame({"id": range(100), 'label': [0] * 50 + [1] * 50})
    for train, test in StratifiedKFold(data, 'label', 10):
        assert len(train) == 90
        assert len(test) == 10
    for train, test in StratifiedKFold(data, 'label', 5):
        assert len(train) == 80
        assert len(test) == 20
示例#2
0
def test_stratified_kfold_label_dist():
    data = tc.SFrame({"id": range(100), 'label': [0] * 50 + [1] * 50})
    for train, test in StratifiedKFold(data, 'label', 10):
        assert len(train[train["label"] == 0]) == 45
        assert len(train[train["label"] == 1]) == 45
        assert len(test[test["label"] == 0]) == 5
        assert len(test[test["label"] == 1]) == 5
    for train, test in StratifiedKFold(data, 'label', 5):
        assert len(train[train["label"] == 0]) == 40
        assert len(train[train["label"] == 1]) == 40
        assert len(test[test["label"] == 0]) == 10
        assert len(test[test["label"] == 1]) == 10
    data = tc.SFrame({"id": range(100), 'label': [0] * 90 + [1] * 10})
    for train, test in StratifiedKFold(data, 'label', 10):
        assert len(train[train["label"] == 0]) == 81
        assert len(train[train["label"] == 1]) == 9
        assert len(test[test["label"] == 0]) == 9
        assert len(test[test["label"] == 1]) == 1
示例#3
0
def test_cross_val_basic():
    data = tc.SFrame({
        "id": ["a"] * 50 + ["b"] * 50,
        'label': [0] * 50 + [1] * 50
    })
    params = {'target': 'label'}
    folds = StratifiedKFold(data, 'label', 5)
    metrics = cross_val_score(folds, tc.decision_tree_classifier.create,
                              params)
    assert metrics == {
        'recall': 1.0,
        'auc': 1.0,
        'precision': 1.0,
        'accuracy': 1.0
    }
import turicreate as tc
from turicreate_cross_validation.cross_validation import shuffle_sframe, StratifiedKFold, cross_val_score

if __name__ == "__main__":
    url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
    sf = tc.SFrame.read_csv(url)
    sf['label'] = (sf['label'] == 'p')
    params = {'target': 'label'}
    sf = shuffle_sframe(sf)
    folds = StratifiedKFold(sf, 'label', 5)
    cross_val_score(folds, tc.random_forest_classifier.create, params)
示例#5
0
def test_StratifiedKFold_with_wrong_label():
    data = tc.SFrame({"id": range(100), 'label': [0] * 50 + [1] * 50})
    with pytest.raises(ToolkitError):
        folds = StratifiedKFold(data, 'label2', 5)
        for train, test in folds:
            pass
示例#6
0
def test_cross_val_score_with_wrong_label():
    data = tc.SFrame({"id": range(100), 'label': [0] * 50 + [1] * 50})
    params = {'target': 'label2'}
    folds = StratifiedKFold(data, 'label', 5)
    with pytest.raises(ToolkitError):
        cross_val_score(folds, tc.random_forest_classifier.create, params)
示例#7
0
def test_stratified_kfold_split_intersect():
    data = tc.SFrame({"id": range(100), 'label': [0] * 50 + [1] * 50})
    for train, test in StratifiedKFold(data, 'label', 10):
        assert 100 == len(train.unique().append(test.unique()))
示例#8
0
def test_stratified_kfold_split_unique():
    data = tc.SFrame({"id": range(100), 'label': [0] * 50 + [1] * 50})
    for train, test in StratifiedKFold(data, 'label', 10):
        assert len(train) == len(train.unique())
        assert len(test) == len(test.unique())
示例#9
0
def test_stratified_kfold_split_number():
    data = tc.SFrame({"id": range(100), 'label': [0] * 50 + [1] * 50})
    assert len(list(StratifiedKFold(data, 'label', 10))) == 10
    assert len(list(StratifiedKFold(data, 'label', 5))) == 5