예제 #1
0
def test_root_import_all_completeness():
    EXCEPTIONS = ('utils', 'tests', 'base', 'setup')
    for _, modname, _ in pkgutil.walk_packages(path=sklearn.__path__,
                                               onerror=lambda _: None):
        if '.' in modname or modname.startswith('_') or modname in EXCEPTIONS:
            continue
        assert_in(modname, sklearn.__all__)
예제 #2
0
def test_dump():
    X_sparse, y_dense = load_svmlight_file(datafile)
    X_dense = X_sparse.toarray()
    y_sparse = sp.csr_matrix(y_dense)

    # slicing a csr_matrix can unsort its .indices, so test that we sort
    # those correctly
    X_sliced = X_sparse[np.arange(X_sparse.shape[0])]
    y_sliced = y_sparse[np.arange(y_sparse.shape[0])]

    for X in (X_sparse, X_dense, X_sliced):
        for y in (y_sparse, y_dense, y_sliced):
            for zero_based in (True, False):
                for dtype in [np.float32, np.float64, np.int32]:
                    f = BytesIO()
                    # we need to pass a comment to get the version info in;
                    # LibSVM doesn't grok comments so they're not put in by
                    # default anymore.

                    if (sp.issparse(y) and y.shape[0] == 1):
                        # make sure y's shape is: (n_samples, n_labels)
                        # when it is sparse
                        y = y.T

                    dump_svmlight_file(X.astype(dtype),
                                       y,
                                       f,
                                       comment="test",
                                       zero_based=zero_based)
                    f.seek(0)

                    comment = f.readline()
                    comment = str(comment, "utf-8")

                    assert_in("scikit-learn %s" % sklearn.__version__, comment)

                    comment = f.readline()
                    comment = str(comment, "utf-8")

                    assert_in(["one", "zero"][zero_based] + "-based", comment)

                    X2, y2 = load_svmlight_file(f,
                                                dtype=dtype,
                                                zero_based=zero_based)
                    assert_equal(X2.dtype, dtype)
                    assert_array_equal(X2.sorted_indices().indices, X2.indices)

                    X2_dense = X2.toarray()

                    if dtype == np.float32:
                        # allow a rounding error at the last decimal place
                        assert_array_almost_equal(X_dense.astype(dtype),
                                                  X2_dense, 4)
                        assert_array_almost_equal(y_dense.astype(dtype), y2, 4)
                    else:
                        # allow a rounding error at the last decimal place
                        assert_array_almost_equal(X_dense.astype(dtype),
                                                  X2_dense, 15)
                        assert_array_almost_equal(y_dense.astype(dtype), y2,
                                                  15)
예제 #3
0
def test_n_iter_without_progress():
    # Use a dummy negative n_iter_without_progress and check output on stdout
    random_state = check_random_state(0)
    X = random_state.randn(100, 10)
    for method in ["barnes_hut", "exact"]:
        tsne = TSNE(n_iter_without_progress=-1,
                    verbose=2,
                    learning_rate=1e8,
                    random_state=0,
                    method=method,
                    n_iter=351,
                    init="random")
        tsne._N_ITER_CHECK = 1
        tsne._EXPLORATION_N_ITER = 0

        old_stdout = sys.stdout
        sys.stdout = StringIO()
        try:
            tsne.fit_transform(X)
        finally:
            out = sys.stdout.getvalue()
            sys.stdout.close()
            sys.stdout = old_stdout

        # The output needs to contain the value of n_iter_without_progress
        assert_in(
            "did not make any progress during the "
            "last -1 episodes. Finished.", out)
예제 #4
0
def test_similarity_tree():
    # Test that rules are well splitted
    rules = [
        ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)),
        ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)),
        ("a > 2 and b > 45", (0.5, 0.3, 0)),
        ("a > 2 and b > 40", (0.5, 0.2, 0)),
        ("a <= 2 and b <= 45", (1, 1, 0)),
        ("a > 2 and c <= 3", (1, 1, 0)),
        ("b > 45", (1, 1, 0)),
    ]

    sk = SkopeRules(max_depth_duplication=2)
    rulesets = sk._find_similar_rulesets(rules)
    # Assert some couples of rules are in the same bag
    idx_bags_rules = []
    for idx_rule, r in enumerate(rules):
        idx_bags_for_rule = []
        for idx_bag, bag in enumerate(rulesets):
            if r in bag:
                idx_bags_for_rule.append(idx_bag)
        idx_bags_rules.append(idx_bags_for_rule)

    assert_equal(idx_bags_rules[0], idx_bags_rules[1])
    assert_not_equal(idx_bags_rules[0], idx_bags_rules[2])
    # Assert the best rules are kept
    final_rules = sk.deduplicate(rules)
    assert_in(rules[0], final_rules)
    assert_in(rules[2], final_rules)
    assert_not_in(rules[3], final_rules)
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    for X in (Xs, Xd):
        for zero_based in (True, False):
            for dtype in [np.float32, np.float64]:
                f = BytesIO()
                dump_svmlight_file(X.astype(dtype), y, f, zero_based=zero_based)
                f.seek(0)

                comment = f.readline()
                assert_in("scikit-learn %s" % sklearn.__version__, comment)
                comment = f.readline()
                assert_in(["one", "zero"][zero_based] + "-based", comment)

                X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based)
                assert_equal(X2.dtype, dtype)
                if dtype == np.float32:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype),
                        X2.toarray(),
                        4,
                    )
                else:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype),
                        X2.toarray(),
                        15,
                    )
                assert_array_equal(y, y2)
예제 #6
0
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    for X in (Xs, Xd):
        for zero_based in (True, False):
            for dtype in [np.float32, np.float64]:
                f = BytesIO()
                # we need to pass a comment to get the version info in;
                # LibSVM doesn't grok comments so they're not put in by
                # default anymore.
                dump_svmlight_file(X.astype(dtype), y, f, comment="test",
                                   zero_based=zero_based)
                f.seek(0)

                comment = f.readline()
                assert_in("scikit-learn %s" % sklearn.__version__, comment)
                comment = f.readline()
                assert_in(["one", "zero"][zero_based] + "-based", comment)

                X2, y2 = load_svmlight_file(f, dtype=dtype,
                                            zero_based=zero_based)
                assert_equal(X2.dtype, dtype)
                if dtype == np.float32:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype), X2.toarray(), 4)
                else:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype), X2.toarray(), 15)
                assert_array_equal(y, y2)
예제 #7
0
def test_root_import_all_completeness():
    EXCEPTIONS = ('utils', 'tests', 'base', 'setup')
    for _, modname, _ in pkgutil.walk_packages(path=sklearn.__path__,
                                               onerror=lambda _: None):
        if '.' in modname or modname.startswith('_') or modname in EXCEPTIONS:
            continue
        assert_in(modname, sklearn.__all__)
예제 #8
0
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    for X in (Xs, Xd):
        for zero_based in (True, False):
            for dtype in [np.float32, np.float64]:
                f = BytesIO()
                dump_svmlight_file(X.astype(dtype), y, f,
                                   zero_based=zero_based)
                f.seek(0)

                comment = f.readline()
                assert_in("scikit-learn %s" % sklearn.__version__, comment)
                comment = f.readline()
                assert_in(["one", "zero"][zero_based] + "-based", comment)

                X2, y2 = load_svmlight_file(f, dtype=dtype,
                                            zero_based=zero_based)
                assert_equal(X2.dtype, dtype)
                if dtype == np.float32:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype), X2.toarray(), 4)
                else:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype), X2.toarray(), 15)
                assert_array_equal(y, y2)
예제 #9
0
def test_sparse_precomputed():
    clf = svm.SVC(kernel='precomputed')
    sparse_gram = sparse.csr_matrix([[1, 0], [0, 1]])
    try:
        clf.fit(sparse_gram, [0, 1])
        assert not "reached"
    except TypeError as e:
        assert_in("Sparse precomputed", str(e))
예제 #10
0
def test_big_input():
    """Test if the warning for too large inputs is appropriate."""
    X = np.repeat(10**40., 4).astype(np.float64).reshape(-1, 1)
    clf = DecisionTreeClassifier()
    try:
        clf.fit(X, [0, 1, 0, 1])
    except ValueError as e:
        assert_in("float32", str(e))
예제 #11
0
def test_sparse_precomputed():
    clf = svm.SVC(kernel='precomputed')
    sparse_gram = sparse.csr_matrix([[1, 0], [0, 1]])
    try:
        clf.fit(sparse_gram, [0, 1])
        assert not "reached"
    except TypeError as e:
        assert_in("Sparse precomputed", str(e))
예제 #12
0
def test_big_input():
    """Test if the warning for too large inputs is appropriate."""
    X = np.repeat(10 ** 40., 4).astype(np.float64).reshape(-1, 1)
    clf = DecisionTreeClassifier()
    try:
        clf.fit(X, [0, 1, 0, 1])
    except ValueError as e:
        assert_in("float32", str(e))
def test_dump():
    X_sparse, y_dense = load_svmlight_file(datafile)
    X_dense = X_sparse.toarray()
    y_sparse = sp.csr_matrix(y_dense)

    # slicing a csr_matrix can unsort its .indices, so test that we sort
    # those correctly
    X_sliced = X_sparse[np.arange(X_sparse.shape[0])]
    y_sliced = y_sparse[np.arange(y_sparse.shape[0])]

    for X in (X_sparse, X_dense, X_sliced):
        for y in (y_sparse, y_dense, y_sliced):
            for zero_based in (True, False):
                for dtype in [np.float32, np.float64, np.int32]:
                    f = BytesIO()
                    # we need to pass a comment to get the version info in;
                    # LibSVM doesn't grok comments so they're not put in by
                    # default anymore.

                    if (sp.issparse(y) and y.shape[0] == 1):
                        # make sure y's shape is: (n_samples, n_labels)
                        # when it is sparse
                        y = y.T

                    dump_svmlight_file(X.astype(dtype), y, f, comment="test",
                                       zero_based=zero_based)
                    f.seek(0)

                    comment = f.readline()
                    comment = str(comment, "utf-8")

                    assert_in("scikit-learn %s" % sklearn.__version__, comment)

                    comment = f.readline()
                    comment = str(comment, "utf-8")

                    assert_in(["one", "zero"][zero_based] + "-based", comment)

                    X2, y2 = load_svmlight_file(f, dtype=dtype,
                                                zero_based=zero_based)
                    assert_equal(X2.dtype, dtype)
                    assert_array_equal(X2.sorted_indices().indices, X2.indices)

                    X2_dense = X2.toarray()

                    if dtype == np.float32:
                        # allow a rounding error at the last decimal place
                        assert_array_almost_equal(
                            X_dense.astype(dtype), X2_dense, 4)
                        assert_array_almost_equal(
                            y_dense.astype(dtype), y2, 4)
                    else:
                        # allow a rounding error at the last decimal place
                        assert_array_almost_equal(
                            X_dense.astype(dtype), X2_dense, 15)
                        assert_array_almost_equal(
                            y_dense.astype(dtype), y2, 15)
예제 #14
0
def test_boundaries():
    # ensure min_samples is inclusive of core point
    core, _ = dbscan([[0], [1]], eps=2, min_samples=2)
    assert_in(0, core)
    # ensure eps is inclusive of circumference
    core, _ = dbscan([[0], [1], [1]], eps=1, min_samples=2)
    assert_in(0, core)
    core, _ = dbscan([[0], [1], [1]], eps=.99, min_samples=2)
    assert_not_in(0, core)
예제 #15
0
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    # slicing a csr_matrix can unsort its .indices, so test that we sort
    # those correctly
    Xsliced = Xs[np.arange(Xs.shape[0])]

    for X in (Xs, Xd, Xsliced):
        for zero_based in (True, False):
            for dtype in [np.float32, np.float64, np.int32]:
                f = BytesIO()
                # we need to pass a comment to get the version info in;
                # LibSVM doesn't grok comments so they're not put in by
                # default anymore.
                dump_svmlight_file(X.astype(dtype),
                                   y,
                                   f,
                                   comment="test",
                                   zero_based=zero_based)
                f.seek(0)

                comment = f.readline()
                try:
                    comment = str(comment, "utf-8")
                except TypeError:  # fails in Python 2.x
                    pass

                assert_in("scikit-learn %s" % sklearn.__version__, comment)

                comment = f.readline()
                try:
                    comment = str(comment, "utf-8")
                except TypeError:  # fails in Python 2.x
                    pass

                assert_in(["one", "zero"][zero_based] + "-based", comment)

                X2, y2 = load_svmlight_file(f,
                                            dtype=dtype,
                                            zero_based=zero_based)
                assert_equal(X2.dtype, dtype)
                assert_array_equal(X2.sorted_indices().indices, X2.indices)
                if dtype == np.float32:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype),
                        X2.toarray(),
                        4)
                else:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype),
                        X2.toarray(),
                        15)
                assert_array_equal(y, y2)
예제 #16
0
def check_parameters_default_constructible(name, Estimator):
    classifier = LDA()
    # test default-constructibility
    # get rid of deprecation warnings
    with warnings.catch_warnings(record=True):
        if name in META_ESTIMATORS:
            estimator = Estimator(classifier)
        else:
            estimator = Estimator()
        # test cloning
        clone(estimator)
        # test __repr__
        repr(estimator)
        # test that set_params returns self
        assert_true(estimator.set_params() is estimator)

        # test if init does nothing but set parameters
        # this is important for grid_search etc.
        # We get the default parameters from init and then
        # compare these against the actual values of the attributes.

        # this comes from getattr. Gets rid of deprecation decorator.
        init = getattr(estimator.__init__, 'deprecated_original',
                       estimator.__init__)
        try:
            args, varargs, kws, defaults = inspect.getargspec(init)
        except TypeError:
            # init is not a python function.
            # true for mixins
            return
        params = estimator.get_params()
        if name in META_ESTIMATORS:
            # they need a non-default argument
            args = args[2:]
        else:
            args = args[1:]
        if args:
            # non-empty list
            assert_equal(len(args), len(defaults))
        else:
            return
        for arg, default in zip(args, defaults):
            assert_in(type(default), [
                str, int, float, bool, tuple,
                type(None), np.float64, types.FunctionType, Memory
            ])
            if arg not in params.keys():
                # deprecated parameter, not in get_params
                assert_true(default is None)
                continue

            if isinstance(params[arg], np.ndarray):
                assert_array_equal(params[arg], default)
            else:
                assert_equal(params[arg], default)
예제 #17
0
def check_parameters_default_constructible(name, Estimator):
    classifier = LDA()
    # test default-constructibility
    # get rid of deprecation warnings
    with warnings.catch_warnings(record=True):
        if name in META_ESTIMATORS:
            estimator = Estimator(classifier)
        else:
            estimator = Estimator()
        # test cloning
        clone(estimator)
        # test __repr__
        repr(estimator)
        # test that set_params returns self
        assert_true(estimator.set_params() is estimator)

        # test if init does nothing but set parameters
        # this is important for grid_search etc.
        # We get the default parameters from init and then
        # compare these against the actual values of the attributes.

        # this comes from getattr. Gets rid of deprecation decorator.
        init = getattr(estimator.__init__, 'deprecated_original',
                       estimator.__init__)
        try:
            args, varargs, kws, defaults = inspect.getargspec(init)
        except TypeError:
            # init is not a python function.
            # true for mixins
            return
        params = estimator.get_params()
        if name in META_ESTIMATORS:
            # they need a non-default argument
            args = args[2:]
        else:
            args = args[1:]
        if args:
            # non-empty list
            assert_equal(len(args), len(defaults))
        else:
            return
        for arg, default in zip(args, defaults):
            assert_in(type(default), [str, int, float, bool, tuple, type(None),
                                      np.float64, types.FunctionType, Memory])
            if arg not in params.keys():
                # deprecated parameter, not in get_params
                assert_true(default is None)
                continue

            if isinstance(params[arg], np.ndarray):
                assert_array_equal(params[arg], default)
            else:
                assert_equal(params[arg], default)
예제 #18
0
def test_dump():
    Xs, y = load_svmlight_file(datafile)
    Xd = Xs.toarray()

    # slicing a csr_matrix can unsort its .indices, so test that we sort
    # those correctly
    Xsliced = Xs[np.arange(Xs.shape[0])]

    for X in (Xs, Xd, Xsliced):
        for zero_based in (True, False):
            for dtype in [np.float32, np.float64, np.int32]:
                f = BytesIO()
                # we need to pass a comment to get the version info in;
                # LibSVM doesn't grok comments so they're not put in by
                # default anymore.
                dump_svmlight_file(X.astype(dtype), y, f, comment="test", zero_based=zero_based)
                f.seek(0)

                comment = f.readline()
                try:
                    comment = str(comment, "utf-8")
                except TypeError:  # fails in Python 2.x
                    pass

                assert_in("scikit-learn %s" % sklearn.__version__, comment)

                comment = f.readline()
                try:
                    comment = str(comment, "utf-8")
                except TypeError:  # fails in Python 2.x
                    pass

                assert_in(["one", "zero"][zero_based] + "-based", comment)

                X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based)
                assert_equal(X2.dtype, dtype)
                assert_array_equal(X2.sorted_indices().indices, X2.indices)
                if dtype == np.float32:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype),
                        X2.toarray(),
                        4,
                    )
                else:
                    assert_array_almost_equal(
                        # allow a rounding error at the last decimal place
                        Xd.astype(dtype),
                        X2.toarray(),
                        15,
                    )
                assert_array_equal(y, y2)
예제 #19
0
def test_friedman_mse_in_graphviz():
    clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0)
    clf.fit(X, y)
    dot_data = StringIO()
    export_graphviz(clf, out_file=dot_data)

    clf = GradientBoostingClassifier(n_estimators=2, random_state=0)
    clf.fit(X, y)
    for estimator in clf.estimators_:
        export_graphviz(estimator[0], out_file=dot_data)

    for finding in finditer("\[.*?samples.*?\]", dot_data.getvalue()):
        assert_in("friedman_mse", finding.group())
예제 #20
0
def test_friedman_mse_in_graphviz():
    clf = DecisionTreeRegressor(criterion="friedman_mse", random_state=0)
    clf.fit(X, y)
    dot_data = StringIO()
    export_graphviz(clf, out_file=dot_data)

    clf = GradientBoostingClassifier(n_estimators=2, random_state=0)
    clf.fit(X, y)
    for estimator in clf.estimators_:
        export_graphviz(estimator[0], out_file=dot_data)

    for finding in finditer(r"\[.*?samples.*?\]", dot_data.getvalue()):
        assert_in("friedman_mse", finding.group())
예제 #21
0
def test_countvectorizer_empty_vocabulary():
    try:
        CountVectorizer(vocabulary=[])
        assert False, "we shouldn't get here"
    except ValueError as e:
        assert_in("empty vocabulary", str(e))

    try:
        v = CountVectorizer(min_df=1, max_df=1.0, stop_words="english")
        # fit on stopwords only
        v.fit(["to be or not to be", "and me too", "and so do you"])
        assert False, "we shouldn't get here"
    except ValueError as e:
        assert_in("empty vocabulary", str(e))
예제 #22
0
def test_countvectorizer_empty_vocabulary():
    try:
        CountVectorizer(vocabulary=[])
        assert False, "we shouldn't get here"
    except ValueError as e:
        assert_in("empty vocabulary", str(e).lower())

    try:
        v = CountVectorizer(max_df=1.0, stop_words="english")
        # fit on stopwords only
        v.fit(["to be or not to be", "and me too", "and so do you"])
        assert False, "we shouldn't get here"
    except ValueError as e:
        assert_in("empty vocabulary", str(e).lower())
def test_valid_brute_metric_for_auto_algorithm():
    X = rng.rand(12, 12)
    Xcsr = csr_matrix(X)

    # check that there is a metric that is valid for brute
    # but not ball_tree (so we actually test something)
    assert_in("cosine", VALID_METRICS['brute'])
    assert_false("cosine" in VALID_METRICS['ball_tree'])

    # Metric which don't required any additional parameter
    require_params = ['mahalanobis', 'wminkowski', 'seuclidean']
    for metric in VALID_METRICS['brute']:
        if metric != 'precomputed' and metric not in require_params:
            nn = neighbors.NearestNeighbors(n_neighbors=3,
                                            algorithm='auto',
                                            metric=metric).fit(X)
            nn.kneighbors(X)
        elif metric == 'precomputed':
            X_precomputed = rng.random_sample((10, 4))
            Y_precomputed = rng.random_sample((3, 4))
            DXX = metrics.pairwise_distances(X_precomputed, metric='euclidean')
            DYX = metrics.pairwise_distances(Y_precomputed,
                                             X_precomputed,
                                             metric='euclidean')
            nb_p = neighbors.NearestNeighbors(n_neighbors=3)
            nb_p.fit(DXX)
            nb_p.kneighbors(DYX)

    for metric in VALID_METRICS_SPARSE['brute']:
        if metric != 'precomputed' and metric not in require_params:
            nn = neighbors.NearestNeighbors(n_neighbors=3,
                                            algorithm='auto',
                                            metric=metric).fit(Xcsr)
            nn.kneighbors(Xcsr)

    # Metric with parameter
    VI = np.dot(X, X.T)
    list_metrics = [('seuclidean', dict(V=rng.rand(12))),
                    ('wminkowski', dict(w=rng.rand(12))),
                    ('mahalanobis', dict(VI=VI))]
    for metric, params in list_metrics:
        nn = neighbors.NearestNeighbors(n_neighbors=3,
                                        algorithm='auto',
                                        metric=metric,
                                        metric_params=params).fit(X)
        nn.kneighbors(X)
예제 #24
0
def test_download():
    """Test that fetch_mldata is able to download and cache a data set."""

    _urllib2_ref = datasets.mldata.urllib2
    datasets.mldata.urllib2 = mock_urllib2({'mock':
                                            {'label': sp.ones((150,)),
                                             'data': sp.ones((150, 4))}})
    try:
        mock = fetch_mldata('mock', data_home=tmpdir)
        assert_in(mock, in_=['COL_NAMES', 'DESCR', 'target', 'data'])

        assert_equal(mock.target.shape, (150,))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.urllib2.HTTPError,
                      fetch_mldata, 'not_existing_name')
    finally:
        datasets.mldata.urllib2 = _urllib2_ref
예제 #25
0
def test_n_iter_without_progress():
    # Make sure that the parameter n_iter_without_progress is used correctly
    random_state = check_random_state(0)
    X = random_state.randn(100, 2)
    tsne = TSNE(n_iter_without_progress=2, verbose=2,
                random_state=0, method='exact')

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        tsne.fit_transform(X)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    # The output needs to contain the value of n_iter_without_progress
    assert_in("did not make any progress during the "
              "last 2 episodes. Finished.", out)
예제 #26
0
def test_fetch_one_column():
    _urllib2_ref = datasets.mldata.urllib2
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urllib2 = mock_urllib2({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        assert_in(dset, in_=['COL_NAMES', 'DESCR', 'data'], out_=['target'])

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urllib2 = _urllib2_ref
def test_unseen_or_no_features():
    D = [{"camelot": 0, "spamalot": 1}]
    for sparse in [True, False]:
        v = DictVectorizer(sparse=sparse).fit(D)

        X = v.transform({"push the pram a lot": 2})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        X = v.transform({})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        try:
            v.transform([])
        except ValueError as e:
            assert_in("empty", str(e))
예제 #28
0
def test_n_iter_without_progress():
    # Use a dummy negative n_iter_without_progress and check output on stdout
    random_state = check_random_state(0)
    X = random_state.randn(100, 2)
    tsne = TSNE(n_iter_without_progress=-1, verbose=2,
                random_state=1, method='exact')

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        tsne.fit_transform(X)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    # The output needs to contain the value of n_iter_without_progress
    assert_in("did not make any progress during the "
              "last -1 episodes. Finished.", out)
예제 #29
0
def test_fetch_one_column():
    _urllib2_ref = datasets.mldata.urllib2
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urllib2 = mock_urllib2({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        assert_in(dset, in_=['COL_NAMES', 'DESCR', 'data'], out_=['target'])

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urllib2 = _urllib2_ref
def test_unseen_or_no_features():
    D = [{"camelot": 0, "spamalot": 1}]
    for sparse in [True, False]:
        v = DictVectorizer(sparse=sparse).fit(D)

        X = v.transform({"push the pram a lot": 2})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        X = v.transform({})
        if sparse:
            X = X.toarray()
        assert_array_equal(X, np.zeros((1, 2)))

        try:
            v.transform([])
        except ValueError as e:
            assert_in("empty", str(e))
예제 #31
0
def test_valid_brute_metric_for_auto_algorithm():
    X = rng.rand(12, 12)
    Xcsr = csr_matrix(X)

    # check that there is a metric that is valid for brute
    # but not ball_tree (so we actually test something)
    assert_in("cosine", VALID_METRICS['brute'])
    assert_false("cosine" in VALID_METRICS['ball_tree'])

    # Metric which don't required any additional parameter
    require_params = ['mahalanobis', 'wminkowski', 'seuclidean']
    for metric in VALID_METRICS['brute']:
        if metric != 'precomputed' and metric not in require_params:
            nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
                                            metric=metric).fit(X)
            nn.kneighbors(X)
        elif metric == 'precomputed':
            X_precomputed = rng.random_sample((10, 4))
            Y_precomputed = rng.random_sample((3, 4))
            DXX = metrics.pairwise_distances(X_precomputed, metric='euclidean')
            DYX = metrics.pairwise_distances(Y_precomputed, X_precomputed,
                                             metric='euclidean')
            nb_p = neighbors.NearestNeighbors(n_neighbors=3)
            nb_p.fit(DXX)
            nb_p.kneighbors(DYX)

    for metric in VALID_METRICS_SPARSE['brute']:
        if metric != 'precomputed' and metric not in require_params:
            nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
                                            metric=metric).fit(Xcsr)
            nn.kneighbors(Xcsr)

    # Metric with parameter
    VI = np.dot(X, X.T)
    list_metrics = [('seuclidean', dict(V=rng.rand(12))),
                    ('wminkowski', dict(w=rng.rand(12))),
                    ('mahalanobis', dict(VI=VI))]
    for metric, params in list_metrics:
        nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
                                        metric=metric,
                                        metric_params=params).fit(X)
        nn.kneighbors(X)
예제 #32
0
def test_download():
    """Test that fetch_mldata is able to download and cache a data set."""

    _urllib2_ref = datasets.mldata.urllib2
    datasets.mldata.urllib2 = mock_urllib2(
        {'mock': {
            'label': sp.ones((150, )),
            'data': sp.ones((150, 4))
        }})
    try:
        mock = fetch_mldata('mock', data_home=tmpdir)
        assert_in(mock, in_=['COL_NAMES', 'DESCR', 'target', 'data'])

        assert_equal(mock.target.shape, (150, ))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.urllib2.HTTPError, fetch_mldata,
                      'not_existing_name')
    finally:
        datasets.mldata.urllib2 = _urllib2_ref
예제 #33
0
def test_fetch_one_column():
    _urlopen_ref = datasets.mldata.urlopen
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urlopen = mock_mldata_urlopen({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "data"]:
            assert_in(n, dset)
        assert_not_in("target", dset)

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urlopen = _urlopen_ref
예제 #34
0
def test_fetch_one_column():
    _urllib2_ref = datasets.mldata.urllib2
    try:
        dataname = 'onecol'
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        datasets.mldata.urllib2 = mock_urllib2({dataname: {'x': x}})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "data"]:
            assert_in(n, dset)
        assert_not_in("target", dset)

        assert_equal(dset.data.shape, (2, 3))
        assert_array_equal(dset.data, x)

        # transposing the data array
        dset = fetch_mldata(dataname, transpose_data=False, data_home=tmpdir)
        assert_equal(dset.data.shape, (3, 2))
    finally:
        datasets.mldata.urllib2 = _urllib2_ref
예제 #35
0
def test_download(tmpdata):
    """Test that fetch_mldata is able to download and cache a data set."""
    _urlopen_ref = datasets.mldata.urlopen
    datasets.mldata.urlopen = mock_mldata_urlopen({
        'mock': {
            'label': sp.ones((150,)),
            'data': sp.ones((150, 4)),
        },
    })
    try:
        mock = fetch_mldata('mock', data_home=tmpdata)
        for n in ["COL_NAMES", "DESCR", "target", "data"]:
            assert_in(n, mock)

        assert_equal(mock.target.shape, (150,))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.HTTPError,
                      fetch_mldata, 'not_existing_name')
    finally:
        datasets.mldata.urlopen = _urlopen_ref
예제 #36
0
def test_download(tmpdata):
    """Test that fetch_mldata is able to download and cache a data set."""
    _urlopen_ref = datasets.mldata.urlopen
    datasets.mldata.urlopen = mock_mldata_urlopen({
        'mock': {
            'label': sp.ones((150, )),
            'data': sp.ones((150, 4)),
        },
    })
    try:
        mock = fetch_mldata('mock', data_home=tmpdata)
        for n in ["COL_NAMES", "DESCR", "target", "data"]:
            assert_in(n, mock)

        assert_equal(mock.target.shape, (150, ))
        assert_equal(mock.data.shape, (150, 4))

        assert_raises(datasets.mldata.HTTPError, fetch_mldata,
                      'not_existing_name')
    finally:
        datasets.mldata.urlopen = _urlopen_ref
예제 #37
0
def test_n_iter_without_progress():
    # Make sure that the parameter n_iter_without_progress is used correctly
    random_state = check_random_state(0)
    X = random_state.randn(100, 2)
    tsne = TSNE(n_iter_without_progress=2,
                verbose=2,
                random_state=0,
                method='exact')

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        tsne.fit_transform(X)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    # The output needs to contain the value of n_iter_without_progress
    assert_in(
        "did not make any progress during the "
        "last 2 episodes. Finished.", out)
예제 #38
0
def test_n_iter_without_progress():
    # Use a dummy negative n_iter_without_progress and check output on stdout
    random_state = check_random_state(0)
    X = random_state.randn(100, 2)
    tsne = TSNE(n_iter_without_progress=-1,
                verbose=2,
                random_state=1,
                method='exact')

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        tsne.fit_transform(X)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    # The output needs to contain the value of n_iter_without_progress
    assert_in(
        "did not make any progress during the "
        "last -1 episodes. Finished.", out)
예제 #39
0
def test_n_iter_without_progress():
    # Use a dummy negative n_iter_without_progress and check output on stdout
    random_state = check_random_state(0)
    X = random_state.randn(100, 10)
    for method in ["barnes_hut", "exact"]:
        tsne = TSNE(n_iter_without_progress=-1, verbose=2, learning_rate=1e8,
                    random_state=0, method=method, n_iter=351, init="random")
        tsne._N_ITER_CHECK = 1
        tsne._EXPLORATION_N_ITER = 0

        old_stdout = sys.stdout
        sys.stdout = StringIO()
        try:
            tsne.fit_transform(X)
        finally:
            out = sys.stdout.getvalue()
            sys.stdout.close()
            sys.stdout = old_stdout

        # The output needs to contain the value of n_iter_without_progress
        assert_in("did not make any progress during the "
                  "last -1 episodes. Finished.", out)
예제 #40
0
def test_sparse_random_matrix():
    """Check some statical properties of sparse random matrix"""
    n_components = 100
    n_features = 500

    for density in [0.3, 1.]:
        s = 1 / density

        A = sparse_random_matrix(n_components,
                                 n_features,
                                 density=density,
                                 random_state=0)
        A = densify(A)

        # Check possible values
        values = np.unique(A)
        assert_in(np.sqrt(s) / np.sqrt(n_components), values)
        assert_in(-np.sqrt(s) / np.sqrt(n_components), values)

        if density == 1.0:
            assert_equal(np.size(values), 2)
        else:
            assert_in(0., values)
            assert_equal(np.size(values), 3)

        # Check that the random matrix follow the proper distribution.
        # Let's say that each element of a_{ij} of A is taken from
        #
        # - -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
        # -  0                              with probability 1 - 1 / s
        # - +sqrt(s) / sqrt(n_components)   with probability 1 / 2s
        #
        assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2)
        assert_almost_equal(np.mean(A == np.sqrt(s) / np.sqrt(n_components)),
                            1 / (2 * s),
                            decimal=2)
        assert_almost_equal(np.mean(A == -np.sqrt(s) / np.sqrt(n_components)),
                            1 / (2 * s),
                            decimal=2)

        assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s,
                            decimal=2)
        assert_almost_equal(np.var(A == np.sqrt(s) / np.sqrt(n_components),
                                   ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s),
                            decimal=2)
        assert_almost_equal(np.var(A == -np.sqrt(s) / np.sqrt(n_components),
                                   ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s),
                            decimal=2)
def test_sparse_random_matrix():
    # Check some statical properties of sparse random matrix
    n_components = 100
    n_features = 500

    for density in [0.3, 1.]:
        s = 1 / density

        A = sparse_random_matrix(n_components,
                                 n_features,
                                 density=density,
                                 random_state=0)
        A = densify(A)

        # Check possible values
        values = np.unique(A)
        assert_in(np.sqrt(s) / np.sqrt(n_components), values)
        assert_in(- np.sqrt(s) / np.sqrt(n_components), values)

        if density == 1.0:
            assert_equal(np.size(values), 2)
        else:
            assert_in(0., values)
            assert_equal(np.size(values), 3)

        # Check that the random matrix follow the proper distribution.
        # Let's say that each element of a_{ij} of A is taken from
        #
        # - -sqrt(s) / sqrt(n_components)   with probability 1 / 2s
        # -  0                              with probability 1 - 1 / s
        # - +sqrt(s) / sqrt(n_components)   with probability 1 / 2s
        #
        assert_almost_equal(np.mean(A == 0.0),
                            1 - 1 / s, decimal=2)
        assert_almost_equal(np.mean(A == np.sqrt(s) / np.sqrt(n_components)),
                            1 / (2 * s), decimal=2)
        assert_almost_equal(np.mean(A == - np.sqrt(s) / np.sqrt(n_components)),
                            1 / (2 * s), decimal=2)

        assert_almost_equal(np.var(A == 0.0, ddof=1),
                            (1 - 1 / s) * 1 / s, decimal=2)
        assert_almost_equal(np.var(A == np.sqrt(s) / np.sqrt(n_components),
                                   ddof=1),
                            (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2)
        assert_almost_equal(np.var(A == - np.sqrt(s) / np.sqrt(n_components),
                                   ddof=1),
                            (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2)
예제 #42
0
def test_countvectorizer_custom_vocabulary_gap_index():
    vocab = {"pizza": 1, "beer": 2}
    try:
        CountVectorizer(vocabulary=vocab)
    except ValueError as e:
        assert_in("doesn't contain index", str(e).lower())
예제 #43
0
def test_countvectorizer_custom_vocabulary_repeated_indeces():
    vocab = {"pizza": 0, "beer": 0}
    try:
        vect = CountVectorizer(vocabulary=vocab)
    except ValueError as e:
        assert_in("vocabulary contains repeated indices", str(e).lower())
예제 #44
0
def test_countvectorizer_custom_vocabulary_gap_index():
    vocab = {"pizza": 1, "beer": 2}
    try:
        vect = CountVectorizer(vocabulary=vocab)
    except ValueError as e:
        assert_in("doesn't contain index", str(e).lower())
예제 #45
0
def test_fetch_multiple_column():
    _urllib2_ref = datasets.mldata.urllib2
    try:
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        y = sp.array([1, -1])
        z = sp.arange(12).reshape(4, 3)

        # by default
        dataname = 'threecol-default'
        datasets.mldata.urllib2 = mock_urllib2({dataname:
                                                ({'label': y,
                                                  'data': x,
                                                  'z': z},
                                                 ['z', 'data', 'label'])})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        assert_in(dset, in_=['COL_NAMES', 'DESCR', 'target', 'data', 'z'],
                  out_=['x', 'y'])

        assert_array_equal(dset.data, x)
        assert_array_equal(dset.target, y)
        assert_array_equal(dset.z, z.T)

        # by order
        dataname = 'threecol-order'
        datasets.mldata.urllib2 = mock_urllib2({dataname:
                                                ({'y': y,
                                                  'x': x,
                                                  'z': z},
                                                 ['y', 'x', 'z'])})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        assert_in(dset, in_=['COL_NAMES', 'DESCR', 'target', 'data', 'z'],
                  out_=['x', 'y'])
        assert_array_equal(dset.data, x)
        assert_array_equal(dset.target, y)
        assert_array_equal(dset.z, z.T)

        # by number
        dataname = 'threecol-number'
        datasets.mldata.urllib2 = mock_urllib2({dataname:
                                                ({'y': y,
                                                  'x': x,
                                                  'z': z},
                                                 ['z', 'x', 'y'])})

        dset = fetch_mldata(dataname, target_name=2, data_name=0,
                            data_home=tmpdir)
        assert_in(dset, in_=['COL_NAMES', 'DESCR', 'target', 'data', 'x'],
                  out_=['z', 'y'])
        assert_array_equal(dset.data, z)
        assert_array_equal(dset.target, y)

        # by name
        dset = fetch_mldata(dataname, target_name='y', data_name='z',
                            data_home=tmpdir)
        assert_in(dset, in_=['COL_NAMES', 'DESCR', 'target', 'data', 'x'],
                  out_=['z', 'y'])
        assert_array_equal(dset.data, z)
        assert_array_equal(dset.target, y)
    finally:
        datasets.mldata.urllib2 = _urllib2_ref
예제 #46
0
def test_fetch_multiple_column():
    _urlopen_ref = datasets.mldata.urlopen
    try:
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        y = sp.array([1, -1])
        z = sp.arange(12).reshape(4, 3)

        # by default
        dataname = 'threecol-default'
        datasets.mldata.urlopen = mock_mldata_urlopen({
            dataname: (
                {
                    'label': y,
                    'data': x,
                    'z': z,
                },
                ['z', 'data', 'label'],
            ),
        })

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "z"]:
            assert_in(n, dset)
        assert_not_in("x", dset)
        assert_not_in("y", dset)

        assert_array_equal(dset.data, x)
        assert_array_equal(dset.target, y)
        assert_array_equal(dset.z, z.T)

        # by order
        dataname = 'threecol-order'
        datasets.mldata.urlopen = mock_mldata_urlopen({
            dataname: ({'y': y, 'x': x, 'z': z},
                       ['y', 'x', 'z']),
            })

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "z"]:
            assert_in(n, dset)
        assert_not_in("x", dset)
        assert_not_in("y", dset)

        assert_array_equal(dset.data, x)
        assert_array_equal(dset.target, y)
        assert_array_equal(dset.z, z.T)

        # by number
        dataname = 'threecol-number'
        datasets.mldata.urlopen = mock_mldata_urlopen({
            dataname: ({'y': y, 'x': x, 'z': z},
                       ['z', 'x', 'y']),
        })

        dset = fetch_mldata(dataname, target_name=2, data_name=0,
                            data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "x"]:
            assert_in(n, dset)
        assert_not_in("y", dset)
        assert_not_in("z", dset)

        assert_array_equal(dset.data, z)
        assert_array_equal(dset.target, y)

        # by name
        dset = fetch_mldata(dataname, target_name='y', data_name='z',
                            data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "x"]:
            assert_in(n, dset)
        assert_not_in("y", dset)
        assert_not_in("z", dset)

    finally:
        datasets.mldata.urlopen = _urlopen_ref
예제 #47
0
def test_root_import_all_completeness():
    EXCEPTIONS = ("utils", "tests", "base", "setup")
    for _, modname, _ in pkgutil.walk_packages(path=sklearn.__path__, onerror=lambda _: None):
        if "." in modname or modname.startswith("_") or modname in EXCEPTIONS:
            continue
        assert_in(modname, sklearn.__all__)
예제 #48
0
def test_countvectorizer_custom_vocabulary_repeated_indices():
    vocab = {"pizza": 0, "beer": 0}
    try:
        CountVectorizer(vocabulary=vocab)
    except ValueError as e:
        assert_in("vocabulary contains repeated indices", str(e).lower())
예제 #49
0
def test_fetch_multiple_column():
    _urllib2_ref = datasets.mldata.urllib2
    try:
        # create fake data set in cache
        x = sp.arange(6).reshape(2, 3)
        y = sp.array([1, -1])
        z = sp.arange(12).reshape(4, 3)

        # by default
        dataname = 'threecol-default'
        datasets.mldata.urllib2 = mock_urllib2({
            dataname: ({
                'label': y,
                'data': x,
                'z': z
            }, ['z', 'data', 'label'])
        })

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "z"]:
            assert_in(n, dset)
        assert_not_in("x", dset)
        assert_not_in("y", dset)

        assert_array_equal(dset.data, x)
        assert_array_equal(dset.target, y)
        assert_array_equal(dset.z, z.T)

        # by order
        dataname = 'threecol-order'
        datasets.mldata.urllib2 = mock_urllib2(
            {dataname: ({
                'y': y,
                'x': x,
                'z': z
            }, ['y', 'x', 'z'])})

        dset = fetch_mldata(dataname, data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "z"]:
            assert_in(n, dset)
        assert_not_in("x", dset)
        assert_not_in("y", dset)

        assert_array_equal(dset.data, x)
        assert_array_equal(dset.target, y)
        assert_array_equal(dset.z, z.T)

        # by number
        dataname = 'threecol-number'
        datasets.mldata.urllib2 = mock_urllib2(
            {dataname: ({
                'y': y,
                'x': x,
                'z': z
            }, ['z', 'x', 'y'])})

        dset = fetch_mldata(dataname,
                            target_name=2,
                            data_name=0,
                            data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "x"]:
            assert_in(n, dset)
        assert_not_in("y", dset)
        assert_not_in("z", dset)

        assert_array_equal(dset.data, z)
        assert_array_equal(dset.target, y)

        # by name
        dset = fetch_mldata(dataname,
                            target_name='y',
                            data_name='z',
                            data_home=tmpdir)
        for n in ["COL_NAMES", "DESCR", "target", "data", "x"]:
            assert_in(n, dset)
        assert_not_in("y", dset)
        assert_not_in("z", dset)

    finally:
        datasets.mldata.urllib2 = _urllib2_ref