예제 #1
0
def test_n_samples_leaves_roots():
    """Sanity check for the number of samples in leaves and roots"""
    X, y = make_blobs(n_samples=10)
    brc = Birch()
    brc.fit(X)
    n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
    n_samples_leaves = sum([sc.n_samples_ for leaf in brc._get_leaves() for sc in leaf.subclusters_])
    assert_equal(n_samples_leaves, X.shape[0])
    assert_equal(n_samples_root, X.shape[0])
예제 #2
0
def test_n_samples_leaves_roots():
    """Sanity check for the number of samples in leaves and roots"""
    X, y = make_blobs(n_samples=10)
    brc = Birch()
    brc.fit(X)
    n_samples_root = sum([sc.n_samples_ for sc in brc.root_.subclusters_])
    n_samples_leaves = sum([sc.n_samples_ for leaf in brc._get_leaves()
                            for sc in leaf.subclusters_])
    assert_equal(n_samples_leaves, X.shape[0])
    assert_equal(n_samples_root, X.shape[0])
예제 #3
0
def test_partial_fit():
    """Test that fit is equivalent to calling partial_fit multiple times"""
    X, y = make_blobs(n_samples=100)
    brc = Birch(n_clusters=3)
    brc.fit(X)
    brc_partial = Birch(n_clusters=None)
    brc_partial.partial_fit(X[:50])
    brc_partial.partial_fit(X[50:])
    assert_array_equal(brc_partial.subcluster_centers_, brc.subcluster_centers_)

    # Test that same global labels are obtained after calling partial_fit
    # with None
    brc_partial.set_params(n_clusters=3)
    brc_partial.partial_fit(None)
    assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
예제 #4
0
def test_birch_predict():
    """Test the predict method predicts the nearest centroid."""
    rng = np.random.RandomState(0)
    X = generate_clustered_data(n_clusters=3, n_features=3, n_samples_per_cluster=10)

    # n_samples * n_samples_per_cluster
    shuffle_indices = np.arange(30)
    rng.shuffle(shuffle_indices)
    X_shuffle = X[shuffle_indices, :]
    brc = Birch(n_clusters=4, threshold=1.0)
    brc.fit(X_shuffle)
    centroids = brc.subcluster_centers_
    assert_array_equal(brc.labels_, brc.predict(X_shuffle))
    nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids)
    assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
예제 #5
0
def test_birch_predict():
    """Test the predict method predicts the nearest centroid."""
    rng = np.random.RandomState(0)
    X = generate_clustered_data(n_clusters=3, n_features=3,
                                n_samples_per_cluster=10)

    # n_samples * n_samples_per_cluster
    shuffle_indices = np.arange(30)
    rng.shuffle(shuffle_indices)
    X_shuffle = X[shuffle_indices, :]
    brc = Birch(n_clusters=4, threshold=1.)
    brc.fit(X_shuffle)
    centroids = brc.subcluster_centers_
    assert_array_equal(brc.labels_, brc.predict(X_shuffle))
    nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids)
    assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
예제 #6
0
파일: birch.py 프로젝트: yutarochan/lale
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
예제 #7
0
 def __init__(self, threshold=0.5, branching_factor=50, n_clusters=3, compute_labels=True, copy=True):
     self._hyperparams = {
         'threshold': threshold,
         'branching_factor': branching_factor,
         'n_clusters': n_clusters,
         'compute_labels': compute_labels,
         'copy': copy}
     self._wrapped_model = SKLModel(**self._hyperparams)
예제 #8
0
def test_threshold():
    """Test that the leaf subclusters have a threshold lesser than radius"""
    X, y = make_blobs(n_samples=80, centers=4)
    brc = Birch(threshold=0.5, n_clusters=None)
    brc.fit(X)
    check_threshold(brc, 0.5)

    brc = Birch(threshold=5.0, n_clusters=None)
    brc.fit(X)
    check_threshold(brc, 5.)
예제 #9
0
def test_branching_factor():
    """Test that nodes have at max branching_factor number of subclusters"""
    X, y = make_blobs()
    branching_factor = 9

    # Purposefully set a low threshold to maximize the subclusters.
    brc = Birch(n_clusters=None, branching_factor=branching_factor,
                threshold=0.01)
    brc.fit(X)
    check_branching_factor(brc.root_, branching_factor)
    brc = Birch(n_clusters=3, branching_factor=branching_factor,
                threshold=0.01)
    brc.fit(X)
    check_branching_factor(brc.root_, branching_factor)

    # Raises error when branching_factor is set to one.
    brc = Birch(n_clusters=None, branching_factor=1, threshold=0.01)
    assert_raises(ValueError, brc.fit, X)
예제 #10
0
def test_sparse_X():
    """Test that sparse and dense data give same results"""
    X, y = make_blobs(n_samples=100, centers=10)
    brc = Birch(n_clusters=10)
    brc.fit(X)

    csr = sparse.csr_matrix(X)
    brc_sparse = Birch(n_clusters=10)
    brc_sparse.fit(csr)

    assert_array_equal(brc.labels_, brc_sparse.labels_)
    assert_array_equal(brc.subcluster_centers_, brc_sparse.subcluster_centers_)
예제 #11
0
class BirchImpl():
    def __init__(self,
                 threshold=0.5,
                 branching_factor=50,
                 n_clusters=3,
                 compute_labels=True,
                 copy=True):
        self._hyperparams = {
            'threshold': threshold,
            'branching_factor': branching_factor,
            'n_clusters': n_clusters,
            'compute_labels': compute_labels,
            'copy': copy
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def transform(self, X):
        return self._sklearn_model.transform(X)

    def predict(self, X):
        return self._sklearn_model.predict(X)
예제 #12
0
def test_n_clusters():
    """Test that n_clusters param works properly"""
    X, y = make_blobs(n_samples=100, centers=10)
    brc1 = Birch(n_clusters=10)
    brc1.fit(X)
    assert_greater(len(brc1.subcluster_centers_), 10)
    assert_equal(len(np.unique(brc1.labels_)), 10)

    # Test that n_clusters = Agglomerative Clustering gives
    # the same results.
    gc = AgglomerativeClustering(n_clusters=10)
    brc2 = Birch(n_clusters=gc)
    brc2.fit(X)
    assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
    assert_array_equal(brc1.labels_, brc2.labels_)

    # Test that the wrong global clustering step raises an Error.
    clf = ElasticNet()
    brc3 = Birch(n_clusters=clf)
    assert_raises(ValueError, brc3.fit, X)

    # Test that a small number of clusters raises a warning.
    brc4 = Birch(threshold=10000.)
    assert_warns(UserWarning, brc4.fit, X)
예제 #13
0
def test_threshold():
    """Test that the leaf subclusters have a threshold lesser than radius"""
    X, y = make_blobs(n_samples=80, centers=4)
    brc = Birch(threshold=0.5, n_clusters=None)
    brc.fit(X)
    check_threshold(brc, 0.5)

    brc = Birch(threshold=5.0, n_clusters=None)
    brc.fit(X)
    check_threshold(brc, 5.0)
예제 #14
0
def test_sparse_X():
    """Test that sparse and dense data give same results"""
    X, y = make_blobs(n_samples=100, centers=10)
    brc = Birch(n_clusters=10)
    brc.fit(X)

    csr = sparse.csr_matrix(X)
    brc_sparse = Birch(n_clusters=10)
    brc_sparse.fit(csr)

    assert_array_equal(brc.labels_, brc_sparse.labels_)
    assert_array_equal(brc.subcluster_centers_, brc_sparse.subcluster_centers_)
예제 #15
0
def test_branching_factor():
    """Test that nodes have at max branching_factor number of subclusters"""
    X, y = make_blobs()
    branching_factor = 9

    # Purposefully set a low threshold to maximize the subclusters.
    brc = Birch(n_clusters=None, branching_factor=branching_factor, threshold=0.01)
    brc.fit(X)
    check_branching_factor(brc.root_, branching_factor)
    brc = Birch(n_clusters=3, branching_factor=branching_factor, threshold=0.01)
    brc.fit(X)
    check_branching_factor(brc.root_, branching_factor)

    # Raises error when branching_factor is set to one.
    brc = Birch(n_clusters=None, branching_factor=1, threshold=0.01)
    assert_raises(ValueError, brc.fit, X)
예제 #16
0
def test_n_clusters():
    """Test that n_clusters param works properly"""
    X, y = make_blobs(n_samples=100, centers=10)
    brc1 = Birch(n_clusters=10)
    brc1.fit(X)
    assert_greater(len(brc1.subcluster_centers_), 10)
    assert_equal(len(np.unique(brc1.labels_)), 10)

    # Test that n_clusters = Agglomerative Clustering gives
    # the same results.
    gc = AgglomerativeClustering(n_clusters=10)
    brc2 = Birch(n_clusters=gc)
    brc2.fit(X)
    assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
    assert_array_equal(brc1.labels_, brc2.labels_)

    # Test that the wrong global clustering step raises an Error.
    clf = ElasticNet()
    brc3 = Birch(n_clusters=clf)
    assert_raises(ValueError, brc3.fit, X)

    # Test that a small number of clusters raises a warning.
    brc4 = Birch(threshold=10000.0)
    assert_warns(UserWarning, brc4.fit, X)
예제 #17
0
def test_partial_fit():
    """Test that fit is equivalent to calling partial_fit multiple times"""
    X, y = make_blobs(n_samples=100)
    brc = Birch(n_clusters=3)
    brc.fit(X)
    brc_partial = Birch(n_clusters=None)
    brc_partial.partial_fit(X[:50])
    brc_partial.partial_fit(X[50:])
    assert_array_equal(brc_partial.subcluster_centers_,
                       brc.subcluster_centers_)

    # Test that same global labels are obtained after calling partial_fit
    # with None
    brc_partial.set_params(n_clusters=3)
    brc_partial.partial_fit(None)
    assert_array_equal(brc_partial.subcluster_labels_, brc.subcluster_labels_)
예제 #18
0

clf_dict = {'ARDRegression':ARDRegression(),
			'AdaBoostClassifier':AdaBoostClassifier(),
			'AdaBoostRegressor':AdaBoostRegressor(),
			'AdditiveChi2Sampler':AdditiveChi2Sampler(),
			'AffinityPropagation':AffinityPropagation(),
			'AgglomerativeClustering':AgglomerativeClustering(),
			'BaggingClassifier':BaggingClassifier(),
			'BaggingRegressor':BaggingRegressor(),
			'BayesianGaussianMixture':BayesianGaussianMixture(),
			'BayesianRidge':BayesianRidge(),
			'BernoulliNB':BernoulliNB(),
			'BernoulliRBM':BernoulliRBM(),
			'Binarizer':Binarizer(),
			'Birch':Birch(),
			'CCA':CCA(),
			'CalibratedClassifierCV':CalibratedClassifierCV(),
			'DBSCAN':DBSCAN(),
			'DPGMM':DPGMM(),
			'DecisionTreeClassifier':DecisionTreeClassifier(),
			'DecisionTreeRegressor':DecisionTreeRegressor(),
			'DictionaryLearning':DictionaryLearning(),
			'ElasticNet':ElasticNet(),
			'ElasticNetCV':ElasticNetCV(),
			'EmpiricalCovariance':EmpiricalCovariance(),
			'ExtraTreeClassifier':ExtraTreeClassifier(),
			'ExtraTreeRegressor':ExtraTreeRegressor(),
			'ExtraTreesClassifier':ExtraTreesClassifier(),
			'ExtraTreesRegressor':ExtraTreesRegressor(),
			'FactorAnalysis':FactorAnalysis(),