示例#1
0
    def test_timings(self):
        """
        Test the twinx double axes with k-elbow timings
        """
        visualizer = KElbowVisualizer(KMeans(random_state=0),
                                      k=5,
                                      timings=True,
                                      locate_elbow=False)
        visualizer.fit(self.clusters.X)

        # Check that we kept track of time
        assert len(visualizer.k_timers_) == 4
        assert all([t > 0 for t in visualizer.k_timers_])

        # Check that we plotted time on a twinx
        assert hasattr(visualizer, "axes")
        assert len(visualizer.axes) == 2

        # delete the timings axes and
        # overwrite k_timers_, k_values_ for image similarity Tests
        visualizer.axes[1].remove()
        visualizer.k_timers_ = [
            0.01084589958190918,
            0.011144161224365234,
            0.017028093338012695,
            0.010634183883666992,
        ]
        visualizer.k_values_ = [2, 3, 4, 5]

        # call draw again which is normally called in fit
        visualizer.draw()
        visualizer.finalize()

        self.assert_images_similar(visualizer)
示例#2
0
    def test_invalid_k(self):
        """
        Assert that invalid values of K raise exceptions
        """

        with self.assertRaises(YellowbrickValueError):
            model = KElbowVisualizer(KMeans(), k=(1, 2, 3, 4, 5))

        with self.assertRaises(YellowbrickValueError):
            model = KElbowVisualizer(KMeans(), k="foo")
示例#3
0
    def test_invalid_k(self):
        """
        Assert that invalid values of K raise exceptions
        """

        with pytest.raises(YellowbrickValueError):
            KElbowVisualizer(KMeans(), k=(1, 2, 3, "foo", 5))

        with pytest.raises(YellowbrickValueError):
            KElbowVisualizer(KMeans(), k="foo")
    def test_timings(self):
        """
        Test the twinx double axes with k-elbow timings
        """
        visualizer = KElbowVisualizer(
            KMeans(random_state=0), k=5, timings=True
        )
        visualizer.fit(X)

        # Check that we kept track of time
        assert len(visualizer.k_timers_) == 4
        assert all([t > 0 for t in visualizer.k_timers_])

        # Check that we plotted time on a twinx
        assert hasattr(visualizer, "axes")
        assert len(visualizer.axes) == 2

        # delete the timings axes and
        # overwrite k_timers_, k_values_ for image similarity Tests
        visualizer.axes[1].remove()
        visualizer.k_timers_ = [
            0.01084589958190918, 0.011144161224365234,
            0.017028093338012695, 0.010634183883666992
        ]
        visualizer.k_values_ = [2, 3, 4, 5]

        # call draw again which is normally called in fit
        visualizer.draw()
        visualizer.poof()

        self.assert_images_similar(visualizer)
示例#5
0
    def test_calinski_harabaz_metric(self):
        """
        Test the calinski-harabaz metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(KMeans(), k=5, metric="calinski_harabaz")
        visualizer.fit(X)

        expected = [
            81.662726256035683, 50.992378259195554, 40.952179227847012,
            37.068658049555459
        ]
        self.assertEqual(len(visualizer.k_scores_), 4)
示例#6
0
    def test_silhouette_metric(self):
        """
        Test the silhouette metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(KMeans(), k=5, metric="silhouette")
        visualizer.fit(X)

        expected = [
            0.69163638040000031, 0.4534779796676191, 0.24802958481973392,
            0.21792458448172247
        ]
        self.assertEqual(len(visualizer.k_scores_), 4)
示例#7
0
    def test_distortion_metric(self):
        """
        Test the distortion metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(KMeans(), k=5, metric="distortion")
        visualizer.fit(X)

        expected = [
            7.6777850157143783, 8.3643185158057669, 9.5203330222217666,
            8.9777589843618912
        ]
        self.assertEqual(len(visualizer.k_scores_), 4)
示例#8
0
    def test_timings(self):
        """
        Test the twinx double axes with k-elbow timings
        """
        visualizer = KElbowVisualizer(KMeans(), k=5, timings=True)
        visualizer.fit(X)

        # Check that we kept track of time
        self.assertEqual(len(visualizer.k_timers_), 4)
        self.assertTrue(all([t > 0 for t in visualizer.k_timers_]))

        # Check that we plotted time on a twinx
        self.assertTrue(hasattr(visualizer, "axes"))
        self.assertEqual(len(visualizer.axes), 2)
    def test_topic_modeling_k_means(self):
        """
        Test topic modeling k-means on the hobbies corpus
        """
        corpus = self.load_corpus("hobbies")

        tfidf  = TfidfVectorizer()
        docs   = tfidf.fit_transform(corpus.data)
        visualizer = KElbowVisualizer(KMeans(), k=(4, 8))

        visualizer.fit(docs)
        visualizer.poof()

        self.assert_images_similar(visualizer)
示例#10
0
    def test_silhouette_metric(self):
        """
        Test the silhouette metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(
            KMeans(random_state=0), k=5, metric="silhouette", timings=False
        )
        visualizer.fit(X)

        expected = np.array([ 0.691636,  0.456646,  0.255174,  0.239842])
        assert len(visualizer.k_scores_) == 4

        visualizer.poof()
        self.assert_images_similar(visualizer)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#11
0
    def test_distortion_metric(self):
        """
        Test the distortion metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(
            KMeans(random_state=0), k=5, metric="distortion", timings=False
        )
        visualizer.fit(X)

        expected = np.array([ 7.677785,  8.364319,  8.893634,  8.013021])
        assert len(visualizer.k_scores_) == 4

        visualizer.poof()
        self.assert_images_similar(visualizer)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#12
0
 def test_no_knee(self):
     """
     Assert that a warning is issued if there is no knee detected
     """
     X, y = make_blobs(n_samples=1000,
                       centers=3,
                       n_features=12,
                       random_state=12)
     message = ("No 'knee' or 'elbow point' detected "
                "This could be due to bad clustering, no "
                "actual clusters being formed etc.")
     with pytest.warns(YellowbrickWarning, match=message):
         visualizer = KElbowVisualizer(KMeans(random_state=12),
                                       k=(4, 12),
                                       locate_elbow=True)
         visualizer.fit(X)
示例#13
0
    def test_valid_k(self):
        """
        Assert that valid values of K generate correct k_values_:
        if k is an int, k_values_ = range(2, k+1)
        if k is a tuple of 2 ints, k_values = range(k[0], k[1])
        if k is an iterable, k_values_ = list(k)
        """
        visualizer = KElbowVisualizer(KMeans(), k=8)
        assert visualizer.k_values_ == list(np.arange(2, 8+1))

        visualizer = KElbowVisualizer(KMeans(), k=(4, 12))
        assert visualizer.k_values_ == list(np.arange(4, 12))

        visualizer = KElbowVisualizer(KMeans(), k=np.arange(10, 100, 10))
        assert visualizer.k_values_ == list(np.arange(10, 100, 10))

        visualizer = KElbowVisualizer(KMeans(),
                                      k=[10, 20, 30, 40, 50, 60, 70, 80, 90])
        assert visualizer.k_values_ == list(np.arange(10, 100, 10))
示例#14
0
    def test_calinski_harabaz_metric(self):
        """
        Test the calinski-harabaz metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(
            KMeans(random_state=0), k=5,
            metric="calinski_harabaz", timings=False
        )
        visualizer.fit(X)
        assert len(visualizer.k_scores_) == 4

        expected = np.array([
            81.662726256035683, 50.992378259195554,
            40.952179227847012, 35.939494
        ])


        visualizer.poof()
        self.assert_images_similar(visualizer)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#15
0
    def test_locate_elbow(self):
        """
        Test the addition of locate_elbow to an image
        """
        X, y = make_blobs(n_samples=1000,
                          n_features=5,
                          centers=3,
                          shuffle=True,
                          random_state=42)

        visualizer = KElbowVisualizer(
            KMeans(random_state=0),
            k=6,
            metric="calinski_harabasz",
            timings=False,
            locate_elbow=True,
        )
        visualizer.fit(X)
        assert len(visualizer.k_scores_) == 5
        assert visualizer.elbow_value_ == 3
        expected = np.array(
            [4286.479848, 12463.383743, 8766.999551, 6950.08391, 5865.79722])

        visualizer.finalize()
        self.assert_images_similar(visualizer, windows_tol=2.2)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#16
0
    def test_integrated_mini_batch_kmeans_elbow(self):
        """
        Test no exceptions for mini-batch kmeans k-elbow visualizer
        """
        # NOTE #182: cannot use occupancy dataset because of memory usage

        # Generate a blobs data set
        X, y = make_blobs(n_samples=1000,
                          n_features=12,
                          centers=6,
                          shuffle=True,
                          random_state=42)

        try:
            _, ax = plt.subplots()

            visualizer = KElbowVisualizer(MiniBatchKMeans(random_state=42),
                                          k=4,
                                          ax=ax)
            visualizer.fit(X)
            visualizer.finalize()

            self.assert_images_similar(visualizer)
        except Exception as e:
            pytest.fail("error during k-elbow: {}".format(e))
示例#17
0
    def test_sample_weights(self):
        """
        Test that passing in sample weights correctly influences the clusterer's fit
        """
        seed = 1234

        # original data has 5 clusters
        X, y = make_blobs(
            n_samples=[5, 30, 30, 30, 30],
            n_features=5,
            random_state=seed,
            shuffle=False,
        )

        visualizer = KElbowVisualizer(KMeans(random_state=seed),
                                      k=(2, 12),
                                      timings=False)
        visualizer.fit(X)
        assert visualizer.elbow_value_ == 5

        # weights should push elbow down to 4
        weights = np.concatenate([np.ones(5) * 0.0001, np.ones(120)])

        visualizer.fit(X, sample_weight=weights)
        assert visualizer.elbow_value_ == 4
示例#18
0
    def test_calinski_harabasz_metric(self):
        """
        Test the calinski-harabasz metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(
            KMeans(random_state=0),
            k=5,
            metric="calinski_harabasz",
            timings=False,
            locate_elbow=False,
        )
        visualizer.fit(self.clusters.X)
        assert len(visualizer.k_scores_) == 4
        assert visualizer.elbow_value_ is None

        expected = np.array([
            81.66272625603568,
            50.992378259195554,
            39.573201061900455,
            37.06865804955547,
        ])

        visualizer.finalize()
        self.assert_images_similar(visualizer)
        assert_array_almost_equal(visualizer.k_scores_, expected)
    def TrainModel(self):
        print(self.data_train.columns)
        self.listWidget_data_train.clear()
        self.columnsRemove.clear()
        save_location = self.GetLocation(operation='save',
                                         caption="Save as",
                                         filter="JobLib Files(*.joblib)")
        if save_location != '':
            print(save_location, 'model train start')
            #train model
            self.data_train.dropna(inplace=True)
            self.data_train.drop_duplicates(inplace=True)
            X = pd.get_dummies(self.data_train)
            kmeans = KMeans(init='k-means++',
                            max_iter=300,
                            n_init=10,
                            random_state=4)
            scaler = MinMaxScaler()
            scaled_features = scaler.fit_transform(X)
            visualizer = KElbowVisualizer(kmeans,
                                          k=(4, 12),
                                          metric='silhouette',
                                          timings=False)

            visualizer.fit(X)

            if (not visualizer.elbow_value_):
                clusterValue = 3
            else:
                clusterValue = visualizer.elbow_value_
            kmeans = KMeans(max_iter=300,
                            n_init=10,
                            random_state=4,
                            n_clusters=clusterValue)
            print(clusterValue)
            kmeans.fit(scaled_features)
            #save model
            dump(kmeans, save_location + '.joblib')
            print('model train done')
示例#20
0
    def test_integrated_kmeans_elbow(self):
        """
        Test no exceptions for kmeans k-elbow visualizer on blobs dataset
        """
        # NOTE #182: cannot use occupancy dataset because of memory usage

        # Generate a blobs data set
        X,y = make_blobs(
            n_samples=1000, n_features=12, centers=6,
            shuffle=True, random_state=42
        )

        try:
            _, ax = plt.subplots()

            visualizer = KElbowVisualizer(KMeans(random_state=42), k=4, ax=ax)
            visualizer.fit(X)
            visualizer.poof()

            self.assert_images_similar(visualizer)
        except Exception as e:
            pytest.fail("error during k-elbow: {}".format(e))
示例#21
0
    def test_topic_modeling_k_means(self):
        """
        Test topic modeling k-means on the hobbies corpus
        """
        corpus = load_hobbies()

        tfidf = TfidfVectorizer()
        docs = tfidf.fit_transform(corpus.data)
        visualizer = KElbowVisualizer(KMeans(), k=(4, 8))

        visualizer.fit(docs)
        visualizer.finalize()

        self.assert_images_similar(visualizer)
示例#22
0
    def test_distortion_metric(self):
        """
        Test the distortion metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(KMeans(random_state=0),
                                      k=5,
                                      metric="distortion",
                                      timings=False)
        visualizer.fit(X)

        expected = np.array([7.677785, 8.364319, 8.893634, 8.013021])
        self.assertEqual(len(visualizer.k_scores_), 4)
        visualizer.poof()
        self.assert_images_similar(visualizer)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#23
0
    def test_silhouette_metric(self):
        """
        Test the silhouette metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(KMeans(random_state=0),
                                      k=5,
                                      metric="silhouette",
                                      timings=False)
        visualizer.fit(X)

        expected = np.array([0.691636, 0.456646, 0.255174, 0.239842])
        self.assertEqual(len(visualizer.k_scores_), 4)
        visualizer.poof()
        self.assert_images_similar(visualizer)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#24
0
    def test_calinski_harabaz_metric(self):
        """
        Test the calinski-harabaz metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(KMeans(random_state=0),
                                      k=5,
                                      metric="calinski_harabaz",
                                      timings=False)
        visualizer.fit(X)

        expected = np.array([
            81.662726256035683, 50.992378259195554, 40.952179227847012,
            35.939494
        ])

        self.assertEqual(len(visualizer.k_scores_), 4)
        visualizer.poof()
        self.assert_images_similar(visualizer)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#25
0
    def test_distortion_metric(self):
        """
        Test the distortion metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(
            KMeans(random_state=0),
            k=5,
            metric="distortion",
            timings=False,
            locate_elbow=False,
        )
        visualizer.fit(self.clusters.X)

        expected = np.array([69.100065, 54.081571, 43.146921, 34.978487])
        assert len(visualizer.k_scores_) == 4

        visualizer.finalize()
        self.assert_images_similar(visualizer, tol=0.03)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#26
0
    def test_integrated_mini_batch_kmeans_elbow(self):
        """
        Test no exceptions for mini-batch kmeans k-elbow visualizer

        See #182: cannot use occupancy dataset because of memory usage
        """

        # Generate a blobs data set
        X, y = make_blobs(n_samples=1000,
                          n_features=12,
                          centers=6,
                          shuffle=True)

        try:
            visualizer = KElbowVisualizer(MiniBatchKMeans(), k=4)
            visualizer.fit(X)
            visualizer.poof()
        except Exception as e:
            self.fail("error during k-elbow: {}".format(e))
示例#27
0
    def test_calinski_harabasz_metric(self):
        """
        Test the calinski-harabasz metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(
            KMeans(random_state=0),
            k=5,
            metric="calinski_harabasz",
            timings=False,
            locate_elbow=False,
        )
        visualizer.fit(self.clusters.X)
        assert len(visualizer.k_scores_) == 4
        assert visualizer.elbow_value_ is None

        expected = np.array([81.662726, 50.992378, 40.952179, 35.939494])

        visualizer.finalize()
        self.assert_images_similar(visualizer)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#28
0
    def test_silhouette_metric(self):
        """
        Test the silhouette metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(
            KMeans(random_state=0),
            k=5,
            metric="silhouette",
            timings=False,
            locate_elbow=False,
        )
        visualizer.fit(self.clusters.X)

        expected = np.array([
            0.6916363804000003,
            0.456645663683503,
            0.26918583373704463,
            0.25523298106687914,
        ])
        assert len(visualizer.k_scores_) == 4

        visualizer.finalize()
        self.assert_images_similar(visualizer)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#29
0
    def test_integrated_kmeans_elbow(self):
        """
        Test no exceptions for kmeans k-elbow visualizer on blobs dataset
        """
        # NOTE #182: cannot use occupancy dataset because of memory usage

        # Generate a blobs data set
        X, y = make_blobs(n_samples=1000,
                          n_features=12,
                          centers=6,
                          shuffle=True,
                          random_state=42)

        try:
            fig = plt.figure()
            ax = fig.add_subplot()

            visualizer = KElbowVisualizer(KMeans(random_state=42), k=4, ax=ax)
            visualizer.fit(X)
            visualizer.poof()

            self.assert_images_similar(visualizer)
        except Exception as e:
            self.fail("error during k-elbow: {}".format(e))
示例#30
0
    def test_distortion_metric(self):
        """
        Test the distortion metric of the k-elbow visualizer
        """
        visualizer = KElbowVisualizer(
            KMeans(random_state=0),
            k=5,
            metric="distortion",
            timings=False,
            locate_elbow=False,
        )
        visualizer.fit(self.clusters.X)

        expected = np.array([
            69.10006514142941,
            54.081571290449936,
            44.491830981793605,
            33.99887993254433,
        ])
        assert len(visualizer.k_scores_) == 4

        visualizer.finalize()
        self.assert_images_similar(visualizer, tol=0.03)
        assert_array_almost_equal(visualizer.k_scores_, expected)
示例#31
0
    def test_set_colors_manually(self):
        """
        Test the silhouette metric of the k-elbow visualizer
        """
        oz = KElbowVisualizer(
            KMeans(random_state=0),
            k=5,
        )

        oz.metric_color = "r"
        oz.timing_color = "y"
        oz.vline_color = "c"

        # Create artificial "fit" data for testing purposes
        oz.k_values_ = [1, 2, 3, 4, 5, 6, 7, 8]
        oz.k_timers_ = [6.2, 8.3, 10.1, 15.8, 21.2, 27.9, 38.2, 44.9]
        oz.k_scores_ = [.8, .7, .55, .48, .40, .38, .35, .30]
        oz.elbow_value_ = 5
        oz.elbow_score_ = 0.40

        # Execute drawing
        oz.draw()
        oz.finalize()
        self.assert_images_similar(oz, tol=3.2)
示例#32
0
 def test_bad_metric(self):
     """
     Assert KElbow raises an exception when a bad metric is supplied
     """
     with pytest.raises(YellowbrickValueError):
         KElbowVisualizer(KMeans(), k=5, metric="foo")