예제 #1
0
    def test_input_mutations(self):
        """
        Make sure inputs to the create() method are not mutated. Note that
        'batch_size' may be mutated by the model, by design. The input data
        does have integer types, which are cast internally to floats. The
        user's data should not be changed at all.
        """

        ## Make copies of key objects
        sf = copy.copy(self.sf)
        verbose = copy.copy(self.verbose)
        K = copy.copy(self.K)
        max_iter = copy.copy(self.max_iter)
        features = copy.copy(self.sf.column_names())

        ## Create a model with the copied objects
        m = tc.kmeans.create(sf, features=features, num_clusters=K,
                             max_iterations=max_iter, verbose=verbose)

        ## Check that the copies still equal the originals
        assert_sframe_equal(sf, self.sf)
        self.assertEqual(verbose, self.verbose)
        self.assertEqual(K, self.K)
        self.assertEqual(max_iter, self.max_iter)
        self.assertEqual(features, self.sf.column_names())
예제 #2
0
    def test_custom_initial_centers(self):
        """
        Test that the user can pass hard-coded initial cluster centers, and
        that these are actually used to initialize the clusters.
        """

        ## Empty initial centers
        with self.assertRaises(ValueError):
            m = tc.kmeans.create(dataset=self.sf, initial_centers=tc.SFrame(),
                                 max_iterations=self.max_iter, verbose=False)

        ## Initial centers as an SArray of indices
        with self.assertRaises(TypeError):
            m = tc.kmeans.create(dataset=self.sf, initial_centers=tc.SArray([1, 2, 3]),
                                 max_iterations=self.max_iter, verbose=False)

        ## Initial centers with a schema that doesn't match the data
        sf_init = make_clustering_data(n=10, d=self.dim-1, seed=43)

        with self.assertRaises(ValueError):
            m = tc.kmeans.create(dataset=self.sf, initial_centers=sf_init,
                                 max_iterations=self.max_iter, verbose=False)

        ## Good initial centers
        sf_init = make_clustering_data(n=10, d=self.dim, seed=43)
        ftrs = ['float0', 'float1', 'dict0'] # exclude int feature because these *are* changed.

        m = tc.kmeans.create(self.sf, features=ftrs, initial_centers=sf_init,
                             max_iterations=0, verbose=False)

        model_init_centers = m.cluster_info
        assert_sframe_equal(sf_init[ftrs], model_init_centers[ftrs])
예제 #3
0
    def test_pickling_sgraph_types(self):

        sg_test_1 = tc.SGraph().add_vertices([
            tc.Vertex(0, {'fluffy': 1}),
            tc.Vertex(1, {
                'fluffy': 1,
                'woof': 1
            }),
            tc.Vertex(2, {})
        ])

        sg_test_2 = tc.SGraph()
        sg_test_2 = sg_test_2.add_vertices([tc.Vertex(x) for x in [0, 1, 2]])
        sg_test_2 = sg_test_2.add_edges([
            tc.Edge(0, 1, attr={'relationship': 'dislikes'}),
            tc.Edge(1, 2, attr={'relationship': 'likes'}),
            tc.Edge(1, 0, attr={'relationship': 'likes'})
        ])

        sarray_list = [sg_test_1, sg_test_2]
        for obj in sarray_list:
            pickler = gl_pickle.GLPickler(self.filename)
            pickler.dump(obj)
            pickler.close()
            unpickler = gl_pickle.GLUnpickler(self.filename)
            obj_ret = unpickler.load()
            unpickler.close()
            assert_sframe_equal(obj.get_vertices(), obj_ret.get_vertices())
            assert_sframe_equal(obj.get_edges(), obj_ret.get_edges())
예제 #4
0
    def test_predictions(self):
        """
        Test correctness of predictions on new data, by comparing to nearest
        neighbors search results. Note that this implicitly checks that integer
        features are correctly cast as floats in the predict method.
        """
        sf_train = self.sf[:-10]
        sf_predict = self.sf[-10:]

        kmeans = tc.kmeans.create(sf_train, num_clusters=3, verbose=False)
        sf_train_copy = copy.copy(sf_train)

        ## Check internal consistency - each training point's closest center
        #  should be the one for the assigned cluster. Also check that the
        #  input SFrame isn't mutated.
        yhat = kmeans.predict(sf_train)

        assert_sframe_equal(sf_train, sf_train_copy)
        self.assertTrue((yhat == kmeans.cluster_id['cluster_id']).all())

        ## Check internal consistency for prediction distances.
        yhat_dists = kmeans.predict(sf_train, output_type='distance')
        assert_allclose(yhat_dists, kmeans.cluster_id['distance'], rtol=1e-6)

        ## Check consistency with nearest neighbors.

        # get the predictions from the model and combine into a single SFrame.
        ystar_labels = kmeans.predict(sf_predict, output_type='cluster_id')
        ystar_dists = kmeans.predict(sf_predict, output_type='distance')

        ystar = tc.SFrame({
            'cluster_id': ystar_labels,
            'distance': ystar_dists
        })
        ystar = ystar.add_row_number('row_id')

        # convert type of predictions to floats so they match the types of the
        # centers in the nearest neighbors model.
        coltype_map = {
            k: v
            for k, v in zip(sf_predict.column_names(),
                            sf_predict.column_types())
        }
        for ftr in coltype_map.keys():
            if coltype_map[ftr] is int:
                sf_predict[ftr] = sf_predict[ftr].astype(float)

        knn_model = tc.nearest_neighbors.create(kmeans.cluster_info,
                                                features=kmeans.features,
                                                distance='euclidean',
                                                method='ball_tree')
        knn_dists = knn_model.query(sf_predict, k=1, radius=None)

        assert_sframe_equal(ystar[['row_id', 'cluster_id']],
                            knn_dists[['query_label', 'reference_label']],
                            check_column_names=False)

        assert_allclose(ystar['distance'], knn_dists['distance'], rtol=1e-6)
예제 #5
0
    def test_pickling_sframe_types(self):

        sarray_list = [
            tc.SFrame([1, 2, 3]),
            tc.SFrame([1.0, 2.0, 3.5]),
            tc.SFrame(["foo", "bar"]),
        ]
        for obj in sarray_list:
            pickler = gl_pickle.GLPickler(self.filename)
            pickler.dump(obj)
            pickler.close()
            unpickler = gl_pickle.GLUnpickler(self.filename)
            obj_ret = unpickler.load()
            unpickler.close()
            assert_sframe_equal(obj, obj_ret)
예제 #6
0
    def test_relative_path(self):
        # Arrange
        sf1 = tc.SFrame(range(10))
        relative_path = 'tmp/%s' % self.filename

        # Act
        pickler = gl_pickle.GLPickler(relative_path)
        pickler.dump(sf1)
        pickler.close()
        unpickler = gl_pickle.GLUnpickler(relative_path)
        sf2 = unpickler.load()
        unpickler.close()

        # Assert
        assert_sframe_equal(sf1, sf2)

        # Clean up
        shutil.rmtree(relative_path)
예제 #7
0
    def test_input_mutations(self):
        """
        Make sure inputs to the create() method are not mutated.
        """
        local_sf = copy.copy(self.sf)
        local_dist = copy.deepcopy(self.distance)
        local_radius = copy.deepcopy(self.radius)
        local_min_core_neighbors = copy.deepcopy(self.min_core_neighbors)

        local_model = tc.dbscan.create(self.sf, distance=self.distance,
                                     radius=self.radius,
                                     min_core_neighbors=self.min_core_neighbors,
                                     verbose=False)

        assert_sframe_equal(self.sf, local_sf)
        self.assertEqual(self.distance, local_dist)
        self.assertEqual(self.radius, local_radius)
        self.assertEqual(self.min_core_neighbors, local_min_core_neighbors)
예제 #8
0
    def test_input_mutations(self):
        """
        Make sure inputs to the create() method are not mutated.
        """

        ## Make copies of key objects
        sf = self.sf[:]
        distance = copy.deepcopy(self.distance)
        verbose = self.verbose

        ## Create a model with the copied opbjects
        m = tc.nearest_neighbor_classifier.create(sf,
                                                  target='class',
                                                  distance=distance,
                                                  verbose=self.verbose)

        ## Check that the copies still equal the originals
        assert_sframe_equal(sf, self.sf)
        self.assertEqual(distance, self.distance)
        self.assertEqual(verbose, self.verbose)
예제 #9
0
    def test_combination_gl_python_types(self):

        sg_test_1 = tc.SGraph().add_vertices([
            tc.Vertex(1, {'fluffy': 1}),
            tc.Vertex(2, {
                'fluffy': 1,
                'woof': 1
            }),
            tc.Vertex(3, {})
        ])
        sarray_test_1 = tc.SArray([1, 2, 3])
        sframe_test_1 = tc.SFrame([1, 2, 3])

        obj_list = [[sg_test_1, sframe_test_1, sarray_test_1], {
            0: sg_test_1,
            1: sframe_test_1,
            2: sarray_test_1
        }]

        for obj in obj_list:
            pickler = gl_pickle.GLPickler(self.filename)
            pickler.dump(obj)
            pickler.close()
            unpickler = gl_pickle.GLUnpickler(self.filename)
            obj_ret = unpickler.load()
            unpickler.close()
            assert_sframe_equal(obj[0].get_vertices(),
                                obj_ret[0].get_vertices())
            assert_sframe_equal(obj[0].get_edges(), obj_ret[0].get_edges())
            assert_sframe_equal(obj[1], obj_ret[1])
            assert list(obj[2]) == list(obj_ret[2])