def test_simple(self): loader = StubLoader() schema = Schema({'v': Schema.ID}) ds = Dataset(loader, schema) for (idx, (label, d)) in ds: self.assertEqual(unicode_t(idx + 1), label) self.assertEqual(0, len(d.string_values)) self.assertEqual(0, len(d.num_values)) self.assertEqual(0, len(d.binary_values)) self.assertEqual(['1', '2', '3'], list(ds.get_ids()))
def test_simple(self): loader = StubLoader() schema = Schema({'v': Schema.ID}) ds = Dataset(loader, schema) for (idx, (label, d)) in ds: self.assertEqual(unicode_t(idx+1), label) self.assertEqual(0, len(d.string_values)) self.assertEqual(0, len(d.num_values)) self.assertEqual(0, len(d.binary_values)) self.assertEqual(['1','2','3'], list(ds.get_ids()))
def test_get_ids(self): ds = Dataset.from_array( [[10, 20, 30], [20, 10, 50], [40, 10, 30]], # data ['i1', 'i2', 'i3'], # ids static=True) actual_ids = [] expected_ids = ['i1', 'i2', 'i3'] for row_id in ds.get_ids(): actual_ids.append(row_id) self.assertEqual(expected_ids, actual_ids) ds = Dataset.from_array( [[10, 20, 30], [20, 10, 50], [40, 10, 30]], # data ['i1', 'i2', 'i3'], # ids static=False) self.assertRaises(RuntimeError, list, ds.get_ids())
def test_get_ids(self): ds = Dataset.from_array( [ [10, 20, 30], [20, 10, 50], [40, 10, 30]], # data ['i1', 'i2', 'i3'], # ids static=True ) actual_ids = [] expected_ids = ['i1', 'i2', 'i3'] for row_id in ds.get_ids(): actual_ids.append(row_id) self.assertEqual(expected_ids, actual_ids) ds = Dataset.from_array( [ [10, 20, 30], [20, 10, 50], [40, 10, 30]], # data ['i1', 'i2', 'i3'], # ids static=False ) self.assertRaises(RuntimeError, list, ds.get_ids())
def _make_stub_dataset(self): ids = ['id1', 'id2', 'id3', 'id4', 'id5'] X = [ [0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4] ] dataset = Dataset.from_array(X, ids=ids) return dataset
def test_from_data(self): # load from array format ds = Dataset.from_data( [[10, 20, 30], [20, 10, 50], [40, 10, 30]], # data ['i1', 'i2', 'i3'], # ids ['k1', 'k2', 'k3'] # feature names ) expected_k1s = [10, 20, 40] expected_ids = ['i1', 'i2', 'i3'] actual_k1s = [] actual_ids = [] for (idx, (row_id, d)) in ds: actual_k1s.append(dict(d.num_values).get('k1', None)) actual_ids.append(row_id) self.assertEqual(expected_k1s, actual_k1s) self.assertEqual(expected_ids, actual_ids) # load from scipy.sparse format ds = Dataset.from_data( self._create_matrix(), # data ['i1', 'i2', 'i3'], # ids ['k1', 'k2', 'k3'], # feature_names ) expected_k1s = [1, None, 4] expected_k3s = [2, 3, 6] expected_ids = ['i1', 'i2', 'i3'] actual_k1s = [] actual_k3s = [] actual_ids = [] for (idx, (row_id, d)) in ds: actual_k1s.append(dict(d.num_values).get('k1', None)) actual_k3s.append(dict(d.num_values).get('k3', None)) actual_ids.append(row_id) self.assertEqual(expected_k1s, actual_k1s) self.assertEqual(expected_k3s, actual_k3s) self.assertEqual(expected_ids, actual_ids)
def test_from_data(self): # load from array format ds = Dataset.from_data( [ [10, 20, 30], [20, 10, 50], [40, 10, 30]], # data ['i1', 'i2', 'i3'], # ids ['k1', 'k2', 'k3'] # feature names ) expected_k1s = [10, 20, 40] expected_ids = ['i1', 'i2', 'i3'] actual_k1s = [] actual_ids = [] for (idx, (row_id, d)) in ds: actual_k1s.append(dict(d.num_values).get('k1', None)) actual_ids.append(row_id) self.assertEqual(expected_k1s, actual_k1s) self.assertEqual(expected_ids, actual_ids) # load from scipy.sparse format ds = Dataset.from_data( self._create_matrix(), # data ['i1', 'i2', 'i3'], # ids [ 'k1', 'k2', 'k3'], # feature_names ) expected_k1s = [1, None, 4] expected_k3s = [2, 3, 6] expected_ids = ['i1', 'i2', 'i3'] actual_k1s = [] actual_k3s = [] actual_ids = [] for (idx, (row_id, d)) in ds: actual_k1s.append(dict(d.num_values).get('k1', None)) actual_k3s.append(dict(d.num_values).get('k3', None)) actual_ids.append(row_id) self.assertEqual(expected_k1s, actual_k1s) self.assertEqual(expected_k3s, actual_k3s) self.assertEqual(expected_ids, actual_ids)
def test_from_array_without_ids(self): ds = Dataset.from_array( [[10, 20, 30], [20, 10, 50], [40, 10, 30]], # data feature_names=['k1', 'k2', 'k3'] # feature names ) expected_k1s = [10, 20, 40] actual_k1s = [] actual_ids = [] for (idx, (row_id, d)) in ds: actual_k1s.append(dict(d.num_values).get('k1', None)) actual_ids.append(row_id) self.assertEqual(expected_k1s, actual_k1s) self.assertEqual(len(actual_ids), 3)
def test_from_array_without_ids(self): ds = Dataset.from_array( [ [10, 20, 30], [20, 10, 50], [40, 10, 30]], # data feature_names=['k1', 'k2', 'k3'] # feature names ) expected_k1s = [10, 20, 40] actual_k1s = [] actual_ids = [] for (idx, (row_id, d)) in ds: actual_k1s.append(dict(d.num_values).get('k1', None)) actual_ids.append(row_id) self.assertEqual(expected_k1s, actual_k1s) self.assertEqual(len(actual_ids), 3)
def test_from_matrix(self): ds = Dataset.from_matrix( self._create_matrix(), # data ['i1', 'i2', 'i3'], # ids ['k1', 'k2', 'k3'] # feature names ) expected_k1s = [1, None, 4] expected_k3s = [2, 3, 6] expected_ids = ['i1', 'i2', 'i3'] actual_k1s = [] actual_k3s = [] actual_ids = [] for (idx, (row_id, d)) in ds: actual_k1s.append(dict(d.num_values).get('k1', None)) actual_k3s.append(dict(d.num_values).get('k3', None)) actual_ids.append(row_id) self.assertEqual(expected_k1s, actual_k1s) self.assertEqual(expected_k3s, actual_k3s) self.assertEqual(expected_ids, actual_ids)
def test_predict(self): loader = StubLoader() dataset = Dataset(loader) self.assertEqual(['v', 1.0], dataset[0][1].num_values[0])
def _make_stub_dataset(self): ids = ['id1', 'id2', 'id3', 'id4', 'id5'] X = [[0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]] dataset = Dataset.from_array(X, ids=ids) return dataset
""" from jubakit.clustering import Clustering, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load a CSV file. loader = CSVLoader('blobs.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'cluster': Schema.ID, }, Schema.NUMBER) # Create a Dataset. dataset = Dataset(loader, schema) # Create an Clustering Service. cfg = Config(method='kmeans') clustering = Clustering.run(cfg) # Update the Clustering model. for (idx, row_id, result) in clustering.push(dataset): pass # Get clusters clusters = clustering.get_core_members(light=False) # Get centers of each cluster centers = clustering.get_k_center() # Calculate SSE: sum of squared errors