def test_replacement(self): nan = np.nan X = [[1.0, nan, 0.0], [2.0, 1.0, 3.0], [nan, nan, nan]] domain = data.Domain( (data.DiscreteVariable("A", values=["0", "1", "2"]), data.ContinuousVariable("B"), data.ContinuousVariable("C"))) table = data.Table.from_numpy(domain, np.array(X)) v1 = impute.AsValue()(table, domain[0]) self.assertTrue(np.all(np.isfinite(v1.compute_value(table)))) self.assertTrue(np.all(v1.compute_value(table) == [1., 2., 3.])) self.assertEqual([v1.str_val(v) for v in v1.compute_value(table)], ["1", "2", "N/A"]) v1, v2 = impute.AsValue()(table, domain[1]) self.assertTrue(np.all(np.isfinite(v1.compute_value(table)))) self.assertTrue(np.all(np.isfinite(v2.compute_value(table)))) self.assertTrue(np.all(v2.compute_value(table) == [0., 1., 0.])) self.assertEqual([v2.str_val(v) for v in v2.compute_value(table)], ["undef", "def", "undef"]) vars = reduce( lambda acc, v: acc + (list(v) if isinstance(v, (tuple, list)) else [v]), [impute.AsValue()(table, var) for var in table.domain], []) domain = data.Domain(vars) idata = table.from_table(domain, table) np.testing.assert_allclose( idata.X, [[1, 1.0, 0, 0.0, 1], [2, 1.0, 1, 3.0, 1], [3, 1.0, 0, 1.5, 0]])
def test_leave_discrete(self): s = [0] * 50 + [1] * 50 X1 = np.array(s).reshape((100, 1)) X2 = np.arange(100).reshape((100, 1)) X3 = np.ones((100, 1)) X = np.hstack([X1, X2, X3]) domain = data.Domain([ data.DiscreteVariable("a", values="MF"), data.ContinuousVariable("b"), data.DiscreteVariable("c", values="AB") ], data.ContinuousVariable("d")) table = data.Table(domain, X, X1) dom = discretize.DomainDiscretizer(table) self.assertIs(dom[0], table.domain[0]) self.assertEqual(dom[1].compute_value.points, [24.5, 49.5, 74.5]) self.assertIs(dom[2], table.domain[2]) self.assertIs(dom.class_var, table.domain.class_var) domain = data.Domain([ data.DiscreteVariable("a", values="MF"), data.ContinuousVariable("b"), data.DiscreteVariable("c", values="AB") ], data.DiscreteVariable("d")) table = data.Table(domain, X, X1) dom = discretize.DomainDiscretizer(table) self.assertIs(dom[0], table.domain[0]) self.assertEqual(dom[1].compute_value.points, [24.5, 49.5, 74.5]) self.assertIs(dom[2], table.domain[2]) self.assertIs(dom.class_var, table.domain.class_var)
def test_wrong_vartypes(self): attributes = (age, gender, income) with self.assertRaises(TypeError): data.Domain(attributes, ssn) with self.assertRaises(TypeError): data.Domain(attributes + (ssn,)) with self.assertRaises(TypeError): data.Domain((ssn, ) + attributes)
def test_iter(self): d = data.Domain((age, gender, income), metas=(ssn,)) self.assertEqual([var for var in d], [age, gender, income]) d = data.Domain((age, ), metas=(ssn,)) self.assertEqual([var for var in d], [age]) d = data.Domain((), metas=(ssn,)) self.assertEqual([var for var in d], [])
def test_conversion_size(self): domain = data.Domain([age, gender, income], [race]) self.assertRaises(ValueError, domain.convert, [0] * 3) self.assertRaises(ValueError, domain.convert, [0] * 5) domain = data.Domain([age, income], [race], [gender, education, ssn]) self.assertRaises(ValueError, domain.convert, [0] * 2) self.assertRaises(ValueError, domain.convert, [0] * 4) self.assertRaises(ValueError, domain.convert, [0] * 7) domain.convert([0] * 3) domain.convert([0] * 6)
def test_get_conversion(self): d = data.Domain((age, gender, income), metas=(ssn, race)) e = data.Domain((gender, race), None, metas=(age, gender, ssn)) f = data.Domain((gender,), (race, income), metas=(age, income, ssn)) g = data.Domain((), metas=(age, gender, ssn)) d_to_e = e.get_conversion(d) self.assertIs(d_to_e.source, d) self.assertEqual(d_to_e.attributes, [1, -2]) self.assertEqual(d_to_e.class_vars, []) self.assertEqual(d_to_e.metas, [0, 1, -1]) d_to_e = e.get_conversion(d) self.assertIs(d_to_e.source, d) self.assertEqual(d_to_e.attributes, [1, -2]) self.assertEqual(d_to_e.class_vars, []) self.assertEqual(d_to_e.metas, [0, 1, -1]) d_to_f = f.get_conversion(d) self.assertIs(d_to_f.source, d) self.assertEqual(d_to_f.attributes, [1]) self.assertEqual(d_to_f.class_vars, [-2, 2]) self.assertEqual(d_to_f.metas, [0, 2, -1]) d_to_e = e.get_conversion(d) self.assertIs(d_to_e.source, d) self.assertEqual(d_to_e.attributes, [1, -2]) self.assertEqual(d_to_e.class_vars, []) self.assertEqual(d_to_e.metas, [0, 1, -1]) d_to_f = f.get_conversion(d) self.assertIs(d_to_f.source, d) self.assertEqual(d_to_f.attributes, [1]) self.assertEqual(d_to_f.class_vars, [-2, 2]) self.assertEqual(d_to_f.metas, [0, 2, -1]) f_to_g = g.get_conversion(f) self.assertIs(f_to_g.source, f) self.assertEqual(f_to_g.attributes, []) self.assertEqual(f_to_g.class_vars, []) self.assertEqual(f_to_g.metas, [-1, 0, -3]) x = lambda: 42 income.get_value_from = x g_to_f = f.get_conversion(g) self.assertIs(g_to_f.source, g) self.assertEqual(g_to_f.attributes, [-2]) self.assertEqual(g_to_f.class_vars, [None, x]) self.assertEqual(g_to_f.metas, [-1, x, -3])
def test_converts_input_domain_if_needed(self): self.create_normal_dataset() projector = linear.Pca(variance_covered=.99)(self.dataset) new_examples = data.Table(data.Domain(self.dataset.domain.features[:5]), [[1.,2.,3.,4.,5.]]) projector(new_examples)
def create_dataset_with_classes(self): domain, features = prepare_dataset(components=[[random.randint(0, 5) for _ in range(10)]]) domain = data.Domain(domain.features, feature.Discrete("C", values=["F", "T"]), class_vars=[feature.Discrete("MC%i" % i, values=["F", "T"]) for i in range(4)]) self.dataset = data.Table(domain, np.hstack((features, np.random.random((len(features), 5)))))
def test_replacement(self): table = self._create_table() domain = table.domain v1 = impute.AsValue()(table, domain[0]) self.assertTrue(np.all(np.isfinite(v1.compute_value(table)))) self.assertTrue(np.all(v1.compute_value(table) == [1., 2., 3.])) self.assertEqual([v1.str_val(v) for v in v1.compute_value(table)], ["1", "2", "N/A"]) v1, v2 = impute.AsValue()(table, domain[1]) self.assertTrue(np.all(np.isfinite(v1.compute_value(table)))) self.assertTrue(np.all(np.isfinite(v2.compute_value(table)))) self.assertTrue(np.all(v2.compute_value(table) == [0., 1., 0.])) self.assertEqual([v2.str_val(v) for v in v2.compute_value(table)], ["undef", "def", "undef"]) vars = reduce( lambda acc, v: acc + (list(v) if isinstance(v, (tuple, list)) else [v]), [impute.AsValue()(table, var) for var in table.domain.variables], []) domain = data.Domain(vars) idata = table.from_table(domain, table) np.testing.assert_allclose( idata.X, [[1, 1.0, 0, 0.0, 1], [2, 1.0, 1, 3.0, 1], [3, 1.0, 0, 1.5, 0]])
def test_replacement(self): nan = np.nan X = [ [1.0, nan, 0.0], [2.0, 1.0, 3.0], [nan, nan, nan] ] unknowns = np.isnan(X) domain = data.Domain( (data.DiscreteVariable("A", values=["0", "1", "2"]), data.ContinuousVariable("B"), data.ContinuousVariable("C")) ) table = data.Table.from_numpy(domain, np.array(X)) v1 = impute.Random()(table, domain[0]) v2 = impute.Random()(table, domain[1]) v3 = impute.Random()(table, domain[2]) self.assertTrue(np.all(np.isfinite(v1.compute_value(table)))) self.assertTrue(np.all(np.isfinite(v2.compute_value(table)))) self.assertTrue(np.all(np.isfinite(v3.compute_value(table)))) imputer = preprocess.Impute(method=impute.Random()) itable = imputer(table) self.assertTrue(np.all(np.isfinite(itable.X))) # Original data should keep unknowns self.assertTrue(np.all(unknowns == np.isnan(table.X))) self.assertTrue(np.all(itable.X[~unknowns] == table.X[~unknowns]))
def _construct_sparse(): domain = data.Domain([ data.DiscreteVariable("d%i" % i, values=list("abc")) for i in range(10) ] + [data.ContinuousVariable("c%i" % i) for i in range(10)], data.DiscreteVariable("y", values=list("abc"))) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 #------------------------------------------------------------ # 2 2 1 1 2 1 1 1 2 0 2 # 1 1 0 0 1 2 2 1 0 # 1 2 0 # # 2 0 1 1.1 # sdata = np.array([ 2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 0, 1, 2, 0, 2, 0, 1, 1.1 ]) indices = [ 1, 3, 4, 5, 6, 9, 13, 14, 16, 17, 18, 2, 3, 4, 5, 6, 8, 14, 16, 17, 3, 5, 6, 2, 5, 6, 13 ] indptr = [0, 11, 20, 23, 23, 27] X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20)) Y = np.array([[1, 2, 1, 0, 0]]).T return data.Table.from_numpy(domain, X, Y)
def _create_table(self): nan = np.nan X = [[1.0, nan, 0.0], [2.0, 1.0, 3.0], [nan, nan, nan]] domain = data.Domain( (data.DiscreteVariable("A", values=("0", "1", "2")), data.ContinuousVariable("B"), data.ContinuousVariable("C"))) return data.Table.from_numpy(domain, np.array(X))
def test_init_no_class_false(self): attributes = (age, gender, income) d = data.Domain(attributes, None) self.assertEqual(d.variables, attributes) self.assertEqual(d.attributes, attributes) self.assertEqual(d.class_var, None) self.assertEqual(d.class_vars, ()) self.assertEqual(d.metas, ()) self.assertEqual(d.indices, {"AGE": 0, "Gender": 1, "income": 2})
def _preprocess(table): """Remove categorical attributes and impute missing values.""" new_domain = data.Domain( [a for a in table.domain.attributes if a.is_continuous], table.domain.class_var, table.domain.metas) new_data = data.Table(new_domain, table) new_data.X = skl_preprocessing.Imputer().fit_transform(new_data.X) new_data.X = new_data.X if sparse.issparse(new_data.X) else np.squeeze( new_data.X) return new_data
def setUpClass(cls): cls.iris = data.Table("iris") cls.data = data.Table.from_numpy( data.Domain(attributes=[ data.ContinuousVariable('n1'), data.ContinuousVariable('n2'), ]), X=np.array([range(10), [1, 1, 1, 5, 5, 8, 9, np.nan, 9, 9]]).T) cls.n1, cls.n2 = distribution.get_distributions(cls.data)
def prepare_dataset(components=((),), n=150): components = components if isinstance(components, np.ndarray) else np.array(components) ncomponents, m = components.shape coefficients = np.random.normal(0., 1., (n, ncomponents)) d = np.dot(coefficients, components) domain = data.Domain([feature.Continuous("A%d" % i) for i in range(m)], False) return domain, d
def test_var_from_domain(self): d = data.Domain((age, gender, income), metas=(ssn, race)) self.assertEqual(d.var_from_domain(incomeA), incomeA) self.assertEqual(d.var_from_domain(incomeA, False), incomeA) with self.assertRaises(IndexError): d.var_from_domain(incomeA, True) with self.assertRaises(TypeError): d.var_from_domain(1, no_index=True) with self.assertRaises(TypeError): d.var_from_domain(-1, no_index=True)
def _preprocess(table): """Remove categorical attributes and impute missing values.""" if not len(table): return table new_domain = data.Domain( [a for a in table.domain.attributes if a.is_continuous], table.domain.class_vars, table.domain.metas) new_data = data.Table(new_domain, table) new_data = SklImpute(new_data) return new_data
def test_init_class_list(self): attributes = (age, gender, income) d = data.Domain(attributes, [race]) self.assertEqual(d.variables, attributes + (race,)) self.assertEqual(d.attributes, attributes) self.assertEqual(d.class_var, race) self.assertEqual(d.class_vars, (race,)) self.assertEqual(d.metas, ()) self.assertEqual(d.indices, {"AGE": 0, "Gender": 1, "income": 2, "race": 3})
def test_init_metas(self): attributes = (age, gender, income) metas = (ssn, race) d = data.Domain(attributes, race, metas=metas) self.assertEqual(d.variables, attributes + (race, )) self.assertEqual(d.attributes, attributes) self.assertEqual(d.class_var, race) self.assertEqual(d.class_vars, (race, )) self.assertEqual(d.metas, metas) self.assertEqual(d.indices, {"AGE": 0, "Gender": 1, "income": 2, "SSN": -1, "race": -2})
def test_init_multi_class(self): attributes = (age, gender, income) d = data.Domain(attributes, (education, race)) self.assertEqual(d.variables, attributes + (education, race)) self.assertEqual(d.attributes, attributes) self.assertIsNone(d.class_var) self.assertEqual(d.class_vars, (education, race)) self.assertEqual(d.metas, ()) self.assertEqual(d.indices, {"AGE": 0, "Gender": 1, "income": 2, "education": 3, "race": 4})
def prepareTable(self, rows, attr, vars, class_var_domain): attributes = ["Feature %i" % i for i in range(attr)] classes = ["Class %i" % i for i in range(vars)] attr_vars = [data.DiscreteVariable(name=a) for a in attributes] class_vars = [ data.DiscreteVariable(name=c, values=range(class_var_domain)) for c in classes ] meta_vars = [] self.domain = data.Domain(attr_vars, class_vars, meta_vars) self.x = np.random.random_integers(0, 1, (rows, attr))
def _preprocess(table): """Remove categorical attributes and impute missing values.""" new_domain = data.Domain([ i for i in table.domain.attributes if isinstance(i, data.ContinuousVariable) ], table.domain.class_var) new_data = data.Table(new_domain, table) new_data.X = skl_preprocessing.Imputer().fit_transform(new_data.X) new_data.X = new_data.X if sparse.issparse(new_data.X) else np.squeeze( new_data.X) return new_data
def test_conversion(self): domain = data.Domain([age, income], [race], [gender, education, ssn]) values, metas = domain.convert([42, 13, "White"]) assert_array_equal(values, np.array([42, 13, 0])) assert_array_equal(metas, np.array([data.Unknown, data.Unknown, None])) values, metas = domain.convert([42, 13, "White", "M", "HS", "1234567"]) assert_array_equal(values, np.array([42, 13, 0])) assert_array_equal(metas, np.array([0, 1, "1234567"], dtype=object))
def test_get_item_error(self): d = data.Domain((age, gender, income), metas=(ssn, race)) with self.assertRaises(IndexError): _ = d[3] with self.assertRaises(IndexError): _ = d[-3] with self.assertRaises(IndexError): _ = d[incomeA] with self.assertRaises(IndexError): _ = d["no_such_thing"] with self.assertRaises(TypeError): _ = d[[2]]
def test_index_error(self): d = data.Domain((age, gender, income), metas=(ssn, race)) with self.assertRaises(ValueError): d.index(3) with self.assertRaises(ValueError): d.index(-3) with self.assertRaises(ValueError): d.index(incomeA) with self.assertRaises(ValueError): d.index("no_such_thing") with self.assertRaises(TypeError): d.index([2])
def test_default(self): nan = np.nan X = [[nan, 0.0], [1.0, 3.0], [nan, nan]] domain = data.Domain((data.DiscreteVariable("B", values=("a", "b", "c")), data.ContinuousVariable("C"))) table = data.Table.from_numpy(domain, np.array(X)) v2 = impute.Default(42)(table, domain["C"]) self.assertEqual(v2.compute_value.value, 42) v3 = impute.Default()(table, domain["C"], default=42) self.assertEqual(v3.compute_value.value, 42)
def setUp(self): self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0] s = sum(self.freqs) self.rfreqs = [x / s for x in self.freqs] self.data = data.Table.from_numpy( data.Domain(attributes=[ data.DiscreteVariable('rgb', values=('r', 'g', 'b', 'a')), data.DiscreteVariable('num', values=('1', '2', '3')), ]), X=np.array([ [0, 2, 0, 1, 1, 0, np.nan, 1], [0, 2, 0, np.nan, 1, 2, np.nan, 1], ]).T) self.rgb, self.num = distribution.get_distributions(self.data)
def test_get_item(self): d = data.Domain((age, gender, income), metas=(ssn, race)) self.assertEqual(d[age], age) self.assertEqual(d["AGE"], age) self.assertEqual(d[0], age) self.assertEqual(d[income], income) self.assertEqual(d["income"], income) self.assertEqual(d[2], income) self.assertEqual(d[ssn], ssn) self.assertEqual(d["SSN"], ssn) self.assertEqual(d[-1], ssn) self.assertEqual(d[-2], race)
def test_index(self): d = data.Domain((age, gender, income), metas=(ssn, race)) self.assertEqual(d.index(age), 0) self.assertEqual(d.index("AGE"), 0) self.assertEqual(d.index(0), 0) self.assertEqual(d.index(income), 2) self.assertEqual(d.index("income"), 2) self.assertEqual(d.index(2), 2) self.assertEqual(d.index(ssn), -1) self.assertEqual(d.index("SSN"), -1) self.assertEqual(d.index(-1), -1) self.assertEqual(d.index(-2), -2)