示例#1
0
    def test_replacement(self):
        nan = np.nan
        X = [[1.0, nan, 0.0], [2.0, 1.0, 3.0], [nan, nan, nan]]
        domain = data.Domain(
            (data.DiscreteVariable("A", values=["0", "1", "2"]),
             data.ContinuousVariable("B"), data.ContinuousVariable("C")))
        table = data.Table.from_numpy(domain, np.array(X))

        v1 = impute.AsValue()(table, domain[0])
        self.assertTrue(np.all(np.isfinite(v1.compute_value(table))))
        self.assertTrue(np.all(v1.compute_value(table) == [1., 2., 3.]))
        self.assertEqual([v1.str_val(v) for v in v1.compute_value(table)],
                         ["1", "2", "N/A"])

        v1, v2 = impute.AsValue()(table, domain[1])
        self.assertTrue(np.all(np.isfinite(v1.compute_value(table))))
        self.assertTrue(np.all(np.isfinite(v2.compute_value(table))))
        self.assertTrue(np.all(v2.compute_value(table) == [0., 1., 0.]))
        self.assertEqual([v2.str_val(v) for v in v2.compute_value(table)],
                         ["undef", "def", "undef"])

        vars = reduce(
            lambda acc, v: acc + (list(v)
                                  if isinstance(v, (tuple, list)) else [v]),
            [impute.AsValue()(table, var) for var in table.domain], [])
        domain = data.Domain(vars)
        idata = table.from_table(domain, table)

        np.testing.assert_allclose(
            idata.X,
            [[1, 1.0, 0, 0.0, 1], [2, 1.0, 1, 3.0, 1], [3, 1.0, 0, 1.5, 0]])
示例#2
0
    def test_leave_discrete(self):
        s = [0] * 50 + [1] * 50
        X1 = np.array(s).reshape((100, 1))
        X2 = np.arange(100).reshape((100, 1))
        X3 = np.ones((100, 1))
        X = np.hstack([X1, X2, X3])
        domain = data.Domain([
            data.DiscreteVariable("a", values="MF"),
            data.ContinuousVariable("b"),
            data.DiscreteVariable("c", values="AB")
        ], data.ContinuousVariable("d"))
        table = data.Table(domain, X, X1)
        dom = discretize.DomainDiscretizer(table)
        self.assertIs(dom[0], table.domain[0])
        self.assertEqual(dom[1].compute_value.points, [24.5, 49.5, 74.5])
        self.assertIs(dom[2], table.domain[2])
        self.assertIs(dom.class_var, table.domain.class_var)

        domain = data.Domain([
            data.DiscreteVariable("a", values="MF"),
            data.ContinuousVariable("b"),
            data.DiscreteVariable("c", values="AB")
        ], data.DiscreteVariable("d"))
        table = data.Table(domain, X, X1)
        dom = discretize.DomainDiscretizer(table)
        self.assertIs(dom[0], table.domain[0])
        self.assertEqual(dom[1].compute_value.points, [24.5, 49.5, 74.5])
        self.assertIs(dom[2], table.domain[2])
        self.assertIs(dom.class_var, table.domain.class_var)
示例#3
0
 def test_wrong_vartypes(self):
     attributes = (age, gender, income)
     with self.assertRaises(TypeError):
         data.Domain(attributes, ssn)
     with self.assertRaises(TypeError):
         data.Domain(attributes + (ssn,))
     with self.assertRaises(TypeError):
         data.Domain((ssn, ) + attributes)
示例#4
0
    def test_iter(self):
        d = data.Domain((age, gender, income), metas=(ssn,))
        self.assertEqual([var for var in d], [age, gender, income])

        d = data.Domain((age, ), metas=(ssn,))
        self.assertEqual([var for var in d], [age])

        d = data.Domain((), metas=(ssn,))
        self.assertEqual([var for var in d], [])
示例#5
0
    def test_conversion_size(self):
        domain = data.Domain([age, gender, income], [race])
        self.assertRaises(ValueError, domain.convert, [0] * 3)
        self.assertRaises(ValueError, domain.convert, [0] * 5)

        domain = data.Domain([age, income], [race],
                             [gender, education, ssn])
        self.assertRaises(ValueError, domain.convert, [0] * 2)
        self.assertRaises(ValueError, domain.convert, [0] * 4)
        self.assertRaises(ValueError, domain.convert, [0] * 7)
        domain.convert([0] * 3)
        domain.convert([0] * 6)
示例#6
0
    def test_get_conversion(self):
        d = data.Domain((age, gender, income), metas=(ssn, race))
        e = data.Domain((gender, race), None, metas=(age, gender, ssn))
        f = data.Domain((gender,), (race, income), metas=(age, income, ssn))
        g = data.Domain((), metas=(age, gender, ssn))

        d_to_e = e.get_conversion(d)
        self.assertIs(d_to_e.source, d)
        self.assertEqual(d_to_e.attributes, [1, -2])
        self.assertEqual(d_to_e.class_vars, [])
        self.assertEqual(d_to_e.metas, [0, 1, -1])

        d_to_e = e.get_conversion(d)
        self.assertIs(d_to_e.source, d)
        self.assertEqual(d_to_e.attributes, [1, -2])
        self.assertEqual(d_to_e.class_vars, [])
        self.assertEqual(d_to_e.metas, [0, 1, -1])

        d_to_f = f.get_conversion(d)
        self.assertIs(d_to_f.source, d)
        self.assertEqual(d_to_f.attributes, [1])
        self.assertEqual(d_to_f.class_vars, [-2, 2])
        self.assertEqual(d_to_f.metas, [0, 2, -1])

        d_to_e = e.get_conversion(d)
        self.assertIs(d_to_e.source, d)
        self.assertEqual(d_to_e.attributes, [1, -2])
        self.assertEqual(d_to_e.class_vars, [])
        self.assertEqual(d_to_e.metas, [0, 1, -1])

        d_to_f = f.get_conversion(d)
        self.assertIs(d_to_f.source, d)
        self.assertEqual(d_to_f.attributes, [1])
        self.assertEqual(d_to_f.class_vars, [-2, 2])
        self.assertEqual(d_to_f.metas, [0, 2, -1])

        f_to_g = g.get_conversion(f)
        self.assertIs(f_to_g.source, f)
        self.assertEqual(f_to_g.attributes, [])
        self.assertEqual(f_to_g.class_vars, [])
        self.assertEqual(f_to_g.metas, [-1, 0, -3])

        x = lambda: 42
        income.get_value_from = x
        g_to_f = f.get_conversion(g)
        self.assertIs(g_to_f.source, g)
        self.assertEqual(g_to_f.attributes, [-2])
        self.assertEqual(g_to_f.class_vars, [None, x])
        self.assertEqual(g_to_f.metas, [-1, x, -3])
    def test_converts_input_domain_if_needed(self):
        self.create_normal_dataset()
        projector = linear.Pca(variance_covered=.99)(self.dataset)

        new_examples = data.Table(data.Domain(self.dataset.domain.features[:5]), [[1.,2.,3.,4.,5.]])

        projector(new_examples)
    def create_dataset_with_classes(self):
        domain, features = prepare_dataset(components=[[random.randint(0, 5) for _ in range(10)]])
        domain = data.Domain(domain.features,
                             feature.Discrete("C", values=["F", "T"]),
                             class_vars=[feature.Discrete("MC%i" % i, values=["F", "T"]) for i in range(4)])

        self.dataset = data.Table(domain, np.hstack((features, np.random.random((len(features), 5)))))
    def test_replacement(self):
        table = self._create_table()
        domain = table.domain

        v1 = impute.AsValue()(table, domain[0])
        self.assertTrue(np.all(np.isfinite(v1.compute_value(table))))
        self.assertTrue(np.all(v1.compute_value(table) == [1., 2., 3.]))
        self.assertEqual([v1.str_val(v) for v in v1.compute_value(table)],
                         ["1", "2", "N/A"])

        v1, v2 = impute.AsValue()(table, domain[1])
        self.assertTrue(np.all(np.isfinite(v1.compute_value(table))))
        self.assertTrue(np.all(np.isfinite(v2.compute_value(table))))
        self.assertTrue(np.all(v2.compute_value(table) == [0., 1., 0.]))
        self.assertEqual([v2.str_val(v) for v in v2.compute_value(table)],
                         ["undef", "def", "undef"])

        vars = reduce(
            lambda acc, v: acc + (list(v)
                                  if isinstance(v, (tuple, list)) else [v]),
            [impute.AsValue()(table, var) for var in table.domain.variables],
            [])
        domain = data.Domain(vars)
        idata = table.from_table(domain, table)

        np.testing.assert_allclose(
            idata.X,
            [[1, 1.0, 0, 0.0, 1], [2, 1.0, 1, 3.0, 1], [3, 1.0, 0, 1.5, 0]])
示例#10
0
    def test_replacement(self):
        nan = np.nan
        X = [
            [1.0, nan, 0.0],
            [2.0, 1.0, 3.0],
            [nan, nan, nan]
        ]
        unknowns = np.isnan(X)

        domain = data.Domain(
            (data.DiscreteVariable("A", values=["0", "1", "2"]),
             data.ContinuousVariable("B"),
             data.ContinuousVariable("C"))
        )
        table = data.Table.from_numpy(domain, np.array(X))

        v1 = impute.Random()(table, domain[0])
        v2 = impute.Random()(table, domain[1])
        v3 = impute.Random()(table, domain[2])

        self.assertTrue(np.all(np.isfinite(v1.compute_value(table))))
        self.assertTrue(np.all(np.isfinite(v2.compute_value(table))))
        self.assertTrue(np.all(np.isfinite(v3.compute_value(table))))

        imputer = preprocess.Impute(method=impute.Random())
        itable = imputer(table)
        self.assertTrue(np.all(np.isfinite(itable.X)))

        # Original data should keep unknowns
        self.assertTrue(np.all(unknowns == np.isnan(table.X)))
        self.assertTrue(np.all(itable.X[~unknowns] == table.X[~unknowns]))
示例#11
0
    def _construct_sparse():
        domain = data.Domain([
            data.DiscreteVariable("d%i" % i, values=list("abc"))
            for i in range(10)
        ] + [data.ContinuousVariable("c%i" % i) for i in range(10)],
                             data.DiscreteVariable("y", values=list("abc")))

        #  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
        #------------------------------------------------------------
        #     2     2  1  1  2        1           1  1     2  0  2
        #        1  1  0  0  1     2                 2     1  0
        #           1     2  0
        #
        #        2        0  1                   1.1
        #
        sdata = np.array([
            2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 0, 1, 2,
            0, 2, 0, 1, 1.1
        ])
        indices = [
            1, 3, 4, 5, 6, 9, 13, 14, 16, 17, 18, 2, 3, 4, 5, 6, 8, 14, 16, 17,
            3, 5, 6, 2, 5, 6, 13
        ]
        indptr = [0, 11, 20, 23, 23, 27]
        X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20))
        Y = np.array([[1, 2, 1, 0, 0]]).T
        return data.Table.from_numpy(domain, X, Y)
 def _create_table(self):
     nan = np.nan
     X = [[1.0, nan, 0.0], [2.0, 1.0, 3.0], [nan, nan, nan]]
     domain = data.Domain(
         (data.DiscreteVariable("A", values=("0", "1", "2")),
          data.ContinuousVariable("B"), data.ContinuousVariable("C")))
     return data.Table.from_numpy(domain, np.array(X))
示例#13
0
 def test_init_no_class_false(self):
     attributes = (age, gender, income)
     d = data.Domain(attributes, None)
     self.assertEqual(d.variables, attributes)
     self.assertEqual(d.attributes, attributes)
     self.assertEqual(d.class_var, None)
     self.assertEqual(d.class_vars, ())
     self.assertEqual(d.metas, ())
     self.assertEqual(d.indices, {"AGE": 0, "Gender": 1, "income": 2})
示例#14
0
def _preprocess(table):
    """Remove categorical attributes and impute missing values."""
    new_domain = data.Domain(
        [a for a in table.domain.attributes if a.is_continuous],
        table.domain.class_var, table.domain.metas)
    new_data = data.Table(new_domain, table)
    new_data.X = skl_preprocessing.Imputer().fit_transform(new_data.X)
    new_data.X = new_data.X if sparse.issparse(new_data.X) else np.squeeze(
        new_data.X)
    return new_data
示例#15
0
    def setUpClass(cls):
        cls.iris = data.Table("iris")

        cls.data = data.Table.from_numpy(
            data.Domain(attributes=[
                data.ContinuousVariable('n1'),
                data.ContinuousVariable('n2'),
            ]),
            X=np.array([range(10), [1, 1, 1, 5, 5, 8, 9, np.nan, 9, 9]]).T)
        cls.n1, cls.n2 = distribution.get_distributions(cls.data)
示例#16
0
def prepare_dataset(components=((),), n=150):
    components = components if isinstance(components, np.ndarray) else np.array(components)

    ncomponents, m = components.shape
    coefficients = np.random.normal(0., 1., (n, ncomponents))

    d = np.dot(coefficients, components)

    domain = data.Domain([feature.Continuous("A%d" % i) for i in range(m)], False)
    return domain, d
示例#17
0
 def test_var_from_domain(self):
     d = data.Domain((age, gender, income), metas=(ssn, race))
     self.assertEqual(d.var_from_domain(incomeA), incomeA)
     self.assertEqual(d.var_from_domain(incomeA, False), incomeA)
     with self.assertRaises(IndexError):
         d.var_from_domain(incomeA, True)
     with self.assertRaises(TypeError):
         d.var_from_domain(1, no_index=True)
     with self.assertRaises(TypeError):
         d.var_from_domain(-1, no_index=True)
示例#18
0
def _preprocess(table):
    """Remove categorical attributes and impute missing values."""
    if not len(table):
        return table
    new_domain = data.Domain(
        [a for a in table.domain.attributes if a.is_continuous],
        table.domain.class_vars, table.domain.metas)
    new_data = data.Table(new_domain, table)
    new_data = SklImpute(new_data)
    return new_data
示例#19
0
 def test_init_class_list(self):
     attributes = (age, gender, income)
     d = data.Domain(attributes, [race])
     self.assertEqual(d.variables, attributes + (race,))
     self.assertEqual(d.attributes, attributes)
     self.assertEqual(d.class_var, race)
     self.assertEqual(d.class_vars, (race,))
     self.assertEqual(d.metas, ())
     self.assertEqual(d.indices,
                      {"AGE": 0, "Gender": 1, "income": 2, "race": 3})
示例#20
0
 def test_init_metas(self):
     attributes = (age, gender, income)
     metas = (ssn, race)
     d = data.Domain(attributes, race, metas=metas)
     self.assertEqual(d.variables, attributes + (race, ))
     self.assertEqual(d.attributes, attributes)
     self.assertEqual(d.class_var, race)
     self.assertEqual(d.class_vars, (race, ))
     self.assertEqual(d.metas, metas)
     self.assertEqual(d.indices, {"AGE": 0, "Gender": 1, "income": 2,
                                  "SSN": -1, "race": -2})
示例#21
0
 def test_init_multi_class(self):
     attributes = (age, gender, income)
     d = data.Domain(attributes, (education, race))
     self.assertEqual(d.variables, attributes + (education, race))
     self.assertEqual(d.attributes, attributes)
     self.assertIsNone(d.class_var)
     self.assertEqual(d.class_vars, (education, race))
     self.assertEqual(d.metas, ())
     self.assertEqual(d.indices,
                      {"AGE": 0, "Gender": 1, "income": 2,
                       "education": 3, "race": 4})
示例#22
0
 def prepareTable(self, rows, attr, vars, class_var_domain):
     attributes = ["Feature %i" % i for i in range(attr)]
     classes = ["Class %i" % i for i in range(vars)]
     attr_vars = [data.DiscreteVariable(name=a) for a in attributes]
     class_vars = [
         data.DiscreteVariable(name=c, values=range(class_var_domain))
         for c in classes
     ]
     meta_vars = []
     self.domain = data.Domain(attr_vars, class_vars, meta_vars)
     self.x = np.random.random_integers(0, 1, (rows, attr))
示例#23
0
def _preprocess(table):
    """Remove categorical attributes and impute missing values."""
    new_domain = data.Domain([
        i for i in table.domain.attributes
        if isinstance(i, data.ContinuousVariable)
    ], table.domain.class_var)
    new_data = data.Table(new_domain, table)
    new_data.X = skl_preprocessing.Imputer().fit_transform(new_data.X)
    new_data.X = new_data.X if sparse.issparse(new_data.X) else np.squeeze(
        new_data.X)
    return new_data
示例#24
0
    def test_conversion(self):
        domain = data.Domain([age, income], [race],
                             [gender, education, ssn])

        values, metas = domain.convert([42, 13, "White"])
        assert_array_equal(values, np.array([42, 13, 0]))
        assert_array_equal(metas, np.array([data.Unknown, data.Unknown, None]))

        values, metas = domain.convert([42, 13, "White", "M", "HS", "1234567"])
        assert_array_equal(values, np.array([42, 13, 0]))
        assert_array_equal(metas, np.array([0, 1, "1234567"], dtype=object))
示例#25
0
 def test_get_item_error(self):
     d = data.Domain((age, gender, income), metas=(ssn, race))
     with self.assertRaises(IndexError):
         _ = d[3]
     with self.assertRaises(IndexError):
         _ = d[-3]
     with self.assertRaises(IndexError):
         _ = d[incomeA]
     with self.assertRaises(IndexError):
         _ = d["no_such_thing"]
     with self.assertRaises(TypeError):
         _ = d[[2]]
示例#26
0
 def test_index_error(self):
     d = data.Domain((age, gender, income), metas=(ssn, race))
     with self.assertRaises(ValueError):
         d.index(3)
     with self.assertRaises(ValueError):
         d.index(-3)
     with self.assertRaises(ValueError):
         d.index(incomeA)
     with self.assertRaises(ValueError):
         d.index("no_such_thing")
     with self.assertRaises(TypeError):
         d.index([2])
    def test_default(self):
        nan = np.nan
        X = [[nan, 0.0], [1.0, 3.0], [nan, nan]]
        domain = data.Domain((data.DiscreteVariable("B",
                                                    values=("a", "b", "c")),
                              data.ContinuousVariable("C")))
        table = data.Table.from_numpy(domain, np.array(X))

        v2 = impute.Default(42)(table, domain["C"])
        self.assertEqual(v2.compute_value.value, 42)

        v3 = impute.Default()(table, domain["C"], default=42)
        self.assertEqual(v3.compute_value.value, 42)
示例#28
0
    def setUp(self):
        self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0]
        s = sum(self.freqs)
        self.rfreqs = [x / s for x in self.freqs]

        self.data = data.Table.from_numpy(
            data.Domain(attributes=[
                data.DiscreteVariable('rgb', values=('r', 'g', 'b', 'a')),
                data.DiscreteVariable('num', values=('1', '2', '3')),
            ]),
            X=np.array([
                [0, 2, 0, 1, 1, 0, np.nan, 1],
                [0, 2, 0, np.nan, 1, 2, np.nan, 1],
            ]).T)
        self.rgb, self.num = distribution.get_distributions(self.data)
示例#29
0
    def test_get_item(self):
        d = data.Domain((age, gender, income), metas=(ssn, race))
        self.assertEqual(d[age], age)
        self.assertEqual(d["AGE"], age)
        self.assertEqual(d[0], age)

        self.assertEqual(d[income], income)
        self.assertEqual(d["income"], income)
        self.assertEqual(d[2], income)

        self.assertEqual(d[ssn], ssn)
        self.assertEqual(d["SSN"], ssn)
        self.assertEqual(d[-1], ssn)

        self.assertEqual(d[-2], race)
示例#30
0
    def test_index(self):
        d = data.Domain((age, gender, income), metas=(ssn, race))
        self.assertEqual(d.index(age), 0)
        self.assertEqual(d.index("AGE"), 0)
        self.assertEqual(d.index(0), 0)

        self.assertEqual(d.index(income), 2)
        self.assertEqual(d.index("income"), 2)
        self.assertEqual(d.index(2), 2)

        self.assertEqual(d.index(ssn), -1)
        self.assertEqual(d.index("SSN"), -1)
        self.assertEqual(d.index(-1), -1)

        self.assertEqual(d.index(-2), -2)