Exemplo n.º 1
0
    def test_get_distributions(self):
        d = data.Table("iris")
        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 5)
        for i in range(4):
            self.assertIsInstance(ddist[i], distribution.Continuous)
        self.assertIsInstance(ddist[-1], distribution.Discrete)

        freqs = np.array(
            [
                (1.0, 1),
                (1.1, 1),
                (1.2, 2),
                (1.3, 7),
                (1.4, 12),
                (1.5, 14),
                (1.6, 7),
                (1.7, 4),
                (1.9, 2),
                (3.0, 1),
                (3.3, 2),
                (3.5, 2),
                (3.6, 1),
                (3.7, 1),
                (3.8, 1),
                (3.9, 3),
                (4.0, 5),
                (4.1, 3),
                (4.2, 4),
                (4.3, 2),
                (4.4, 4),
                (4.5, 8),
                (4.6, 3),
                (4.7, 5),
                (4.8, 4),
                (4.9, 5),
                (5.0, 4),
                (5.1, 8),
                (5.2, 2),
                (5.3, 2),
                (5.4, 2),
                (5.5, 3),
                (5.6, 6),
                (5.7, 3),
                (5.8, 3),
                (5.9, 2),
                (6.0, 2),
                (6.1, 3),
                (6.3, 1),
                (6.4, 1),
                (6.6, 1),
                (6.7, 2),
                (6.9, 1),
            ]
        ).T
        np.testing.assert_almost_equal(ddist[2], freqs)
        np.testing.assert_almost_equal(ddist[-1], [50, 50, 50])
Exemplo n.º 2
0
    def test_distributions(self):
        iris = SqlTable(self.conn, self.iris, inspect_values=True)

        dists = get_distributions(iris)
        self.assertEqual(len(dists), 5)
        dist = dists[0]
        self.assertAlmostEqual(dist.min(), 4.3)
        self.assertAlmostEqual(dist.max(), 7.9)
        self.assertAlmostEqual(dist.mean(), 5.8, 1)
Exemplo n.º 3
0
    def test_distributions(self):
        iris = SqlTable(self.conn, self.iris, inspect_values=True)

        dists = get_distributions(iris)
        self.assertEqual(len(dists), 5)
        dist = dists[0]
        self.assertAlmostEqual(dist.min(), 4.3)
        self.assertAlmostEqual(dist.max(), 7.9)
        self.assertAlmostEqual(dist.mean(), 5.8, 1)
Exemplo n.º 4
0
    def test_get_distributions(self):
        d = data.Table("iris")
        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 5)
        for i in range(4):
            self.assertIsInstance(ddist[i], distribution.Continuous)
        self.assertIsInstance(ddist[-1], distribution.Discrete)

        freqs = np.array([
            (1.0, 1),
            (1.1, 1),
            (1.2, 2),
            (1.3, 7),
            (1.4, 12),
            (1.5, 14),
            (1.6, 7),
            (1.7, 4),
            (1.9, 2),
            (3.0, 1),
            (3.3, 2),
            (3.5, 2),
            (3.6, 1),
            (3.7, 1),
            (3.8, 1),
            (3.9, 3),
            (4.0, 5),
            (4.1, 3),
            (4.2, 4),
            (4.3, 2),
            (4.4, 4),
            (4.5, 8),
            (4.6, 3),
            (4.7, 5),
            (4.8, 4),
            (4.9, 5),
            (5.0, 4),
            (5.1, 8),
            (5.2, 2),
            (5.3, 2),
            (5.4, 2),
            (5.5, 3),
            (5.6, 6),
            (5.7, 3),
            (5.8, 3),
            (5.9, 2),
            (6.0, 2),
            (6.1, 3),
            (6.3, 1),
            (6.4, 1),
            (6.6, 1),
            (6.7, 2),
            (6.9, 1),
        ]).T
        np.testing.assert_almost_equal(ddist[2], freqs)
        np.testing.assert_almost_equal(ddist[-1], [50, 50, 50])
Exemplo n.º 5
0
    def setUpClass(cls):
        cls.iris = data.Table("iris")

        cls.data = data.Table.from_numpy(
            data.Domain(attributes=[
                data.ContinuousVariable('n1'),
                data.ContinuousVariable('n2'),
            ]),
            X=np.array([range(10), [1, 1, 1, 5, 5, 8, 9, np.nan, 9, 9]]).T)
        cls.n1, cls.n2 = distribution.get_distributions(cls.data)
Exemplo n.º 6
0
 def __call__(self, data):
     dists = distribution.get_distributions(data)
     new_attrs = [self.normalize(dists[i], var) for
                  (i, var) in enumerate(data.domain.attributes)]
     new_class_vars = data.domain.class_vars
     if self.transform_class:
         attr_len = len(data.domain.attributes)
         new_class_vars = [self.normalize(dists[i + attr_len], var) for
                           (i, var) in enumerate(data.domain.class_vars)]
     domain = Domain(new_attrs, new_class_vars, data.domain.metas)
     return data.from_table(domain, data)
Exemplo n.º 7
0
    def setUpClass(cls):
        cls.iris = data.Table("iris")

        cls.data = data.Table.from_numpy(
            data.Domain(
                attributes=[
                    data.ContinuousVariable('n1'),
                    data.ContinuousVariable('n2'),
                ]
            ),
            X=np.array([range(10), [1, 1, 1, 5, 5, 8, 9, np.nan, 9, 9]]).T
        )
        cls.n1, cls.n2 = distribution.get_distributions(cls.data)
Exemplo n.º 8
0
    def setUp(self):
        self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0]
        s = sum(self.freqs)
        self.rfreqs = [x / s for x in self.freqs]

        self.data = data.Table.from_numpy(
            data.Domain(attributes=[
                data.DiscreteVariable('rgb', values=('r', 'g', 'b', 'a')),
                data.DiscreteVariable('num', values=('1', '2', '3')),
            ]),
            X=np.array([
                [0, 2, 0, 1, 1, 0, np.nan, 1],
                [0, 2, 0, np.nan, 1, 2, np.nan, 1],
            ]).T)
        self.rgb, self.num = distribution.get_distributions(self.data)
Exemplo n.º 9
0
    def setUp(self):
        self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0]
        s = sum(self.freqs)
        self.rfreqs = [x / s for x in self.freqs]

        self.data = data.Table.from_numpy(
            data.Domain(attributes=[
                data.DiscreteVariable("rgb", values=["r", "g", "b", "a"]),
                data.DiscreteVariable(
                    "num", values=["1", "2", "3"], ordered=True),
            ]),
            X=np.array([[0, 2, 0, 1, 1, 0, np.nan, 1],
                        [0, 2, 0, np.nan, 1, 2, np.nan, 1]]).T,
        )
        self.rgb, self.num = distribution.get_distributions(self.data)
Exemplo n.º 10
0
    def setUp(self):
        self.freqs = [4.0, 20.0, 13.0, 8.0, 10.0, 41.0, 5.0]
        s = sum(self.freqs)
        self.rfreqs = [x/s for x in self.freqs]

        self.data = data.Table.from_numpy(
            data.Domain(
                attributes=[
                    data.DiscreteVariable('rgb', values=['r', 'g', 'b', 'a']),
                    data.DiscreteVariable('num', values=['1', '2', '3'], ordered=True),
                ]
            ),
            X=np.array([
                [0, 2, 0, 1, 1, 0, np.nan, 1],
                [0, 2, 0, np.nan, 1, 2, np.nan, 1],
            ]).T
        )
        self.rgb, self.num = distribution.get_distributions(self.data)
Exemplo n.º 11
0
 def iterate_states(self, state):
     """
     Iterate through all combinations of attributes as ordered by Relief,
     starting with a single attribute if Mosaic is colored by class
     distributions, and two if by Pearson.
     """
     # If we put initialization of `self.attrs` to `initialize`,
     # `score_heuristic` would be run on every call to master's `set_data`.
     master = self.master
     data = master.discrete_data
     min_attrs, max_attrs = self.attr_range()
     if min_attrs > max_attrs:
         return
     if state is None:  # on the first call, compute order
         if self._compute_class_dists():
             self.marginal = get_distribution(data, data.domain.class_var)
             self.marginal.normalize()
             state = list(range(min_attrs))
         else:
             self.marginal = get_distributions(data)
             for dist in self.marginal:
                 dist.normalize()
             state = list(range(min_attrs))
     n_attrs = len(data.domain.attributes)
     while True:
         yield state
         # Reset while running; just abort
         if self.attr_ordering is None:
             break
         for up, _ in enumerate(state):
             state[up] += 1
             if up + 1 == len(state) or state[up] < state[up + 1]:
                 break
             state[up] = up
         if state[-1] == len(self.attr_ordering):
             if len(state) < min(max_attrs, n_attrs):
                 state = list(range(len(state) + 1))
             else:
                 break
Exemplo n.º 12
0
 def iterate_states(self, state):
     """
     Iterate through all combinations of attributes as ordered by Relief,
     starting with a single attribute if Mosaic is colored by class
     distributions, and two if by Pearson.
     """
     # If we put initialization of `self.attrs` to `initialize`,
     # `score_heuristic` would be run on every call to master's `set_data`.
     master = self.master
     data = master.discrete_data
     if state is None:  # on the first call, compute order
         if self._compute_class_dists():
             self.marginal = get_distribution(data, data.domain.class_var)
             self.marginal.normalize()
             state = [0]
         else:
             self.marginal = get_distributions(data)
             for dist in self.marginal:
                 dist.normalize()
             state = [0, 1]
     n_attrs = len(data.domain.attributes)
     while True:
         yield state
         # Reset while running; just abort
         if self.attr_ordering is None:
             break
         for up, _ in enumerate(state):
             state[up] += 1
             if up + 1 == len(state) or state[up] < state[up + 1]:
                 break
             state[up] = up
         if state[-1] == len(self.attr_ordering):
             if len(state) < min(self.max_attrs, n_attrs):
                 state = list(range(len(state) + 1))
             else:
                 break
Exemplo n.º 13
0
    def test_sparse_get_distributions(self):
        def assert_dist_and_unknowns(computed, gold_dist):
            nonlocal d
            gold_dist = np.array(gold_dist)
            sum_dist = np.sum(gold_dist[1, :] if gold_dist.ndim == 2 else gold_dist)
            n_all = np.sum(d.W) if d.has_weights() else len(d)

            np.testing.assert_almost_equal(computed, gold_dist)
            self.assertEqual(computed.unknowns, n_all - sum_dist)

        domain = data.Domain(
            [data.DiscreteVariable("d%i" % i, values=list("abc"))
             for i in range(10)] +
            [data.ContinuousVariable("c%i" % i) for i in range(10)])

        #  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
        # ------------------------------------------------------------
        #     2     2  1  1  2        1           1  1     2  0  2
        #        1  1  0  0  1     2                 2     1  0
        #           1     2  0
        #
        #        2        0  1                   1.1
        #
        sdata = np.array([2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2,
                          1, 1, 0, 0, 1, 2, 2, 1, 0,
                          1, 2, 0,
                          2, 0, 1, 1.1])
        indices = [1, 3, 4, 5, 6, 9, 13, 14, 16, 17, 18,
                   2, 3, 4, 5, 6, 8, 14, 16, 17,
                   3, 5, 6,
                   2, 5, 6, 13]
        indptr = [0, 11, 20, 23, 23, 27]
        X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20))
        d = data.Table.from_numpy(domain, X)

        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        assert_dist_and_unknowns(ddist[0], [0, 0, 0])
        assert_dist_and_unknowns(ddist[1], [0, 0, 1])
        assert_dist_and_unknowns(ddist[2], [0, 1, 1])
        assert_dist_and_unknowns(ddist[3], [0, 2, 1])
        assert_dist_and_unknowns(ddist[4], [1, 1, 0])
        assert_dist_and_unknowns(ddist[5], [2, 1, 1])
        assert_dist_and_unknowns(ddist[6], [1, 2, 1])
        assert_dist_and_unknowns(ddist[7], [0, 0, 0])
        assert_dist_and_unknowns(ddist[8], [0, 0, 1])
        assert_dist_and_unknowns(ddist[9], [0, 1, 0])

        z = np.zeros((2, 0))
        assert_dist_and_unknowns(ddist[10], z)
        assert_dist_and_unknowns(ddist[11], z)
        assert_dist_and_unknowns(ddist[12], z)
        assert_dist_and_unknowns(ddist[13], [[1, 1.1], [1, 1]])
        assert_dist_and_unknowns(ddist[14], [[1, 2], [1, 1]])
        assert_dist_and_unknowns(ddist[15], z)
        assert_dist_and_unknowns(ddist[16], [[1, 2], [1, 1]])
        assert_dist_and_unknowns(ddist[17], [[0], [2]])
        assert_dist_and_unknowns(ddist[18], [[2], [1]])
        assert_dist_and_unknowns(ddist[19], z)

        d.set_weights(np.array([1, 2, 3, 4, 5]))

        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        assert_dist_and_unknowns(ddist[0], [0, 0, 0])
        assert_dist_and_unknowns(ddist[1], [0, 0, 1])
        assert_dist_and_unknowns(ddist[2], [0, 2, 5])
        assert_dist_and_unknowns(ddist[3], [0, 5, 1])
        assert_dist_and_unknowns(ddist[4], [2, 1, 0])
        assert_dist_and_unknowns(ddist[5], [7, 1, 3])
        assert_dist_and_unknowns(ddist[6], [3, 7, 1])
        assert_dist_and_unknowns(ddist[7], [0, 0, 0])
        assert_dist_and_unknowns(ddist[8], [0, 0, 2])
        assert_dist_and_unknowns(ddist[9], [0, 1, 0])

        z = np.zeros((2, 0))
        assert_dist_and_unknowns(ddist[10], z)
        assert_dist_and_unknowns(ddist[11], z)
        assert_dist_and_unknowns(ddist[12], z)
        assert_dist_and_unknowns(ddist[13], [[1, 1.1], [1, 5]])
        assert_dist_and_unknowns(ddist[14], [[1, 2], [1, 2]])
        assert_dist_and_unknowns(ddist[15], z)
        assert_dist_and_unknowns(ddist[16], [[1, 2], [2, 1]])
        assert_dist_and_unknowns(ddist[17], [[0], [3]])
        assert_dist_and_unknowns(ddist[18], [[2], [1]])
        assert_dist_and_unknowns(ddist[19], z)
Exemplo n.º 14
0
    def __call__(self, data):
        def transform_discrete(var):
            if (len(var.values) < 2 or
                    treat == Continuize.Remove or
                    treat == Continuize.RemoveMultinomial and
                    len(var.values) > 2):
                return []
            if treat == Continuize.AsOrdinal:
                new_var = ContinuousVariable(var.name,
                                             compute_value=Identity(var))
                return [new_var]
            if treat == Continuize.AsNormalizedOrdinal:
                n_values = max(1, len(var.values))
                if self.zero_based:
                    return [ContinuousVariable(var.name, compute_value=Normalizer(var, 0, 1 / (n_values - 1)))]
                else:
                    return [ContinuousVariable(var.name, compute_value=Normalizer(var, (n_values - 1) / 2, 2 / (n_values - 1)))]

            new_vars = []
            if treat == Continuize.Indicators:
                base = -1
            elif treat in (Continuize.FirstAsBase,
                           Continuize.RemoveMultinomial):
                base = max(var.base_value, 0)
            else:
                base = dists[var_ptr].modus()
            ind_class = [Indicator1, Indicator][self.zero_based]
            for i, val in enumerate(var.values):
                if i == base:
                    continue
                new_var = ContinuousVariable(
                    "{}={}".format(var.name, val),
                    compute_value=ind_class(var, i))
                new_vars.append(new_var)
            return new_vars

        def transform_list(s):
            nonlocal var_ptr
            new_vars = []
            for var in s:
                if var.is_discrete:
                    new_vars += transform_discrete(var)
                    if needs_discrete:
                        var_ptr += 1
                else:
                    new_var = var
                    if new_var is not None:
                        new_vars.append(new_var)
                        if needs_continuous:
                            var_ptr += 1
            return new_vars

        treat = self.multinomial_treatment
        transform_class = self.transform_class

        domain = data if isinstance(data, Domain) else data.domain
        if (treat == Continuize.ReportError and
                any(var.is_discrete and len(var.values) > 2 for var in domain)):
            raise ValueError("data has multinomial attributes")
        needs_discrete = (treat == Continuize.FrequentAsBase and
                          domain.has_discrete_attributes(transform_class))
        needs_continuous = False
        if needs_discrete:
            if isinstance(data, Domain):
                raise TypeError("continuizer requires data")
            dists = distribution.get_distributions(
                data, not needs_discrete, not needs_continuous)
        var_ptr = 0
        new_attrs = transform_list(domain.attributes)
        if transform_class:
            new_classes = transform_list(domain.class_vars)
        else:
            new_classes = domain.class_vars
        return Domain(new_attrs, new_classes, domain.metas)
Exemplo n.º 15
0
    def test_sparse_get_distributions(self):
        def assert_dist_and_unknowns(computed, goal_dist):
            nonlocal d
            goal_dist = np.array(goal_dist)
            sum_dist = np.sum(goal_dist[1, :] if goal_dist.ndim == 2 else goal_dist)
            n_all = np.sum(d.W) if d.has_weights() else len(d)

            np.testing.assert_almost_equal(computed, goal_dist)
            self.assertEqual(computed.unknowns, n_all - sum_dist)

        domain = data.Domain(
            [data.DiscreteVariable("d%i" % i, values=list("abc")) for i in range(10)] +
            [data.ContinuousVariable("c%i" % i) for i in range(10)])

        # pylint: disable=bad-whitespace
        X = sp.csr_matrix(
            # 0  1  2  3       4       5       6  7  8  9 10 11 12   13 14 15 16      17 18 19
            # --------------------------------------------------------------------------------
            [[0, 2, 0, 2,      1,      1,      2, 0, 0, 1, 0, 0, 0,   1, 1, 0, 2, np.nan, 2, 0],
             [0, 0, 1, 1, np.nan, np.nan,      1, 0, 2, 0, 0, 0, 0,   0, 2, 0, 1, np.nan, 0, 0],
             [0, 0, 0, 1,      0,      2, np.nan, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0,      0, 0, 0],
             [0, 0, 0, 0,      0,      0,      0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0,      0, 0, 0],
             [0, 0, 2, 0,      0,      0,      1, 0, 0, 0, 0, 0, 0, 1.1, 0, 0, 0,      0, 0, 0]]
        )
        X[0, 0] = 0

        d = data.Table.from_numpy(domain, X)
        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        zeros = [5, 0, 0]
        assert_dist_and_unknowns(ddist[0], zeros)
        assert_dist_and_unknowns(ddist[1], [4, 0, 1])
        assert_dist_and_unknowns(ddist[2], [3, 1, 1])
        assert_dist_and_unknowns(ddist[3], [2, 2, 1])
        assert_dist_and_unknowns(ddist[4], [3, 1, 0])
        assert_dist_and_unknowns(ddist[5], [2, 1, 1])
        assert_dist_and_unknowns(ddist[6], [1, 2, 1])
        assert_dist_and_unknowns(ddist[7], zeros)
        assert_dist_and_unknowns(ddist[8], [4, 0, 1])
        assert_dist_and_unknowns(ddist[9], [4, 1, 0])

        zeros = [[0], [5]]
        assert_dist_and_unknowns(ddist[10], zeros)
        assert_dist_and_unknowns(ddist[11], zeros)
        assert_dist_and_unknowns(ddist[12], zeros)
        assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [3, 1, 1]])
        assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [3, 1, 1]])
        assert_dist_and_unknowns(ddist[15], zeros)
        assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [3, 1, 1]])
        assert_dist_and_unknowns(ddist[17], [[0], [3]])
        assert_dist_and_unknowns(ddist[18], [[0, 2], [4, 1]])
        assert_dist_and_unknowns(ddist[19], zeros)

        d.set_weights(np.array([1, 2, 3, 4, 5]))
        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        assert_dist_and_unknowns(ddist[0], [15, 0, 0])
        assert_dist_and_unknowns(ddist[1], [14, 0, 1])
        assert_dist_and_unknowns(ddist[2], [8, 2, 5])
        assert_dist_and_unknowns(ddist[3], [9, 5, 1])
        assert_dist_and_unknowns(ddist[4], [12, 1, 0])
        assert_dist_and_unknowns(ddist[5], [9, 1, 3])
        assert_dist_and_unknowns(ddist[6], [4, 7, 1])
        assert_dist_and_unknowns(ddist[7], [15, 0, 0])
        assert_dist_and_unknowns(ddist[8], [13, 0, 2])
        assert_dist_and_unknowns(ddist[9], [14, 1, 0])

        zeros = [[0], [15]]
        assert_dist_and_unknowns(ddist[10], zeros)
        assert_dist_and_unknowns(ddist[11], zeros)
        assert_dist_and_unknowns(ddist[12], zeros)
        assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [9, 1, 5]])
        assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [12, 1, 2]])
        assert_dist_and_unknowns(ddist[15], zeros)
        assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [12, 2, 1]])
        assert_dist_and_unknowns(ddist[17], [[0], [12]])
        assert_dist_and_unknowns(ddist[18], [[0, 2], [14, 1]])
        assert_dist_and_unknowns(ddist[19], zeros)
Exemplo n.º 16
0
    def test_sparse_get_distributions(self):
        domain = data.Domain(
            [data.DiscreteVariable("d%i" % i, values=list("abc"))
             for i in range(10)] +
            [data.ContinuousVariable("c%i" % i) for i in range(10)])

        #  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
        # ------------------------------------------------------------
        #     2     2  1  1  2        1           1  1     2  0  2
        #        1  1  0  0  1     2                 2     1  0
        #           1     2  0
        #
        #        2        0  1                   1.1
        #
        sdata = np.array([2, 2, 1, 1, 2, 1, 1, 1, 2, 0, 2,
                          1, 1, 0, 0, 1, 2, 2, 1, 0,
                          1, 2, 0,
                          2, 0, 1, 1.1])
        indices = [1, 3, 4, 5, 6, 9, 13, 14, 16, 17, 18,
                   2, 3, 4, 5, 6, 8, 14, 16, 17,
                   3, 5, 6,
                   2, 5, 6, 13]
        indptr = [0, 11, 20, 23, 23, 27]
        X = sp.csr_matrix((sdata, indices, indptr), shape=(5, 20))
        d = data.Table.from_numpy(domain, X)

        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        np.testing.assert_almost_equal(ddist[0], [0, 0, 0])
        np.testing.assert_almost_equal(ddist[1], [0, 0, 1])
        np.testing.assert_almost_equal(ddist[2], [0, 1, 1])
        np.testing.assert_almost_equal(ddist[3], [0, 2, 1])
        np.testing.assert_almost_equal(ddist[4], [1, 1, 0])
        np.testing.assert_almost_equal(ddist[5], [2, 1, 1])
        np.testing.assert_almost_equal(ddist[6], [1, 2, 1])
        np.testing.assert_almost_equal(ddist[7], [0, 0, 0])
        np.testing.assert_almost_equal(ddist[8], [0, 0, 1])
        np.testing.assert_almost_equal(ddist[9], [0, 1, 0])

        z = np.zeros((2, 0))
        np.testing.assert_almost_equal(ddist[10], z)
        np.testing.assert_almost_equal(ddist[11], z)
        np.testing.assert_almost_equal(ddist[12], z)
        np.testing.assert_almost_equal(ddist[13], [[1, 1.1], [1, 1]])
        np.testing.assert_almost_equal(ddist[14], [[1, 2], [1, 1]])
        np.testing.assert_almost_equal(ddist[15], z)
        np.testing.assert_almost_equal(ddist[16], [[1, 2], [1, 1]])
        np.testing.assert_almost_equal(ddist[17], [[0], [2]])
        np.testing.assert_almost_equal(ddist[18], [[2], [1]])
        np.testing.assert_almost_equal(ddist[19], z)

        d.set_weights(np.array([1, 2, 3, 4, 5]))

        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        np.testing.assert_almost_equal(ddist[0], [0, 0, 0])
        np.testing.assert_almost_equal(ddist[1], [0, 0, 1])
        np.testing.assert_almost_equal(ddist[2], [0, 2, 5])
        np.testing.assert_almost_equal(ddist[3], [0, 5, 1])
        np.testing.assert_almost_equal(ddist[4], [2, 1, 0])
        np.testing.assert_almost_equal(ddist[5], [7, 1, 3])
        np.testing.assert_almost_equal(ddist[6], [3, 7, 1])
        np.testing.assert_almost_equal(ddist[7], [0, 0, 0])
        np.testing.assert_almost_equal(ddist[8], [0, 0, 2])
        np.testing.assert_almost_equal(ddist[9], [0, 1, 0])

        z = np.zeros((2, 0))
        np.testing.assert_almost_equal(ddist[10], z)
        np.testing.assert_almost_equal(ddist[11], z)
        np.testing.assert_almost_equal(ddist[12], z)
        np.testing.assert_almost_equal(ddist[13], [[1, 1.1], [1, 5]])
        np.testing.assert_almost_equal(ddist[14], [[1, 2], [1, 2]])
        np.testing.assert_almost_equal(ddist[15], z)
        np.testing.assert_almost_equal(ddist[16], [[1, 2], [2, 1]])
        np.testing.assert_almost_equal(ddist[17], [[0], [3]])
        np.testing.assert_almost_equal(ddist[18], [[2], [1]])
        np.testing.assert_almost_equal(ddist[19], z)
    def __call__(self, data):
        def transform_discrete(var):
            if (len(var.values) < 2 or treat == Continuize.Remove
                    or treat == Continuize.RemoveMultinomial
                    and len(var.values) > 2):
                return []
            if treat == Continuize.AsOrdinal:
                new_var = ContinuousVariable(var.name,
                                             compute_value=Identity(var))
                return [new_var]
            if treat == Continuize.AsNormalizedOrdinal:
                n_values = max(1, len(var.values))
                if self.zero_based:
                    return [
                        ContinuousVariable(var.name,
                                           compute_value=Normalizer(
                                               var, 0, 1 / (n_values - 1)))
                    ]
                else:
                    return [
                        ContinuousVariable(var.name,
                                           compute_value=Normalizer(
                                               var, (n_values - 1) / 2,
                                               2 / (n_values - 1)))
                    ]

            new_vars = []
            if treat == Continuize.Indicators:
                base = -1
            elif treat in (Continuize.FirstAsBase,
                           Continuize.RemoveMultinomial):
                base = max(var.base_value, 0)
            else:
                base = dists[var_ptr].modus()
            ind_class = [Indicator1, Indicator][self.zero_based]
            for i, val in enumerate(var.values):
                if i == base:
                    continue
                new_var = ContinuousVariable("{}={}".format(var.name, val),
                                             compute_value=ind_class(var, i))
                new_vars.append(new_var)
            return new_vars

        def transform_list(s):
            nonlocal var_ptr
            new_vars = []
            for var in s:
                if var.is_discrete:
                    new_vars += transform_discrete(var)
                    if needs_discrete:
                        var_ptr += 1
                else:
                    new_var = var
                    if new_var is not None:
                        new_vars.append(new_var)
                        if needs_continuous:
                            var_ptr += 1
            return new_vars

        treat = self.multinomial_treatment
        transform_class = self.transform_class

        domain = data if isinstance(data, Domain) else data.domain
        if (treat == Continuize.ReportError
                and any(var.is_discrete and len(var.values) > 2
                        for var in domain)):
            raise ValueError("data has multinomial attributes")
        needs_discrete = (treat == Continuize.FrequentAsBase
                          and domain.has_discrete_attributes(transform_class))
        needs_continuous = False
        if needs_discrete:
            if isinstance(data, Domain):
                raise TypeError("continuizer requires data")
            dists = distribution.get_distributions(data, not needs_discrete,
                                                   not needs_continuous)
        var_ptr = 0
        new_attrs = transform_list(domain.attributes)
        if transform_class:
            new_classes = transform_list(domain.class_vars)
        else:
            new_classes = domain.class_vars
        return Domain(new_attrs, new_classes, domain.metas)
Exemplo n.º 18
0
    def __call__(self, data):
        def transform_discrete(var):
            if (len(var.values) < 2 or
                    treat == self.Ignore or
                    treat == self.IgnoreMulti and len(var.values) > 2):
                return []
            if treat == self.AsOrdinal:
                new_var = ContinuousVariable(var.name)
                new_var.get_value_from = Identity(var)
                return [new_var]
            if treat == self.AsNormalizedOrdinal:
                new_var = ContinuousVariable(var.name)
                n_values = max(1, len(var.values))
                if self.zero_based:
                    new_var.get_value_from = \
                        Normalizer(var, 0, 1 / (n_values - 1))
                else:
                    new_var.get_value_from = \
                        Normalizer(var, (n_values - 1) / 2, 2 / (n_values - 1))
                return [new_var]

            new_vars = []
            if treat == self.NValues:
                base = -1
            elif treat == self.LowestIsBase or treat == self.IgnoreMulti:
                base = max(var.base_value, 0)
            else:
                base = dists[var_ptr].modus()
            IndClass = [Indicator_1, Indicator][self.zero_based]
            for i, val in enumerate(var.values):
                if i == base:
                    continue
                new_var = ContinuousVariable(
                    "{}={}".format(var.name, val))
                new_var.get_value_from = IndClass(var, i)
                new_vars.append(new_var)
            return new_vars

        def transform_continuous(var):
            if self.normalize_continuous == self.Leave:
                return var
            elif self.normalize_continuous == self.NormalizeBySpan:
                new_var = ContinuousVariable(var.name)
                dma, dmi = dists[var_ptr].max(), dists[var_ptr].min()
                diff = dma - dmi
                if diff < 1e-15:
                    diff = 1
                if self.zero_based:
                    new_var.get_value_from = Normalizer(var, dmi, 1 / diff)
                else:
                    new_var.get_value_from = Normalizer(var, (dma + dmi) / 2,
                                                    2 / diff)
                return new_var
            elif self.normalize_continuous == self.NormalizeBySD:
                new_var = ContinuousVariable(var.name)
                avg = dists[var_ptr].mean()
                sd = dists[var_ptr].standard_deviation()
                new_var.get_value_from = Normalizer(var, avg, 1 / sd)
                return new_var

        def transform_list(s):
            nonlocal var_ptr
            new_vars = []
            for var in s:
                if isinstance(var, DiscreteVariable):
                    new_vars += transform_discrete(var)
                    if needs_discrete:
                        var_ptr += 1
                else:
                    new_var = transform_continuous(var)
                    if new_var is not None:
                        new_vars.append(new_var)
                        if needs_continuous:
                            var_ptr += 1
            return new_vars

        treat = self.multinomial_treatment
        transform_class = self.transform_class
        domain = data if isinstance(data, Domain) else data.domain
        if treat == self.ReportError and any(
                isinstance(var, DiscreteVariable) and len(var.values) > 2
                for var in domain):
            raise ValueError("data has multinomial attributes")
        needs_discrete = (treat == self.FrequentIsBase and
                          domain.has_discrete_attributes(transform_class))
        needs_continuous = (not self.normalize_continuous == self.Leave and
                            domain.has_continuous_attributes(transform_class))
        if needs_discrete or needs_continuous:
            if isinstance(data, Domain):
                raise TypeError("continuizer requires data")
            dists = distribution.get_distributions(
                data, not needs_discrete, not needs_continuous)
        var_ptr = 0
        new_attrs = transform_list(domain.attributes)
        if transform_class:
            new_classes = transform_list(domain.class_vars)
        else:
            new_classes = domain.class_vars
        return Domain(new_attrs, new_classes, domain.metas)
Exemplo n.º 19
0
    def test_sparse_get_distributions(self):
        def assert_dist_and_unknowns(computed, goal_dist):
            nonlocal d
            goal_dist = np.array(goal_dist)
            sum_dist = np.sum(goal_dist[1, :] if goal_dist.ndim ==
                              2 else goal_dist)
            n_all = np.sum(d.W) if d.has_weights() else len(d)

            assert_dist_almost_equal(computed, goal_dist)
            self.assertEqual(computed.unknowns, n_all - sum_dist)

        domain = data.Domain([
            data.DiscreteVariable("d%i" % i, values=tuple("abc"))
            for i in range(10)
        ] + [data.ContinuousVariable("c%i" % i) for i in range(10)])

        # pylint: disable=bad-whitespace
        X = sp.csr_matrix(
            # 0  1  2  3       4       5       6  7  8  9 10 11 12   13 14 15 16      17 18 19
            # --------------------------------------------------------------------------------
            [[0, 2, 0, 2, 1, 1, 2, 0, 0, 1, 0, 0, 0, 1, 1, 0, 2, np.nan, 2, 0],
             [
                 0, 0, 1, 1, np.nan, np.nan, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1,
                 np.nan, 0, 0
             ],
             [0, 0, 0, 1, 0, 2, np.nan, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
             [0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1.1, 0, 0, 0, 0, 0, 0]])
        warnings.filterwarnings("ignore", ".*", sp.SparseEfficiencyWarning)
        X[0, 0] = 0

        d = data.Table.from_numpy(domain, X)
        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        zeros = [5, 0, 0]
        assert_dist_and_unknowns(ddist[0], zeros)
        assert_dist_and_unknowns(ddist[1], [4, 0, 1])
        assert_dist_and_unknowns(ddist[2], [3, 1, 1])
        assert_dist_and_unknowns(ddist[3], [2, 2, 1])
        assert_dist_and_unknowns(ddist[4], [3, 1, 0])
        assert_dist_and_unknowns(ddist[5], [2, 1, 1])
        assert_dist_and_unknowns(ddist[6], [1, 2, 1])
        assert_dist_and_unknowns(ddist[7], zeros)
        assert_dist_and_unknowns(ddist[8], [4, 0, 1])
        assert_dist_and_unknowns(ddist[9], [4, 1, 0])

        zeros = [[0], [5]]
        assert_dist_and_unknowns(ddist[10], zeros)
        assert_dist_and_unknowns(ddist[11], zeros)
        assert_dist_and_unknowns(ddist[12], zeros)
        assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [3, 1, 1]])
        assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [3, 1, 1]])
        assert_dist_and_unknowns(ddist[15], zeros)
        assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [3, 1, 1]])
        assert_dist_and_unknowns(ddist[17], [[0], [3]])
        assert_dist_and_unknowns(ddist[18], [[0, 2], [4, 1]])
        assert_dist_and_unknowns(ddist[19], zeros)

        with d.unlocked():
            d.set_weights(np.array([1, 2, 3, 4, 5]))
        ddist = distribution.get_distributions(d)

        self.assertEqual(len(ddist), 20)
        assert_dist_and_unknowns(ddist[0], [15, 0, 0])
        assert_dist_and_unknowns(ddist[1], [14, 0, 1])
        assert_dist_and_unknowns(ddist[2], [8, 2, 5])
        assert_dist_and_unknowns(ddist[3], [9, 5, 1])
        assert_dist_and_unknowns(ddist[4], [12, 1, 0])
        assert_dist_and_unknowns(ddist[5], [9, 1, 3])
        assert_dist_and_unknowns(ddist[6], [4, 7, 1])
        assert_dist_and_unknowns(ddist[7], [15, 0, 0])
        assert_dist_and_unknowns(ddist[8], [13, 0, 2])
        assert_dist_and_unknowns(ddist[9], [14, 1, 0])

        zeros = [[0], [15]]
        assert_dist_and_unknowns(ddist[10], zeros)
        assert_dist_and_unknowns(ddist[11], zeros)
        assert_dist_and_unknowns(ddist[12], zeros)
        assert_dist_and_unknowns(ddist[13], [[0, 1, 1.1], [9, 1, 5]])
        assert_dist_and_unknowns(ddist[14], [[0, 1, 2], [12, 1, 2]])
        assert_dist_and_unknowns(ddist[15], zeros)
        assert_dist_and_unknowns(ddist[16], [[0, 1, 2], [12, 2, 1]])
        assert_dist_and_unknowns(ddist[17], [[0], [12]])
        assert_dist_and_unknowns(ddist[18], [[0, 2], [14, 1]])
        assert_dist_and_unknowns(ddist[19], zeros)
Exemplo n.º 20
0
    def __call__(self, data):
        def transform_discrete(var):
            if (len(var.values) < 2 or treat == self.Ignore
                    or treat == self.IgnoreMulti and len(var.values) > 2):
                return []
            if treat == self.AsOrdinal:
                new_var = ContinuousVariable(var.name)
                new_var.get_value_from = Identity(var)
                return [new_var]
            if treat == self.AsNormalizedOrdinal:
                new_var = ContinuousVariable(var.name)
                n_values = max(1, len(var.values))
                if self.zero_based:
                    new_var.get_value_from = \
                        Normalizer(var, 0, 1 / (n_values - 1))
                else:
                    new_var.get_value_from = \
                        Normalizer(var, (n_values - 1) / 2, 2 / (n_values - 1))
                return [new_var]

            new_vars = []
            if treat == self.NValues:
                base = -1
            elif treat == self.LowestIsBase or treat == self.IgnoreMulti:
                base = max(var.base_value, 0)
            else:
                base = dists[var_ptr].modus()
            IndClass = [Indicator_1, Indicator][self.zero_based]
            for i, val in enumerate(var.values):
                if i == base:
                    continue
                new_var = ContinuousVariable("{}={}".format(var.name, val))
                new_var.get_value_from = IndClass(var, i)
                new_vars.append(new_var)
            return new_vars

        def transform_continuous(var):
            if self.normalize_continuous == self.Leave:
                return var
            elif self.normalize_continuous == self.NormalizeBySpan:
                new_var = ContinuousVariable(var.name)
                dma, dmi = dists[var_ptr].max(), dists[var_ptr].min()
                diff = dma - dmi
                if diff < 1e-15:
                    diff = 1
                if self.zero_based:
                    new_var.get_value_from = Normalizer(var, dmi, 1 / diff)
                else:
                    new_var.get_value_from = Normalizer(
                        var, (dma + dmi) / 2, 2 / diff)
                return new_var
            elif self.normalize_continuous == self.NormalizeBySD:
                new_var = ContinuousVariable(var.name)
                avg = dists[var_ptr].mean()
                sd = dists[var_ptr].standard_deviation()
                new_var.get_value_from = Normalizer(var, avg, 1 / sd)
                return new_var

        def transform_list(s):
            nonlocal var_ptr
            new_vars = []
            for var in s:
                if isinstance(var, DiscreteVariable):
                    new_vars += transform_discrete(var)
                    if needs_discrete:
                        var_ptr += 1
                else:
                    new_var = transform_continuous(var)
                    if new_var is not None:
                        new_vars.append(new_var)
                        if needs_continuous:
                            var_ptr += 1
            return new_vars

        treat = self.multinomial_treatment
        transform_class = self.transform_class

        domain = data if isinstance(data, Domain) else data.domain
        if treat == self.ReportError and any(
                isinstance(var, DiscreteVariable) and len(var.values) > 2
                for var in domain):
            raise ValueError("data has multinomial attributes")
        needs_discrete = (treat == self.FrequentIsBase
                          and domain.has_discrete_attributes(transform_class))
        needs_continuous = (not self.normalize_continuous == self.Leave and
                            domain.has_continuous_attributes(transform_class))
        if needs_discrete or needs_continuous:
            if isinstance(data, Domain):
                raise TypeError("continuizer requires data")
            dists = distribution.get_distributions(data, not needs_discrete,
                                                   not needs_continuous)
        var_ptr = 0
        new_attrs = transform_list(domain.attributes)
        if transform_class:
            new_classes = transform_list(domain.class_vars)
        else:
            new_classes = domain.class_vars
        return Domain(new_attrs, new_classes, domain.metas)