Exemplo n.º 1
0
    def test_get_distribution(self):
        d = data.Table("iris")
        cls = d.domain.class_var
        disc = distribution.get_distribution(d, cls)
        self.assertIsInstance(disc, np.ndarray)
        self.assertIs(disc.variable, cls)
        self.assertEqual(disc.unknowns, 0)
        np.testing.assert_array_equal(disc, [50, 50, 50])

        petal_length = d.columns.petal_length
        freqs = np.array(
            [
                (1.0, 1),
                (1.1, 1),
                (1.2, 2),
                (1.3, 7),
                (1.4, 12),
                (1.5, 14),
                (1.6, 7),
                (1.7, 4),
                (1.9, 2),
                (3.0, 1),
                (3.3, 2),
                (3.5, 2),
                (3.6, 1),
                (3.7, 1),
                (3.8, 1),
                (3.9, 3),
                (4.0, 5),
                (4.1, 3),
                (4.2, 4),
                (4.3, 2),
                (4.4, 4),
                (4.5, 8),
                (4.6, 3),
                (4.7, 5),
                (4.8, 4),
                (4.9, 5),
                (5.0, 4),
                (5.1, 8),
                (5.2, 2),
                (5.3, 2),
                (5.4, 2),
                (5.5, 3),
                (5.6, 6),
                (5.7, 3),
                (5.8, 3),
                (5.9, 2),
                (6.0, 2),
                (6.1, 3),
                (6.3, 1),
                (6.4, 1),
                (6.6, 1),
                (6.7, 2),
                (6.9, 1),
            ]
        ).T
        disc = distribution.get_distribution(d, petal_length)
        np.testing.assert_almost_equal(disc, freqs)
Exemplo n.º 2
0
def column_imputer_random(variable, data):
    if variable.is_discrete:
        dist = distribution.get_distribution(data, variable)
        transformer = RandomTransform(variable, dist)
    elif variable.is_continuous:
        dist = distribution.get_distribution(data, variable)
        transformer = RandomTransform(variable, dist)
    return RandomImputerModel((variable,), (variable,), (transformer,))
Exemplo n.º 3
0
def column_imputer_random(variable, data):
    if isinstance(variable, Orange.data.DiscreteVariable):
        dist = distribution.get_distribution(data, variable)
        transformer = RandomTransform(variable, dist)
    elif isinstance(variable, Orange.data.ContinuousVariable):
        dist = distribution.get_distribution(data, variable)
        transformer = RandomTransform(variable, dist)
    return RandomImputerModel((variable,), (variable,), (transformer,))
Exemplo n.º 4
0
 def compute_box_data(self):
     attr = self.attribute
     if not attr:
         return
     dataset = self.dataset
     self.is_continuous = attr.is_continuous
     if dataset is None or not self.is_continuous and not attr.values or \
                     self.group_var and not self.group_var.values:
         self.stats = self.dist = self.conts = []
         return
     if self.group_var:
         self.dist = []
         self.conts = contingency.get_contingency(
             dataset, attr, self.group_var)
         if self.is_continuous:
             self.stats = [BoxData(cont, attr, i, self.group_var)
                           for i, cont in enumerate(self.conts)]
         self.label_txts_all = self.group_var.values
     else:
         self.dist = distribution.get_distribution(dataset, attr)
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist, attr, None)]
         self.label_txts_all = [""]
     self.label_txts = [txts for stat, txts in zip(self.stats,
                                                   self.label_txts_all)
                        if stat.n > 0]
     self.stats = [stat for stat in self.stats if stat.n > 0]
Exemplo n.º 5
0
    def _setup(self):
        self.plot.clear()
        self.plot_prob.clear()
        self._legend.clear()
        self._legend.hide()

        varidx = self.variable_idx
        self.var = self.cvar = None
        if varidx >= 0:
            self.var = self.varmodel[varidx]
        if self.groupvar_idx > 0:
            self.cvar = self.groupvarmodel[self.groupvar_idx]
            self.cb_prob.clear()
            self.cb_prob.addItem("(None)")
            self.cb_prob.addItems(self.cvar.values)
            self.cb_prob.addItem("(All)")
            self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1)
        data = self.data
        self._setup_smoothing()
        if self.var is None:
            return
        if self.disc_cont:
            data = self.data[:, (self.var, self.cvar) if self.cvar else self.var]
            disc = Orange.preprocess.discretize.EqualWidth(n=self.bins[self.smoothing_index])
            data = Orange.preprocess.Discretize(data, method=disc, remove_const=False)
            self.var = data.domain[0]
        self.set_left_axis_name()
        self.enable_disable_rel_freq()
        if self.cvar:
            self.contingencies = contingency.get_contingency(data, self.var, self.cvar)
            self.display_contingency()
        else:
            self.distributions = distribution.get_distribution(data, self.var)
            self.display_distribution()
        self.plot.autoRange()
Exemplo n.º 6
0
    def _setup(self):
        self.plot.clear()
        self.plot_prob.clear()
        self._legend.clear()
        self._legend.hide()

        varidx = self.variable_idx
        self.var = self.cvar = None
        if varidx >= 0:
            self.var = self.varmodel[varidx]
        if self.groupvar_idx > 0:
            self.cvar = self.groupvarmodel[self.groupvar_idx]
        data = self.data
        self._setup_smoothing()
        if self.var is None:
            return
        if self.disc_cont:
            data = self.data[:, (self.var, self.cvar) if self.cvar else self.var ]
            disc = Orange.preprocess.discretize.EqualWidth(n=self.bins[self.smoothing_index])
            data = Orange.preprocess.Discretize(data, method=disc)
            self.var = data.domain.variables[0]
        self.set_left_axis_name()
        self.enable_disable_rel_freq()
        if self.cvar:
            self.contingencies = \
                contingency.get_contingency(data, self.var, self.cvar)
            self.display_contingency()
        else:
            self.distributions = \
                distribution.get_distribution(data, self.var)
            self.display_distribution()
        self.plot.autoRange()
Exemplo n.º 7
0
 def fit_storage(self, data):
     dist = distribution.get_distribution(data, data.domain.class_var)
     N = dist.sum()
     if N > 0:
         dist /= N
     else:
         dist.fill(1 / len(dist))
     return ConstantClassifier(dist=dist)
 def compute_box_data(self):
     if self.split_var:
         return (
             contingency.get_contingency(
                 self.dataset, self.attribute, self.split_var),
             self.split_var.values)
     else:
         return [
             distribution.get_distribution(
                 self.dataset, self.attribute)], [""]
Exemplo n.º 9
0
 def fit_storage(self, dat):
     if not isinstance(dat.domain.class_var, data.DiscreteVariable):
         raise ValueError("classification.MajorityFitter expects a domain with a "
                          "(single) discrete variable")
     dist = distribution.get_distribution(dat, dat.domain.class_var)
     N = dist.sum()
     if N > 0:
         dist /= N
     else:
         dist.fill(1 / len(dist))
     return ConstantClassifier(dist=dist)
Exemplo n.º 10
0
 def fit_storage(self, dat):
     if not dat.domain.has_discrete_class:
         raise ValueError("classification.MajorityLearner expects a domain with a "
                          "(single) discrete variable")
     dist = distribution.get_distribution(dat, dat.domain.class_var)
     N = dist.sum()
     if N > 0:
         dist /= N
     else:
         dist.fill(1 / len(dist))
     return ConstantModel(dist=dist)
Exemplo n.º 11
0
 def __call__(self, data, attribute):
     if type(data) == SqlTable:
         att = attribute.to_sql()
         quantiles = [(i + 1) / self.n for i in range(self.n - 1)]
         query = data._sql_query(['quantile(%s, ARRAY%s)' %
                                  (att, str(quantiles))])
         with data._execute_sql_query(query) as cur:
             points = sorted(set(cur.fetchone()[0]))
     else:
         d = distribution.get_distribution(data, attribute)
         points = _discretize.split_eq_freq(d, n=self.n)
     return Discretizer.create_discretized_var(
         data.domain[attribute], points)
Exemplo n.º 12
0
def _ensure_dist(var, data_or_dist):
    if isinstance(data_or_dist, distribution.Discrete):
        if not is_discrete(var):
            raise TypeError
        return data_or_dist
    elif isinstance(data_or_dist, distribution.Continuous):
        if not is_continuous(var):
            raise TypeError
        return data_or_dist
    elif isinstance(data_or_dist, Orange.data.Storage):
        return distribution.get_distribution(data_or_dist, var)
    else:
        raise ValueError("Need a distribution or data.")
Exemplo n.º 13
0
 def __call__(self, data, attribute):
     if type(data) == SqlTable:
         att = attribute.to_sql()
         quantiles = [(i + 1) / self.n for i in range(self.n - 1)]
         query = data._sql_query(
             ['quantile(%s, ARRAY%s)' % (att, str(quantiles))])
         with data._execute_sql_query(query) as cur:
             points = sorted(set(cur.fetchone()[0]))
     else:
         d = distribution.get_distribution(data, attribute)
         points = _discretize.split_eq_freq(d, self.n)
     return Discretizer.create_discretized_var(data.domain[attribute],
                                               points)
Exemplo n.º 14
0
    def __call__(self, data, variable):
        if is_continuous(variable):
            stats = basic_stats.BasicStats(data, variable)
            value = stats.mean
        elif is_discrete(variable):
            dist = distribution.get_distribution(data, variable)
            value = dist.modus()
        else:
            raise TypeError

        var = copy.copy(variable)
        var.compute_value = ReplaceUnknowns(variable, value)
        return var
Exemplo n.º 15
0
def _ensure_dist(var, data_or_dist):
    if isinstance(data_or_dist, distribution.Discrete):
        if not var.is_discrete:
            raise TypeError
        return data_or_dist
    elif isinstance(data_or_dist, distribution.Continuous):
        if not var.is_continuous:
            raise TypeError
        return data_or_dist
    elif isinstance(data_or_dist, Orange.data.Storage):
        return distribution.get_distribution(data_or_dist, var)
    else:
        raise ValueError("Need a distribution or data.")
Exemplo n.º 16
0
    def fit_storage(self, dat):

        if not isinstance(dat.domain.class_var, data.DiscreteVariable):
            raise ValueError(
                "classification.MajorityLearner expects a domain with a "
                "(single) discrete variable")
        dist = distribution.get_distribution(dat, dat.domain.class_var)
        N = dist.sum()
        if N > 0:
            dist /= N
        else:
            dist.fill(1 / len(dist))
        return ConstantModel(dist=dist)
Exemplo n.º 17
0
    def __call__(self, data, variable, value=None):
        variable = data.domain[variable]
        if value is None:
            if variable.is_continuous:
                stats = basic_stats.BasicStats(data, variable)
                value = stats.mean
            elif variable.is_discrete:
                dist = distribution.get_distribution(data, variable)
                value = dist.modus()
            else:
                raise TypeError("Variable must be continuous or discrete")

        return variable.copy(compute_value=ReplaceUnknowns(variable, value))
Exemplo n.º 18
0
    def fit_storage(self, data):
        """
        Construct a :obj:`MeanModel` by computing the mean value of the given
        data.

        :param data: data table
        :type data: Orange.data.Table
        :return: regression model, which always returns mean value
        :rtype: :obj:`MeanModel`
        """
        if not data.domain.has_continuous_class:
            raise ValueError("regression.MeanLearner expects a domain with a "
                             "(single) continuous variable")
        dist = distribution.get_distribution(data, data.domain.class_var)
        return MeanModel(dist)
Exemplo n.º 19
0
    def __call__(self, data, variable, value=None):
        variable = data.domain[variable]
        if value is None:
            if is_continuous(variable):
                stats = basic_stats.BasicStats(data, variable)
                value = stats.mean
            elif is_discrete(variable):
                dist = distribution.get_distribution(data, variable)
                value = dist.modus()
            else:
                raise TypeError("Variable must be continuous or discrete")

        var = copy.copy(variable)
        var.compute_value = ReplaceUnknowns(variable, value)
        return var
Exemplo n.º 20
0
    def __call__(self, data, variable, value=None):
        variable = data.domain[variable]
        if value is None:
            if variable.is_continuous:
                stats = basic_stats.BasicStats(data, variable)
                value = stats.mean
            elif variable.is_discrete:
                dist = distribution.get_distribution(data, variable)
                value = dist.modus()
            else:
                raise TypeError("Variable must be numeric or categorical.")

        a = variable.copy(compute_value=ReplaceUnknowns(variable, value))
        a.to_sql = ImputeSql(variable, value)
        return a
Exemplo n.º 21
0
 def __call__(self, data, attribute):
     if type(data) == SqlTable:
         att = attribute.to_sql()
         quantiles = [(i + 1) / self.n for i in range(self.n - 1)]
         query = data._sql_query(
             ['quantile(%s, ARRAY%s)' % (att, str(quantiles))],
             use_time_sample=1000)
         with data._execute_sql_query(query) as cur:
             points = sorted(set(cur.fetchone()[0]))
     else:
         d = distribution.get_distribution(data, attribute)
         points = _discretize.split_eq_freq(d, self.n)
         # np.unique handles cases in which differences are below precision
         points = list(np.unique(points))
     return Discretizer.create_discretized_var(data.domain[attribute],
                                               points)
Exemplo n.º 22
0
        def transform(var):
            dist = distribution.get_distribution(data, var)
            if self.center:
                c = self.center(dist)
                dist[0, :] -= c
            else:
                c = 0

            if self.scale:
                s = self.scale(dist)
                if s < 1e-15:
                    s = 1
            else:
                s = 1
            factor = 1 / s
            return var.copy(compute_value=transformation.Normalizer(var, c, factor))
Exemplo n.º 23
0
 def compute_box_data(self):
     attr = self.attribute
     if not attr:
         return
     dataset = self.dataset
     if dataset is None \
             or not attr.is_continuous and not attr.values \
             or self.group_var and not self.group_var.values:
         self.stats = []
         self.dist = self.conts = None
         return
     if self.group_var:
         self.dist = None
         missing_val_str = f"missing '{self.group_var.name}'"
         group_var_labels = self.group_var.values + ("", )
         if self.attribute.is_continuous:
             stats, label_texts = [], []
             attr_col = dataset.get_column_view(attr)[0].astype(float)
             for group, value in \
                     zip(self._group_cols(dataset, self.group_var, attr_col),
                         group_var_labels):
                 if group.size:
                     stats.append(BoxData(group, value))
                     label_texts.append(value or missing_val_str)
             self.stats = stats
             self.label_txts_all = label_texts
         else:
             self.conts = contingency.get_contingency(
                 dataset, attr, self.group_var)
             self.label_txts_all = [
                 v or missing_val_str for v, c in zip(
                     group_var_labels, self.conts.array_with_unknowns)
                 if np.sum(c) > 0
             ]
     else:
         self.conts = None
         if self.attribute.is_continuous:
             attr_col = dataset.get_column_view(attr)[0].astype(float)
             self.stats = [BoxData(attr_col)]
         else:
             self.dist = distribution.get_distribution(dataset, attr)
         self.label_txts_all = [""]
     self.label_txts = [
         txts for stat, txts in zip(self.stats, self.label_txts_all)
         if stat.n > 0
     ]
     self.stats = [stat for stat in self.stats if stat.n > 0]
Exemplo n.º 24
0
    def __call__(self, data, variable):
        variable = data.domain[variable]
        dist = distribution.get_distribution(data, variable)
        # A distribution is invalid if a continuous variable's column does not
        # contain any known values or if a discrete variable's .values == []
        isinvalid = dist.size == 0
        if isinvalid and variable.is_discrete:
            assert len(variable.values) == 0
            raise ValueError("'{}' has no values".format(variable))
        elif isinvalid and variable.is_continuous:
            raise ValueError("'{}' has an unknown distribution".format(variable))

        if variable.is_discrete and numpy.sum(dist) == 0:
            dist += 1 / len(dist)
        elif variable.is_continuous and numpy.sum(dist[1, :]) == 0:
            dist[1, :] += 1 / dist.shape[1]
        return variable.copy(compute_value=ReplaceUnknownsRandom(variable, dist))
Exemplo n.º 25
0
 def _disc_plot(self):
     var = self.var
     self.ploti.getAxis("bottom").setTicks([list(enumerate(var.values))])
     colors = [QColor(0, 128, 255)]
     dist = distribution.get_distribution(self.data, self.var)
     for i, freq in enumerate(dist):
         tooltip = \
             "<p style='white-space:pre;'>" \
             f"<b>{escape(var.values[i])}</b>: {int(freq)} " \
             f"({100 * freq / len(self.valid_data):.2f} %) "
         self._add_bar(i - 0.5,
                       1,
                       0.1, [freq],
                       colors,
                       stacked=False,
                       expanded=False,
                       tooltip=tooltip)
Exemplo n.º 26
0
    def fit_storage(self, dat):
        if not dat.domain.has_discrete_class:
            raise ValueError("classification.MajorityLearner expects a domain " "with a (single) discrete variable")
        dist = distribution.get_distribution(dat, dat.domain.class_var)
        N = dist.sum()
        if N > 0:
            dist /= N
        else:
            dist.fill(1 / len(dist))

        probs = np.array(dist)
        ties = np.flatnonzero(probs == probs.max())
        if len(ties) > 1:
            random_idx = int(sha1(bytes(dat.Y)).hexdigest(), 16) % len(ties)
            unif_maj = ties[random_idx]
        else:
            unif_maj = None
        return ConstantModel(dist=dist, unif_maj=unif_maj)
Exemplo n.º 27
0
        def transform(var):
            dist = distribution.get_distribution(data, var)
            if self.center != self.NoCentering:
                c = self.center(dist)
                dist[0, :] -= c
            else:
                c = 0

            if self.scale != self.NoScaling:
                s = self.scale(dist)
                if s < 1e-15:
                    s = 1
            else:
                s = 1
            factor = 1 / s
            transformed_var = var.copy(compute_value=transformation.Normalizer(var, c, factor))
            if s != 1:
                transformed_var.number_of_decimals = 3
            return transformed_var
Exemplo n.º 28
0
    def fit_storage(self, dat):
        if not dat.domain.has_discrete_class:
            raise ValueError("classification.MajorityLearner expects a domain "
                             "with a (single) discrete variable")
        dist = distribution.get_distribution(dat, dat.domain.class_var)
        N = dist.sum()
        if N > 0:
            dist /= N
        else:
            dist.fill(1 / len(dist))

        probs = np.array(dist)
        ties = np.flatnonzero(probs == probs.max())
        if len(ties) > 1:
            random_idx = int(sha1(bytes(dat.Y)).hexdigest(), 16) % len(ties)
            unif_maj = ties[random_idx]
        else:
            unif_maj = None
        return ConstantModel(dist=dist, unif_maj=unif_maj)
Exemplo n.º 29
0
 def compute_box_data(self):
     attr = self.attribute
     if not attr:
         return
     dataset = self.dataset
     if dataset is None \
             or not attr.is_continuous and not attr.values \
             or self.group_var and not self.group_var.values:
         self.stats = []
         self.dist = self.conts = None
         return
     if self.group_var:
         self.dist = None
         self.conts = contingency.get_contingency(dataset, attr,
                                                  self.group_var)
         missing_val_str = f"missing '{self.group_var.name}'"
         group_var_labels = self.group_var.values + ("", )
         if self.attribute.is_continuous:
             stats, label_texts = [], []
             for cont, value in zip(self.conts.array_with_unknowns,
                                    group_var_labels):
                 if np.sum(cont[1]):
                     stats.append(BoxData(cont, value))
                     label_texts.append(value or missing_val_str)
             self.stats = stats
             self.label_txts_all = label_texts
         else:
             self.label_txts_all = [
                 v or missing_val_str for v, c in zip(
                     group_var_labels, self.conts.array_with_unknowns)
                 if np.sum(c) > 0
             ]
     else:
         self.dist = distribution.get_distribution(dataset, attr)
         self.conts = None
         if self.attribute.is_continuous:
             self.stats = [BoxData(self.dist, None)]
         self.label_txts_all = [""]
     self.label_txts = [
         txts for stat, txts in zip(self.stats, self.label_txts_all)
         if stat.n > 0
     ]
     self.stats = [stat for stat in self.stats if stat.n > 0]
Exemplo n.º 30
0
    def __call__(self, data, variable):
        variable = data.domain[variable]
        dist = distribution.get_distribution(data, variable)
        # A distribution is invalid if a continuous variable's column does not
        # contain any known values or if a discrete variable's .values == []
        isinvalid = dist.size == 0
        if isinvalid and variable.is_discrete:
            assert len(variable.values) == 0
            raise ValueError("'{}' has no values".format(variable))
        elif isinvalid and variable.is_continuous:
            raise ValueError(
                "'{}' has an unknown distribution".format(variable))

        if variable.is_discrete and np.sum(dist) == 0:
            dist += 1 / len(dist)
        elif variable.is_continuous and np.sum(dist[1, :]) == 0:
            dist[1, :] += 1 / dist.shape[1]
        return variable.copy(
            compute_value=ReplaceUnknownsRandom(variable, dist))
Exemplo n.º 31
0
 def compute_box_data(self):
     attr = self.attribute
     if not attr:
         return
     dataset = self.dataset
     self.is_continuous = attr.is_continuous
     if dataset is None or not self.is_continuous and not attr.values or \
                     self.group_var and not self.group_var.values:
         self.stats = self.dist = self.conts = []
         return
     if self.group_var:
         self.dist = []
         self.conts = contingency.get_contingency(dataset, attr,
                                                  self.group_var)
         group_var_labels = self.group_var.values + [
             f"missing '{self.group_var.name}'"
         ]
         if self.is_continuous:
             stats, label_texts = [], []
             for i, cont in enumerate(self.conts.array_with_unknowns):
                 if np.sum(cont[1]):
                     stats.append(BoxData(cont, attr, i, self.group_var))
                     label_texts.append(group_var_labels[i])
             self.stats = stats
             self.label_txts_all = label_texts
         else:
             self.label_txts_all = [
                 v for v, c in zip(group_var_labels,
                                   self.conts.array_with_unknowns)
                 if np.sum(c) > 0
             ]
     else:
         self.dist = distribution.get_distribution(dataset, attr)
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist, attr, None)]
         self.label_txts_all = [""]
     self.label_txts = [
         txts for stat, txts in zip(self.stats, self.label_txts_all)
         if stat.n > 0
     ]
     self.stats = [stat for stat in self.stats if stat.n > 0]
Exemplo n.º 32
0
    def _setup(self):
        self.plot.clear()
        self.plot_prob.clear()
        self._legend.clear()
        self._legend.hide()

        varidx = self.variable_idx
        self.var = self.cvar = None
        if varidx >= 0:
            self.var = self.varmodel[varidx]
        if self.groupvar_idx > 0:
            self.cvar = self.groupvarmodel[self.groupvar_idx]
            prob = self.controls.show_prob
            prob.clear()
            prob.addItem("(None)")
            prob.addItems(self.cvar.values)
            prob.addItem("(All)")
            self.show_prob = min(max(self.show_prob, 0),
                                 len(self.cvar.values) + 1)
        data = self.data
        self._setup_smoothing()
        if self.var is None:
            return
        if self.disc_cont:
            domain = Orange.data.Domain(
                [self.var, self.cvar] if self.cvar else [self.var])
            data = Orange.data.Table(domain, data)
            disc = EqualWidth(n=self.bins[self.smoothing_index])
            data = Discretize(method=disc, remove_const=False)(data)
            self.var = data.domain[0]
        self.set_left_axis_name()
        self.enable_disable_rel_freq()
        self.controls.cumulative_distr.setDisabled(not self.var.is_continuous)
        if self.cvar:
            self.contingencies = \
                contingency.get_contingency(data, self.var, self.cvar)
            self.display_contingency()
        else:
            self.distributions = \
                distribution.get_distribution(data, self.var)
            self.display_distribution()
        self.plot.autoRange()
Exemplo n.º 33
0
    def _setup(self):
        self.plot.clear()
        self.plot_prob.clear()
        self._legend.clear()
        self._legend.hide()

        varidx = self.variable_idx
        self.var = self.cvar = None
        if varidx >= 0:
            self.var = self.varmodel[varidx]
        if self.groupvar_idx > 0:
            self.cvar = self.groupvarmodel[self.groupvar_idx]
            prob = self.controls.show_prob
            prob.clear()
            prob.addItem("(None)")
            prob.addItems(self.cvar.values)
            prob.addItem("(All)")
            self.show_prob = min(max(self.show_prob, 0),
                                 len(self.cvar.values) + 1)
        data = self.data
        self._setup_smoothing()
        if self.var is None:
            return
        if self.disc_cont:
            domain = Orange.data.Domain(
                [self.var, self.cvar] if self.cvar else [self.var])
            data = Orange.data.Table(domain, data)
            disc = EqualWidth(n=self.bins[self.smoothing_index])
            data = Discretize(method=disc, remove_const=False)(data)
            self.var = data.domain[0]
        self.set_left_axis_name()
        self.enable_disable_rel_freq()
        self.controls.cumulative_distr.setDisabled(not self.var.is_continuous)
        if self.cvar:
            self.contingencies = \
                contingency.get_contingency(data, self.var, self.cvar)
            self.display_contingency()
        else:
            self.distributions = \
                distribution.get_distribution(data, self.var)
            self.display_distribution()
        self.plot.autoRange()
Exemplo n.º 34
0
    def _setup(self):
        self.plot.clear()
        self.plot_prob.clear()
        self._legend.clear()
        self._legend.hide()

        varidx = self.variable_idx
        self.var = self.cvar = None
        if varidx >= 0:
            self.var = self.varmodel[varidx]
        if self.groupvar_idx > 0:
            self.cvar = self.groupvarmodel[self.groupvar_idx]
            self.cb_prob.clear()
            self.cb_prob.addItem("(None)")
            self.cb_prob.addItems(self.cvar.values)
            self.cb_prob.addItem("(All)")
            self.show_prob = min(max(self.show_prob, 0),
                                 len(self.cvar.values) + 1)
        data = self.data
        self._setup_smoothing()
        if self.var is None:
            return
        if self.disc_cont:
            data = self.data[:,
                             (self.var, self.cvar) if self.cvar else self.var]
            disc = Orange.preprocess.discretize.EqualWidth(
                n=self.bins[self.smoothing_index])
            data = Orange.preprocess.Discretize(data,
                                                method=disc,
                                                remove_const=False)
            self.var = data.domain[0]
        self.set_left_axis_name()
        self.enable_disable_rel_freq()
        if self.cvar:
            self.contingencies = \
                contingency.get_contingency(data, self.var, self.cvar)
            self.display_contingency()
        else:
            self.distributions = \
                distribution.get_distribution(data, self.var)
            self.display_distribution()
        self.plot.autoRange()
Exemplo n.º 35
0
 def __call__(self, data, attribute, fixed=None):
     if fixed:
         min, max = fixed[attribute.name]
         points = self._split_eq_width_fixed(min, max, n=self.n)
     else:
         if type(data) == SqlTable:
             att = attribute.to_sql()
             query = data._sql_query(['min(%s)::double precision' % att,
                                      'max(%s)::double precision' % att])
             with data._execute_sql_query(query) as cur:
                 min, max = cur.fetchone()
             dif = (max - min) / self.n
             points = [min + (i + 1) * dif for i in range(self.n - 1)]
         else:
             # TODO: why is the whole distribution computed instead of
             # just min/max
             d = distribution.get_distribution(data, attribute)
             points = self._split_eq_width(d, n=self.n)
     return Discretizer.create_discretized_var(
         data.domain[attribute], points)
Exemplo n.º 36
0
 def _setup(self):
     self.plot.clear()
     varidx = self.variable_idx
     self.var = self.cvar = None
     if varidx >= 0:
         self.var = self.varmodel[varidx]
     if self.groupvar_idx > 0:
         self.cvar = self.groupvarmodel[self.groupvar_idx]
     self.set_left_axis_name()
     self.enable_disable_rel_freq()
     if self.var is None:
         return
     if self.cvar:
         self.contingencies = \
             contingency.get_contingency(self.data, self.var, self.cvar)
         self.display_contingency()
     else:
         self.distributions = \
             distribution.get_distribution(self.data, self.var)
         self.display_distribution()
Exemplo n.º 37
0
 def _setup(self):
     self.plot.clear()
     varidx = self.variable_idx
     self.var = self.cvar = None
     if varidx >= 0:
         self.var = self.varmodel[varidx]
     if self.groupvar_idx > 0:
         self.cvar = self.groupvarmodel[self.groupvar_idx]
     self.set_left_axis_name()
     self.enable_disable_rel_freq()
     if self.var is None:
         return
     if self.cvar:
         self.contingencies = \
             contingency.get_contingency(self.data, self.var, self.cvar)
         self.display_contingency()
     else:
         self.distributions = \
             distribution.get_distribution(self.data, self.var)
         self.display_distribution()
Exemplo n.º 38
0
        def transform(var):
            dist = distribution.get_distribution(data, var)
            if self.center != self.NoCentering:
                c = self.center(dist)
                dist[0, :] -= c
            else:
                c = 0

            if self.scale != self.NoScaling:
                s = self.scale(dist)
                if s < 1e-15:
                    s = 1
            else:
                s = 1
            factor = 1 / s
            transformed_var = var.copy(
                compute_value=transformation.Normalizer(var, c, factor))
            if s != 1:
                transformed_var.number_of_decimals = 3
            return transformed_var
Exemplo n.º 39
0
    def _disc_plot(self):
        var = self.var
        dist = distribution.get_distribution(self.data, self.var)
        dist = np.array(dist)  # Distribution misbehaves in further operations
        if self.sort_by_freq:
            order = np.argsort(dist)[::-1]
        else:
            order = np.arange(len(dist))

        ordered_values = np.array(var.values)[order]
        self.ploti.getAxis("bottom").setTicks([list(enumerate(ordered_values))])

        colors = [QColor(0, 128, 255)]
        for i, freq, desc in zip(count(), dist[order], ordered_values):
            tooltip = \
                "<p style='white-space:pre;'>" \
                f"<b>{escape(desc)}</b>: {int(freq)} " \
                f"({100 * freq / len(self.valid_data):.2f} %) "
            self._add_bar(
                i - 0.5, 1, 0.1, [freq], colors,
                stacked=False, expanded=False, tooltip=tooltip, desc=desc)
Exemplo n.º 40
0
    def _setup(self):
        """Setup the plot."""
        self.plot.clear()

        varidx = self.variable_idx
        var = cvar = None
        if varidx >= 0:
            var = self.varmodel[varidx]

        if self.groupvar_idx >= 0:
            cvar = self.groupvarmodel[self.groupvar_idx]

        if var is None:
            return

        if is_discrete(cvar):
            cont = contingency.get_contingency(self.data, var, cvar)
            self.set_contingency(cont, var, cvar)
        else:
            dist = distribution.get_distribution(self.data, var)
            self.set_distribution(dist, var)
Exemplo n.º 41
0
    def fit_storage(self, dat):
        """
        Constructs `Orange.classification.majority.ConstantClassifier` from given data.

        :param dat: table of data
        :type dat: Orange.data.Table
        :return: classification model, which always returns majority value
        :rtype: Orange.classification.majority.ConstantClassifier
        """

        if not isinstance(dat.domain.class_var, data.DiscreteVariable):
            raise ValueError(
                "classification.MajorityFitter expects a domain with a "
                "(single) discrete variable")
        dist = distribution.get_distribution(dat, dat.domain.class_var)
        N = dist.sum()
        if N > 0:
            dist /= N
        else:
            dist.fill(1 / len(dist))
        return ConstantClassifier(dist=dist)
Exemplo n.º 42
0
    def _setup(self):
        """Setup the plot."""
        self.plot.clear()

        varidx = self.variable_idx
        var = cvar = None
        if varidx >= 0:
            var = self.varmodel[varidx]

        if self.groupvar_idx >= 0:
            cvar = self.groupvarmodel[self.groupvar_idx]

        if var is None:
            return

        if is_discrete(cvar):
            cont = contingency.get_contingency(self.data, var, cvar)
            self.set_contingency(cont, var, cvar)
        else:
            dist = distribution.get_distribution(self.data, var)
            self.set_distribution(dist, var)
Exemplo n.º 43
0
 def iterate_states(self, state):
     """
     Iterate through all combinations of attributes as ordered by Relief,
     starting with a single attribute if Mosaic is colored by class
     distributions, and two if by Pearson.
     """
     # If we put initialization of `self.attrs` to `initialize`,
     # `score_heuristic` would be run on every call to master's `set_data`.
     master = self.master
     data = master.discrete_data
     min_attrs, max_attrs = self.attr_range()
     if min_attrs > max_attrs:
         return
     if state is None:  # on the first call, compute order
         if self._compute_class_dists():
             self.marginal = get_distribution(data, data.domain.class_var)
             self.marginal.normalize()
             state = list(range(min_attrs))
         else:
             self.marginal = get_distributions(data)
             for dist in self.marginal:
                 dist.normalize()
             state = list(range(min_attrs))
     n_attrs = len(data.domain.attributes)
     while True:
         yield state
         # Reset while running; just abort
         if self.attr_ordering is None:
             break
         for up, _ in enumerate(state):
             state[up] += 1
             if up + 1 == len(state) or state[up] < state[up + 1]:
                 break
             state[up] = up
         if state[-1] == len(self.attr_ordering):
             if len(state) < min(max_attrs, n_attrs):
                 state = list(range(len(state) + 1))
             else:
                 break
Exemplo n.º 44
0
 def compute_box_data(self):
     attr = self.attribute
     if not attr:
         return
     dataset = self.dataset
     self.is_continuous = attr.is_continuous
     if dataset is None or not self.is_continuous and not attr.values or \
                     self.group_var and not self.group_var.values:
         self.stats = self.dist = self.conts = []
         return
     if self.group_var:
         self.dist = []
         self.conts = contingency.get_contingency(
             dataset, attr, self.group_var)
         if self.is_continuous:
             stats, label_texts = [], []
             for i, cont in enumerate(self.conts):
                 if np.sum(cont[1]):
                     stats.append(BoxData(cont, attr, i, self.group_var))
                     label_texts.append(self.group_var.values[i])
             self.stats = stats
             self.label_txts_all = label_texts
         else:
             self.label_txts_all = \
                 [v for v, c in zip(self.group_var.values, self.conts)
                  if np.sum(c) > 0]
     else:
         self.dist = distribution.get_distribution(dataset, attr)
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist, attr, None)]
         self.label_txts_all = [""]
     self.label_txts = [txts for stat, txts in zip(self.stats,
                                                   self.label_txts_all)
                        if stat.n > 0]
     self.stats = [stat for stat in self.stats if stat.n > 0]
Exemplo n.º 45
0
 def iterate_states(self, state):
     """
     Iterate through all combinations of attributes as ordered by Relief,
     starting with a single attribute if Mosaic is colored by class
     distributions, and two if by Pearson.
     """
     # If we put initialization of `self.attrs` to `initialize`,
     # `score_heuristic` would be run on every call to master's `set_data`.
     master = self.master
     data = master.discrete_data
     if state is None:  # on the first call, compute order
         if self._compute_class_dists():
             self.marginal = get_distribution(data, data.domain.class_var)
             self.marginal.normalize()
             state = [0]
         else:
             self.marginal = get_distributions(data)
             for dist in self.marginal:
                 dist.normalize()
             state = [0, 1]
     n_attrs = len(data.domain.attributes)
     while True:
         yield state
         # Reset while running; just abort
         if self.attr_ordering is None:
             break
         for up, _ in enumerate(state):
             state[up] += 1
             if up + 1 == len(state) or state[up] < state[up + 1]:
                 break
             state[up] = up
         if state[-1] == len(self.attr_ordering):
             if len(state) < min(self.max_attrs, n_attrs):
                 state = list(range(len(state) + 1))
             else:
                 break
Exemplo n.º 46
0
def column_imputer_modus(variable, table):
    stat = distribution.get_distribution(table, variable)
    column_imputer_defaults(variable, table, stat.modus())
Exemplo n.º 47
0
 def fit_storage(self, dat):
     if not isinstance(dat.domain.class_var, data.ContinuousVariable):
         raise ValueError("regression.MeanFitter expects a domain with a "
                          "(single) continuous variable")
     dist = distribution.get_distribution(dat, dat.domain.class_var)
     return MeanModel(dist)
Exemplo n.º 48
0
        def add_rect(x0,
                     x1,
                     y0,
                     y1,
                     condition="",
                     used_attrs=[],
                     used_vals=[],
                     attr_vals=""):
            area_index = len(self.areas)
            if x0 == x1:
                x1 += 1
            if y0 == y1:
                y1 += 1

            # rectangles of width and height 1 are not shown - increase
            if x1 - x0 + y1 - y0 == 2:
                y1 += 1

            if class_var and class_var.is_discrete:
                colors = [QColor(*col) for col in class_var.colors]
            else:
                colors = None

            def select_area(_, ev):
                self.select_area(area_index, ev)

            def rect(x, y, w, h, z, pen_color=None, brush_color=None, **args):
                if pen_color is None:
                    return CanvasRectangle(self.canvas,
                                           x,
                                           y,
                                           w,
                                           h,
                                           z=z,
                                           onclick=select_area,
                                           **args)
                if brush_color is None:
                    brush_color = pen_color
                return CanvasRectangle(self.canvas,
                                       x,
                                       y,
                                       w,
                                       h,
                                       pen_color,
                                       brush_color,
                                       z=z,
                                       onclick=select_area,
                                       **args)

            def line(x1, y1, x2, y2):
                r = QGraphicsLineItem(x1, y1, x2, y2, None)
                self.canvas.addItem(r)
                r.setPen(QPen(Qt.white, 2))
                r.setZValue(30)

            outer_rect = rect(x0, y0, x1 - x0, y1 - y0, 30)
            self.areas.append((used_attrs, used_vals, outer_rect))
            if not conditionaldict[attr_vals]:
                return

            if self.interior_coloring == self.PEARSON:
                s = sum(apriori_dists[0])
                expected = s * reduce(
                    mul, (apriori_dists[i][used_vals[i]] / float(s)
                          for i in range(len(used_vals))))
                actual = conditionaldict[attr_vals]
                pearson = (actual - expected) / sqrt(expected)
                if pearson == 0:
                    ind = 0
                else:
                    ind = max(0, min(int(log(abs(pearson), 2)), 3))
                color = [self.RED_COLORS, self.BLUE_COLORS][pearson > 0][ind]
                rect(x0, y0, x1 - x0, y1 - y0, -20, color)
                outer_rect.setToolTip(
                    condition + "<hr/>" + "Expected instances: %.1f<br>"
                    "Actual instances: %d<br>"
                    "Standardized (Pearson) residual: %.1f" %
                    (expected, conditionaldict[attr_vals], pearson))
            else:
                cls_values = get_variable_values_sorted(class_var)
                prior = get_distribution(data, class_var.name)
                total = 0
                for i, value in enumerate(cls_values):
                    val = conditionaldict[attr_vals + "-" + value]
                    if val == 0:
                        continue
                    if i == len(cls_values) - 1:
                        v = y1 - y0 - total
                    else:
                        v = ((y1 - y0) * val) / conditionaldict[attr_vals]
                    rect(x0, y0 + total, x1 - x0, v, -20, colors[i])
                    total += v

                if self.use_boxes and \
                        abs(x1 - x0) > bar_width and \
                        abs(y1 - y0) > bar_width:
                    total = 0
                    line(x0 + bar_width, y0, x0 + bar_width, y1)
                    n = sum(prior)
                    for i, (val, color) in enumerate(zip(prior, colors)):
                        if i == len(prior) - 1:
                            h = y1 - y0 - total
                        else:
                            h = (y1 - y0) * val / n
                        rect(x0, y0 + total, bar_width, h, 20, color)
                        total += h

                if conditionalsubsetdict:
                    if conditionalsubsetdict[attr_vals]:
                        counts = [
                            conditionalsubsetdict[attr_vals + "-" + val]
                            for val in cls_values
                        ]
                        if sum(counts) == 1:
                            rect(x0 - 2,
                                 y0 - 2,
                                 x1 - x0 + 5,
                                 y1 - y0 + 5,
                                 -550,
                                 colors[counts.index(1)],
                                 Qt.white,
                                 penWidth=2,
                                 penStyle=Qt.DashLine)
                        if self.subset_data is not None:
                            line(x1 - bar_width, y0, x1 - bar_width, y1)
                            total = 0
                            n = conditionalsubsetdict[attr_vals]
                            if n:
                                for i, (cls, color) in \
                                        enumerate(zip(cls_values, colors)):
                                    val = conditionalsubsetdict[attr_vals +
                                                                "-" + cls]
                                    if val == 0:
                                        continue
                                    if i == len(prior) - 1:
                                        v = y1 - y0 - total
                                    else:
                                        v = ((y1 - y0) * val) / n
                                    rect(x1 - bar_width, y0 + total, bar_width,
                                         v, 15, color)
                                    total += v

                actual = [
                    conditionaldict[attr_vals + "-" + cls_values[i]]
                    for i in range(len(prior))
                ]
                n_actual = sum(actual)
                if n_actual > 0:
                    apriori = [prior[key] for key in cls_values]
                    n_apriori = sum(apriori)
                    text = "<br/>".join(
                        "<b>%s</b>: %d / %.1f%% (Expected %.1f / %.1f%%)" %
                        (cls, act, 100.0 * act / n_actual,
                         apr / n_apriori * n_actual, 100.0 * apr / n_apriori)
                        for cls, act, apr in zip(cls_values, actual, apriori))
                else:
                    text = ""
                outer_rect.setToolTip("{}<hr>Instances: {}<br><br>{}".format(
                    condition, n_actual, text[:-4]))
Exemplo n.º 49
0
    def update_graph(self):
        spacing = self.SPACING
        bar_width = self.BAR_WIDTH

        def draw_data(attr_list,
                      x0_x1,
                      y0_y1,
                      side,
                      condition,
                      total_attrs,
                      used_attrs=[],
                      used_vals=[],
                      attr_vals=""):
            x0, x1 = x0_x1
            y0, y1 = y0_y1
            if conditionaldict[attr_vals] == 0:
                add_rect(x0,
                         x1,
                         y0,
                         y1,
                         "",
                         used_attrs,
                         used_vals,
                         attr_vals=attr_vals)
                # store coordinates for later drawing of labels
                draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs,
                          used_attrs, used_vals, attr_vals)
                return

            attr = attr_list[0]
            # how much smaller rectangles do we draw
            edge = len(attr_list) * spacing
            values = get_variable_values_sorted(data.domain[attr])
            if side % 2:
                values = values[::-1]  # reverse names if necessary

            if side % 2 == 0:  # we are drawing on the x axis
                # remove the space needed for separating different attr. values
                whole = max(0, (x1 - x0) - edge * (len(values) - 1))
                if whole == 0:
                    edge = (x1 - x0) / float(len(values) - 1)
            else:  # we are drawing on the y axis
                whole = max(0, (y1 - y0) - edge * (len(values) - 1))
                if whole == 0:
                    edge = (y1 - y0) / float(len(values) - 1)

            if attr_vals == "":
                counts = [conditionaldict[val] for val in values]
            else:
                counts = [
                    conditionaldict[attr_vals + "-" + val] for val in values
                ]
            total = sum(counts)

            # if we are visualizing the third attribute and the first attribute
            # has the last value, we have to reverse the order in which the
            # boxes will be drawn otherwise, if the last cell, nearest to the
            # labels of the fourth attribute, is empty, we wouldn't be able to
            # position the labels
            valrange = list(range(len(values)))
            if len(attr_list + used_attrs) == 4 and len(used_attrs) == 2:
                attr1values = get_variable_values_sorted(
                    data.domain[used_attrs[0]])
                if used_vals[0] == attr1values[-1]:
                    valrange = valrange[::-1]

            for i in valrange:
                start = i * edge + whole * float(sum(counts[:i]) / total)
                end = i * edge + whole * float(sum(counts[:i + 1]) / total)
                val = values[i]
                htmlval = getHtmlCompatibleString(val)
                if attr_vals != "":
                    newattrvals = attr_vals + "-" + val
                else:
                    newattrvals = val

                tooltip = condition + 4 * "&nbsp;" + attr + \
                    ": <b>" + htmlval + "</b><br>"
                attrs = used_attrs + [attr]
                vals = used_vals + [val]
                common_args = attrs, vals, newattrvals
                if side % 2 == 0:  # if we are moving horizontally
                    if len(attr_list) == 1:
                        add_rect(x0 + start, x0 + end, y0, y1, tooltip,
                                 *common_args)
                    else:
                        draw_data(attr_list[1:], (x0 + start, x0 + end),
                                  (y0, y1), side + 1, tooltip, total_attrs,
                                  *common_args)
                else:
                    if len(attr_list) == 1:
                        add_rect(x0, x1, y0 + start, y0 + end, tooltip,
                                 *common_args)
                    else:
                        draw_data(attr_list[1:], (x0, x1),
                                  (y0 + start, y0 + end), side + 1, tooltip,
                                  total_attrs, *common_args)

            draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs,
                      used_attrs, used_vals, attr_vals)

        def draw_text(side, attr, x0_x1, y0_y1, total_attrs, used_attrs,
                      used_vals, attr_vals):
            x0, x1 = x0_x1
            y0, y1 = y0_y1
            if side in drawn_sides:
                return

            # the text on the right will be drawn when we are processing
            # visualization of the last value of the first attribute
            if side == 3:
                attr1values = \
                    get_variable_values_sorted(data.domain[used_attrs[0]])
                if used_vals[0] != attr1values[-1]:
                    return

            if not conditionaldict[attr_vals]:
                if side not in draw_positions:
                    draw_positions[side] = (x0, x1, y0, y1)
                return
            else:
                if side in draw_positions:
                    # restore the positions of attribute values and name
                    (x0, x1, y0, y1) = draw_positions[side]

            drawn_sides.add(side)

            values = get_variable_values_sorted(data.domain[attr])
            if side % 2:
                values = values[::-1]

            spaces = spacing * (total_attrs - side) * (len(values) - 1)
            width = x1 - x0 - spaces * (side % 2 == 0)
            height = y1 - y0 - spaces * (side % 2 == 1)

            # calculate position of first attribute
            currpos = 0

            if attr_vals == "":
                counts = [conditionaldict.get(val, 1) for val in values]
            else:
                counts = [
                    conditionaldict.get(attr_vals + "-" + val, 1)
                    for val in values
                ]
            total = sum(counts)
            if total == 0:
                counts = [1] * len(values)
                total = sum(counts)

            aligns = [
                Qt.AlignTop | Qt.AlignHCenter, Qt.AlignRight | Qt.AlignVCenter,
                Qt.AlignBottom | Qt.AlignHCenter,
                Qt.AlignLeft | Qt.AlignVCenter
            ]
            align = aligns[side]
            for i in range(len(values)):
                val = values[i]
                perc = counts[i] / float(total)
                if distributiondict[val] != 0:
                    if side == 0:
                        CanvasText(self.canvas, str(val),
                                   x0 + currpos + width * 0.5 * perc,
                                   y1 + self.ATTR_VAL_OFFSET, align)
                    elif side == 1:
                        CanvasText(self.canvas, str(val),
                                   x0 - self.ATTR_VAL_OFFSET,
                                   y0 + currpos + height * 0.5 * perc, align)
                    elif side == 2:
                        CanvasText(self.canvas, str(val),
                                   x0 + currpos + width * perc * 0.5,
                                   y0 - self.ATTR_VAL_OFFSET, align)
                    else:
                        CanvasText(self.canvas, str(val),
                                   x1 + self.ATTR_VAL_OFFSET,
                                   y0 + currpos + height * 0.5 * perc, align)

                if side % 2 == 0:
                    currpos += perc * width + spacing * (total_attrs - side)
                else:
                    currpos += perc * height + spacing * (total_attrs - side)

            if side == 0:
                CanvasText(self.canvas,
                           attr,
                           x0 + (x1 - x0) / 2,
                           y1 + self.ATTR_VAL_OFFSET + self.ATTR_NAME_OFFSET,
                           align,
                           bold=1)
            elif side == 1:
                CanvasText(self.canvas,
                           attr,
                           x0 - max_ylabel_w1 - self.ATTR_VAL_OFFSET,
                           y0 + (y1 - y0) / 2,
                           align,
                           bold=1,
                           vertical=True)
            elif side == 2:
                CanvasText(self.canvas,
                           attr,
                           x0 + (x1 - x0) / 2,
                           y0 - self.ATTR_VAL_OFFSET - self.ATTR_NAME_OFFSET,
                           align,
                           bold=1)
            else:
                CanvasText(self.canvas,
                           attr,
                           x1 + max_ylabel_w2 + self.ATTR_VAL_OFFSET,
                           y0 + (y1 - y0) / 2,
                           align,
                           bold=1,
                           vertical=True)

        def add_rect(x0,
                     x1,
                     y0,
                     y1,
                     condition="",
                     used_attrs=[],
                     used_vals=[],
                     attr_vals=""):
            area_index = len(self.areas)
            if x0 == x1:
                x1 += 1
            if y0 == y1:
                y1 += 1

            # rectangles of width and height 1 are not shown - increase
            if x1 - x0 + y1 - y0 == 2:
                y1 += 1

            if class_var and class_var.is_discrete:
                colors = [QColor(*col) for col in class_var.colors]
            else:
                colors = None

            def select_area(_, ev):
                self.select_area(area_index, ev)

            def rect(x, y, w, h, z, pen_color=None, brush_color=None, **args):
                if pen_color is None:
                    return CanvasRectangle(self.canvas,
                                           x,
                                           y,
                                           w,
                                           h,
                                           z=z,
                                           onclick=select_area,
                                           **args)
                if brush_color is None:
                    brush_color = pen_color
                return CanvasRectangle(self.canvas,
                                       x,
                                       y,
                                       w,
                                       h,
                                       pen_color,
                                       brush_color,
                                       z=z,
                                       onclick=select_area,
                                       **args)

            def line(x1, y1, x2, y2):
                r = QGraphicsLineItem(x1, y1, x2, y2, None)
                self.canvas.addItem(r)
                r.setPen(QPen(Qt.white, 2))
                r.setZValue(30)

            outer_rect = rect(x0, y0, x1 - x0, y1 - y0, 30)
            self.areas.append((used_attrs, used_vals, outer_rect))
            if not conditionaldict[attr_vals]:
                return

            if self.interior_coloring == self.PEARSON:
                s = sum(apriori_dists[0])
                expected = s * reduce(
                    mul, (apriori_dists[i][used_vals[i]] / float(s)
                          for i in range(len(used_vals))))
                actual = conditionaldict[attr_vals]
                pearson = (actual - expected) / sqrt(expected)
                if pearson == 0:
                    ind = 0
                else:
                    ind = max(0, min(int(log(abs(pearson), 2)), 3))
                color = [self.RED_COLORS, self.BLUE_COLORS][pearson > 0][ind]
                rect(x0, y0, x1 - x0, y1 - y0, -20, color)
                outer_rect.setToolTip(
                    condition + "<hr/>" + "Expected instances: %.1f<br>"
                    "Actual instances: %d<br>"
                    "Standardized (Pearson) residual: %.1f" %
                    (expected, conditionaldict[attr_vals], pearson))
            else:
                cls_values = get_variable_values_sorted(class_var)
                prior = get_distribution(data, class_var.name)
                total = 0
                for i, value in enumerate(cls_values):
                    val = conditionaldict[attr_vals + "-" + value]
                    if val == 0:
                        continue
                    if i == len(cls_values) - 1:
                        v = y1 - y0 - total
                    else:
                        v = ((y1 - y0) * val) / conditionaldict[attr_vals]
                    rect(x0, y0 + total, x1 - x0, v, -20, colors[i])
                    total += v

                if self.use_boxes and \
                        abs(x1 - x0) > bar_width and \
                        abs(y1 - y0) > bar_width:
                    total = 0
                    line(x0 + bar_width, y0, x0 + bar_width, y1)
                    n = sum(prior)
                    for i, (val, color) in enumerate(zip(prior, colors)):
                        if i == len(prior) - 1:
                            h = y1 - y0 - total
                        else:
                            h = (y1 - y0) * val / n
                        rect(x0, y0 + total, bar_width, h, 20, color)
                        total += h

                if conditionalsubsetdict:
                    if conditionalsubsetdict[attr_vals]:
                        counts = [
                            conditionalsubsetdict[attr_vals + "-" + val]
                            for val in cls_values
                        ]
                        if sum(counts) == 1:
                            rect(x0 - 2,
                                 y0 - 2,
                                 x1 - x0 + 5,
                                 y1 - y0 + 5,
                                 -550,
                                 colors[counts.index(1)],
                                 Qt.white,
                                 penWidth=2,
                                 penStyle=Qt.DashLine)
                        if self.subset_data is not None:
                            line(x1 - bar_width, y0, x1 - bar_width, y1)
                            total = 0
                            n = conditionalsubsetdict[attr_vals]
                            if n:
                                for i, (cls, color) in \
                                        enumerate(zip(cls_values, colors)):
                                    val = conditionalsubsetdict[attr_vals +
                                                                "-" + cls]
                                    if val == 0:
                                        continue
                                    if i == len(prior) - 1:
                                        v = y1 - y0 - total
                                    else:
                                        v = ((y1 - y0) * val) / n
                                    rect(x1 - bar_width, y0 + total, bar_width,
                                         v, 15, color)
                                    total += v

                actual = [
                    conditionaldict[attr_vals + "-" + cls_values[i]]
                    for i in range(len(prior))
                ]
                n_actual = sum(actual)
                if n_actual > 0:
                    apriori = [prior[key] for key in cls_values]
                    n_apriori = sum(apriori)
                    text = "<br/>".join(
                        "<b>%s</b>: %d / %.1f%% (Expected %.1f / %.1f%%)" %
                        (cls, act, 100.0 * act / n_actual,
                         apr / n_apriori * n_actual, 100.0 * apr / n_apriori)
                        for cls, act, apr in zip(cls_values, actual, apriori))
                else:
                    text = ""
                outer_rect.setToolTip("{}<hr>Instances: {}<br><br>{}".format(
                    condition, n_actual, text[:-4]))

        def draw_legend(x0_x1, y0_y1):
            x0, x1 = x0_x1
            y0, y1 = y0_y1
            if self.interior_coloring == self.PEARSON:
                names = [
                    "<-8", "-8:-4", "-4:-2", "-2:2", "2:4", "4:8", ">8",
                    "Residuals:"
                ]
                colors = self.RED_COLORS[::-1] + self.BLUE_COLORS[1:]
            else:
                names = get_variable_values_sorted(class_var) + \
                        [class_var.name + ":"]
                colors = [QColor(*col) for col in class_var.colors]

            names = [
                CanvasText(self.canvas, name, alignment=Qt.AlignVCenter)
                for name in names
            ]
            totalwidth = sum(text.boundingRect().width() for text in names)

            # compute the x position of the center of the legend
            y = y1 + self.ATTR_NAME_OFFSET + self.ATTR_VAL_OFFSET + 35
            distance = 30
            startx = (x0 + x1) / 2 - (totalwidth + (len(names)) * distance) / 2

            names[-1].setPos(startx + 15, y)
            names[-1].show()
            xoffset = names[-1].boundingRect().width() + distance

            size = 8

            for i in range(len(names) - 1):
                if self.interior_coloring == self.PEARSON:
                    edgecolor = Qt.black
                else:
                    edgecolor = colors[i]

                CanvasRectangle(self.canvas, startx + xoffset, y - size / 2,
                                size, size, edgecolor, colors[i])
                names[i].setPos(startx + xoffset + 10, y)
                xoffset += distance + names[i].boundingRect().width()

        self.canvas.clear()
        self.areas = []

        data = self.discrete_data
        if data is None:
            return
        subset = self.subset_data
        attr_list = self.get_attr_list()
        class_var = data.domain.class_var
        if class_var:
            sql = type(data) == SqlTable
            name = not sql and data.name
            # save class_var because it is removed in the next line
            data = data[:, attr_list + [class_var]]
            data.domain.class_var = class_var
            if not sql:
                data.name = name
        else:
            data = data[:, attr_list]
        # TODO: check this
        # data = Preprocessor_dropMissing(data)
        if len(data) == 0:
            self.warning(5, "No valid data for current attributes.")
            return
        else:
            self.warning(5)

        if self.interior_coloring == self.PEARSON:
            apriori_dists = [
                get_distribution(data, attr) for attr in attr_list
            ]
        else:
            apriori_dists = []

        def get_max_label_width(attr):
            values = get_variable_values_sorted(data.domain[attr])
            maxw = 0
            for val in values:
                t = CanvasText(self.canvas, val, 0, 0, bold=0, show=False)
                maxw = max(int(t.boundingRect().width()), maxw)
            return maxw

        # get the maximum width of rectangle
        xoff = 20
        width = 20
        if len(attr_list) > 1:
            text = CanvasText(self.canvas, attr_list[1], bold=1, show=0)
            max_ylabel_w1 = min(get_max_label_width(attr_list[1]), 150)
            width = 5 + text.boundingRect().height() + \
                self.ATTR_VAL_OFFSET + max_ylabel_w1
            xoff = width
            if len(attr_list) == 4:
                text = CanvasText(self.canvas, attr_list[3], bold=1, show=0)
                max_ylabel_w2 = min(get_max_label_width(attr_list[3]), 150)
                width += text.boundingRect().height() + \
                    self.ATTR_VAL_OFFSET + max_ylabel_w2 - 10

        # get the maximum height of rectangle
        height = 100
        yoff = 45
        square_size = min(self.canvas_view.width() - width - 20,
                          self.canvas_view.height() - height - 20)

        if square_size < 0:
            return  # canvas is too small to draw rectangles
        self.canvas_view.setSceneRect(0, 0, self.canvas_view.width(),
                                      self.canvas_view.height())

        drawn_sides = set()
        draw_positions = {}

        conditionaldict, distributiondict = \
            get_conditional_distribution(data, attr_list)
        conditionalsubsetdict = None
        if subset:
            conditionalsubsetdict, _ = \
                get_conditional_distribution(subset, attr_list)

        # draw rectangles
        draw_data(attr_list, (xoff, xoff + square_size),
                  (yoff, yoff + square_size), 0, "", len(attr_list))
        draw_legend((xoff, xoff + square_size), (yoff, yoff + square_size))
        self.update_selection_rects()
Exemplo n.º 50
0
    def update_graph(self):
        spacing = self.SPACING
        bar_width = self.BAR_WIDTH

        def get_counts(attr_vals, values):
            """Calculate rectangles' widths; if all are 0, they are set to 1."""
            if not attr_vals:
                counts = [conditionaldict[val] for val in values]
            else:
                counts = [
                    conditionaldict[attr_vals + "-" + val] for val in values
                ]
            total = sum(counts)
            if total == 0:
                counts = [1] * len(values)
                total = sum(counts)
            return total, counts

        def draw_data(attr_list,
                      x0_x1,
                      y0_y1,
                      side,
                      condition,
                      total_attrs,
                      used_attrs,
                      used_vals,
                      attr_vals=""):
            x0, x1 = x0_x1
            y0, y1 = y0_y1
            if conditionaldict[attr_vals] == 0:
                add_rect(x0,
                         x1,
                         y0,
                         y1,
                         "",
                         used_attrs,
                         used_vals,
                         attr_vals=attr_vals)
                # store coordinates for later drawing of labels
                draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs,
                          used_attrs, used_vals, attr_vals)
                return

            attr = attr_list[0]
            # how much smaller rectangles do we draw
            edge = len(attr_list) * spacing
            values = get_variable_values_sorted(attr)
            if side % 2:
                values = values[::-1]  # reverse names if necessary

            if side % 2 == 0:  # we are drawing on the x axis
                # remove the space needed for separating different attr. values
                whole = max(0, (x1 - x0) - edge * (len(values) - 1))
                if whole == 0:
                    edge = (x1 - x0) / float(len(values) - 1)
            else:  # we are drawing on the y axis
                whole = max(0, (y1 - y0) - edge * (len(values) - 1))
                if whole == 0:
                    edge = (y1 - y0) / float(len(values) - 1)

            total, counts = get_counts(attr_vals, values)

            # when visualizing the third attribute and the first attribute has
            # the last value, reverse the order in which the boxes are drawn;
            # otherwise, if the last cell, nearest to the labels of the fourth
            # attribute, is empty, we wouldn't be able to position the labels
            valrange = list(range(len(values)))
            if len(attr_list + used_attrs) == 4 and len(used_attrs) == 2:
                attr1values = get_variable_values_sorted(used_attrs[0])
                if used_vals[0] == attr1values[-1]:
                    valrange = valrange[::-1]

            for i in valrange:
                start = i * edge + whole * float(sum(counts[:i]) / total)
                end = i * edge + whole * float(sum(counts[:i + 1]) / total)
                val = values[i]
                htmlval = to_html(val)
                newattrvals = attr_vals + "-" + val if attr_vals else val

                tooltip = "{}&nbsp;&nbsp;&nbsp;&nbsp;{}: <b>{}</b><br/>".format(
                    condition, attr.name, htmlval)
                attrs = used_attrs + [attr]
                vals = used_vals + [val]
                args = attrs, vals, newattrvals
                if side % 2 == 0:  # if we are moving horizontally
                    if len(attr_list) == 1:
                        add_rect(x0 + start, x0 + end, y0, y1, tooltip, *args)
                    else:
                        draw_data(attr_list[1:], (x0 + start, x0 + end),
                                  (y0, y1), side + 1, tooltip, total_attrs,
                                  *args)
                else:
                    if len(attr_list) == 1:
                        add_rect(x0, x1, y0 + start, y0 + end, tooltip, *args)
                    else:
                        draw_data(attr_list[1:], (x0, x1),
                                  (y0 + start, y0 + end), side + 1, tooltip,
                                  total_attrs, *args)
            draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs,
                      used_attrs, used_vals, attr_vals)

        def draw_text(side, attr, x0_x1, y0_y1, total_attrs, used_attrs,
                      used_vals, attr_vals):
            x0, x1 = x0_x1
            y0, y1 = y0_y1
            if side in drawn_sides:
                return

            # the text on the right will be drawn when we are processing
            # visualization of the last value of the first attribute
            if side == 3:
                attr1values = get_variable_values_sorted(used_attrs[0])
                if used_vals[0] != attr1values[-1]:
                    return

            if not conditionaldict[attr_vals]:
                if side not in draw_positions:
                    draw_positions[side] = (x0, x1, y0, y1)
                return
            else:
                if side in draw_positions:
                    # restore the positions of attribute values and name
                    (x0, x1, y0, y1) = draw_positions[side]

            drawn_sides.add(side)

            values = get_variable_values_sorted(attr)
            if side % 2:
                values = values[::-1]

            spaces = spacing * (total_attrs - side) * (len(values) - 1)
            width = x1 - x0 - spaces * (side % 2 == 0)
            height = y1 - y0 - spaces * (side % 2 == 1)

            # calculate position of first attribute
            currpos = 0
            total, counts = get_counts(attr_vals, values)
            aligns = [
                Qt.AlignTop | Qt.AlignHCenter, Qt.AlignRight | Qt.AlignVCenter,
                Qt.AlignBottom | Qt.AlignHCenter,
                Qt.AlignLeft | Qt.AlignVCenter
            ]
            align = aligns[side]
            for i, val in enumerate(values):
                if distributiondict[val] != 0:
                    perc = counts[i] / float(total)
                    rwidth = width * perc
                    xs = [
                        x0 + currpos + rwidth / 2, x0 - self.ATTR_VAL_OFFSET,
                        x0 + currpos + rwidth / 2, x1 + self.ATTR_VAL_OFFSET
                    ]
                    ys = [
                        y1 + self.ATTR_VAL_OFFSET,
                        y0 + currpos + height * 0.5 * perc,
                        y0 - self.ATTR_VAL_OFFSET,
                        y0 + currpos + height * 0.5 * perc
                    ]

                    CanvasText(self.canvas,
                               val,
                               xs[side],
                               ys[side],
                               align,
                               max_width=rwidth if side == 0 else None)
                    space = height if side % 2 else width
                    currpos += perc * space + spacing * (total_attrs - side)

            xs = [
                x0 + (x1 - x0) / 2, x0 - max_ylabel_w1 - self.ATTR_VAL_OFFSET,
                x0 + (x1 - x0) / 2, x1 + max_ylabel_w2 + self.ATTR_VAL_OFFSET
            ]
            ys = [
                y1 + self.ATTR_VAL_OFFSET + self.ATTR_NAME_OFFSET,
                y0 + (y1 - y0) / 2,
                y0 - self.ATTR_VAL_OFFSET - self.ATTR_NAME_OFFSET,
                y0 + (y1 - y0) / 2
            ]
            CanvasText(self.canvas,
                       attr.name,
                       xs[side],
                       ys[side],
                       align,
                       bold=True,
                       vertical=side % 2)

        def add_rect(x0,
                     x1,
                     y0,
                     y1,
                     condition,
                     used_attrs,
                     used_vals,
                     attr_vals=""):
            area_index = len(self.areas)
            x1 += (x0 == x1)
            y1 += (y0 == y1)
            # rectangles of width and height 1 are not shown - increase
            y1 += (x1 - x0 + y1 - y0 == 2)
            colors = class_var and [QColor(*col) for col in class_var.colors]

            def select_area(_, ev):
                self.select_area(area_index, ev)

            def rect(x, y, w, h, z, pen_color=None, brush_color=None, **args):
                if pen_color is None:
                    return CanvasRectangle(self.canvas,
                                           x,
                                           y,
                                           w,
                                           h,
                                           z=z,
                                           onclick=select_area,
                                           **args)
                if brush_color is None:
                    brush_color = pen_color
                return CanvasRectangle(self.canvas,
                                       x,
                                       y,
                                       w,
                                       h,
                                       pen_color,
                                       brush_color,
                                       z=z,
                                       onclick=select_area,
                                       **args)

            def line(x1, y1, x2, y2):
                r = QGraphicsLineItem(x1, y1, x2, y2, None)
                self.canvas.addItem(r)
                r.setPen(QPen(Qt.white, 2))
                r.setZValue(30)

            outer_rect = rect(x0, y0, x1 - x0, y1 - y0, 30)
            self.areas.append((used_attrs, used_vals, outer_rect))
            if not conditionaldict[attr_vals]:
                return

            if self.variable_color is None:
                s = sum(apriori_dists[0])
                expected = s * reduce(
                    mul, (apriori_dists[i][used_vals[i]] / float(s)
                          for i in range(len(used_vals))))
                actual = conditionaldict[attr_vals]
                pearson = float((actual - expected) / sqrt(expected))
                if pearson == 0:
                    ind = 0
                else:
                    ind = max(0, min(int(log(abs(pearson), 2)), 3))
                color = [self.RED_COLORS, self.BLUE_COLORS][pearson > 0][ind]
                rect(x0, y0, x1 - x0, y1 - y0, -20, color)
                outer_rect.setToolTip(
                    condition + "<hr/>" + "Expected instances: %.1f<br>"
                    "Actual instances: %d<br>"
                    "Standardized (Pearson) residual: %.1f" %
                    (expected, conditionaldict[attr_vals], pearson))
            else:
                cls_values = get_variable_values_sorted(class_var)
                prior = get_distribution(data, class_var.name)
                total = 0
                for i, value in enumerate(cls_values):
                    val = conditionaldict[attr_vals + "-" + value]
                    if val == 0:
                        continue
                    if i == len(cls_values) - 1:
                        v = y1 - y0 - total
                    else:
                        v = ((y1 - y0) * val) / conditionaldict[attr_vals]
                    rect(x0, y0 + total, x1 - x0, v, -20, colors[i])
                    total += v

                if self.use_boxes and \
                        abs(x1 - x0) > bar_width and abs(y1 - y0) > bar_width:
                    total = 0
                    line(x0 + bar_width, y0, x0 + bar_width, y1)
                    n = sum(prior)
                    for i, (val, color) in enumerate(zip(prior, colors)):
                        if i == len(prior) - 1:
                            h = y1 - y0 - total
                        else:
                            h = (y1 - y0) * val / n
                        rect(x0, y0 + total, bar_width, h, 20, color)
                        total += h

                if conditionalsubsetdict:
                    if conditionalsubsetdict[attr_vals]:
                        if self.subset_indices is not None:
                            line(x1 - bar_width, y0, x1 - bar_width, y1)
                            total = 0
                            n = conditionalsubsetdict[attr_vals]
                            if n:
                                for i, (cls, color) in \
                                        enumerate(zip(cls_values, colors)):
                                    val = conditionalsubsetdict[attr_vals +
                                                                "-" + cls]
                                    if val == 0:
                                        continue
                                    if i == len(prior) - 1:
                                        v = y1 - y0 - total
                                    else:
                                        v = ((y1 - y0) * val) / n
                                    rect(x1 - bar_width, y0 + total, bar_width,
                                         v, 15, color)
                                    total += v

                actual = [
                    conditionaldict[attr_vals + "-" + cls_values[i]]
                    for i in range(len(prior))
                ]
                n_actual = sum(actual)
                if n_actual > 0:
                    apriori = [prior[key] for key in cls_values]
                    n_apriori = sum(apriori)
                    text = "<br/>".join(
                        "<b>%s</b>: %d / %.1f%% (Expected %.1f / %.1f%%)" %
                        (cls, act, 100.0 * act / n_actual,
                         apr / n_apriori * n_actual, 100.0 * apr / n_apriori)
                        for cls, act, apr in zip(cls_values, actual, apriori))
                else:
                    text = ""
                outer_rect.setToolTip("{}<hr>Instances: {}<br><br>{}".format(
                    condition, n_actual, text[:-4]))

        def create_legend():
            if self.variable_color is None:
                names = [
                    "<-8", "-8:-4", "-4:-2", "-2:2", "2:4", "4:8", ">8",
                    "Residuals:"
                ]
                colors = self.RED_COLORS[::-1] + self.BLUE_COLORS[1:]
                edges = repeat(Qt.black)
            else:
                names = get_variable_values_sorted(class_var)
                edges = colors = [QColor(*col) for col in class_var.colors]

            items = []
            size = 8
            for name, color, edgecolor in zip(names, colors, edges):
                item = QGraphicsItemGroup()
                item.addToGroup(
                    CanvasRectangle(None, -size / 2, -size / 2, size, size,
                                    edgecolor, color))
                item.addToGroup(
                    CanvasText(None, name, size, 0, Qt.AlignVCenter))
                items.append(item)
            return wrap_legend_items(items,
                                     hspacing=20,
                                     vspacing=16 + size,
                                     max_width=self.canvas_view.width() - xoff)

        self.canvas.clear()
        self.areas = []

        data = self.discrete_data
        if data is None:
            return
        attr_list = self.get_disc_attr_list()
        class_var = data.domain.class_var
        # TODO: check this
        # data = Preprocessor_dropMissing(data)

        unique = [v.name for v in set(attr_list + [class_var]) if v]
        if len(data[:, unique]) == 0:
            self.Warning.no_valid_data()
            return
        else:
            self.Warning.no_valid_data.clear()

        attrs = [attr for attr in attr_list if not attr.values]
        if attrs:
            CanvasText(self.canvas,
                       "Feature {} has no values".format(attrs[0]),
                       (self.canvas_view.width() - 120) / 2,
                       self.canvas_view.height() / 2)
            return
        if self.variable_color is None:
            apriori_dists = [
                get_distribution(data, attr) for attr in attr_list
            ]
        else:
            apriori_dists = []

        def get_max_label_width(attr):
            values = get_variable_values_sorted(attr)
            maxw = 0
            for val in values:
                t = CanvasText(self.canvas, val, 0, 0, bold=0, show=False)
                maxw = max(int(t.boundingRect().width()), maxw)
            return maxw

        xoff = 20

        # get the maximum width of rectangle
        width = 20
        max_ylabel_w1 = max_ylabel_w2 = 0
        if len(attr_list) > 1:
            text = CanvasText(self.canvas, attr_list[1].name, bold=1, show=0)
            max_ylabel_w1 = min(get_max_label_width(attr_list[1]), 150)
            width = 5 + text.boundingRect().height() + \
                self.ATTR_VAL_OFFSET + max_ylabel_w1
            xoff = width
            if len(attr_list) == 4:
                text = CanvasText(self.canvas,
                                  attr_list[3].name,
                                  bold=1,
                                  show=0)
                max_ylabel_w2 = min(get_max_label_width(attr_list[3]), 150)
                width += text.boundingRect().height() + \
                    self.ATTR_VAL_OFFSET + max_ylabel_w2 - 10

        legend = create_legend()

        # get the maximum height of rectangle
        yoff = 45
        legendoff = yoff + self.ATTR_NAME_OFFSET + self.ATTR_VAL_OFFSET + 35
        square_size = min(
            self.canvas_view.width() - width - 20,
            self.canvas_view.height() - legendoff -
            legend.boundingRect().height())

        if square_size < 0:
            return  # canvas is too small to draw rectangles
        self.canvas_view.setSceneRect(0, 0, self.canvas_view.width(),
                                      self.canvas_view.height())

        drawn_sides = set()
        draw_positions = {}

        conditionaldict, distributiondict = \
            get_conditional_distribution(data, attr_list)
        conditionalsubsetdict = None
        if self.subset_indices:
            conditionalsubsetdict, _ = get_conditional_distribution(
                self.discrete_data[self.subset_indices], attr_list)

        # draw rectangles
        draw_data(attr_list, (xoff, xoff + square_size),
                  (yoff, yoff + square_size), 0, "", len(attr_list), [], [])

        self.canvas.addItem(legend)
        legend.setPos(
            xoff - legend.boundingRect().x() +
            max(0, (square_size - legend.boundingRect().width()) / 2),
            legendoff + square_size)
        self.update_selection_rects()
Exemplo n.º 51
0
        def add_rect(x0, x1, y0, y1, condition="",
                     used_attrs=[], used_vals=[], attr_vals=""):
            area_index = len(self.areas)
            if x0 == x1:
                x1 += 1
            if y0 == y1:
                y1 += 1

            # rectangles of width and height 1 are not shown - increase
            if x1 - x0 + y1 - y0 == 2:
                y1 += 1

            if class_var and class_var.is_discrete:
                colors = [QColor(*col) for col in class_var.colors]
            else:
                colors = None

            def select_area(_, ev):
                self.select_area(area_index, ev)

            def rect(x, y, w, h, z, pen_color=None, brush_color=None, **args):
                if pen_color is None:
                    return CanvasRectangle(
                            self.canvas, x, y, w, h, z=z, onclick=select_area,
                            **args)
                if brush_color is None:
                    brush_color = pen_color
                return CanvasRectangle(
                        self.canvas, x, y, w, h, pen_color, brush_color, z=z,
                        onclick=select_area, **args)

            def line(x1, y1, x2, y2):
                r = QGraphicsLineItem(x1, y1, x2, y2, None)
                self.canvas.addItem(r)
                r.setPen(QPen(Qt.white, 2))
                r.setZValue(30)

            outer_rect = rect(x0, y0, x1 - x0, y1 - y0, 30)
            self.areas.append((used_attrs, used_vals, outer_rect))
            if not conditionaldict[attr_vals]:
                return

            if self.interior_coloring == self.PEARSON:
                s = sum(apriori_dists[0])
                expected = s * reduce(
                        mul,
                        (apriori_dists[i][used_vals[i]] / float(s)
                         for i in range(len(used_vals))))
                actual = conditionaldict[attr_vals]
                pearson = (actual - expected) / sqrt(expected)
                if pearson == 0:
                    ind = 0
                else:
                    ind = max(0, min(int(log(abs(pearson), 2)), 3))
                color = [self.RED_COLORS, self.BLUE_COLORS][pearson > 0][ind]
                rect(x0, y0, x1 - x0, y1 - y0, -20, color)
                outer_rect.setToolTip(
                        condition + "<hr/>" +
                        "Expected instances: %.1f<br>"
                        "Actual instances: %d<br>"
                        "Standardized (Pearson) residual: %.1f" %
                        (expected, conditionaldict[attr_vals], pearson))
            else:
                cls_values = get_variable_values_sorted(class_var)
                prior = get_distribution(data, class_var.name)
                total = 0
                for i, value in enumerate(cls_values):
                    val = conditionaldict[attr_vals + "-" + value]
                    if val == 0:
                        continue
                    if i == len(cls_values) - 1:
                        v = y1 - y0 - total
                    else:
                        v = ((y1 - y0) * val) / conditionaldict[attr_vals]
                    rect(x0, y0 + total, x1 - x0, v, -20, colors[i])
                    total += v

                if self.use_boxes and \
                        abs(x1 - x0) > bar_width and \
                        abs(y1 - y0) > bar_width:
                    total = 0
                    line(x0 + bar_width, y0, x0 + bar_width, y1)
                    n = sum(prior)
                    for i, (val, color) in enumerate(zip(prior, colors)):
                        if i == len(prior) - 1:
                            h = y1 - y0 - total
                        else:
                            h = (y1 - y0) * val / n
                        rect(x0, y0 + total, bar_width, h, 20, color)
                        total += h

                if conditionalsubsetdict:
                    if conditionalsubsetdict[attr_vals]:
                        counts = [conditionalsubsetdict[attr_vals + "-" + val]
                                  for val in cls_values]
                        if sum(counts) == 1:
                            rect(x0 - 2, y0 - 2, x1 - x0 + 5, y1 - y0 + 5, -550,
                                 colors[counts.index(1)], Qt.white,
                                 penWidth=2, penStyle=Qt.DashLine)
                        if self.subset_data is not None:
                            line(x1 - bar_width, y0, x1 - bar_width, y1)
                            total = 0
                            n = conditionalsubsetdict[attr_vals]
                            if n:
                                for i, (cls, color) in \
                                        enumerate(zip(cls_values, colors)):
                                    val = conditionalsubsetdict[
                                        attr_vals + "-" + cls]
                                    if val == 0:
                                        continue
                                    if i == len(prior) - 1:
                                        v = y1 - y0 - total
                                    else:
                                        v = ((y1 - y0) * val) / n
                                    rect(x1 - bar_width, y0 + total,
                                         bar_width, v, 15, color)
                                    total += v

                actual = [conditionaldict[attr_vals + "-" + cls_values[i]]
                          for i in range(len(prior))]
                n_actual = sum(actual)
                if n_actual > 0:
                    apriori = [prior[key] for key in cls_values]
                    n_apriori = sum(apriori)
                    text = "<br/>".join(
                            "<b>%s</b>: %d / %.1f%% (Expected %.1f / %.1f%%)" %
                            (cls, act, 100.0 * act / n_actual,
                             apr / n_apriori * n_actual, 100.0 * apr / n_apriori
                             )
                            for cls, act, apr in zip(cls_values, actual, apriori
                                                     ))
                else:
                    text = ""
                outer_rect.setToolTip(
                        "{}<hr>Instances: {}<br><br>{}".format(
                                condition, n_actual, text[:-4]))
Exemplo n.º 52
0
 def fit_storage(self, data):
     dist = distribution.get_distribution(data, data.domain.class_var)
     return MeanModel(dist)
Exemplo n.º 53
0
 def fit_storage(self, data):
     dist = distribution.get_distribution(data, data.domain.class_var)
     domain = Orange.data.Domain((), (data.domain.class_var,))
     return MeanPredictor(domain, dist)
Exemplo n.º 54
0
def column_imputer_modus(variable, table):
    stat = distribution.get_distribution(table, variable)
    column_imputer_defaults(variable, table, stat.modus())
Exemplo n.º 55
0
 def fit_storage(self, data):
     dist = distribution.get_distribution(data, data.domain.class_var)
     domain = Orange.data.Domain((), (data.domain.class_var,))
     return MeanPredictor(domain, dist)
Exemplo n.º 56
0
    def update_graph(self):
        spacing = self.SPACING
        bar_width = self.BAR_WIDTH

        def draw_data(attr_list, x0_x1, y0_y1, side, condition,
                      total_attrs, used_attrs=[], used_vals=[],
                      attr_vals=""):
            x0, x1 = x0_x1
            y0, y1 = y0_y1
            if conditionaldict[attr_vals] == 0:
                add_rect(x0, x1, y0, y1, "",
                         used_attrs, used_vals, attr_vals=attr_vals)
                # store coordinates for later drawing of labels
                draw_text(side, attr_list[0], (x0, x1), (y0, y1), total_attrs,
                          used_attrs, used_vals, attr_vals)
                return

            attr = attr_list[0]
            # how much smaller rectangles do we draw
            edge = len(attr_list) * spacing
            values = get_variable_values_sorted(data.domain[attr])
            if side % 2:
                values = values[::-1]  # reverse names if necessary

            if side % 2 == 0:  # we are drawing on the x axis
                # remove the space needed for separating different attr. values
                whole = max(0, (x1 - x0) - edge * (
                    len(values) - 1))
                if whole == 0:
                    edge = (x1 - x0) / float(len(values) - 1)
            else:  # we are drawing on the y axis
                whole = max(0, (y1 - y0) - edge * (len(values) - 1))
                if whole == 0:
                    edge = (y1 - y0) / float(len(values) - 1)

            if attr_vals == "":
                counts = [conditionaldict[val] for val in values]
            else:
                counts = [conditionaldict[attr_vals + "-" + val]
                          for val in values]
            total = sum(counts)

            # if we are visualizing the third attribute and the first attribute
            # has the last value, we have to reverse the order in which the
            # boxes will be drawn otherwise, if the last cell, nearest to the
            # labels of the fourth attribute, is empty, we wouldn't be able to
            # position the labels
            valrange = list(range(len(values)))
            if len(attr_list + used_attrs) == 4 and len(used_attrs) == 2:
                attr1values = get_variable_values_sorted(
                        data.domain[used_attrs[0]])
                if used_vals[0] == attr1values[-1]:
                    valrange = valrange[::-1]

            for i in valrange:
                start = i * edge + whole * float(sum(counts[:i]) / total)
                end = i * edge + whole * float(sum(counts[:i + 1]) / total)
                val = values[i]
                htmlval = to_html(val)
                if attr_vals != "":
                    newattrvals = attr_vals + "-" + val
                else:
                    newattrvals = val

                tooltip = condition + 4 * "&nbsp;" + attr + \
                    ": <b>" + htmlval + "</b><br>"
                attrs = used_attrs + [attr]
                vals = used_vals + [val]
                common_args = attrs, vals, newattrvals
                if side % 2 == 0:  # if we are moving horizontally
                    if len(attr_list) == 1:
                        add_rect(x0 + start, x0 + end, y0, y1,
                                 tooltip, *common_args)
                    else:
                        draw_data(attr_list[1:], (x0 + start, x0 + end),
                                  (y0, y1), side + 1,
                                  tooltip, total_attrs, *common_args)
                else:
                    if len(attr_list) == 1:
                        add_rect(x0, x1, y0 + start, y0 + end,
                                 tooltip, *common_args)
                    else:
                        draw_data(attr_list[1:], (x0, x1),
                                  (y0 + start, y0 + end), side + 1,
                                  tooltip, total_attrs, *common_args)

            draw_text(side, attr_list[0], (x0, x1), (y0, y1),
                      total_attrs, used_attrs, used_vals, attr_vals)

        def draw_text(side, attr, x0_x1, y0_y1,
                      total_attrs, used_attrs, used_vals, attr_vals):
            x0, x1 = x0_x1
            y0, y1 = y0_y1
            if side in drawn_sides:
                return

            # the text on the right will be drawn when we are processing
            # visualization of the last value of the first attribute
            if side == 3:
                attr1values = \
                    get_variable_values_sorted(data.domain[used_attrs[0]])
                if used_vals[0] != attr1values[-1]:
                    return

            if not conditionaldict[attr_vals]:
                if side not in draw_positions:
                    draw_positions[side] = (x0, x1, y0, y1)
                return
            else:
                if side in draw_positions:
                    # restore the positions of attribute values and name
                    (x0, x1, y0, y1) = draw_positions[side]

            drawn_sides.add(side)

            values = get_variable_values_sorted(data.domain[attr])
            if side % 2:
                values = values[::-1]

            spaces = spacing * (total_attrs - side) * (len(values) - 1)
            width = x1 - x0 - spaces * (side % 2 == 0)
            height = y1 - y0 - spaces * (side % 2 == 1)

            # calculate position of first attribute
            currpos = 0

            if attr_vals == "":
                counts = [conditionaldict.get(val, 1) for val in values]
            else:
                counts = [conditionaldict.get(attr_vals + "-" + val, 1)
                          for val in values]
            total = sum(counts)
            if total == 0:
                counts = [1] * len(values)
                total = sum(counts)

            aligns = [Qt.AlignTop | Qt.AlignHCenter,
                      Qt.AlignRight | Qt.AlignVCenter,
                      Qt.AlignBottom | Qt.AlignHCenter,
                      Qt.AlignLeft | Qt.AlignVCenter]
            align = aligns[side]
            for i in range(len(values)):
                val = values[i]
                perc = counts[i] / float(total)
                if distributiondict[val] != 0:
                    if side == 0:
                        CanvasText(self.canvas, str(val),
                                   x0 + currpos + width * 0.5 * perc,
                                   y1 + self.ATTR_VAL_OFFSET, align)
                    elif side == 1:
                        CanvasText(self.canvas, str(val),
                                   x0 - self.ATTR_VAL_OFFSET,
                                   y0 + currpos + height * 0.5 * perc, align)
                    elif side == 2:
                        CanvasText(self.canvas, str(val),
                                   x0 + currpos + width * perc * 0.5,
                                   y0 - self.ATTR_VAL_OFFSET, align)
                    else:
                        CanvasText(self.canvas, str(val),
                                   x1 + self.ATTR_VAL_OFFSET,
                                   y0 + currpos + height * 0.5 * perc, align)

                if side % 2 == 0:
                    currpos += perc * width + spacing * (total_attrs - side)
                else:
                    currpos += perc * height + spacing * (total_attrs - side)

            if side == 0:
                CanvasText(
                        self.canvas, attr,
                        x0 + (x1 - x0) / 2,
                        y1 + self.ATTR_VAL_OFFSET +
                        self.ATTR_NAME_OFFSET,
                        align, bold=1)
            elif side == 1:
                CanvasText(
                        self.canvas, attr,
                        x0 - max_ylabel_w1 - self.ATTR_VAL_OFFSET,
                        y0 + (y1 - y0) / 2,
                        align, bold=1, vertical=True)
            elif side == 2:
                CanvasText(
                        self.canvas, attr,
                        x0 + (x1 - x0) / 2,
                        y0 - self.ATTR_VAL_OFFSET -
                        self.ATTR_NAME_OFFSET,
                        align, bold=1)
            else:
                CanvasText(
                        self.canvas, attr,
                        x1 + max_ylabel_w2 + self.ATTR_VAL_OFFSET,
                        y0 + (y1 - y0) / 2,
                        align, bold=1, vertical=True)

        def add_rect(x0, x1, y0, y1, condition="",
                     used_attrs=[], used_vals=[], attr_vals=""):
            area_index = len(self.areas)
            if x0 == x1:
                x1 += 1
            if y0 == y1:
                y1 += 1

            # rectangles of width and height 1 are not shown - increase
            if x1 - x0 + y1 - y0 == 2:
                y1 += 1

            if class_var and class_var.is_discrete:
                colors = [QColor(*col) for col in class_var.colors]
            else:
                colors = None

            def select_area(_, ev):
                self.select_area(area_index, ev)

            def rect(x, y, w, h, z, pen_color=None, brush_color=None, **args):
                if pen_color is None:
                    return CanvasRectangle(
                            self.canvas, x, y, w, h, z=z, onclick=select_area,
                            **args)
                if brush_color is None:
                    brush_color = pen_color
                return CanvasRectangle(
                        self.canvas, x, y, w, h, pen_color, brush_color, z=z,
                        onclick=select_area, **args)

            def line(x1, y1, x2, y2):
                r = QGraphicsLineItem(x1, y1, x2, y2, None)
                self.canvas.addItem(r)
                r.setPen(QPen(Qt.white, 2))
                r.setZValue(30)

            outer_rect = rect(x0, y0, x1 - x0, y1 - y0, 30)
            self.areas.append((used_attrs, used_vals, outer_rect))
            if not conditionaldict[attr_vals]:
                return

            if self.interior_coloring == self.PEARSON:
                s = sum(apriori_dists[0])
                expected = s * reduce(
                        mul,
                        (apriori_dists[i][used_vals[i]] / float(s)
                         for i in range(len(used_vals))))
                actual = conditionaldict[attr_vals]
                pearson = (actual - expected) / sqrt(expected)
                if pearson == 0:
                    ind = 0
                else:
                    ind = max(0, min(int(log(abs(pearson), 2)), 3))
                color = [self.RED_COLORS, self.BLUE_COLORS][pearson > 0][ind]
                rect(x0, y0, x1 - x0, y1 - y0, -20, color)
                outer_rect.setToolTip(
                        condition + "<hr/>" +
                        "Expected instances: %.1f<br>"
                        "Actual instances: %d<br>"
                        "Standardized (Pearson) residual: %.1f" %
                        (expected, conditionaldict[attr_vals], pearson))
            else:
                cls_values = get_variable_values_sorted(class_var)
                prior = get_distribution(data, class_var.name)
                total = 0
                for i, value in enumerate(cls_values):
                    val = conditionaldict[attr_vals + "-" + value]
                    if val == 0:
                        continue
                    if i == len(cls_values) - 1:
                        v = y1 - y0 - total
                    else:
                        v = ((y1 - y0) * val) / conditionaldict[attr_vals]
                    rect(x0, y0 + total, x1 - x0, v, -20, colors[i])
                    total += v

                if self.use_boxes and \
                        abs(x1 - x0) > bar_width and \
                        abs(y1 - y0) > bar_width:
                    total = 0
                    line(x0 + bar_width, y0, x0 + bar_width, y1)
                    n = sum(prior)
                    for i, (val, color) in enumerate(zip(prior, colors)):
                        if i == len(prior) - 1:
                            h = y1 - y0 - total
                        else:
                            h = (y1 - y0) * val / n
                        rect(x0, y0 + total, bar_width, h, 20, color)
                        total += h

                if conditionalsubsetdict:
                    if conditionalsubsetdict[attr_vals]:
                        counts = [conditionalsubsetdict[attr_vals + "-" + val]
                                  for val in cls_values]
                        if sum(counts) == 1:
                            rect(x0 - 2, y0 - 2, x1 - x0 + 5, y1 - y0 + 5, -550,
                                 colors[counts.index(1)], Qt.white,
                                 penWidth=2, penStyle=Qt.DashLine)
                        if self.subset_data is not None:
                            line(x1 - bar_width, y0, x1 - bar_width, y1)
                            total = 0
                            n = conditionalsubsetdict[attr_vals]
                            if n:
                                for i, (cls, color) in \
                                        enumerate(zip(cls_values, colors)):
                                    val = conditionalsubsetdict[
                                        attr_vals + "-" + cls]
                                    if val == 0:
                                        continue
                                    if i == len(prior) - 1:
                                        v = y1 - y0 - total
                                    else:
                                        v = ((y1 - y0) * val) / n
                                    rect(x1 - bar_width, y0 + total,
                                         bar_width, v, 15, color)
                                    total += v

                actual = [conditionaldict[attr_vals + "-" + cls_values[i]]
                          for i in range(len(prior))]
                n_actual = sum(actual)
                if n_actual > 0:
                    apriori = [prior[key] for key in cls_values]
                    n_apriori = sum(apriori)
                    text = "<br/>".join(
                            "<b>%s</b>: %d / %.1f%% (Expected %.1f / %.1f%%)" %
                            (cls, act, 100.0 * act / n_actual,
                             apr / n_apriori * n_actual, 100.0 * apr / n_apriori
                             )
                            for cls, act, apr in zip(cls_values, actual, apriori
                                                     ))
                else:
                    text = ""
                outer_rect.setToolTip(
                        "{}<hr>Instances: {}<br><br>{}".format(
                                condition, n_actual, text[:-4]))

        def draw_legend(x0_x1, y0_y1):
            x0, x1 = x0_x1
            y0, y1 = y0_y1
            if self.interior_coloring == self.PEARSON:
                names = ["<-8", "-8:-4", "-4:-2", "-2:2", "2:4", "4:8", ">8",
                         "Residuals:"]
                colors = self.RED_COLORS[::-1] + self.BLUE_COLORS[1:]
            else:
                names = get_variable_values_sorted(class_var) + \
                        [class_var.name + ":"]
                colors = [QColor(*col) for col in class_var.colors]

            names = [CanvasText(self.canvas, name, alignment=Qt.AlignVCenter)
                     for name in names]
            totalwidth = sum(text.boundingRect().width() for text in names)

            # compute the x position of the center of the legend
            y = y1 + self.ATTR_NAME_OFFSET + self.ATTR_VAL_OFFSET + 35
            distance = 30
            startx = (x0 + x1) / 2 - (totalwidth + (len(names)) * distance) / 2

            names[-1].setPos(startx + 15, y)
            names[-1].show()
            xoffset = names[-1].boundingRect().width() + distance

            size = 8

            for i in range(len(names) - 1):
                if self.interior_coloring == self.PEARSON:
                    edgecolor = Qt.black
                else:
                    edgecolor = colors[i]

                CanvasRectangle(self.canvas, startx + xoffset, y - size / 2,
                                size, size, edgecolor, colors[i])
                names[i].setPos(startx + xoffset + 10, y)
                xoffset += distance + names[i].boundingRect().width()

        self.canvas.clear()
        self.areas = []

        data = self.discrete_data
        if data is None:
            return
        subset = self.subset_data
        attr_list = self.get_attr_list()
        class_var = data.domain.class_var
        if class_var:
            sql = type(data) == SqlTable
            name = not sql and data.name
            # save class_var because it is removed in the next line
            data = data[:, attr_list + [class_var]]
            data.domain.class_var = class_var
            if not sql:
                data.name = name
        else:
            data = data[:, attr_list]
        # TODO: check this
        # data = Preprocessor_dropMissing(data)
        if len(data) == 0:
            self.Warning.no_valid_data()
            return
        else:
            self.Warning.no_valid_data.clear()

        if self.interior_coloring == self.PEARSON:
            apriori_dists = [get_distribution(data, attr) for attr in attr_list]
        else:
            apriori_dists = []

        def get_max_label_width(attr):
            values = get_variable_values_sorted(data.domain[attr])
            maxw = 0
            for val in values:
                t = CanvasText(self.canvas, val, 0, 0, bold=0, show=False)
                maxw = max(int(t.boundingRect().width()), maxw)
            return maxw

        # get the maximum width of rectangle
        xoff = 20
        width = 20
        if len(attr_list) > 1:
            text = CanvasText(self.canvas, attr_list[1], bold=1, show=0)
            max_ylabel_w1 = min(get_max_label_width(attr_list[1]), 150)
            width = 5 + text.boundingRect().height() + \
                self.ATTR_VAL_OFFSET + max_ylabel_w1
            xoff = width
            if len(attr_list) == 4:
                text = CanvasText(self.canvas, attr_list[3], bold=1, show=0)
                max_ylabel_w2 = min(get_max_label_width(attr_list[3]), 150)
                width += text.boundingRect().height() + \
                    self.ATTR_VAL_OFFSET + max_ylabel_w2 - 10

        # get the maximum height of rectangle
        height = 100
        yoff = 45
        square_size = min(self.canvas_view.width() - width - 20,
                          self.canvas_view.height() - height - 20)

        if square_size < 0:
            return  # canvas is too small to draw rectangles
        self.canvas_view.setSceneRect(
                0, 0, self.canvas_view.width(), self.canvas_view.height())

        drawn_sides = set()
        draw_positions = {}

        conditionaldict, distributiondict = \
            get_conditional_distribution(data, attr_list)
        conditionalsubsetdict = None
        if subset:
            conditionalsubsetdict, _ = \
                get_conditional_distribution(subset, attr_list)

        # draw rectangles
        draw_data(
            attr_list, (xoff, xoff + square_size), (yoff, yoff + square_size),
            0, "", len(attr_list))
        draw_legend((xoff, xoff + square_size), (yoff, yoff + square_size))
        self.update_selection_rects()