def test_get_contingencies(self): d = self._construct_sparse() conts = contingency.get_contingencies(d) self.assertEqual(len(conts), 20) cont = conts[5] self.assertIsInstance(cont, contingency.Discrete) np.testing.assert_almost_equal(cont[0], [1, 0, 0]) np.testing.assert_almost_equal(cont["b"], [0, 1, 1]) np.testing.assert_almost_equal(cont[2], [1, 0, 0]) cont = conts[14] self.assertIsInstance(cont, contingency.Continuous) np.testing.assert_almost_equal(cont[0], [[], []]) np.testing.assert_almost_equal(cont["b"], [[1], [1]]) np.testing.assert_almost_equal(cont[2], [[2], [1]]) conts = contingency.get_contingencies(d, skipDiscrete=True) self.assertEqual(len(conts), 10) cont = conts[4] self.assertIsInstance(cont, contingency.Continuous) np.testing.assert_almost_equal(cont[0], [[], []]) np.testing.assert_almost_equal(cont["b"], [[1], [1]]) np.testing.assert_almost_equal(cont[2], [[2], [1]]) conts = contingency.get_contingencies(d, skipContinuous=True) self.assertEqual(len(conts), 10) cont = conts[5] self.assertIsInstance(cont, contingency.Discrete) np.testing.assert_almost_equal(cont[0], [1, 0, 0]) np.testing.assert_almost_equal(cont["b"], [0, 1, 1]) np.testing.assert_almost_equal(cont[2], [1, 0, 0])
def test_get_contingencies(self): d = self._construct_sparse() conts = contingency.get_contingencies(d) self.assertEqual(len(conts), 20) cont = conts[5] self.assertIsInstance(cont, contingency.Discrete) assert_dist_equal(cont[0], [2, 0, 0]) assert_dist_equal(cont["b"], [0, 1, 1]) assert_dist_equal(cont[2], [1, 0, 0]) cont = conts[14] self.assertIsInstance(cont, contingency.Continuous) assert_dist_equal(cont[0], [[], []]) assert_dist_equal(cont["b"], [[1], [1]]) assert_dist_equal(cont[2], [[2], [1]]) conts = contingency.get_contingencies(d, skip_discrete=True) self.assertEqual(len(conts), 10) cont = conts[4] self.assertIsInstance(cont, contingency.Continuous) assert_dist_equal(cont[0], [[], []]) assert_dist_equal(cont["b"], [[1], [1]]) assert_dist_equal(cont[2], [[2], [1]]) conts = contingency.get_contingencies(d, skip_continuous=True) self.assertEqual(len(conts), 10) cont = conts[5] self.assertIsInstance(cont, contingency.Discrete) assert_dist_equal(cont[0], [2, 0, 0]) assert_dist_equal(cont["b"], [0, 1, 1]) assert_dist_equal(cont[2], [1, 0, 0])
def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.array(np.diag( contingency.get_contingency(table, table.domain.class_var))) nclss = (class_freq != 0).sum() if not nclss: raise ValueError("Data has no defined target values") # Laplacian smoothing considers only classes that appear in the data, # in part to avoid cases where the probabilities are affected by empty # (or completely spurious) classes that appear because of Orange's reuse # of variables. See GH-2943. # The corresponding elements of class_probs are set to zero only after # mock non-zero values are used in computation of log_cont_prob to # prevent division by zero. class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss) log_cont_prob = [np.log( (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + nclss) / class_prob[:, None]) for c in cont] class_prob[class_freq == 0] = 0 return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.array(np.diag( contingency.get_contingency(table, table.domain.class_var))) nclss = (class_freq != 0).sum() if not nclss: raise ValueError("Data has no defined target values") # Laplacian smoothing considers only classes that appear in the data, # in part to avoid cases where the probabilities are affected by empty # (or completely spurious) classes that appear because of Orange's reuse # of variables. See GH-2943. # The corresponding elements of class_probs are set to zero only after # mock non-zero values are used in computation of log_cont_prob to # prevent division by zero. class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss) log_cont_prob = [np.log( (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + nclss) / class_prob[:, None]) for c in cont] class_prob[class_freq == 0] = 0 return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(isinstance(var, DiscreteVariable) for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.diag(contingency.get_contingency(table, table.domain.class_var)) return NaiveBayesModel(cont, class_freq, table.domain)
def test_contingencies(self): iris = SqlTable(self.conn, self.iris, inspect_values=True) iris.domain = Domain(iris.domain[:2] + (EqualWidth()(iris, iris.domain['sepal width']),), iris.domain['iris']) conts = get_contingencies(iris) self.assertEqual(len(conts), 3) self.assertIsInstance(conts[0], Continuous) self.assertIsInstance(conts[1], Continuous) self.assertIsInstance(conts[2], Discrete)
def test_contingencies(self): iris = SqlTable(self.conn, self.iris, inspect_values=True) iris.domain = Domain(iris.domain[:2] + (EqualWidth()(iris, iris.domain['sepal width']),), iris.domain['iris']) conts = get_contingencies(iris) self.assertEqual(len(conts), 3) self.assertIsInstance(conts[0], Continuous) self.assertIsInstance(conts[1], Continuous) self.assertIsInstance(conts[2], Discrete)
def draw_distributions(self): """Draw distributions with discrete attributes""" if not (self.show_distributions and self.have_data and self.data_has_discrete_class): return class_count = len(self.data_domain.class_var.values) class_ = self.data_domain.class_var # we create a hash table of possible class values (happens only if we have a discrete class) if self.domain_contingencies is None: self.domain_contingencies = dict( zip( [attr for attr in self.data_domain if isinstance(attr, DiscreteVariable)], get_contingencies(self.raw_data, skipContinuous=True), ) ) self.domain_contingencies[class_] = get_contingency(self.raw_data, class_, class_) max_count = max([contingency.max() for contingency in self.domain_contingencies.values()] or [1]) sorted_class_values = get_variable_values_sorted(self.data_domain.class_var) for axis_idx, attr_idx in enumerate(self.attribute_indices): attr = self.data_domain[attr_idx] if isinstance(attr, DiscreteVariable): continue contingency = self.domain_contingencies[attr] attr_len = len(attr.values) # we create a hash table of variable values and their indices sorted_variable_values = get_variable_values_sorted(attr) # create bar curve for j in range(attr_len): attribute_value = sorted_variable_values[j] value_count = contingency[:, attribute_value] for i in range(class_count): class_value = sorted_class_values[i] color = QColor(self.discrete_palette[i]) color.setAlpha(self.alpha_value) width = float(value_count[class_value] * 0.5) / float(max_count) y_off = float(1.0 + 2.0 * j) / float(2 * attr_len) height = 0.7 / float(class_count * attr_len) y_low_bottom = y_off + float(class_count * height) / 2.0 - i * height curve = PolygonCurve( QPen(color), QBrush(color), xData=[axis_idx, axis_idx + width, axis_idx + width, axis_idx], yData=[y_low_bottom, y_low_bottom, y_low_bottom - height, y_low_bottom - height], tooltip=attr.name, ) curve.attach(self)
def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.array( np.diag(contingency.get_contingency(table, table.domain.class_var))) return NaiveBayesModel(cont, class_freq, table.domain)
def draw_distributions(self): """Draw distributions with discrete attributes""" if not (self.show_distributions and self.have_data and self.data_has_discrete_class): return class_count = len(self.data_domain.class_var.values) class_ = self.data_domain.class_var # we create a hash table of possible class values (happens only if we have a discrete class) if self.domain_contingencies is None: self.domain_contingencies = dict( zip([attr for attr in self.data_domain if isinstance(attr, DiscreteVariable)], get_contingencies(self.raw_data, skipContinuous=True))) self.domain_contingencies[class_] = get_contingency(self.raw_data, class_, class_) max_count = max([contingency.max() for contingency in self.domain_contingencies.values()] or [1]) sorted_class_values = get_variable_values_sorted(self.data_domain.class_var) for axis_idx, attr_idx in enumerate(self.attribute_indices): attr = self.data_domain[attr_idx] if isinstance(attr, DiscreteVariable): continue contingency = self.domain_contingencies[attr] attr_len = len(attr.values) # we create a hash table of variable values and their indices sorted_variable_values = get_variable_values_sorted(attr) # create bar curve for j in range(attr_len): attribute_value = sorted_variable_values[j] value_count = contingency[:, attribute_value] for i in range(class_count): class_value = sorted_class_values[i] color = QColor(self.discrete_palette[i]) color.setAlpha(self.alpha_value) width = float(value_count[class_value] * 0.5) / float(max_count) y_off = float(1.0 + 2.0 * j) / float(2 * attr_len) height = 0.7 / float(class_count * attr_len) y_low_bottom = y_off + float(class_count * height) / 2.0 - i * height curve = PolygonCurve(QPen(color), QBrush(color), xData=[axis_idx, axis_idx + width, axis_idx + width, axis_idx], yData=[y_low_bottom, y_low_bottom, y_low_bottom - height, y_low_bottom - height], tooltip=attr.name) curve.attach(self)
def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.array(np.diag( contingency.get_contingency(table, table.domain.class_var))) class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq)) log_cont_prob = [np.log( (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + c.shape[0]) / class_prob[:, None]) for c in cont] return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
def fit_storage(self, table): if not isinstance(table, Storage): raise TypeError("Data is not a subclass of Orange.data.Storage.") if not all(var.is_discrete for var in table.domain.variables): raise NotImplementedError("Only discrete variables are supported.") cont = contingency.get_contingencies(table) class_freq = np.array(np.diag( contingency.get_contingency(table, table.domain.class_var))) class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq)) log_cont_prob = [np.log( (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + c.shape[0]) / class_prob[:, None]) for c in cont] return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
def fit_storage(self, table): cont = contingency.get_contingencies(table) class_freq = np.diag( contingency.get_contingency(table, table.domain.class_var)) return BayesStorageClassifier(cont, class_freq, table.domain)