def set_data(self, data): if type(data) == SqlTable and data.approx_len() > LARGE_TABLE: data = data.sample_time(DEFAULT_SAMPLE_TIME) self.closeContext() self.data = data self.areas = [] if self.data is None: self.attrs[:] = [] else: if any(attr.is_continuous for attr in data.domain): self.discrete_data = Discretize(method=EqualFreq(n=4))(data) else: self.discrete_data = self.data self.attrs[:] = [ var for var in chain(self.discrete_data.domain, ( var for var in self.data.domain.metas if var.is_discrete)) ] if self.attrs: self.attrX = self.attrs[0].name self.attrY = self.attrs[len(self.attrs) > 1].name else: self.attrX = self.attrY = None self.areas = self.selection = None self.openContext(self.data) self.resolve_shown_attributes() self.update_selection()
def set_data(self, data): if type(data) == SqlTable and data.approx_len() > LARGE_TABLE: data = data.sample_time(DEFAULT_SAMPLE_TIME) self.closeContext() self.data = data self.init_combos(self.data) self.information([0, 1, 2]) if not self.data: self.discrete_data = None return """ TODO: check if data.has_missing_class(): self.information(1, "Examples with missing classes were removed.") """ if any(attr.is_continuous for attr in data.domain): self.discrete_data = Discretize(method=EqualFreq(n=4))(data) else: self.discrete_data = self.data if self.data.domain.class_var is None: self.rb_colors.setDisabled(True) disc_class = False else: self.rb_colors.setDisabled(False) disc_class = self.data.domain.has_discrete_class self.rb_colors.group.button(2).setDisabled(not disc_class) self.bar_button.setDisabled(not disc_class) self.interior_coloring = bool(disc_class) self.openContext(self.data) # if we first received subset we now call setSubsetData to process it if self.unprocessed_subset_data: self.set_subset_data(self.unprocessed_subset_data) self.unprocessed_subset_data = None
def discretizer(data): if any(attr.is_continuous for attr in chain(data.domain.variables, data.domain.metas)): discretize = Discretize( method=EqualFreq(n=4), remove_const=False, discretize_classes=True, discretize_metas=True) return discretize(data).to_dense() return data
def set_data(self, data): if type(data) == SqlTable and data.approx_len() > LARGE_TABLE: data = data.sample_time(DEFAULT_SAMPLE_TIME) self.closeContext() self.data = data self.init_combos(self.data) if self.data is None: self.discrete_data = None elif any(attr.is_continuous for attr in data.domain): self.discrete_data = Discretize( method=EqualFreq(n=4), discretize_classes=True)(data) else: self.discrete_data = self.data self.vizrank.stop_and_reset() self.vizrank_button.setEnabled( self.data is not None and len(self.data) > 1 \ and len(self.data.domain.attributes) >= 1) if self.data is None: return has_class = self.data.domain.class_var is not None self.rb_colors.setDisabled(not has_class) self.interior_coloring = \ self.CLASS_DISTRIBUTION if has_class else self.PEARSON self.openContext(self.data) # if we first received subset we now call setSubsetData to process it if self.unprocessed_subset_data: self.set_subset_data(self.unprocessed_subset_data) self.unprocessed_subset_data = None
def set_data(self, data): self.closeContext() self.clear_messages() self.data = data self.disc_data = None self.selection = [] if data is not None: if len(data) < 2: self.Warning.not_enough_inst() elif data.Y.size == 0: self.Warning.no_class_var() else: remover = Remove(Remove.RemoveConstant) data = remover(data) disc_data = Discretize(method=EqualFreq())(data) if remover.attr_results["removed"]: self.Information.removed_cons_feat() if len(disc_data.domain.attributes) < 2: self.Warning.not_enough_vars() else: self.disc_data = disc_data self.feature_model.set_domain(self.disc_data and self.disc_data.domain) self.openContext(self.disc_data) self.apply() self.vizrank.button.setEnabled(self.disc_data is not None)
def create_contingencies(X, callback=None): window_size = 1 dim = len(X.domain) X_ = Discretize(method=EqualFreq(n=10))(X) m = [] for i, var in enumerate(X_.domain): cleaned_values = [ tuple(map(str.strip, v.strip('[]()<>=≥').split('-'))) for v in var.values ] try: float_values = [[float(v) for v in vals] for vals in cleaned_values] bin_centers = { i: v[0] if len(v) == 1 else v[0] + (v[1] - v[0]) for i, v in enumerate(float_values) } except ValueError: bin_centers = {i: i for i, v in enumerate(cleaned_values)} m.append(bin_centers) from Orange.data.sql.table import SqlTable if isinstance(X, SqlTable): conts = [] al = len(X.domain) if al > 1: conts.append(create_sql_contingency(X_, [0, 1], m)) if callback: callback(1, al) for a1, a2, a3 in zip(range(al), range(1, al), range(2, al)): conts.append(create_sql_contingency(X_, [a1, a2, a3], m)) if callback: callback(a3, al) if al > 2: conts.append(create_sql_contingency(X_, [al - 2, al - 1], m)) if callback: callback(al, al) else: conts = [defaultdict(float) for i in range(len(X_.domain))] for i, r in enumerate(X_): if any(np.isnan(r)): continue row = tuple(m[vi].get(v) for vi, v in enumerate(r)) for l in range(len(X_.domain)): lower = l - window_size if l - window_size >= 0 else None upper = l + window_size + 1 if l + window_size + 1 <= dim else None dims = slice(lower, upper) conts[l][row[dims]] += 1 conts = [zip(*x.items()) for x in conts] conts = [(np.array(c), np.array(cw)) for c, cw in conts] # for i, ((c1, cw1), (c2, cw2)) in enumerate(zip(contss, conts)): # a = np.sort(np.hstack((c1, cw1[:, None])), axis=0) # b = np.sort(np.hstack((c2, cw2[:, None])), axis=0) # assert_almost_equal(a, b) return conts
def set_data(self, data): """ Discretize continuous attributes, and put all attributes and discrete metas into self.attrs. Select the first two attributes unless context overrides this. Method `resolve_shown_attributes` is called to use the attributes from the input, if it exists and matches the attributes in the data. Remove selection; again let the context override this. Initialize the vizrank dialog, but don't show it. Args: data (Table): input data """ if isinstance(data, SqlTable) and data.approx_len() > LARGE_TABLE: data = data.sample_time(DEFAULT_SAMPLE_TIME) self.closeContext() self.data = data self.areas = [] self.selection = set() if self.data is None: self.attrs[:] = [] self.domain_model.set_domain(None) else: self.domain_model.set_domain(data.domain) if any(attr.is_continuous for attr in chain(data.domain, data.domain.metas)): discretizer = Discretize(method=EqualFreq(n=4), remove_const=False, discretize_classes=True, discretize_metas=True) self.discrete_data = discretizer(data) else: self.discrete_data = data self.attrs = [x for x in self.domain_model if isinstance(x, Variable)] if self.attrs: self.attr_x = self.attrs[0] self.attr_y = self.attrs[len(self.attrs) > 1] else: self.attr_x = self.attr_y = None self.areas = [] self.selection = set() self.openContext(self.data) self.resolve_shown_attributes() self.update_graph() self.update_selection() self.vizrank.initialize() self.vizrank_button.setEnabled( self.data is not None and len(self.data) > 1 and len(self.data.domain.attributes) > 1)
def _get_discrete_data(self, data): """ Discretize continuous attributes. Return None when there is no data, no rows, or no primitive attributes. """ if (data is None or not len(data) or not any( attr.is_discrete or attr.is_continuous for attr in chain(data.domain.variables, data.domain.metas))): return None elif any(attr.is_continuous for attr in data.domain.variables): return Discretize(method=EqualFreq(n=4), remove_const=False, discretize_classes=True, discretize_metas=True)(data) else: return data
def create_contingencies(X, callback=None): window_size = 1 dim = len(X.domain) X_ = Discretize(method=EqualFreq(n=10))(X) m = get_bin_centers(X_) from Orange.data.sql.table import SqlTable if isinstance(X, SqlTable): conts = [] al = len(X.domain) if al > 1: conts.append(create_sql_contingency(X_, [0, 1], m)) if callback: callback(1, al) for a1, a2, a3 in zip(range(al), range(1, al), range(2, al)): conts.append(create_sql_contingency(X_, [a1, a2, a3], m)) if callback: callback(a3, al) if al > 2: conts.append(create_sql_contingency(X_, [al - 2, al - 1], m)) if callback: callback(al, al) else: conts = [defaultdict(float) for i in range(len(X_.domain))] for i, r in enumerate(X_): if any(np.isnan(r)): continue row = tuple(m[vi].get(v) for vi, v in enumerate(r)) for l in range(len(X_.domain)): lower = l - window_size if l - window_size >= 0 else None upper = l + window_size + 1 if l + window_size + 1 <= dim else None dims = slice(lower, upper) conts[l][row[dims]] += 1 conts = [zip(*x.items()) for x in conts] conts = [(np.array(c), np.array(cw)) for c, cw in conts] # for i, ((c1, cw1), (c2, cw2)) in enumerate(zip(contss, conts)): # a = np.sort(np.hstack((c1, cw1[:, None])), axis=0) # b = np.sort(np.hstack((c2, cw2[:, None])), axis=0) # assert_almost_equal(a, b) return conts
def setUp(self): self.iris = Table("iris") self.adult = Table("adult") self.discretizer = Discretize(EqualFreq(n=3))
def test_discretization(self): iris = SqlTable(self.conn, self.iris, inspect_values=True) sepal_length = iris.domain["sepal length"] EqualFreq(n=4)(iris, sepal_length)
def setUp(self): self.iris = Table('iris') self.adult = Table('adult') self.discretizer = Discretize(EqualFreq(n=3))