class OWDiscretize(widget.OWWidget): name = "Discretize" description = "Discretize the numeric data features." icon = "icons/Discretize.svg" inputs = [ InputSignal("Data", Orange.data.Table, "set_data", doc="Input data table") ] outputs = [ OutputSignal("Data", Orange.data.Table, doc="Table with discretized features") ] settingsHandler = settings.DomainContextHandler() saved_var_states = settings.ContextSetting({}) default_method = settings.Setting(2) default_k = settings.Setting(3) autosend = settings.Setting(True) #: Discretization methods Default, Leave, MDL, EqualFreq, EqualWidth, Remove, Custom = range(7) want_main_area = False resizing_enabled = False def __init__(self): super().__init__() #: input data self.data = None #: Current variable discretization state self.var_state = {} #: Saved variable discretization settings (context setting) self.saved_var_states = {} self.method = 0 self.k = 5 box = gui.vBox(self.controlArea, self.tr("Default Discretization")) self.default_bbox = rbox = gui.radioButtons( box, self, "default_method", callback=self._default_disc_changed) rb = gui.hBox(rbox) self.left = gui.vBox(rb) right = gui.vBox(rb) rb.layout().setStretch(0, 1) rb.layout().setStretch(1, 1) options = self.options = [ self.tr("Default"), self.tr("Leave numeric"), self.tr("Entropy-MDL discretization"), self.tr("Equal-frequency discretization"), self.tr("Equal-width discretization"), self.tr("Remove numeric variables") ] for opt in options[1:]: t = gui.appendRadioButton(rbox, opt) # This condition is ugly, but it keeps the same order of # options for backward compatibility of saved schemata [right, self.left][opt.startswith("Equal")].layout().addWidget(t) gui.separator(right, 18, 18) def _intbox(widget, attr, callback): box = gui.indentedBox(widget) s = gui.spin(box, self, attr, minv=2, maxv=10, label="Num. of intervals:", callback=callback) s.setMaximumWidth(60) s.setAlignment(Qt.AlignRight) gui.rubber(s.box) return box.box self.k_general = _intbox(self.left, "default_k", self._default_disc_changed) self.k_general.layout().setContentsMargins(0, 0, 0, 0) vlayout = QHBoxLayout() box = gui.widgetBox(self.controlArea, "Individual Attribute Settings", orientation=vlayout, spacing=8) # List view with all attributes self.varview = QListView(selectionMode=QListView.ExtendedSelection) self.varview.setItemDelegate(DiscDelegate()) self.varmodel = itemmodels.VariableListModel() self.varview.setModel(self.varmodel) self.varview.selectionModel().selectionChanged.connect( self._var_selection_changed) vlayout.addWidget(self.varview) # Controls for individual attr settings self.bbox = controlbox = gui.radioButtons( box, self, "method", callback=self._disc_method_changed) vlayout.addWidget(controlbox) for opt in options[:5]: gui.appendRadioButton(controlbox, opt) self.k_specific = _intbox(controlbox, "k", self._disc_method_changed) gui.appendRadioButton(controlbox, "Remove attribute") gui.rubber(controlbox) controlbox.setEnabled(False) self.controlbox = controlbox box = gui.auto_commit(self.controlArea, self, "autosend", "Apply", orientation=Qt.Horizontal, checkbox_label="Apply automatically") box.layout().insertSpacing(0, 20) box.layout().insertWidget(0, self.report_button) self._update_spin_positions() def set_data(self, data): self.closeContext() self.data = data if self.data is not None: self._initialize(data) self.openContext(data) # Restore the per variable discretization settings self._restore(self.saved_var_states) # Complete the induction of cut points self._update_points() else: self._clear() self.unconditional_commit() def _initialize(self, data): # Initialize the default variable states for new data. self.class_var = data.domain.class_var cvars = [var for var in data.domain if var.is_continuous] self.varmodel[:] = cvars class_var = data.domain.class_var has_disc_class = data.domain.has_discrete_class self.default_bbox.buttons[self.MDL - 1].setEnabled(has_disc_class) self.bbox.buttons[self.MDL].setEnabled(has_disc_class) # If the newly disabled MDL button is checked then change it if not has_disc_class and self.default_method == self.MDL - 1: self.default_method = 0 if not has_disc_class and self.method == self.MDL: self.method = 0 # Reset (initialize) the variable discretization states. self._reset() def _restore(self, saved_state): # Restore variable states from a saved_state dictionary. def_method = self._current_default_method() for i, var in enumerate(self.varmodel): key = variable_key(var) if key in saved_state: state = saved_state[key] if isinstance(state.method, Default): state = DState(Default(def_method), None, None) self._set_var_state(i, state) def _reset(self): # restore the individual variable settings back to defaults. def_method = self._current_default_method() self.var_state = {} for i in range(len(self.varmodel)): state = DState(Default(def_method), None, None) self._set_var_state(i, state) def _set_var_state(self, index, state): # set the state of variable at `index` to `state`. self.var_state[index] = state self.varmodel.setData(self.varmodel.index(index), state, Qt.UserRole) def _clear(self): self.data = None self.varmodel[:] = [] self.var_state = {} self.saved_var_states = {} self.default_bbox.buttons[self.MDL - 1].setEnabled(True) self.bbox.buttons[self.MDL].setEnabled(True) def _update_points(self): """ Update the induced cut points. """ if self.data is None or not len(self.data): return def induce_cuts(method, data, var): dvar = _dispatch[type(method)](method, data, var) if dvar is None: # removed return [], None elif dvar is var: # no transformation took place return None, var elif is_discretized(dvar): return dvar.compute_value.points, dvar else: assert False for i, var in enumerate(self.varmodel): state = self.var_state[i] if state.points is None and state.disc_var is None: points, dvar = induce_cuts(state.method, self.data, var) new_state = state._replace(points=points, disc_var=dvar) self._set_var_state(i, new_state) def _method_index(self, method): return METHODS.index((type(method), )) def _current_default_method(self): method = self.default_method + 1 k = self.default_k if method == OWDiscretize.Leave: def_method = Leave() elif method == OWDiscretize.MDL: def_method = MDL() elif method == OWDiscretize.EqualFreq: def_method = EqualFreq(k) elif method == OWDiscretize.EqualWidth: def_method = EqualWidth(k) elif method == OWDiscretize.Remove: def_method = Remove() else: assert False return def_method def _current_method(self): if self.method == OWDiscretize.Default: method = Default(self._current_default_method()) elif self.method == OWDiscretize.Leave: method = Leave() elif self.method == OWDiscretize.MDL: method = MDL() elif self.method == OWDiscretize.EqualFreq: method = EqualFreq(self.k) elif self.method == OWDiscretize.EqualWidth: method = EqualWidth(self.k) elif self.method == OWDiscretize.Remove: method = Remove() elif self.method == OWDiscretize.Custom: method = Custom(self.cutpoints) else: assert False return method def _update_spin_positions(self): self.k_general.setDisabled(self.default_method not in [2, 3]) if self.default_method == 2: self.left.layout().insertWidget(1, self.k_general) elif self.default_method == 3: self.left.layout().insertWidget(2, self.k_general) self.k_specific.setDisabled(self.method not in [3, 4]) if self.method == 3: self.bbox.layout().insertWidget(4, self.k_specific) elif self.method == 4: self.bbox.layout().insertWidget(5, self.k_specific) def _default_disc_changed(self): self._update_spin_positions() method = self._current_default_method() state = DState(Default(method), None, None) for i, _ in enumerate(self.varmodel): if isinstance(self.var_state[i].method, Default): self._set_var_state(i, state) self._update_points() self.commit() def _disc_method_changed(self): self._update_spin_positions() indices = self.selected_indices() method = self._current_method() state = DState(method, None, None) for idx in indices: self._set_var_state(idx, state) self._update_points() self.commit() def _var_selection_changed(self, *args): indices = self.selected_indices() # set of all methods for the current selection methods = [self.var_state[i].method for i in indices] mset = set(methods) self.controlbox.setEnabled(len(mset) > 0) if len(mset) == 1: method = mset.pop() self.method = self._method_index(method) if isinstance(method, (EqualFreq, EqualWidth)): self.k = method.k elif isinstance(method, Custom): self.cutpoints = method.points else: # deselect the current button self.method = -1 bg = self.controlbox.group button_group_reset(bg) self._update_spin_positions() def selected_indices(self): rows = self.varview.selectionModel().selectedRows() return [index.row() for index in rows] def discretized_var(self, source): index = list(self.varmodel).index(source) state = self.var_state[index] if state.disc_var is None: return None elif state.disc_var is source: return source elif state.points == []: return None else: return state.disc_var def discretized_domain(self): """ Return the current effective discretized domain. """ if self.data is None: return None def disc_var(source): if source and source.is_continuous: return self.discretized_var(source) else: return source attributes = [disc_var(v) for v in self.data.domain.attributes] attributes = [v for v in attributes if v is not None] class_var = disc_var(self.data.domain.class_var) domain = Orange.data.Domain(attributes, class_var, metas=self.data.domain.metas) return domain def commit(self): output = None if self.data is not None and len(self.data): domain = self.discretized_domain() output = self.data.transform(domain) self.send("Data", output) def storeSpecificSettings(self): super().storeSpecificSettings() self.saved_var_states = { variable_key(var): self.var_state[i]._replace(points=None, disc_var=None) for i, var in enumerate(self.varmodel) } def send_report(self): self.report_items( (("Default method", self.options[self.default_method + 1]), )) if self.varmodel: self.report_items( "Thresholds", [(var.name, DiscDelegate.cutsText(self.var_state[i]) or "leave numeric") for i, var in enumerate(self.varmodel)])
class OWDistributions(widget.OWWidget): name = "Distributions" description = "Display value distributions of a data feature in a graph." icon = "icons/Distribution.svg" priority = 100 inputs = [ InputSignal("Data", Orange.data.Table, "set_data", doc="Set the input data set") ] settingsHandler = settings.DomainContextHandler() #: Selected variable index variable_idx = settings.ContextSetting(-1) #: Selected group variable groupvar_idx = settings.ContextSetting(0) Hist, ASH, Kernel = 0, 1, 2 #: Continuous variable density estimation method cont_est_type = settings.Setting(ASH) relative_freq = settings.Setting(False) def __init__(self, parent=None): super().__init__(parent) self.data = None self.distributions = None self.contingencies = None self.var = self.cvar = None varbox = gui.widgetBox(self.controlArea, "Variable") self.varmodel = itemmodels.VariableListModel() self.groupvarmodel = itemmodels.VariableListModel() self.varview = QtGui.QListView( selectionMode=QtGui.QListView.SingleSelection) self.varview.setSizePolicy(QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Expanding) self.varview.setModel(self.varmodel) self.varview.setSelectionModel( itemmodels.ListSingleSelectionModel(self.varmodel)) self.varview.selectionModel().selectionChanged.connect( self._on_variable_idx_changed) varbox.layout().addWidget(self.varview) gui.separator(varbox, 8, 8) gui.comboBox(varbox, self, "cont_est_type", label="Show continuous variables by", valueType=int, items=[ "Histograms", "Average shifted histograms", "Kernel density estimators" ], callback=self._on_cont_est_type_changed) box = gui.widgetBox(self.controlArea, "Group by") self.groupvarview = QtGui.QListView( selectionMode=QtGui.QListView.SingleSelection) self.groupvarview.setFixedHeight(100) self.groupvarview.setSizePolicy(QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Preferred) self.groupvarview.setModel(self.groupvarmodel) self.groupvarview.selectionModel().selectionChanged.connect( self._on_groupvar_idx_changed) box.layout().addWidget(self.groupvarview) self.cb_rel_freq = gui.checkBox( box, self, "relative_freq", "Show relative frequencies", callback=self._on_relative_freq_changed) plotview = pg.PlotWidget(background=None) self.mainArea.layout().addWidget(plotview) w = QtGui.QLabel() w.setSizePolicy(QtGui.QSizePolicy.Expanding, QtGui.QSizePolicy.Fixed) self.mainArea.layout().addWidget(w, Qt.AlignCenter) self.plot = pg.PlotItem() # self.plot.getViewBox().setMouseEnabled(False, False) self.plot.getViewBox().setMenuEnabled(False) plotview.setCentralItem(self.plot) pen = QtGui.QPen(self.palette().color(QtGui.QPalette.Text)) for axis in ("left", "bottom"): self.plot.getAxis(axis).setPen(pen) def set_data(self, data): self.closeContext() self.clear() self.data = data if self.data is not None: domain = self.data.domain self.varmodel[:] = list(domain) self.groupvarmodel[:] = \ ["(None)"] + [var for var in domain if var.is_discrete] if domain.has_discrete_class: self.groupvar_idx = \ list(self.groupvarmodel).index(domain.class_var) self.openContext(domain) self.variable_idx = min(max(self.variable_idx, 0), len(self.varmodel) - 1) self.groupvar_idx = min(max(self.groupvar_idx, 0), len(self.groupvarmodel) - 1) itemmodels.select_row(self.groupvarview, self.groupvar_idx) itemmodels.select_row(self.varview, self.variable_idx) self._setup() def clear(self): self.plot.clear() self.varmodel[:] = [] self.groupvarmodel[:] = [] self.variable_idx = -1 self.groupvar_idx = 0 def _setup(self): self.plot.clear() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.set_left_axis_name() self.enable_disable_rel_freq() if self.var is None: return if self.cvar: self.contingencies = \ contingency.get_contingency(self.data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(self.data, self.var) self.display_distribution() def _density_estimator(self): if self.cont_est_type == OWDistributions.Hist: def hist(dist): h, edges = numpy.histogram(dist[0, :], bins=10, weights=dist[1, :]) return edges, h return hist elif self.cont_est_type == OWDistributions.ASH: return lambda dist: ash_curve(dist, m=5) elif self.cont_est_type == OWDistributions.Kernel: return rect_kernel_curve def display_distribution(self): dist = self.distributions var = self.var assert len(dist) > 0 self.plot.clear() bottomaxis = self.plot.getAxis("bottom") bottomaxis.setLabel(var.name) self.set_left_axis_name() if var and var.is_continuous: bottomaxis.setTicks(None) curve_est = self._density_estimator() edges, curve = curve_est(dist) item = pg.PlotCurveItem() item.setData(edges, curve, antialias=True, stepMode=True, fillLevel=0, brush=QtGui.QBrush(Qt.gray), pen=QtGui.QColor(Qt.white)) self.plot.addItem(item) else: bottomaxis.setTicks([list(enumerate(var.values))]) for i, w in enumerate(dist): geom = QtCore.QRectF(i - 0.33, 0, 0.66, w) print(w, list(enumerate(var.values))) item = DistributionBarItem(geom, [1.0], [QtGui.QColor(128, 128, 128)]) self.plot.addItem(item) def _on_relative_freq_changed(self): self.set_left_axis_name() if self.cvar and self.cvar.is_discrete: self.display_contingency() else: self.display_distribution() def display_contingency(self): """ Set the contingency to display. """ cont = self.contingencies var, cvar = self.var, self.cvar assert len(cont) > 0 self.plot.clear() bottomaxis = self.plot.getAxis("bottom") bottomaxis.setLabel(var.name) palette = colorpalette.ColorPaletteGenerator(len(cvar.values)) colors = [palette[i] for i in range(len(cvar.values))] if var and var.is_continuous: bottomaxis.setTicks(None) weights = numpy.array([numpy.sum(W) for _, W in cont]) weights /= numpy.sum(weights) curve_est = self._density_estimator() curves = [curve_est(dist) for dist in cont] curves = [(X, Y * w) for (X, Y), w in zip(curves, weights)] cum_curves = [curves[0]] for X, Y in curves[1:]: cum_curves.append(sum_rect_curve(X, Y, *cum_curves[-1])) for (X, Y), color in reversed(list(zip(cum_curves, colors))): item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(Qt.white), 0.5) pen.setCosmetic(True) item.setData(X, Y, antialias=True, stepMode=True, fillLevel=0, brush=QtGui.QBrush(color.lighter()), pen=pen) self.plot.addItem(item) # # XXX: sum the individual curves and not the distributions. # # The conditional distributions might be 'smoother' than # # the cumulative one # cum_dist = [cont[0]] # for dist in cont[1:]: # cum_dist.append(dist_sum(dist, cum_dist[-1])) # # curves = [rect_kernel_curve(dist) for dist in cum_dist] # colors = [Qt.blue, Qt.red, Qt.magenta] # for (X, Y), color in reversed(list(zip(curves, colors))): # item = pg.PlotCurveItem() # item.setData(X, Y, antialias=True, stepMode=True, # fillLevel=0, brush=QtGui.QBrush(color)) # item.setPen(QtGui.QPen(color)) # self.plot.addItem(item) elif var and var.is_discrete: bottomaxis.setTicks([list(enumerate(var.values))]) cont = numpy.array(cont) for i, (value, dist) in enumerate(zip(var.values, cont.T)): dsum = sum(dist) geom = QtCore.QRectF(i - 0.333, 0, 0.666, 100 if self.relative_freq else dsum) item = DistributionBarItem(geom, dist / dsum, colors) self.plot.addItem(item) def set_left_axis_name(self): set_label = self.plot.getAxis("left").setLabel if (self.var and self.var.is_continuous and self.cont_est_type != OWDistributions.Hist): set_label("Density") else: set_label(["Frequency", "Relative frequency"][self.cvar is not None and self.relative_freq]) def enable_disable_rel_freq(self): self.cb_rel_freq.setDisabled(self.var is None or self.cvar is None or self.var.is_continuous) def _on_variable_idx_changed(self): self.variable_idx = selected_index(self.varview) self._setup() def _on_groupvar_idx_changed(self): self.groupvar_idx = selected_index(self.groupvarview) self._setup() def _on_cont_est_type_changed(self): self.set_left_axis_name() if self.data is not None: self._setup() def onDeleteWidget(self): self.plot.clear() super().onDeleteWidget()
class OWDistributions(widget.OWWidget): name = "Distributions" description = "Display value distributions of a data feature in a graph." icon = "icons/Distribution.svg" priority = 100 inputs = [ InputSignal("Data", Orange.data.Table, "set_data", doc="Set the input data set") ] settingsHandler = settings.DomainContextHandler( match_values=settings.DomainContextHandler.MATCH_VALUES_ALL) #: Selected variable index variable_idx = settings.ContextSetting(-1) #: Selected group variable groupvar_idx = settings.ContextSetting(0) relative_freq = settings.Setting(False) disc_cont = settings.Setting(False) smoothing_index = settings.Setting(5) show_prob = settings.ContextSetting(0) graph_name = "plot" ASH_HIST = 50 bins = [2, 3, 4, 5, 8, 10, 12, 15, 20, 30, 50] smoothing_facs = list( reversed([0.1, 0.2, 0.4, 0.6, 0.8, 1, 1.5, 2, 4, 6, 10])) def __init__(self): super().__init__() self.data = None self.distributions = None self.contingencies = None self.var = self.cvar = None varbox = gui.widgetBox(self.controlArea, "Variable") self.varmodel = itemmodels.VariableListModel() self.groupvarmodel = [] self.varview = QtGui.QListView( selectionMode=QtGui.QListView.SingleSelection) self.varview.setSizePolicy(QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Expanding) self.varview.setModel(self.varmodel) self.varview.setSelectionModel( itemmodels.ListSingleSelectionModel(self.varmodel)) self.varview.selectionModel().selectionChanged.connect( self._on_variable_idx_changed) varbox.layout().addWidget(self.varview) box = gui.widgetBox(self.controlArea, "Precision") gui.separator(self.controlArea, 4, 4) box2 = gui.widgetBox(box, orientation="horizontal") self.l_smoothing_l = gui.widgetLabel(box2, "Smooth") gui.hSlider(box2, self, "smoothing_index", minValue=0, maxValue=len(self.smoothing_facs) - 1, callback=self._on_set_smoothing, createLabel=False) self.l_smoothing_r = gui.widgetLabel(box2, "Precise") self.cb_disc_cont = gui.checkBox( gui.indentedBox(box, sep=4), self, "disc_cont", "Bin continuous variables", callback=self._on_groupvar_idx_changed, tooltip="Show continuous variables as discrete.") box = gui.widgetBox(self.controlArea, "Group by") self.icons = gui.attributeIconDict self.groupvarview = gui.comboBox( box, self, "groupvar_idx", callback=self._on_groupvar_idx_changed, valueType=str, contentsLength=12) box2 = gui.indentedBox(box, sep=4) self.cb_rel_freq = gui.checkBox( box2, self, "relative_freq", "Show relative frequencies", callback=self._on_relative_freq_changed, tooltip= "Normalize probabilities so that probabilities for each group-by value sum to 1." ) gui.separator(box2) self.cb_prob = gui.comboBox( box2, self, "show_prob", label="Show probabilities", orientation="horizontal", callback=self._on_relative_freq_changed, tooltip= "Show probabilities for a chosen group-by value (at each point probabilities for all group-by values sum to 1)." ) self.plotview = pg.PlotWidget(background=None) self.plotview.setRenderHint(QtGui.QPainter.Antialiasing) self.mainArea.layout().addWidget(self.plotview) w = QtGui.QLabel() w.setSizePolicy(QtGui.QSizePolicy.Expanding, QtGui.QSizePolicy.Fixed) self.mainArea.layout().addWidget(w, Qt.AlignCenter) self.ploti = pg.PlotItem() self.plot = self.ploti.vb self.ploti.hideButtons() self.plotview.setCentralItem(self.ploti) self.plot_prob = pg.ViewBox() self.ploti.hideAxis('right') self.ploti.scene().addItem(self.plot_prob) self.ploti.getAxis("right").linkToView(self.plot_prob) self.ploti.getAxis("right").setLabel("Probability") self.plot_prob.setZValue(10) self.plot_prob.setXLink(self.ploti) self.update_views() self.ploti.vb.sigResized.connect(self.update_views) self.plot_prob.setRange(yRange=[0, 1]) self.inline_graph_report() def disable_mouse(plot): plot.setMouseEnabled(False, False) plot.setMenuEnabled(False) disable_mouse(self.plot) disable_mouse(self.plot_prob) self.tooltip_items = [] self.plot.scene().installEventFilter( HelpEventDelegate(self.help_event, self)) pen = QtGui.QPen(self.palette().color(QtGui.QPalette.Text)) for axis in ("left", "bottom"): self.ploti.getAxis(axis).setPen(pen) self._legend = LegendItem() self._legend.setParentItem(self.plot) self._legend.hide() self._legend.anchor((1, 0), (1, 0)) def update_views(self): self.plot_prob.setGeometry(self.plot.sceneBoundingRect()) self.plot_prob.linkedViewChanged(self.plot, self.plot_prob.XAxis) def set_data(self, data): self.closeContext() self.clear() self.data = data if self.data is not None: domain = self.data.domain self.varmodel[:] = list(domain) self.groupvarview.clear() self.groupvarmodel = \ ["(None)"] + [var for var in domain if var.is_discrete] self.groupvarview.addItem("(None)") for var in self.groupvarmodel[1:]: self.groupvarview.addItem(self.icons[var], var.name) if domain.has_discrete_class: self.groupvar_idx = \ self.groupvarmodel[1:].index(domain.class_var) + 1 self.openContext(domain) self.variable_idx = min(max(self.variable_idx, 0), len(self.varmodel) - 1) self.groupvar_idx = min(max(self.groupvar_idx, 0), len(self.groupvarmodel) - 1) itemmodels.select_row(self.varview, self.variable_idx) self._setup() def clear(self): self.plot.clear() self.plot_prob.clear() self.varmodel[:] = [] self.groupvarmodel = [] self.variable_idx = -1 self.groupvar_idx = 0 self._legend.clear() self._legend.hide() def _setup_smoothing(self): if not self.disc_cont and self.var and self.var.is_continuous: self.cb_disc_cont.setText("Bin continuous variables") self.l_smoothing_l.setText("Smooth") self.l_smoothing_r.setText("Precise") else: self.cb_disc_cont.setText( "Bin continuous variables into {} bins".format( self.bins[self.smoothing_index])) self.l_smoothing_l.setText(" " + str(self.bins[0])) self.l_smoothing_r.setText(" " + str(self.bins[-1])) def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.cb_prob.clear() self.cb_prob.addItem("(None)") self.cb_prob.addItems(self.cvar.values) self.cb_prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: data = self.data[:, (self.var, self.cvar) if self.cvar else self.var] disc = Orange.preprocess.discretize.EqualWidth( n=self.bins[self.smoothing_index]) data = Orange.preprocess.Discretize(data, method=disc, remove_const=False) self.var = data.domain[0] self.set_left_axis_name() self.enable_disable_rel_freq() if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange() def help_event(self, ev): in_graph_coor = self.plot.mapSceneToView(ev.scenePos()) ctooltip = [] for vb, item in self.tooltip_items: if isinstance(item, pg.PlotCurveItem) and item.mouseShape().contains( vb.mapSceneToView(ev.scenePos())): ctooltip.append(item.tooltip) elif isinstance( item, DistributionBarItem) and item.boundingRect().contains( vb.mapSceneToView(ev.scenePos())): ctooltip.append(item.tooltip) if ctooltip: QToolTip.showText(ev.screenPos(), "\n\n".join(ctooltip), widget=self.plotview) return True return False def display_distribution(self): dist = self.distributions var = self.var assert len(dist) > 0 self.plot.clear() self.plot_prob.clear() self.ploti.hideAxis('right') self.tooltip_items = [] bottomaxis = self.ploti.getAxis("bottom") bottomaxis.setLabel(var.name) bottomaxis.resizeEvent() self.set_left_axis_name() if var and var.is_continuous: bottomaxis.setTicks(None) if not len(dist[0]): return edges, curve = ash_curve( dist, None, m=OWDistributions.ASH_HIST, smoothing_factor=self.smoothing_facs[self.smoothing_index]) edges = edges + (edges[1] - edges[0]) / 2 edges = edges[:-1] item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(Qt.white), 3) pen.setCosmetic(True) item.setData(edges, curve, antialias=True, stepMode=False, fillLevel=0, brush=QtGui.QBrush(Qt.gray), pen=pen) self.plot.addItem(item) item.tooltip = "Density" self.tooltip_items.append((self.plot, item)) else: bottomaxis.setTicks([list(enumerate(var.values))]) for i, w in enumerate(dist): geom = QtCore.QRectF(i - 0.33, 0, 0.66, w) item = DistributionBarItem(geom, [1.0], [QtGui.QColor(128, 128, 128)]) self.plot.addItem(item) item.tooltip = "Frequency for %s: %r" % (var.values[i], w) self.tooltip_items.append((self.plot, item)) def _on_relative_freq_changed(self): self.set_left_axis_name() if self.cvar and self.cvar.is_discrete: self.display_contingency() else: self.display_distribution() self.plot.autoRange() def display_contingency(self): """ Set the contingency to display. """ cont = self.contingencies var, cvar = self.var, self.cvar assert len(cont) > 0 self.plot.clear() self.plot_prob.clear() self._legend.clear() self.tooltip_items = [] if self.show_prob: self.ploti.showAxis('right') else: self.ploti.hideAxis('right') bottomaxis = self.ploti.getAxis("bottom") bottomaxis.setLabel(var.name) bottomaxis.resizeEvent() cvar_values = cvar.values colors = [QtGui.QColor(*col) for col in cvar.colors] if var and var.is_continuous: bottomaxis.setTicks(None) weights, cols, cvar_values, curves = [], [], [], [] for i, dist in enumerate(cont): v, W = dist if len(v): weights.append(numpy.sum(W)) cols.append(colors[i]) cvar_values.append(cvar.values[i]) curves.append( ash_curve(dist, cont, m=OWDistributions.ASH_HIST, smoothing_factor=self.smoothing_facs[ self.smoothing_index])) weights = numpy.array(weights) sumw = numpy.sum(weights) weights /= sumw colors = cols curves = [(X, Y * w) for (X, Y), w in zip(curves, weights)] ncval = len(cvar_values) curvesline = [] #from histograms to lines for (X, Y) in curves: X = X + (X[1] - X[0]) / 2 X = X[:-1] X = numpy.array(X) Y = numpy.array(Y) curvesline.append((X, Y)) for t in ["fill", "line"]: for (X, Y), color, w, cval in reversed( list(zip(curvesline, colors, weights, cvar_values))): item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(color), 3) pen.setCosmetic(True) color = QtGui.QColor(color) color.setAlphaF(0.2) item.setData(X, Y / (w if self.relative_freq else 1), antialias=True, stepMode=False, fillLevel=0 if t == "fill" else None, brush=QtGui.QBrush(color), pen=pen) self.plot.addItem(item) if t == "line": item.tooltip = ("Normalized density " if self.relative_freq else "Density ") \ + "\n"+ cvar.name + "=" + cval self.tooltip_items.append((self.plot, item)) if self.show_prob: M_EST = 5 #for M estimate all_X = numpy.array( numpy.unique(numpy.hstack([X for X, _ in curvesline]))) inter_X = numpy.array( numpy.linspace(all_X[0], all_X[-1], len(all_X) * 2)) curvesinterp = [ numpy.interp(inter_X, X, Y) for (X, Y) in curvesline ] sumprob = numpy.sum(curvesinterp, axis=0) # allcorrection = M_EST/sumw*numpy.sum(sumprob)/len(inter_X) legal = sumprob > 0.05 * numpy.max(sumprob) i = len(curvesinterp) + 1 show_all = self.show_prob == i for Y, color, cval in reversed( list(zip(curvesinterp, colors, cvar_values))): i -= 1 if show_all or self.show_prob == i: item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(color), 3, style=QtCore.Qt.DotLine) pen.setCosmetic(True) #prob = (Y+allcorrection/ncval)/(sumprob+allcorrection) prob = Y[legal] / sumprob[legal] item.setData(inter_X[legal], prob, antialias=True, stepMode=False, fillLevel=None, brush=None, pen=pen) self.plot_prob.addItem(item) item.tooltip = "Probability that \n" + cvar.name + "=" + cval self.tooltip_items.append((self.plot_prob, item)) elif var and var.is_discrete: bottomaxis.setTicks([list(enumerate(var.values))]) cont = numpy.array(cont) ncval = len(cvar_values) maxh = 0 #maximal column height maxrh = 0 #maximal relative column height scvar = cont.sum(axis=1) #a cvar with sum=0 with allways have distribution counts 0, #therefore we can divide it by anything scvar[scvar == 0] = 1 for i, (value, dist) in enumerate(zip(var.values, cont.T)): maxh = max(maxh, max(dist)) maxrh = max(maxrh, max(dist / scvar)) for i, (value, dist) in enumerate(zip(var.values, cont.T)): dsum = sum(dist) geom = QtCore.QRectF(i - 0.333, 0, 0.666, maxrh if self.relative_freq else maxh) if self.show_prob: prob = dist / dsum ci = 1.96 * numpy.sqrt(prob * (1 - prob) / dsum) else: ci = None item = DistributionBarItem( geom, dist / scvar / maxrh if self.relative_freq else dist / maxh, colors) self.plot.addItem(item) tooltip = "\n".join("%s: %.*f" % (n, 3 if self.relative_freq else 1, v) for n, v in zip( cvar_values, dist / scvar if self.relative_freq else dist)) item.tooltip = ("Normalized frequency " if self.relative_freq else "Frequency ") \ + "(" + cvar.name + "=" + value + "):" \ + "\n" + tooltip self.tooltip_items.append((self.plot, item)) if self.show_prob: item.tooltip += "\n\nProbabilities:" for ic, a in enumerate(dist): if self.show_prob - 1 != ic and \ self.show_prob - 1 != len(dist): continue position = -0.333 + ((ic + 0.5) * 0.666 / len(dist)) if dsum < 1e-6: continue prob = a / dsum if not 1e-6 < prob < 1 - 1e-6: continue ci = 1.96 * sqrt(prob * (1 - prob) / dsum) item.tooltip += "\n%s: %.3f ± %.3f" % (cvar_values[ic], prob, ci) mark = pg.ScatterPlotItem() bar = pg.ErrorBarItem() pen = QtGui.QPen(QtGui.QBrush(QtGui.QColor(0)), 1) pen.setCosmetic(True) bar.setData(x=[i + position], y=[prob], bottom=min(numpy.array([ci]), prob), top=min(numpy.array([ci]), 1 - prob), beam=numpy.array([0.05]), brush=QtGui.QColor(1), pen=pen) mark.setData([i + position], [prob], antialias=True, symbol="o", fillLevel=None, pxMode=True, size=10, brush=QtGui.QColor(colors[ic]), pen=pen) self.plot_prob.addItem(bar) self.plot_prob.addItem(mark) for color, name in zip(colors, cvar_values): self._legend.addItem( ScatterPlotItem(pen=color, brush=color, size=10, shape="s"), escape(name)) self._legend.show() def set_left_axis_name(self): leftaxis = self.ploti.getAxis("left") set_label = leftaxis.setLabel if self.var and self.var.is_continuous: set_label(["Density", "Relative density"][self.cvar is not None and self.relative_freq]) else: set_label(["Frequency", "Relative frequency"][self.cvar is not None and self.relative_freq]) leftaxis.resizeEvent() def enable_disable_rel_freq(self): self.cb_prob.setDisabled(self.var is None or self.cvar is None) self.cb_rel_freq.setDisabled(self.var is None or self.cvar is None) def _on_variable_idx_changed(self): self.variable_idx = selected_index(self.varview) self._setup() def _on_groupvar_idx_changed(self): self._setup() def _on_set_smoothing(self): self._setup() def onDeleteWidget(self): self.plot.clear() super().onDeleteWidget() def get_widget_name_extension(self): if self.variable_idx >= 0: return self.varmodel[self.variable_idx] def send_report(self): if self.variable_idx < 0: return self.report_plot() text = "Distribution of '{}'".format(self.varmodel[self.variable_idx]) if self.groupvar_idx: group_var = self.groupvarmodel[self.groupvar_idx] prob = self.cb_prob indiv_probs = 0 < prob.currentIndex() < prob.count() - 1 if not indiv_probs or self.relative_freq: text += " grouped by '{}'".format(group_var) if self.relative_freq: text += " (relative frequencies)" if indiv_probs: text += "; probabilites for '{}={}'".format( group_var, prob.currentText()) self.report_caption(text)
class OWDiscretize(widget.OWWidget): name = "Discretize" description = "Discretize the continuous data features." icon = "icons/Discretize.svg" inputs = [InputSignal("Data", Orange.data.Table, "set_data", doc="Input data table")] outputs = [OutputSignal("Data", Orange.data.Table, doc="Table with discretized features")] settingsHandler = settings.DomainContextHandler() saved_var_states = settings.ContextSetting({}) default_method = settings.Setting(2) default_k = settings.Setting(3) autosend = settings.Setting(True) #: Discretization methods Default, Leave, MDL, EqualFreq, EqualWidth, Remove, Custom = range(7) want_main_area = False def __init__(self): super().__init__() #: input data self.data = None #: Current variable discretization state self.var_state = {} #: Saved variable discretization settings (context setting) self.saved_var_states = {} self.method = 0 self.k = 5 box = gui.widgetBox( self.controlArea, self.tr("Default Discretization")) self.default_bbox = rbox = gui.radioButtons( box, self, "default_method", callback=self._default_disc_changed) options = [ self.tr("Default"), self.tr("Leave continuous"), self.tr("Entropy-MDL discretization"), self.tr("Equal-frequency discretization"), self.tr("Equal-width discretization"), self.tr("Remove continuous attributes") ] for opt in options[1:5]: gui.appendRadioButton(rbox, opt) s = gui.hSlider(gui.indentedBox(rbox), self, "default_k", minValue=2, maxValue=10, label="Num. of intervals:", callback=self._default_disc_changed) s.setTracking(False) gui.appendRadioButton(rbox, options[-1]) vlayout = QHBoxLayout() box = gui.widgetBox( self.controlArea, "Individual Attribute Settings", orientation=vlayout, spacing=8 ) # List view with all attributes self.varview = QListView(selectionMode=QListView.ExtendedSelection) self.varview.setItemDelegate(DiscDelegate()) self.varmodel = itemmodels.VariableListModel() self.varview.setModel(self.varmodel) self.varview.selectionModel().selectionChanged.connect( self._var_selection_changed ) vlayout.addWidget(self.varview) # Controls for individual attr settings self.bbox = controlbox = gui.radioButtons( box, self, "method", callback=self._disc_method_changed ) vlayout.addWidget(controlbox) for opt in options[:5]: gui.appendRadioButton(controlbox, opt) s = gui.hSlider(gui.indentedBox(controlbox), self, "k", minValue=2, maxValue=10, label="Num. of intervals:", callback=self._disc_method_changed) s.setTracking(False) gui.appendRadioButton(controlbox, "Remove attribute") gui.rubber(controlbox) controlbox.setEnabled(False) self.controlbox = controlbox gui.auto_commit(self.controlArea, self, "autosend", "Apply", orientation="horizontal", checkbox_label="Send data after every change") def set_data(self, data): self.closeContext() self.data = data if self.data is not None: self._initialize(data) self.openContext(data) # Restore the per variable discretization settings self._restore(self.saved_var_states) # Complete the induction of cut points self._update_points() else: self._clear() self.unconditional_commit() def _initialize(self, data): # Initialize the default variable states for new data. self.class_var = data.domain.class_var cvars = [var for var in data.domain if var.is_continuous] self.varmodel[:] = cvars class_var = data.domain.class_var has_disc_class = data.domain.has_discrete_class self.default_bbox.buttons[self.MDL - 1].setEnabled(has_disc_class) self.bbox.buttons[self.MDL].setEnabled(has_disc_class) # If the newly disabled MDL button is checked then change it if not has_disc_class and self.default_method == self.MDL - 1: self.default_method = 0 if not has_disc_class and self.method == self.MDL: self.method = 0 # Reset (initialize) the variable discretization states. self._reset() def _restore(self, saved_state): # Restore variable states from a saved_state dictionary. def_method = self._current_default_method() for i, var in enumerate(self.varmodel): key = variable_key(var) if key in saved_state: state = saved_state[key] if isinstance(state.method, Default): state = DState(Default(def_method), None, None) self._set_var_state(i, state) def _reset(self): # restore the individual variable settings back to defaults. def_method = self._current_default_method() self.var_state = {} for i in range(len(self.varmodel)): state = DState(Default(def_method), None, None) self._set_var_state(i, state) def _set_var_state(self, index, state): # set the state of variable at `index` to `state`. self.var_state[index] = state self.varmodel.setData(self.varmodel.index(index), state, Qt.UserRole) def _clear(self): self.data = None self.varmodel[:] = [] self.var_state = {} self.saved_var_states = {} self.default_bbox.buttons[self.MDL - 1].setEnabled(True) self.bbox.buttons[self.MDL].setEnabled(True) def _update_points(self): """ Update the induced cut points. """ def induce_cuts(method, data, var): dvar = _dispatch[type(method)](method, data, var) if dvar is None: # removed return [], None elif dvar is var: # no transformation took place return None, var elif is_discretized(dvar): return dvar.compute_value.points, dvar else: assert False for i, var in enumerate(self.varmodel): state = self.var_state[i] if state.points is None and state.disc_var is None: points, dvar = induce_cuts(state.method, self.data, var) new_state = state._replace(points=points, disc_var=dvar) self._set_var_state(i, new_state) self.commit() def _method_index(self, method): return METHODS.index((type(method), )) def _current_default_method(self): method = self.default_method + 1 k = self.default_k if method == OWDiscretize.Leave: def_method = Leave() elif method == OWDiscretize.MDL: def_method = MDL() elif method == OWDiscretize.EqualFreq: def_method = EqualFreq(k) elif method == OWDiscretize.EqualWidth: def_method = EqualWidth(k) elif method == OWDiscretize.Remove: def_method = Remove() else: assert False return def_method def _current_method(self): if self.method == OWDiscretize.Default: method = Default(self._current_default_method()) elif self.method == OWDiscretize.Leave: method = Leave() elif self.method == OWDiscretize.MDL: method = MDL() elif self.method == OWDiscretize.EqualFreq: method = EqualFreq(self.k) elif self.method == OWDiscretize.EqualWidth: method = EqualWidth(self.k) elif self.method == OWDiscretize.Remove: method = Remove() elif self.method == OWDiscretize.Custom: method = Custom(self.cutpoints) else: assert False return method def _default_disc_changed(self): method = self._current_default_method() state = DState(Default(method), None, None) for i, _ in enumerate(self.varmodel): if isinstance(self.var_state[i].method, Default): self._set_var_state(i, state) self._update_points() def _disc_method_changed(self): indices = self.selected_indices() method = self._current_method() state = DState(method, None, None) for idx in indices: self._set_var_state(idx, state) self._update_points() def _var_selection_changed(self, *args): indices = self.selected_indices() # set of all methods for the current selection methods = [self.var_state[i].method for i in indices] mset = set(methods) self.controlbox.setEnabled(len(mset) > 0) if len(mset) == 1: method = mset.pop() self.method = self._method_index(method) if isinstance(method, (EqualFreq, EqualWidth)): self.k = method.k elif isinstance(method, Custom): self.cutpoints = method.points else: # deselect the current button self.method = -1 bg = self.controlbox.group button_group_reset(bg) def selected_indices(self): rows = self.varview.selectionModel().selectedRows() return [index.row() for index in rows] def discretized_var(self, source): index = list(self.varmodel).index(source) state = self.var_state[index] if state.disc_var is None: return None elif state.disc_var is source: return source elif state.points == []: return None else: return state.disc_var def discretized_domain(self): """ Return the current effective discretized domain. """ if self.data is None: return None def disc_var(source): if source and source.is_continuous: return self.discretized_var(source) else: return source attributes = [disc_var(v) for v in self.data.domain.attributes] attributes = [v for v in attributes if v is not None] class_var = disc_var(self.data.domain.class_var) domain = Orange.data.Domain( attributes, class_var, metas=self.data.domain.metas ) return domain def commit(self): output = None if self.data is not None: domain = self.discretized_domain() output = self.data.from_table(domain, self.data) self.send("Data", output) def storeSpecificSettings(self): super().storeSpecificSettings() self.saved_var_states = { variable_key(var): self.var_state[i]._replace(points=None, disc_var=None) for i, var in enumerate(self.varmodel) }