class OWDistributions(widget.OWWidget): name = "Distributions" description = "Display value distributions of a data feature in a graph." icon = "icons/Distribution.svg" priority = 100 inputs = [InputSignal("Data", Orange.data.Table, "set_data", doc="Set the input data set")] settingsHandler = settings.DomainContextHandler( match_values=settings.DomainContextHandler.MATCH_VALUES_ALL) #: Selected variable index variable_idx = settings.ContextSetting(-1) #: Selected group variable groupvar_idx = settings.ContextSetting(0) relative_freq = settings.Setting(False) disc_cont = settings.Setting(False) smoothing_index = settings.Setting(5) show_prob = settings.ContextSetting(0) want_graph = True ASH_HIST = 50 bins = [ 2, 3, 4, 5, 8, 10, 12, 15, 20, 30, 50 ] smoothing_facs = list(reversed([ 0.1, 0.2, 0.4, 0.6, 0.8, 1, 1.5, 2, 4, 6, 10 ])) def __init__(self): super().__init__() self.data = None self.distributions = None self.contingencies = None self.var = self.cvar = None varbox = gui.widgetBox(self.controlArea, "Variable") self.varmodel = itemmodels.VariableListModel() self.groupvarmodel = [] self.varview = QtGui.QListView( selectionMode=QtGui.QListView.SingleSelection) self.varview.setSizePolicy( QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Expanding) self.varview.setModel(self.varmodel) self.varview.setSelectionModel( itemmodels.ListSingleSelectionModel(self.varmodel)) self.varview.selectionModel().selectionChanged.connect( self._on_variable_idx_changed) varbox.layout().addWidget(self.varview) box = gui.widgetBox(self.controlArea, "Precision") gui.separator(self.controlArea, 4, 4) box2 = gui.widgetBox(box, orientation="horizontal") self.l_smoothing_l = gui.widgetLabel(box2, "Smooth") gui.hSlider(box2, self, "smoothing_index", minValue=0, maxValue=len(self.smoothing_facs) - 1, callback=self._on_set_smoothing, createLabel=False) self.l_smoothing_r = gui.widgetLabel(box2, "Precise") self.cb_disc_cont = gui.checkBox( gui.indentedBox(box, sep=4), self, "disc_cont", "Bin continuous variables", callback=self._on_groupvar_idx_changed) box = gui.widgetBox(self.controlArea, "Group by") self.icons = gui.attributeIconDict self.groupvarview = gui.comboBox(box, self, "groupvar_idx", callback=self._on_groupvar_idx_changed, valueType=str, contentsLength=12) box2 = gui.indentedBox(box, sep=4) self.cb_rel_freq = gui.checkBox( box2, self, "relative_freq", "Show relative frequencies", callback=self._on_relative_freq_changed) gui.separator(box2) self.cb_prob = gui.comboBox( box2, self, "show_prob", label="Show probabilities", orientation="horizontal", callback=self._on_relative_freq_changed) plotview = pg.PlotWidget(background=None) self.mainArea.layout().addWidget(plotview) w = QtGui.QLabel() w.setSizePolicy(QtGui.QSizePolicy.Expanding, QtGui.QSizePolicy.Fixed) self.mainArea.layout().addWidget(w, Qt.AlignCenter) self.plot = pg.PlotItem() self.plot.hideButtons() plotview.setCentralItem(self.plot) self.plot_prob = pg.ViewBox() self.plot.hideAxis('right') self.plot.scene().addItem(self.plot_prob) self.plot.getAxis("right").linkToView(self.plot_prob) self.plot.getAxis("right").setLabel("Probability") self.plot_prob.setZValue(10) self.plot_prob.setXLink(self.plot) self.update_views() self.plot.vb.sigResized.connect(self.update_views) self.plot_prob.setRange(yRange=[0,1]) def disable_mouse(plot): plot.setMouseEnabled(False, False) plot.setMenuEnabled(False) disable_mouse(self.plot.getViewBox()) disable_mouse(self.plot_prob) pen = QtGui.QPen(self.palette().color(QtGui.QPalette.Text)) for axis in ("left", "bottom"): self.plot.getAxis(axis).setPen(pen) self._legend = LegendItem() self._legend.setParentItem(self.plot.getViewBox()) self._legend.hide() self._legend.anchor((1, 0), (1, 0)) self.graphButton.clicked.connect(self.save_graph) def update_views(self): self.plot_prob.setGeometry(self.plot.vb.sceneBoundingRect()) self.plot_prob.linkedViewChanged(self.plot.vb, self.plot_prob.XAxis) def set_data(self, data): self.closeContext() self.clear() self.data = data if self.data is not None: domain = self.data.domain self.varmodel[:] = list(domain) self.groupvarview.clear() self.groupvarmodel = \ ["(None)"] + [var for var in domain if var.is_discrete] self.groupvarview.addItem("(None)") for var in self.groupvarmodel[1:]: self.groupvarview.addItem(self.icons[var], var.name) if domain.has_discrete_class: self.groupvar_idx = \ self.groupvarmodel.index(domain.class_var) self.openContext(domain) self.variable_idx = min(max(self.variable_idx, 0), len(self.varmodel) - 1) self.groupvar_idx = min(max(self.groupvar_idx, 0), len(self.groupvarmodel) - 1) itemmodels.select_row(self.varview, self.variable_idx) self._setup() def clear(self): self.plot.clear() self.plot_prob.clear() self.varmodel[:] = [] self.groupvarmodel = [] self.variable_idx = -1 self.groupvar_idx = 0 self._legend.clear() self._legend.hide() def _setup_smoothing(self): if not self.disc_cont and self.var and self.var.is_continuous: self.cb_disc_cont.setText("Bin continuous variables") self.l_smoothing_l.setText("Smooth") self.l_smoothing_r.setText("Precise") else: self.cb_disc_cont.setText("Bin continuous variables into {} bins". format(self.bins[self.smoothing_index])) self.l_smoothing_l.setText(" " + str(self.bins[0])) self.l_smoothing_r.setText(" " + str(self.bins[-1])) def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.cb_prob.clear() self.cb_prob.addItem("(None)") self.cb_prob.addItems(self.cvar.values) self.cb_prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: data = self.data[:, (self.var, self.cvar) if self.cvar else self.var ] disc = Orange.preprocess.discretize.EqualWidth(n=self.bins[self.smoothing_index]) data = Orange.preprocess.Discretize(data, method=disc) self.var = data.domain.variables[0] self.set_left_axis_name() self.enable_disable_rel_freq() if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange() def display_distribution(self): dist = self.distributions var = self.var assert len(dist) > 0 self.plot.clear() self.plot_prob.clear() self.plot.hideAxis('right') bottomaxis = self.plot.getAxis("bottom") bottomaxis.setLabel(var.name) self.set_left_axis_name() if var and var.is_continuous: bottomaxis.setTicks(None) edges, curve = ash_curve(dist, None, m=OWDistributions.ASH_HIST, smoothing_factor=self.smoothing_facs[self.smoothing_index]) edges = edges + (edges[1] - edges[0])/2 edges = edges[:-1] item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(Qt.white), 3) pen.setCosmetic(True) item.setData(edges, curve, antialias=True, stepMode=False, fillLevel=0, brush=QtGui.QBrush(Qt.gray), pen=pen) self.plot.addItem(item) else: bottomaxis.setTicks([list(enumerate(var.values))]) for i, w in enumerate(dist): geom = QtCore.QRectF(i - 0.33, 0, 0.66, w) item = DistributionBarItem(geom, [1.0], [QtGui.QColor(128, 128, 128)]) self.plot.addItem(item) def _on_relative_freq_changed(self): self.set_left_axis_name() if self.cvar and self.cvar.is_discrete: self.display_contingency() else: self.display_distribution() self.plot.autoRange() def display_contingency(self): """ Set the contingency to display. """ cont = self.contingencies var, cvar = self.var, self.cvar assert len(cont) > 0 self.plot.clear() self.plot_prob.clear() self._legend.clear() if self.show_prob: self.plot.showAxis('right') else: self.plot.hideAxis('right') bottomaxis = self.plot.getAxis("bottom") bottomaxis.setLabel(var.name) cvar_values = cvar.values palette = colorpalette.ColorPaletteGenerator(len(cvar_values)) colors = [palette[i].lighter() for i in range(len(cvar_values))] if var and var.is_continuous: bottomaxis.setTicks(None) weights, cols, cvar_values, curves = [], [], [], [] for i, dist in enumerate(cont): v, W = dist if len(v): weights.append(numpy.sum(W)) cols.append(colors[i]) cvar_values.append(cvar.values[i]) curves.append(ash_curve(dist, cont, m=OWDistributions.ASH_HIST, smoothing_factor=self.smoothing_facs[self.smoothing_index])) weights = numpy.array(weights) sumw = numpy.sum(weights) weights /= sumw colors = cols curves = [(X, Y * w) for (X, Y), w in zip(curves, weights)] ncval = len(cvar_values) curvesline = [] #from histograms to lines for (X,Y) in curves: X = X + (X[1] - X[0])/2 X = X[:-1] X = numpy.array(X) Y = numpy.array(Y) curvesline.append((X,Y)) for t in [ "fill", "line" ]: for (X, Y), color, w in reversed(list(zip(curvesline, colors, weights))): item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(color), 3) pen.setCosmetic(True) color = QtGui.QColor(color) color.setAlphaF(0.2) item.setData(X, Y/(w if self.relative_freq else 1), antialias=True, stepMode=False, fillLevel=0 if t == "fill" else None, brush=QtGui.QBrush(color), pen=pen) self.plot.addItem(item) if self.show_prob: M_EST = 5 #for M estimate all_X = numpy.array(numpy.unique(numpy.hstack([X for X,_ in curvesline]))) inter_X = numpy.array(numpy.linspace(all_X[0], all_X[-1], len(all_X)*2)) curvesinterp = [ numpy.interp(inter_X, X, Y) for (X,Y) in curvesline ] sumprob = numpy.sum(curvesinterp, axis=0) # allcorrection = M_EST/sumw*numpy.sum(sumprob)/len(inter_X) legal = sumprob > 0.05 * numpy.max(sumprob) i = len(curvesinterp) + 1 show_all = self.show_prob == i for Y, color in reversed(list(zip(curvesinterp, colors))): i -= 1 if show_all or self.show_prob == i: item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(color), 3, style=QtCore.Qt.DotLine) pen.setCosmetic(True) #prob = (Y+allcorrection/ncval)/(sumprob+allcorrection) prob = Y[legal] / sumprob[legal] item.setData(inter_X[legal], prob, antialias=True, stepMode=False, fillLevel=None, brush=None, pen=pen) self.plot_prob.addItem(item) elif var and var.is_discrete: bottomaxis.setTicks([list(enumerate(var.values))]) cont = numpy.array(cont) ncval = len(cvar_values) maxh = 0 #maximal column height maxrh = 0 #maximal relative column height scvar = cont.sum(axis=1) #a cvar with sum=0 with allways have distribution counts 0, #therefore we can divide it by anything scvar[scvar==0] = 1 for i, (value, dist) in enumerate(zip(var.values, cont.T)): maxh = max(maxh, max(dist)) maxrh = max(maxrh, max(dist/scvar)) for i, (value, dist) in enumerate(zip(var.values, cont.T)): dsum = sum(dist) geom = QtCore.QRectF(i - 0.333, 0, 0.666, maxrh if self.relative_freq else maxh) if self.show_prob: prob = dist / dsum ci = 1.96 * numpy.sqrt(prob * (1 - prob) / dsum) else: ci = None item = DistributionBarItem(geom, dist/scvar/maxrh if self.relative_freq else dist/maxh, colors) self.plot.addItem(item) if self.show_prob: for ic, a in enumerate(dist): if self.show_prob - 1 != ic and \ self.show_prob - 1 != len(dist): continue position = -0.333 + ((ic+0.5)*0.666/len(dist)) if dsum < 1e-6: continue prob = a / dsum if not 1e-6 < prob < 1 - 1e-6: continue ci = 1.96 * sqrt(prob * (1 - prob) / dsum) mark = pg.ScatterPlotItem() bar = pg.ErrorBarItem() pen = QtGui.QPen(QtGui.QBrush(QtGui.QColor(0)), 1) pen.setCosmetic(True) bar.setData(x=[i+position], y=[prob], bottom=min(numpy.array([ci]), prob), top=min(numpy.array([ci]), 1 - prob), beam=numpy.array([0.05]), brush=QtGui.QColor(1), pen=pen) mark.setData([i+position], [prob], antialias=True, symbol="o", fillLevel=None, pxMode=True, size=10, brush=QtGui.QColor(colors[ic]), pen=pen) self.plot_prob.addItem(bar) self.plot_prob.addItem(mark) for color, name in zip(colors, cvar_values): self._legend.addItem( ScatterPlotItem(pen=color, brush=color, size=10, shape="s"), escape(name) ) self._legend.show() def set_left_axis_name(self): set_label = self.plot.getAxis("left").setLabel if self.var and self.var.is_continuous: set_label(["Density", "Relative density"] [self.cvar is not None and self.relative_freq]) else: set_label(["Frequency", "Relative frequency"] [self.cvar is not None and self.relative_freq]) def enable_disable_rel_freq(self): self.cb_prob.setDisabled(self.var is None or self.cvar is None) self.cb_rel_freq.setDisabled( self.var is None or self.cvar is None) def _on_variable_idx_changed(self): self.variable_idx = selected_index(self.varview) self._setup() def _on_groupvar_idx_changed(self): self._setup() def _on_set_smoothing(self): self._setup() def onDeleteWidget(self): self.plot.clear() super().onDeleteWidget() def save_graph(self): from Orange.widgets.data.owsave import OWSave save_img = OWSave(data=self.plot, file_formats=FileFormat.img_writers) save_img.exec_()
class OWDistributions(widget.OWWidget): name = "Distributions" description = "Display value distributions of a data feature in a graph." icon = "icons/Distribution.svg" priority = 120 class Inputs: data = Input("Data", Orange.data.Table, doc="Set the input data set") settingsHandler = settings.DomainContextHandler( match_values=settings.DomainContextHandler.MATCH_VALUES_ALL) #: Selected variable index variable_idx = settings.ContextSetting(-1) #: Selected group variable groupvar_idx = settings.ContextSetting(0) relative_freq = settings.Setting(False) disc_cont = settings.Setting(False) smoothing_index = settings.Setting(5) show_prob = settings.ContextSetting(0) graph_name = "plot" ASH_HIST = 50 bins = [2, 3, 4, 5, 8, 10, 12, 15, 20, 30, 50] smoothing_facs = list(reversed([0.1, 0.2, 0.4, 0.6, 0.8, 1, 1.5, 2, 4, 6, 10])) def __init__(self): super().__init__() self.data = None self.distributions = None self.contingencies = None self.var = self.cvar = None varbox = gui.vBox(self.controlArea, "Variable") self.varmodel = itemmodels.VariableListModel() self.groupvarmodel = [] self.varview = QListView( selectionMode=QListView.SingleSelection) self.varview.setSizePolicy( QSizePolicy.Minimum, QSizePolicy.Expanding) self.varview.setModel(self.varmodel) self.varview.setSelectionModel( itemmodels.ListSingleSelectionModel(self.varmodel)) self.varview.selectionModel().selectionChanged.connect( self._on_variable_idx_changed) varbox.layout().addWidget(self.varview) box = gui.vBox(self.controlArea, "Precision") gui.separator(self.controlArea, 4, 4) box2 = gui.hBox(box) self.l_smoothing_l = gui.widgetLabel(box2, "Smooth") gui.hSlider(box2, self, "smoothing_index", minValue=0, maxValue=len(self.smoothing_facs) - 1, callback=self._on_set_smoothing, createLabel=False) self.l_smoothing_r = gui.widgetLabel(box2, "Precise") self.cb_disc_cont = gui.checkBox( gui.indentedBox(box, sep=4), self, "disc_cont", "Bin numeric variables", callback=self._on_groupvar_idx_changed, tooltip="Show numeric variables as categorical.") box = gui.vBox(self.controlArea, "Group by") self.icons = gui.attributeIconDict self.groupvarview = gui.comboBox( box, self, "groupvar_idx", callback=self._on_groupvar_idx_changed, valueType=str, contentsLength=12) box2 = gui.indentedBox(box, sep=4) self.cb_rel_freq = gui.checkBox( box2, self, "relative_freq", "Show relative frequencies", callback=self._on_relative_freq_changed, tooltip="Normalize probabilities so that probabilities " "for each group-by value sum to 1.") gui.separator(box2) self.cb_prob = gui.comboBox( box2, self, "show_prob", label="Show probabilities:", orientation=Qt.Horizontal, callback=self._on_relative_freq_changed, tooltip="Show probabilities for a chosen group-by value " "(at each point probabilities for all group-by values sum to 1).") self.plotview = pg.PlotWidget(background=None) self.plotview.setRenderHint(QPainter.Antialiasing) self.mainArea.layout().addWidget(self.plotview) w = QLabel() w.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Fixed) self.mainArea.layout().addWidget(w, Qt.AlignCenter) self.ploti = pg.PlotItem() self.plot = self.ploti.vb self.ploti.hideButtons() self.plotview.setCentralItem(self.ploti) self.plot_prob = pg.ViewBox() self.ploti.hideAxis('right') self.ploti.scene().addItem(self.plot_prob) self.ploti.getAxis("right").linkToView(self.plot_prob) self.ploti.getAxis("right").setLabel("Probability") self.plot_prob.setZValue(10) self.plot_prob.setXLink(self.ploti) self.update_views() self.ploti.vb.sigResized.connect(self.update_views) self.plot_prob.setRange(yRange=[0, 1]) def disable_mouse(plot): plot.setMouseEnabled(False, False) plot.setMenuEnabled(False) disable_mouse(self.plot) disable_mouse(self.plot_prob) self.tooltip_items = [] self.plot.scene().installEventFilter( HelpEventDelegate(self.help_event, self)) pen = QPen(self.palette().color(QPalette.Text)) for axis in ("left", "bottom"): self.ploti.getAxis(axis).setPen(pen) self._legend = LegendItem() self._legend.setParentItem(self.plot) self._legend.hide() self._legend.anchor((1, 0), (1, 0)) def update_views(self): self.plot_prob.setGeometry(self.plot.sceneBoundingRect()) self.plot_prob.linkedViewChanged(self.plot, self.plot_prob.XAxis) @Inputs.data def set_data(self, data): self.closeContext() self.clear() self.warning() self.data = data self.distributions = None self.contingencies = None if self.data is not None: if not self.data: self.warning("Empty input data cannot be visualized") return domain = self.data.domain self.varmodel[:] = list(domain.variables) + \ [meta for meta in domain.metas if meta.is_continuous or meta.is_discrete] self.groupvarview.clear() self.groupvarmodel = \ ["(None)"] + [var for var in domain.variables if var.is_discrete] + \ [meta for meta in domain.metas if meta.is_discrete] self.groupvarview.addItem("(None)") for var in self.groupvarmodel[1:]: self.groupvarview.addItem(self.icons[var], var.name) if domain.has_discrete_class: self.groupvar_idx = \ self.groupvarmodel[1:].index(domain.class_var) + 1 self.openContext(domain) self.variable_idx = min(max(self.variable_idx, 0), len(self.varmodel) - 1) self.groupvar_idx = min(max(self.groupvar_idx, 0), len(self.groupvarmodel) - 1) itemmodels.select_row(self.varview, self.variable_idx) self._setup() def clear(self): self.plot.clear() self.plot_prob.clear() self.varmodel[:] = [] self.groupvarmodel = [] self.variable_idx = -1 self.groupvar_idx = 0 self._legend.clear() self._legend.hide() self.groupvarview.clear() self.cb_prob.clear() def _setup_smoothing(self): if not self.disc_cont and self.var and self.var.is_continuous: self.cb_disc_cont.setText("Bin numeric variables") self.l_smoothing_l.setText("Smooth") self.l_smoothing_r.setText("Precise") else: self.cb_disc_cont.setText("Bin numeric variables into {} bins". format(self.bins[self.smoothing_index])) self.l_smoothing_l.setText(" " + str(self.bins[0])) self.l_smoothing_r.setText(" " + str(self.bins[-1])) @property def smoothing_factor(self): return self.smoothing_facs[self.smoothing_index] def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.cb_prob.clear() self.cb_prob.addItem("(None)") self.cb_prob.addItems(self.cvar.values) self.cb_prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: domain = Orange.data.Domain( [self.var, self.cvar] if self.cvar else [self.var]) data = Orange.data.Table(domain, data) disc = EqualWidth(n=self.bins[self.smoothing_index]) data = Discretize(method=disc, remove_const=False)(data) self.var = data.domain[0] self.set_left_axis_name() self.enable_disable_rel_freq() if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange() def help_event(self, ev): self.plot.mapSceneToView(ev.scenePos()) ctooltip = [] for vb, item in self.tooltip_items: mouse_over_curve = isinstance(item, pg.PlotCurveItem) \ and item.mouseShape().contains(vb.mapSceneToView(ev.scenePos())) mouse_over_bar = isinstance(item, DistributionBarItem) \ and item.boundingRect().contains(vb.mapSceneToView(ev.scenePos())) if mouse_over_curve or mouse_over_bar: ctooltip.append(item.tooltip) if ctooltip: QToolTip.showText(ev.screenPos(), "\n\n".join(ctooltip), widget=self.plotview) return True return False def display_distribution(self): dist = self.distributions var = self.var if dist is None or not len(dist): return self.plot.clear() self.plot_prob.clear() self.ploti.hideAxis('right') self.tooltip_items = [] bottomaxis = self.ploti.getAxis("bottom") bottomaxis.setLabel(var.name) bottomaxis.resizeEvent() self.set_left_axis_name() if var and var.is_continuous: bottomaxis.setTicks(None) if not len(dist[0]): return edges, curve = ash_curve(dist, None, m=OWDistributions.ASH_HIST, smoothing_factor=self.smoothing_factor) edges = edges + (edges[1] - edges[0])/2 edges = edges[:-1] item = pg.PlotCurveItem() pen = QPen(QBrush(Qt.white), 3) pen.setCosmetic(True) item.setData(edges, curve, antialias=True, stepMode=False, fillLevel=0, brush=QBrush(Qt.gray), pen=pen) self.plot.addItem(item) item.tooltip = "Density" self.tooltip_items.append((self.plot, item)) else: bottomaxis.setTicks([list(enumerate(var.values))]) for i, w in enumerate(dist): geom = QRectF(i - 0.33, 0, 0.66, w) item = DistributionBarItem(geom, [1.0], [QColor(128, 128, 128)]) self.plot.addItem(item) item.tooltip = "Frequency for %s: %r" % (var.values[i], w) self.tooltip_items.append((self.plot, item)) def _on_relative_freq_changed(self): self.set_left_axis_name() if self.cvar and self.cvar.is_discrete: self.display_contingency() else: self.display_distribution() self.plot.autoRange() def display_contingency(self): """ Set the contingency to display. """ cont = self.contingencies var, cvar = self.var, self.cvar if cont is None or not len(cont): return self.plot.clear() self.plot_prob.clear() self._legend.clear() self.tooltip_items = [] if self.show_prob: self.ploti.showAxis('right') else: self.ploti.hideAxis('right') bottomaxis = self.ploti.getAxis("bottom") bottomaxis.setLabel(var.name) bottomaxis.resizeEvent() cvar_values = cvar.values colors = [QColor(*col) for col in cvar.colors] if var and var.is_continuous: bottomaxis.setTicks(None) weights, cols, cvar_values, curves = [], [], [], [] for i, dist in enumerate(cont): v, W = dist if len(v): weights.append(numpy.sum(W)) cols.append(colors[i]) cvar_values.append(cvar.values[i]) curves.append(ash_curve( dist, cont, m=OWDistributions.ASH_HIST, smoothing_factor=self.smoothing_factor)) weights = numpy.array(weights) sumw = numpy.sum(weights) weights /= sumw colors = cols curves = [(X, Y * w) for (X, Y), w in zip(curves, weights)] curvesline = [] #from histograms to lines for X, Y in curves: X = X + (X[1] - X[0])/2 X = X[:-1] X = numpy.array(X) Y = numpy.array(Y) curvesline.append((X, Y)) for t in ["fill", "line"]: curve_data = list(zip(curvesline, colors, weights, cvar_values)) for (X, Y), color, w, cval in reversed(curve_data): item = pg.PlotCurveItem() pen = QPen(QBrush(color), 3) pen.setCosmetic(True) color = QColor(color) color.setAlphaF(0.2) item.setData(X, Y/(w if self.relative_freq else 1), antialias=True, stepMode=False, fillLevel=0 if t == "fill" else None, brush=QBrush(color), pen=pen) self.plot.addItem(item) if t == "line": item.tooltip = "{}\n{}={}".format( "Normalized density " if self.relative_freq else "Density ", cvar.name, cval) self.tooltip_items.append((self.plot, item)) if self.show_prob: all_X = numpy.array(numpy.unique(numpy.hstack([X for X, _ in curvesline]))) inter_X = numpy.array(numpy.linspace(all_X[0], all_X[-1], len(all_X)*2)) curvesinterp = [numpy.interp(inter_X, X, Y) for (X, Y) in curvesline] sumprob = numpy.sum(curvesinterp, axis=0) legal = sumprob > 0.05 * numpy.max(sumprob) i = len(curvesinterp) + 1 show_all = self.show_prob == i for Y, color, cval in reversed(list(zip(curvesinterp, colors, cvar_values))): i -= 1 if show_all or self.show_prob == i: item = pg.PlotCurveItem() pen = QPen(QBrush(color), 3, style=Qt.DotLine) pen.setCosmetic(True) prob = Y[legal] / sumprob[legal] item.setData( inter_X[legal], prob, antialias=True, stepMode=False, fillLevel=None, brush=None, pen=pen) self.plot_prob.addItem(item) item.tooltip = "Probability that \n" + cvar.name + "=" + cval self.tooltip_items.append((self.plot_prob, item)) elif var and var.is_discrete: bottomaxis.setTicks([list(enumerate(var.values))]) cont = numpy.array(cont) maxh = 0 #maximal column height maxrh = 0 #maximal relative column height scvar = cont.sum(axis=1) #a cvar with sum=0 with allways have distribution counts 0, #therefore we can divide it by anything scvar[scvar == 0] = 1 for i, (value, dist) in enumerate(zip(var.values, cont.T)): maxh = max(maxh, max(dist)) maxrh = max(maxrh, max(dist/scvar)) for i, (value, dist) in enumerate(zip(var.values, cont.T)): dsum = sum(dist) geom = QRectF(i - 0.333, 0, 0.666, maxrh if self.relative_freq else maxh) if self.show_prob: prob = dist / dsum ci = 1.96 * numpy.sqrt(prob * (1 - prob) / dsum) else: ci = None item = DistributionBarItem(geom, dist/scvar/maxrh if self.relative_freq else dist/maxh, colors) self.plot.addItem(item) tooltip = "\n".join( "%s: %.*f" % (n, 3 if self.relative_freq else 1, v) for n, v in zip(cvar_values, dist/scvar if self.relative_freq else dist)) item.tooltip = "{} ({}={}):\n{}".format( "Normalized frequency " if self.relative_freq else "Frequency ", cvar.name, value, tooltip) self.tooltip_items.append((self.plot, item)) if self.show_prob: item.tooltip += "\n\nProbabilities:" for ic, a in enumerate(dist): if self.show_prob - 1 != ic and \ self.show_prob - 1 != len(dist): continue position = -0.333 + ((ic+0.5)*0.666/len(dist)) if dsum < 1e-6: continue prob = a / dsum if not 1e-6 < prob < 1 - 1e-6: continue ci = 1.96 * sqrt(prob * (1 - prob) / dsum) item.tooltip += "\n%s: %.3f ± %.3f" % (cvar_values[ic], prob, ci) mark = pg.ScatterPlotItem() errorbar = pg.ErrorBarItem() pen = QPen(QBrush(QColor(0)), 1) pen.setCosmetic(True) errorbar.setData(x=[i+position], y=[prob], bottom=min(numpy.array([ci]), prob), top=min(numpy.array([ci]), 1 - prob), beam=numpy.array([0.05]), brush=QColor(1), pen=pen) mark.setData([i+position], [prob], antialias=True, symbol="o", fillLevel=None, pxMode=True, size=10, brush=QColor(colors[ic]), pen=pen) self.plot_prob.addItem(errorbar) self.plot_prob.addItem(mark) for color, name in zip(colors, cvar_values): self._legend.addItem( ScatterPlotItem(pen=color, brush=color, size=10, shape="s"), escape(name) ) self._legend.show() def set_left_axis_name(self): leftaxis = self.ploti.getAxis("left") set_label = leftaxis.setLabel if self.var and self.var.is_continuous: set_label(["Density", "Relative density"] [self.cvar is not None and self.relative_freq]) else: set_label(["Frequency", "Relative frequency"] [self.cvar is not None and self.relative_freq]) leftaxis.resizeEvent() def enable_disable_rel_freq(self): self.cb_prob.setDisabled(self.var is None or self.cvar is None) self.cb_rel_freq.setDisabled( self.var is None or self.cvar is None) def _on_variable_idx_changed(self): self.variable_idx = selected_index(self.varview) self._setup() def _on_groupvar_idx_changed(self): self._setup() def _on_set_smoothing(self): self._setup() def onDeleteWidget(self): self.plot.clear() super().onDeleteWidget() def get_widget_name_extension(self): if self.variable_idx >= 0: return self.varmodel[self.variable_idx] def send_report(self): self.plotview.scene().setSceneRect(self.plotview.sceneRect()) if self.variable_idx < 0: return self.report_plot() text = "Distribution of '{}'".format( self.varmodel[self.variable_idx]) if self.groupvar_idx: group_var = self.groupvarmodel[self.groupvar_idx] prob = self.cb_prob indiv_probs = 0 < prob.currentIndex() < prob.count() - 1 if not indiv_probs or self.relative_freq: text += " grouped by '{}'".format(group_var) if self.relative_freq: text += " (relative frequencies)" if indiv_probs: text += "; probabilites for '{}={}'".format( group_var, prob.currentText()) self.report_caption(text)
class DistributionTest(OWWidget): name = 'Distribution Test' description = 'Check if data is in given distribution.' icon = 'icons/disttest.svg' want_main_area = True buttons_area_orientation = Qt.Vertical resizing_enabled = True inputs = [('Data', Orange.data.Table, 'set_data')] available_tests = ( KolmogorovSmirnov, AndersonDarling, ShapiroWilk, ChiSquare, ) settingsHandler = settings.DomainContextHandler( match_values=settings.DomainContextHandler.MATCH_VALUES_ALL) #: Selected variable index available_distributions = [d for d in Distribution] test_idx = 0 distribution_idx = 0 column_idx = 0 own_distribution_idx = 0 relative_freq = settings.Setting(False) smoothing_index = settings.Setting(5) show_prob = settings.ContextSetting(0) graph_name = "box_scene" ASH_HIST = 50 bins = [2, 3, 4, 5, 8, 10, 12, 15, 20, 30, 50] def __init__(self): super().__init__() self.varmodel = itemmodels.VariableListModel() self.groupvarmodel = [] self.distributions = [ distribution.value for distribution in self.available_distributions ] box = gui.vBox(self.controlArea, 'Tests') gui.radioButtonsInBox( box, self, 'test_idx', btnLabels=[test.name for test in self.available_tests], callback=self.test_changed, ) box = gui.vBox(self.controlArea, 'Distributions') self.distribution_choose = gui.radioButtonsInBox( box, self, 'distribution_idx', btnLabels=self.distributions, callback=self.distribution_changed, ) self.column_chose = gui.comboBox( self.controlArea, self, 'column_idx', box='Selected column', items=[], orientation=Qt.Horizontal, callback=self.column_changed, ) self.available_columns = itemmodels.VariableListModel(parent=self) self.column_chose.setModel(self.available_columns) self.infolabel = gui.widgetLabel(box, "<center>p-value: </center>") self.mainArea.setMinimumWidth(800) self.own_distribution_choose = gui.comboBox( self.controlArea, self, 'own_distribution_idx', box='Own distribution', items=[], orientation=Qt.Horizontal, callback=self.column_changed, ) self.own_distribution_choose.setModel(self.available_columns) self.data = None self.plotview = pg.PlotWidget(background=None) self.plotview.setRenderHint(QPainter.Antialiasing) self.mainArea.layout().addWidget(self.plotview) w = QLabel() w.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Fixed) self.mainArea.layout().addWidget(w, Qt.AlignCenter) self.ploti = pg.PlotItem() self.box_scene = self.ploti.vb self.ploti.hideButtons() self.plotview.setCentralItem(self.ploti) self.plot_prob = pg.ViewBox() self.ploti.scene().addItem(self.plot_prob) self.ploti.getAxis("right").linkToView(self.plot_prob) self.ploti.getAxis("right").setLabel("Probability") self.plot_prob.setZValue(10) self.plot_prob.setXLink(self.ploti) self.update_views() self.ploti.vb.sigResized.connect(self.update_views) self.plot_prob.setRange(yRange=[0, 1]) def disable_mouse(box_scene): box_scene.setMouseEnabled(False, False) box_scene.setMenuEnabled(False) disable_mouse(self.box_scene) disable_mouse(self.plot_prob) self.tooltip_items = [] pen = QPen(self.palette().color(QPalette.Text)) for axis in ("left", "bottom"): self.ploti.getAxis(axis).setPen(pen) self._legend = LegendItem() self._legend.setParentItem(self.box_scene) self._legend.hide() self._legend.anchor((1, 0), (1, 0)) self.test_changed() def update_views(self): """ resize """ self.plot_prob.setGeometry(self.box_scene.sceneBoundingRect()) self.plot_prob.linkedViewChanged(self.box_scene, self.plot_prob.XAxis) def set_data(self, data): self.closeContext() self.clear() self.warning() self.data = data self.own_distribution_choose.hide() if data is not None: self.available_columns[:] = data.domain domain = self.data.domain self.varmodel[:] = list(domain) + [ meta for meta in domain.metas if meta.is_continuous or meta.is_discrete ] self.groupvarmodel = \ ["(None)"] + [var for var in domain if var.is_discrete] + \ [meta for meta in domain.metas if meta.is_discrete] if domain.has_discrete_class: self.groupvar_idx = \ self.groupvarmodel[1:].index(domain.class_var) + 1 self.openContext(domain) self.column_idx = min(max(self.column_idx, 0), len(self.varmodel) - 1) self.groupvar_idx = min(max(self.groupvar_idx, 0), len(self.groupvarmodel) - 1) self._setup() def test_changed(self): """ Management of buttons, depends from type of distribution test. Own samples are hidden. :return: """ for idx, button in enumerate(self.distribution_choose.buttons): if Distribution(button.text()) in self.test.allowed_distribution: button.show() if self.distribution not in self.test.allowed_distribution: button.toggle() self.distribution_idx = idx else: self.own_distribution_choose.hide() button.hide() self.compute_p_value() def distribution_changed(self): """ Control buttons of allowed distributions - show or hide :return: compute p-value """ if self.distribution == Distribution.OWN: self.own_distribution_choose.show() else: self.own_distribution_choose.hide() self.compute_p_value() def clear(self): self.box_scene.clear() self.plot_prob.clear() self.varmodel[:] = [] self.groupvarmodel = [] self.column_idx = -1 self.groupvar_idx = 0 self._legend.clear() self._legend.hide() def column_changed(self): """ compute p-value if column is changed """ self._setup() self.compute_p_value() def _setup(self): """ set new plot """ self.box_scene.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.column_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] data = self.data if self.var is None: return self.set_left_axis_name() if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.box_scene.autoRange() def compute_p_value(self): """ :return: p-value """ if self.data is not None: p_value = self.test.compute(self) if isinstance(p_value, float): self.infolabel.setText('\np-value: {}'.format(p_value)) else: self.infolabel.setText('\np-value: {}'.format(round( p_value, 3))) def _on_relative_freq_changed(self): self.set_left_axis_name() if self.cvar and self.cvar.is_discrete: self.display_contingency() else: self.display_distribution() self.box_scene.autoRange() def display_contingency(self): """ Set the contingency to display. """ cont = self.contingencies var, cvar = self.var, self.cvar assert len(cont) > 0 self.box_scene.clear() self.plot_prob.clear() self._legend.clear() self.tooltip_items = [] if self.show_prob: self.ploti.showAxis('right') else: self.ploti.hideAxis('right') bottomaxis = self.ploti.getAxis("bottom") bottomaxis.setLabel(var.name) bottomaxis.resizeEvent() cvar_values = cvar.values colors = [QColor(*col) for col in cvar.colors] if var and var.is_continuous: bottomaxis.setTicks(None) weights, cols, cvar_values, curves = [], [], [], [] for i, dist in enumerate(cont): v, W = dist if len(v): weights.append(np.sum(W)) cols.append(colors[i]) cvar_values.append(cvar.values[i]) curves.append( ash_curve(dist, cont, m=DistributionTest.ASH_HIST)) weights = np.array(weights) sumw = np.sum(weights) weights /= sumw colors = cols curves = [(X, Y * w) for (X, Y), w in zip(curves, weights)] curvesline = [] # from histograms to lines for (X, Y) in curves: X += np.array(((X[1] - X[0]) / 2)[:-1]) Y = np.array(Y) curvesline.append((X, Y)) for t in ["fill", "line"]: for (X, Y), color, w, cval in reversed( list(zip(curvesline, colors, weights, cvar_values))): item = pg.PlotCurveItem() pen = QPen(QBrush(color), 3) pen.setCosmetic(True) color = QColor(color) color.setAlphaF(0.2) item.setData(X, Y / (w if self.relative_freq else 1), antialias=True, stepMode=False, fillLevel=0 if t == "fill" else None, brush=QBrush(color), pen=pen) self.box_scene.addItem(item) if t == "line": if self.relative_freq: density = "Normalized density" else: density = "Density" item.tooltip = "{density}\n{name}={value}".format( value=cval, name=cvar.name, density=density) self.tooltip_items.append((self.box_scene, item)) if self.show_prob: all_X = np.array( np.unique(np.hstack([X for X, _ in curvesline]))) inter_X = np.array( np.linspace(all_X[0], all_X[-1], len(all_X) * 2)) curvesinterp = [ np.interp(inter_X, X, Y) for (X, Y) in curvesline ] sumprob = np.sum(curvesinterp, axis=0) legal = sumprob > 0.05 * np.max(sumprob) i = len(curvesinterp) + 1 show_all = self.show_prob == i for Y, color, cval in reversed( list(zip(curvesinterp, colors, cvar_values))): i -= 1 if show_all or self.show_prob == i: item = pg.PlotCurveItem() pen = QPen(QBrush(color), 3, style=Qt.DotLine) pen.setCosmetic(True) prob = Y[legal] / sumprob[legal] item.setData(inter_X[legal], prob, antialias=True, stepMode=False, fillLevel=None, brush=None, pen=pen) self.plot_prob.addItem(item) item.tooltip = \ "Probability that \n{name}={value}".format( name=cvar.name, value=cval) self.tooltip_items.append((self.plot_prob, item)) elif var and var.is_discrete: bottomaxis.setTicks([list(enumerate(var.values))]) cont = np.array(cont) maxh = 0 # maximal column height maxrh = 0 # maximal relative column height scvar = cont.sum(axis=1) # a cvar with sum=0 with allways have distribution counts 0, # therefore we can divide it by anything scvar[scvar == 0] = 1 for i, (value, dist) in enumerate(zip(var.values, cont.T)): maxh = max(maxh, max(dist)) maxrh = max(maxrh, max(dist / scvar)) for i, (value, dist) in enumerate(zip(var.values, cont.T)): dsum = sum(dist) geom = QRectF(i - 0.333, 0, 0.666, maxrh if self.relative_freq else maxh) item = DistributionBarItem( geom, dist / scvar / maxrh if self.relative_freq else dist / maxh, colors) self.box_scene.addItem(item) tooltip = "\n".join("%s: %.*f" % (n, 3 if self.relative_freq else 1, v) for n, v in zip( cvar_values, dist / scvar if self.relative_freq else dist)) if self.relative_freq: frequency = "Normalized frequency" else: frequency = "Frequency" item.tooltip = \ "{frequency}({name}={value}):\n{tooltip}".format( frequency=frequency, name=cvar.name, value=value, tooltip=tooltip, ) self.tooltip_items.append((self.box_scene, item)) if self.show_prob: item.tooltip += "\n\nProbabilities:" for ic, a in enumerate(dist): if (self.show_prob - 1 != ic and self.show_prob - 1 != len(dist)) \ or dsum < 1e-6: continue position = -0.333 + ((ic + 0.5) * 0.666 / len(dist)) prob = a / dsum if not 1e-6 < prob < 1 - 1e-6: continue ci = 1.96 * sqrt(prob * (1 - prob) / dsum) item.tooltip += "\n%s: %.3f ± %.3f" % (cvar_values[ic], prob, ci) mark = pg.ScatterPlotItem() bar = pg.ErrorBarItem() pen = QPen(QBrush(QColor(0)), 1) pen.setCosmetic(True) bar.setData(x=[i + position], y=[prob], bottom=min(np.array([ci]), prob), top=min(np.array([ci]), 1 - prob), beam=np.array([0.05]), brush=QColor(1), pen=pen) mark.setData([i + position], [prob], antialias=True, symbol="o", fillLevel=None, pxMode=True, size=10, brush=QColor(colors[ic]), pen=pen) self.plot_prob.addItem(bar) self.plot_prob.addItem(mark) for color, name in zip(colors, cvar_values): self._legend.addItem( ScatterPlotItem(pen=color, brush=color, size=10, shape="s"), escape(name)) self._legend.show() def set_left_axis_name(self): leftaxis = self.ploti.getAxis("left") set_label = leftaxis.setLabel if self.var and self.var.is_continuous: set_label(["Density", "Relative density"][self.cvar is not None and self.relative_freq]) else: set_label(["Frequency", "Relative frequency"][self.cvar is not None and self.relative_freq]) leftaxis.resizeEvent() def display_distribution(self): dist = self.distributions var = self.var assert len(dist) > 0 self.box_scene.clear() self.plot_prob.clear() self.ploti.hideAxis('right') self.tooltip_items = [] bottomaxis = self.ploti.getAxis("bottom") bottomaxis.setLabel(var.name) bottomaxis.resizeEvent() self.set_left_axis_name() if var and var.is_continuous: bottomaxis.setTicks(None) if not len(dist[0]): return edges, curve = ash_curve(dist, None, m=DistributionTest.ASH_HIST) edges = edges + (edges[1] - edges[0]) / 2 edges = edges[:-1] item = pg.PlotCurveItem() pen = QPen(QBrush(Qt.black), 3) pen.setCosmetic(True) item.setData(edges, curve, antialias=True, stepMode=False, fillLevel=0, brush=QBrush(Qt.gray), pen=pen) self.box_scene.addItem(item) item.tooltip = "Density" self.tooltip_items.append((self.box_scene, item)) else: bottomaxis.setTicks([list(enumerate(var.values))]) for i, w in enumerate(dist): geom = QRectF(i - 0.33, 0, 0.66, w) item = DistributionBarItem(geom, [1.0], [QColor(128, 128, 128)]) self.box_scene.addItem(item) item.tooltip = "Frequency for %s: %r" % (var.values[i], w) self.tooltip_items.append((self.box_scene, item)) def onDeleteWidget(self): self.box_scene.clear() super().onDeleteWidget() def get_widget_name_extension(self): if self.column_idx >= 0: return self.varmodel[self.column_idx] @property def test(self) -> Test: return self.available_tests[self.test_idx] @property def distribution(self) -> Distribution: return self.available_distributions[self.distribution_idx] @property def column(self): return self.data[:, self.column_idx] @property def own_distribution(self): return self.data[:, self.own_distribution_idx]
class OWDistributions(widget.OWWidget): name = "Distributions" description = "Display value distributions of a data feature in a graph." icon = "icons/Distribution.svg" priority = 100 inputs = [ InputSignal("Data", Orange.data.Table, "set_data", doc="Set the input data set") ] settingsHandler = settings.DomainContextHandler( match_values=settings.DomainContextHandler.MATCH_VALUES_ALL) #: Selected variable index variable_idx = settings.ContextSetting(-1) #: Selected group variable groupvar_idx = settings.ContextSetting(0) relative_freq = settings.Setting(False) disc_cont = settings.Setting(False) smoothing_index = settings.Setting(5) show_prob = settings.ContextSetting(0) graph_name = "plot" ASH_HIST = 50 bins = [2, 3, 4, 5, 8, 10, 12, 15, 20, 30, 50] smoothing_facs = list( reversed([0.1, 0.2, 0.4, 0.6, 0.8, 1, 1.5, 2, 4, 6, 10])) def __init__(self): super().__init__() self.data = None self.distributions = None self.contingencies = None self.var = self.cvar = None varbox = gui.widgetBox(self.controlArea, "Variable") self.varmodel = itemmodels.VariableListModel() self.groupvarmodel = [] self.varview = QtGui.QListView( selectionMode=QtGui.QListView.SingleSelection) self.varview.setSizePolicy(QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Expanding) self.varview.setModel(self.varmodel) self.varview.setSelectionModel( itemmodels.ListSingleSelectionModel(self.varmodel)) self.varview.selectionModel().selectionChanged.connect( self._on_variable_idx_changed) varbox.layout().addWidget(self.varview) box = gui.widgetBox(self.controlArea, "Precision") gui.separator(self.controlArea, 4, 4) box2 = gui.widgetBox(box, orientation="horizontal") self.l_smoothing_l = gui.widgetLabel(box2, "Smooth") gui.hSlider(box2, self, "smoothing_index", minValue=0, maxValue=len(self.smoothing_facs) - 1, callback=self._on_set_smoothing, createLabel=False) self.l_smoothing_r = gui.widgetLabel(box2, "Precise") self.cb_disc_cont = gui.checkBox( gui.indentedBox(box, sep=4), self, "disc_cont", "Bin continuous variables", callback=self._on_groupvar_idx_changed, tooltip="Show continuous variables as discrete.") box = gui.widgetBox(self.controlArea, "Group by") self.icons = gui.attributeIconDict self.groupvarview = gui.comboBox( box, self, "groupvar_idx", callback=self._on_groupvar_idx_changed, valueType=str, contentsLength=12) box2 = gui.indentedBox(box, sep=4) self.cb_rel_freq = gui.checkBox( box2, self, "relative_freq", "Show relative frequencies", callback=self._on_relative_freq_changed, tooltip= "Normalize probabilities so that probabilities for each group-by value sum to 1." ) gui.separator(box2) self.cb_prob = gui.comboBox( box2, self, "show_prob", label="Show probabilities", orientation="horizontal", callback=self._on_relative_freq_changed, tooltip= "Show probabilities for a chosen group-by value (at each point probabilities for all group-by values sum to 1)." ) self.plotview = pg.PlotWidget(background=None) self.plotview.setRenderHint(QtGui.QPainter.Antialiasing) self.mainArea.layout().addWidget(self.plotview) w = QtGui.QLabel() w.setSizePolicy(QtGui.QSizePolicy.Expanding, QtGui.QSizePolicy.Fixed) self.mainArea.layout().addWidget(w, Qt.AlignCenter) self.ploti = pg.PlotItem() self.plot = self.ploti.vb self.ploti.hideButtons() self.plotview.setCentralItem(self.ploti) self.plot_prob = pg.ViewBox() self.ploti.hideAxis('right') self.ploti.scene().addItem(self.plot_prob) self.ploti.getAxis("right").linkToView(self.plot_prob) self.ploti.getAxis("right").setLabel("Probability") self.plot_prob.setZValue(10) self.plot_prob.setXLink(self.ploti) self.update_views() self.ploti.vb.sigResized.connect(self.update_views) self.plot_prob.setRange(yRange=[0, 1]) self.inline_graph_report() def disable_mouse(plot): plot.setMouseEnabled(False, False) plot.setMenuEnabled(False) disable_mouse(self.plot) disable_mouse(self.plot_prob) self.tooltip_items = [] self.plot.scene().installEventFilter( HelpEventDelegate(self.help_event, self)) pen = QtGui.QPen(self.palette().color(QtGui.QPalette.Text)) for axis in ("left", "bottom"): self.ploti.getAxis(axis).setPen(pen) self._legend = LegendItem() self._legend.setParentItem(self.plot) self._legend.hide() self._legend.anchor((1, 0), (1, 0)) def update_views(self): self.plot_prob.setGeometry(self.plot.sceneBoundingRect()) self.plot_prob.linkedViewChanged(self.plot, self.plot_prob.XAxis) def set_data(self, data): self.closeContext() self.clear() self.data = data if self.data is not None: domain = self.data.domain self.varmodel[:] = list(domain) self.groupvarview.clear() self.groupvarmodel = \ ["(None)"] + [var for var in domain if var.is_discrete] self.groupvarview.addItem("(None)") for var in self.groupvarmodel[1:]: self.groupvarview.addItem(self.icons[var], var.name) if domain.has_discrete_class: self.groupvar_idx = \ self.groupvarmodel[1:].index(domain.class_var) + 1 self.openContext(domain) self.variable_idx = min(max(self.variable_idx, 0), len(self.varmodel) - 1) self.groupvar_idx = min(max(self.groupvar_idx, 0), len(self.groupvarmodel) - 1) itemmodels.select_row(self.varview, self.variable_idx) self._setup() def clear(self): self.plot.clear() self.plot_prob.clear() self.varmodel[:] = [] self.groupvarmodel = [] self.variable_idx = -1 self.groupvar_idx = 0 self._legend.clear() self._legend.hide() def _setup_smoothing(self): if not self.disc_cont and self.var and self.var.is_continuous: self.cb_disc_cont.setText("Bin continuous variables") self.l_smoothing_l.setText("Smooth") self.l_smoothing_r.setText("Precise") else: self.cb_disc_cont.setText( "Bin continuous variables into {} bins".format( self.bins[self.smoothing_index])) self.l_smoothing_l.setText(" " + str(self.bins[0])) self.l_smoothing_r.setText(" " + str(self.bins[-1])) def _setup(self): self.plot.clear() self.plot_prob.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.cb_prob.clear() self.cb_prob.addItem("(None)") self.cb_prob.addItems(self.cvar.values) self.cb_prob.addItem("(All)") self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1) data = self.data self._setup_smoothing() if self.var is None: return if self.disc_cont: data = self.data[:, (self.var, self.cvar) if self.cvar else self.var] disc = Orange.preprocess.discretize.EqualWidth( n=self.bins[self.smoothing_index]) data = Orange.preprocess.Discretize(data, method=disc, remove_const=False) self.var = data.domain[0] self.set_left_axis_name() self.enable_disable_rel_freq() if self.cvar: self.contingencies = \ contingency.get_contingency(data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(data, self.var) self.display_distribution() self.plot.autoRange() def help_event(self, ev): in_graph_coor = self.plot.mapSceneToView(ev.scenePos()) ctooltip = [] for vb, item in self.tooltip_items: if isinstance(item, pg.PlotCurveItem) and item.mouseShape().contains( vb.mapSceneToView(ev.scenePos())): ctooltip.append(item.tooltip) elif isinstance( item, DistributionBarItem) and item.boundingRect().contains( vb.mapSceneToView(ev.scenePos())): ctooltip.append(item.tooltip) if ctooltip: QToolTip.showText(ev.screenPos(), "\n\n".join(ctooltip), widget=self.plotview) return True return False def display_distribution(self): dist = self.distributions var = self.var assert len(dist) > 0 self.plot.clear() self.plot_prob.clear() self.ploti.hideAxis('right') self.tooltip_items = [] bottomaxis = self.ploti.getAxis("bottom") bottomaxis.setLabel(var.name) bottomaxis.resizeEvent() self.set_left_axis_name() if var and var.is_continuous: bottomaxis.setTicks(None) if not len(dist[0]): return edges, curve = ash_curve( dist, None, m=OWDistributions.ASH_HIST, smoothing_factor=self.smoothing_facs[self.smoothing_index]) edges = edges + (edges[1] - edges[0]) / 2 edges = edges[:-1] item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(Qt.white), 3) pen.setCosmetic(True) item.setData(edges, curve, antialias=True, stepMode=False, fillLevel=0, brush=QtGui.QBrush(Qt.gray), pen=pen) self.plot.addItem(item) item.tooltip = "Density" self.tooltip_items.append((self.plot, item)) else: bottomaxis.setTicks([list(enumerate(var.values))]) for i, w in enumerate(dist): geom = QtCore.QRectF(i - 0.33, 0, 0.66, w) item = DistributionBarItem(geom, [1.0], [QtGui.QColor(128, 128, 128)]) self.plot.addItem(item) item.tooltip = "Frequency for %s: %r" % (var.values[i], w) self.tooltip_items.append((self.plot, item)) def _on_relative_freq_changed(self): self.set_left_axis_name() if self.cvar and self.cvar.is_discrete: self.display_contingency() else: self.display_distribution() self.plot.autoRange() def display_contingency(self): """ Set the contingency to display. """ cont = self.contingencies var, cvar = self.var, self.cvar assert len(cont) > 0 self.plot.clear() self.plot_prob.clear() self._legend.clear() self.tooltip_items = [] if self.show_prob: self.ploti.showAxis('right') else: self.ploti.hideAxis('right') bottomaxis = self.ploti.getAxis("bottom") bottomaxis.setLabel(var.name) bottomaxis.resizeEvent() cvar_values = cvar.values colors = [QtGui.QColor(*col) for col in cvar.colors] if var and var.is_continuous: bottomaxis.setTicks(None) weights, cols, cvar_values, curves = [], [], [], [] for i, dist in enumerate(cont): v, W = dist if len(v): weights.append(numpy.sum(W)) cols.append(colors[i]) cvar_values.append(cvar.values[i]) curves.append( ash_curve(dist, cont, m=OWDistributions.ASH_HIST, smoothing_factor=self.smoothing_facs[ self.smoothing_index])) weights = numpy.array(weights) sumw = numpy.sum(weights) weights /= sumw colors = cols curves = [(X, Y * w) for (X, Y), w in zip(curves, weights)] ncval = len(cvar_values) curvesline = [] #from histograms to lines for (X, Y) in curves: X = X + (X[1] - X[0]) / 2 X = X[:-1] X = numpy.array(X) Y = numpy.array(Y) curvesline.append((X, Y)) for t in ["fill", "line"]: for (X, Y), color, w, cval in reversed( list(zip(curvesline, colors, weights, cvar_values))): item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(color), 3) pen.setCosmetic(True) color = QtGui.QColor(color) color.setAlphaF(0.2) item.setData(X, Y / (w if self.relative_freq else 1), antialias=True, stepMode=False, fillLevel=0 if t == "fill" else None, brush=QtGui.QBrush(color), pen=pen) self.plot.addItem(item) if t == "line": item.tooltip = ("Normalized density " if self.relative_freq else "Density ") \ + "\n"+ cvar.name + "=" + cval self.tooltip_items.append((self.plot, item)) if self.show_prob: M_EST = 5 #for M estimate all_X = numpy.array( numpy.unique(numpy.hstack([X for X, _ in curvesline]))) inter_X = numpy.array( numpy.linspace(all_X[0], all_X[-1], len(all_X) * 2)) curvesinterp = [ numpy.interp(inter_X, X, Y) for (X, Y) in curvesline ] sumprob = numpy.sum(curvesinterp, axis=0) # allcorrection = M_EST/sumw*numpy.sum(sumprob)/len(inter_X) legal = sumprob > 0.05 * numpy.max(sumprob) i = len(curvesinterp) + 1 show_all = self.show_prob == i for Y, color, cval in reversed( list(zip(curvesinterp, colors, cvar_values))): i -= 1 if show_all or self.show_prob == i: item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(color), 3, style=QtCore.Qt.DotLine) pen.setCosmetic(True) #prob = (Y+allcorrection/ncval)/(sumprob+allcorrection) prob = Y[legal] / sumprob[legal] item.setData(inter_X[legal], prob, antialias=True, stepMode=False, fillLevel=None, brush=None, pen=pen) self.plot_prob.addItem(item) item.tooltip = "Probability that \n" + cvar.name + "=" + cval self.tooltip_items.append((self.plot_prob, item)) elif var and var.is_discrete: bottomaxis.setTicks([list(enumerate(var.values))]) cont = numpy.array(cont) ncval = len(cvar_values) maxh = 0 #maximal column height maxrh = 0 #maximal relative column height scvar = cont.sum(axis=1) #a cvar with sum=0 with allways have distribution counts 0, #therefore we can divide it by anything scvar[scvar == 0] = 1 for i, (value, dist) in enumerate(zip(var.values, cont.T)): maxh = max(maxh, max(dist)) maxrh = max(maxrh, max(dist / scvar)) for i, (value, dist) in enumerate(zip(var.values, cont.T)): dsum = sum(dist) geom = QtCore.QRectF(i - 0.333, 0, 0.666, maxrh if self.relative_freq else maxh) if self.show_prob: prob = dist / dsum ci = 1.96 * numpy.sqrt(prob * (1 - prob) / dsum) else: ci = None item = DistributionBarItem( geom, dist / scvar / maxrh if self.relative_freq else dist / maxh, colors) self.plot.addItem(item) tooltip = "\n".join("%s: %.*f" % (n, 3 if self.relative_freq else 1, v) for n, v in zip( cvar_values, dist / scvar if self.relative_freq else dist)) item.tooltip = ("Normalized frequency " if self.relative_freq else "Frequency ") \ + "(" + cvar.name + "=" + value + "):" \ + "\n" + tooltip self.tooltip_items.append((self.plot, item)) if self.show_prob: item.tooltip += "\n\nProbabilities:" for ic, a in enumerate(dist): if self.show_prob - 1 != ic and \ self.show_prob - 1 != len(dist): continue position = -0.333 + ((ic + 0.5) * 0.666 / len(dist)) if dsum < 1e-6: continue prob = a / dsum if not 1e-6 < prob < 1 - 1e-6: continue ci = 1.96 * sqrt(prob * (1 - prob) / dsum) item.tooltip += "\n%s: %.3f ± %.3f" % (cvar_values[ic], prob, ci) mark = pg.ScatterPlotItem() bar = pg.ErrorBarItem() pen = QtGui.QPen(QtGui.QBrush(QtGui.QColor(0)), 1) pen.setCosmetic(True) bar.setData(x=[i + position], y=[prob], bottom=min(numpy.array([ci]), prob), top=min(numpy.array([ci]), 1 - prob), beam=numpy.array([0.05]), brush=QtGui.QColor(1), pen=pen) mark.setData([i + position], [prob], antialias=True, symbol="o", fillLevel=None, pxMode=True, size=10, brush=QtGui.QColor(colors[ic]), pen=pen) self.plot_prob.addItem(bar) self.plot_prob.addItem(mark) for color, name in zip(colors, cvar_values): self._legend.addItem( ScatterPlotItem(pen=color, brush=color, size=10, shape="s"), escape(name)) self._legend.show() def set_left_axis_name(self): leftaxis = self.ploti.getAxis("left") set_label = leftaxis.setLabel if self.var and self.var.is_continuous: set_label(["Density", "Relative density"][self.cvar is not None and self.relative_freq]) else: set_label(["Frequency", "Relative frequency"][self.cvar is not None and self.relative_freq]) leftaxis.resizeEvent() def enable_disable_rel_freq(self): self.cb_prob.setDisabled(self.var is None or self.cvar is None) self.cb_rel_freq.setDisabled(self.var is None or self.cvar is None) def _on_variable_idx_changed(self): self.variable_idx = selected_index(self.varview) self._setup() def _on_groupvar_idx_changed(self): self._setup() def _on_set_smoothing(self): self._setup() def onDeleteWidget(self): self.plot.clear() super().onDeleteWidget() def get_widget_name_extension(self): if self.variable_idx >= 0: return self.varmodel[self.variable_idx] def send_report(self): if self.variable_idx < 0: return self.report_plot() text = "Distribution of '{}'".format(self.varmodel[self.variable_idx]) if self.groupvar_idx: group_var = self.groupvarmodel[self.groupvar_idx] prob = self.cb_prob indiv_probs = 0 < prob.currentIndex() < prob.count() - 1 if not indiv_probs or self.relative_freq: text += " grouped by '{}'".format(group_var) if self.relative_freq: text += " (relative frequencies)" if indiv_probs: text += "; probabilites for '{}={}'".format( group_var, prob.currentText()) self.report_caption(text)
class OWDistributions(widget.OWWidget): name = "Distributions" description = "Display value distributions of a data feature in a graph." icon = "icons/Distribution.svg" priority = 100 inputs = [InputSignal("Data", Orange.data.Table, "set_data", doc="Set the input data set")] settingsHandler = settings.DomainContextHandler() #: Selected variable index variable_idx = settings.ContextSetting(-1) #: Selected group variable groupvar_idx = settings.ContextSetting(0) Hist, ASH, Kernel = 0, 1, 2 #: Continuous variable density estimation method cont_est_type = settings.Setting(ASH) relative_freq = settings.Setting(False) def __init__(self, parent=None): super().__init__(parent) self.data = None self.distributions = None self.contingencies = None self.var = self.cvar = None varbox = gui.widgetBox(self.controlArea, "Variable") self.varmodel = itemmodels.VariableListModel() self.groupvarmodel = itemmodels.VariableListModel() self.varview = QtGui.QListView( selectionMode=QtGui.QListView.SingleSelection) self.varview.setSizePolicy( QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Expanding) self.varview.setModel(self.varmodel) self.varview.setSelectionModel( itemmodels.ListSingleSelectionModel(self.varmodel)) self.varview.selectionModel().selectionChanged.connect( self._on_variable_idx_changed) varbox.layout().addWidget(self.varview) gui.separator(varbox, 8, 8) gui.comboBox( varbox, self, "cont_est_type", label="Show continuous variables by", valueType=int, items=["Histograms", "Average shifted histograms", "Kernel density estimators"], callback=self._on_cont_est_type_changed) box = gui.widgetBox(self.controlArea, "Group by") self.groupvarview = QtGui.QListView( selectionMode=QtGui.QListView.SingleSelection) self.groupvarview.setFixedHeight(100) self.groupvarview.setSizePolicy( QtGui.QSizePolicy.Minimum, QtGui.QSizePolicy.Preferred) self.groupvarview.setModel(self.groupvarmodel) self.groupvarview.selectionModel().selectionChanged.connect( self._on_groupvar_idx_changed) box.layout().addWidget(self.groupvarview) self.cb_rel_freq = gui.checkBox( box, self, "relative_freq", "Show relative frequencies", callback=self._on_relative_freq_changed) plotview = pg.PlotWidget(background=None) self.mainArea.layout().addWidget(plotview) w = QtGui.QLabel() w.setSizePolicy(QtGui.QSizePolicy.Expanding, QtGui.QSizePolicy.Fixed) self.mainArea.layout().addWidget(w, Qt.AlignCenter) self.plot = pg.PlotItem() # self.plot.getViewBox().setMouseEnabled(False, False) self.plot.getViewBox().setMenuEnabled(False) plotview.setCentralItem(self.plot) pen = QtGui.QPen(self.palette().color(QtGui.QPalette.Text)) for axis in ("left", "bottom"): self.plot.getAxis(axis).setPen(pen) self._legend = LegendItem() self._legend.setParentItem(self.plot.getViewBox()) self._legend.hide() self._legend.anchor((1, 0), (1, 0)) def set_data(self, data): self.closeContext() self.clear() self.data = data if self.data is not None: domain = self.data.domain self.varmodel[:] = list(domain) self.groupvarmodel[:] = \ ["(None)"] + [var for var in domain if var.is_discrete] if domain.has_discrete_class: self.groupvar_idx = \ list(self.groupvarmodel).index(domain.class_var) self.openContext(domain) self.variable_idx = min(max(self.variable_idx, 0), len(self.varmodel) - 1) self.groupvar_idx = min(max(self.groupvar_idx, 0), len(self.groupvarmodel) - 1) itemmodels.select_row(self.groupvarview, self.groupvar_idx) itemmodels.select_row(self.varview, self.variable_idx) self._setup() def clear(self): self.plot.clear() self.varmodel[:] = [] self.groupvarmodel[:] = [] self.variable_idx = -1 self.groupvar_idx = 0 self._legend.clear() self._legend.hide() def _setup(self): self.plot.clear() self._legend.clear() self._legend.hide() varidx = self.variable_idx self.var = self.cvar = None if varidx >= 0: self.var = self.varmodel[varidx] if self.groupvar_idx > 0: self.cvar = self.groupvarmodel[self.groupvar_idx] self.set_left_axis_name() self.enable_disable_rel_freq() if self.var is None: return if self.cvar: self.contingencies = \ contingency.get_contingency(self.data, self.var, self.cvar) self.display_contingency() else: self.distributions = \ distribution.get_distribution(self.data, self.var) self.display_distribution() def _density_estimator(self): if self.cont_est_type == OWDistributions.Hist: def hist(dist): h, edges = numpy.histogram(dist[0, :], bins=10, weights=dist[1, :]) return edges, h return hist elif self.cont_est_type == OWDistributions.ASH: return lambda dist: ash_curve(dist, m=5) elif self.cont_est_type == OWDistributions.Kernel: return rect_kernel_curve def display_distribution(self): dist = self.distributions var = self.var assert len(dist) > 0 self.plot.clear() bottomaxis = self.plot.getAxis("bottom") bottomaxis.setLabel(var.name) self.set_left_axis_name() if var and var.is_continuous: bottomaxis.setTicks(None) curve_est = self._density_estimator() edges, curve = curve_est(dist) item = pg.PlotCurveItem() item.setData(edges, curve, antialias=True, stepMode=True, fillLevel=0, brush=QtGui.QBrush(Qt.gray), pen=QtGui.QColor(Qt.white)) self.plot.addItem(item) else: bottomaxis.setTicks([list(enumerate(var.values))]) for i, w in enumerate(dist): geom = QtCore.QRectF(i - 0.33, 0, 0.66, w) item = DistributionBarItem(geom, [1.0], [QtGui.QColor(128, 128, 128)]) self.plot.addItem(item) def _on_relative_freq_changed(self): self.set_left_axis_name() if self.cvar and self.cvar.is_discrete: self.display_contingency() else: self.display_distribution() def display_contingency(self): """ Set the contingency to display. """ cont = self.contingencies var, cvar = self.var, self.cvar assert len(cont) > 0 self.plot.clear() self._legend.clear() bottomaxis = self.plot.getAxis("bottom") bottomaxis.setLabel(var.name) palette = colorpalette.ColorPaletteGenerator(len(cvar.values)) colors = [palette[i] for i in range(len(cvar.values))] if var and var.is_continuous: bottomaxis.setTicks(None) weights = numpy.array([numpy.sum(W) for _, W in cont]) weights /= numpy.sum(weights) curve_est = self._density_estimator() curves = [curve_est(dist) for dist in cont] curves = [(X, Y * w) for (X, Y), w in zip(curves, weights)] cum_curves = [curves[0]] for X, Y in curves[1:]: cum_curves.append(sum_rect_curve(X, Y, *cum_curves[-1])) for (X, Y), color in reversed(list(zip(cum_curves, colors))): item = pg.PlotCurveItem() pen = QtGui.QPen(QtGui.QBrush(Qt.white), 0.5) pen.setCosmetic(True) item.setData(X, Y, antialias=True, stepMode=True, fillLevel=0, brush=QtGui.QBrush(color.lighter()), pen=pen) self.plot.addItem(item) # # XXX: sum the individual curves and not the distributions. # # The conditional distributions might be 'smoother' than # # the cumulative one # cum_dist = [cont[0]] # for dist in cont[1:]: # cum_dist.append(dist_sum(dist, cum_dist[-1])) # # curves = [rect_kernel_curve(dist) for dist in cum_dist] # colors = [Qt.blue, Qt.red, Qt.magenta] # for (X, Y), color in reversed(list(zip(curves, colors))): # item = pg.PlotCurveItem() # item.setData(X, Y, antialias=True, stepMode=True, # fillLevel=0, brush=QtGui.QBrush(color)) # item.setPen(QtGui.QPen(color)) # self.plot.addItem(item) elif var and var.is_discrete: bottomaxis.setTicks([list(enumerate(var.values))]) cont = numpy.array(cont) for i, (value, dist) in enumerate(zip(var.values, cont.T)): dsum = sum(dist) geom = QtCore.QRectF(i - 0.333, 0, 0.666, 100 if self.relative_freq else dsum) item = DistributionBarItem(geom, dist / dsum, colors) self.plot.addItem(item) for color, name in zip(colors, cvar.values): self._legend.addItem( ScatterPlotItem(pen=color, brush=color, size=10, shape="s"), name ) self._legend.show() def set_left_axis_name(self): set_label = self.plot.getAxis("left").setLabel if (self.var and self.var.is_continuous and self.cont_est_type != OWDistributions.Hist): set_label("Density") else: set_label(["Frequency", "Relative frequency"] [self.cvar is not None and self.relative_freq]) def enable_disable_rel_freq(self): self.cb_rel_freq.setDisabled( self.var is None or self.cvar is None or self.var.is_continuous) def _on_variable_idx_changed(self): self.variable_idx = selected_index(self.varview) self._setup() def _on_groupvar_idx_changed(self): self.groupvar_idx = selected_index(self.groupvarview) self._setup() def _on_cont_est_type_changed(self): self.set_left_axis_name() if self.data is not None: self._setup() def onDeleteWidget(self): self.plot.clear() super().onDeleteWidget()