class OWTreeGraph(OWTreeViewer2D): """Graphical visualization of tree models""" name = "Tree Viewer" icon = "icons/TreeViewer.svg" priority = 35 inputs = [ widget.InputSignal( "Tree", TreeModel, "ctree", # Had different input names before merging from # Classification/Regression tree variants replaces=["Classification Tree", "Regression Tree"]) ] outputs = [ widget.OutputSignal( "Selected Data", Table, widget.Default, id="selected-data", ), widget.OutputSignal(ANNOTATED_DATA_SIGNAL_NAME, Table, id="annotated-data") ] settingsHandler = ClassValuesContextHandler() target_class_index = ContextSetting(0) regression_colors = Setting(0) replaces = [ "Orange.widgets.classify.owclassificationtreegraph.OWClassificationTreeGraph", "Orange.widgets.classify.owregressiontreegraph.OWRegressionTreeGraph" ] COL_OPTIONS = ["Default", "Number of instances", "Mean value", "Variance"] COL_DEFAULT, COL_INSTANCE, COL_MEAN, COL_VARIANCE = range(4) def __init__(self): super().__init__() self.domain = None self.dataset = None self.clf_dataset = None self.tree_adapter = None self.color_label = QLabel("Target class: ") combo = self.color_combo = gui.OrangeComboBox() combo.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed) combo.setSizeAdjustPolicy( QComboBox.AdjustToMinimumContentsLengthWithIcon) combo.setMinimumContentsLength(8) combo.activated[int].connect(self.color_changed) self.display_box.layout().addRow(self.color_label, combo) def set_node_info(self): """Set the content of the node""" for node in self.scene.nodes(): node.set_rect(QRectF()) self.update_node_info(node) w = max([n.rect().width() for n in self.scene.nodes()] + [0]) if w > self.max_node_width: w = self.max_node_width for node in self.scene.nodes(): rect = node.rect() node.set_rect(QRectF(rect.x(), rect.y(), w, rect.height())) self.scene.fix_pos(self.root_node, 10, 10) def _update_node_info_attr_name(self, node, text): attr = self.tree_adapter.attribute(node.node_inst) if attr is not None: text += "<hr/>{}".format(attr.name) return text def activate_loaded_settings(self): if not self.model: return super().activate_loaded_settings() if self.domain.class_var.is_discrete: self.color_combo.setCurrentIndex(self.target_class_index) self.toggle_node_color_cls() else: self.color_combo.setCurrentIndex(self.regression_colors) self.toggle_node_color_reg() self.set_node_info() def color_changed(self, i): if self.domain.class_var.is_discrete: self.target_class_index = i self.toggle_node_color_cls() self.set_node_info() else: self.regression_colors = i self.toggle_node_color_reg() def toggle_node_size(self): self.set_node_info() self.scene.update() self.scene_view.repaint() def toggle_color_cls(self): self.toggle_node_color_cls() self.set_node_info() self.scene.update() def toggle_color_reg(self): self.toggle_node_color_reg() self.set_node_info() self.scene.update() def ctree(self, model=None): """Input signal handler""" self.clear_scene() self.color_combo.clear() self.closeContext() self.model = model if model is None: self.info.setText('No tree.') self.root_node = None self.dataset = None self.tree_adapter = None else: self.tree_adapter = self._get_tree_adapter(model) self.domain = model.domain self.dataset = model.instances if self.dataset is not None and self.dataset.domain != self.domain: self.clf_dataset = Table.from_table(model.domain, self.dataset) else: self.clf_dataset = self.dataset class_var = self.domain.class_var if class_var.is_discrete: self.scene.colors = [QColor(*col) for col in class_var.colors] self.color_label.setText("Target class: ") self.color_combo.addItem("None") self.color_combo.addItems(self.domain.class_vars[0].values) self.color_combo.setCurrentIndex(self.target_class_index) else: self.scene.colors = \ ContinuousPaletteGenerator(*model.domain.class_var.colors) self.color_label.setText("Color by: ") self.color_combo.addItems(self.COL_OPTIONS) self.color_combo.setCurrentIndex(self.regression_colors) self.openContext(self.domain.class_var) # self.root_node = self.walkcreate(model.root, None) self.root_node = self.walkcreate(self.tree_adapter.root) self.info.setText('{} nodes, {} leaves'.format( self.tree_adapter.num_nodes, len(self.tree_adapter.leaves(self.tree_adapter.root)))) self.setup_scene() self.send("Selected Data", None) self.send(ANNOTATED_DATA_SIGNAL_NAME, create_annotated_table(self.dataset, [])) def walkcreate(self, node, parent=None): """Create a structure of tree nodes from the given model""" node_obj = TreeNode(self.tree_adapter, node, parent) self.scene.addItem(node_obj) if parent: edge = GraphicsEdge(node1=parent, node2=node_obj) self.scene.addItem(edge) parent.graph_add_edge(edge) for child_inst in self.tree_adapter.children(node): if child_inst is not None: self.walkcreate(child_inst, node_obj) return node_obj def node_tooltip(self, node): return "<br>".join( to_html(str(rule)) for rule in self.tree_adapter.rules(node.node_inst)) def update_selection(self): if self.model is None: return nodes = [ item.node_inst for item in self.scene.selectedItems() if isinstance(item, TreeNode) ] data = self.tree_adapter.get_instances_in_nodes( self.clf_dataset, nodes) self.send("Selected Data", data) self.send( ANNOTATED_DATA_SIGNAL_NAME, create_annotated_table(self.dataset, self.tree_adapter.get_indices(nodes))) def send_report(self): if not self.model: return items = [ ("Tree size", self.info.text()), ( "Edge widths", ("Fixed", "Relative to root", "Relative to parent")[ # pylint: disable=invalid-sequence-index self.line_width_method]) ] if self.domain.class_var.is_discrete: items.append(("Target class", self.color_combo.currentText())) elif self.regression_colors != self.COL_DEFAULT: items.append( ("Color by", self.COL_OPTIONS[self.regression_colors])) self.report_items(items) self.report_plot(self.scene) def update_node_info(self, node): if self.domain.class_var.is_discrete: self.update_node_info_cls(node) else: self.update_node_info_reg(node) def update_node_info_cls(self, node): """Update the printed contents of the node for classification trees""" node_inst = node.node_inst distr = self.tree_adapter.get_distribution(node_inst)[0] total = self.tree_adapter.num_samples(node_inst) distr = distr / np.sum(distr) if self.target_class_index: tabs = distr[self.target_class_index - 1] text = "" else: modus = np.argmax(distr) tabs = distr[modus] text = self.domain.class_vars[0].values[int(modus)] + "<br/>" if tabs > 0.999: text += "100%, {}/{}".format(total, total) else: text += "{:2.1f}%, {}/{}".format(100 * tabs, int(total * tabs), total) text = self._update_node_info_attr_name(node, text) node.setHtml('<p style="line-height: 120%; margin-bottom: 0">' '{}</p>'.format(text)) def update_node_info_reg(self, node): """Update the printed contents of the node for regression trees""" node_inst = node.node_inst mean, var = self.tree_adapter.get_distribution(node_inst)[0] insts = self.tree_adapter.num_samples(node_inst) text = "{:.1f} ± {:.1f}<br/>".format(mean, var) text += "{} instances".format(insts) text = self._update_node_info_attr_name(node, text) node.setHtml( '<p style="line-height: 120%; margin-bottom: 0">{}</p>'.format( text)) def toggle_node_color_cls(self): """Update the node color for classification trees""" colors = self.scene.colors for node in self.scene.nodes(): distr = node.tree_adapter.get_distribution(node.node_inst)[0] total = sum(distr) if self.target_class_index: p = distr[self.target_class_index - 1] / total color = colors[self.target_class_index - 1].lighter(200 - 100 * p) else: modus = np.argmax(distr) p = distr[modus] / (total or 1) color = colors[int(modus)].lighter(300 - 200 * p) node.backgroundBrush = QBrush(color) self.scene.update() def toggle_node_color_reg(self): """Update the node color for regression trees""" def_color = QColor(192, 192, 255) if self.regression_colors == self.COL_DEFAULT: brush = QBrush(def_color.lighter(100)) for node in self.scene.nodes(): node.backgroundBrush = brush elif self.regression_colors == self.COL_INSTANCE: max_insts = len( self.tree_adapter.get_instances_in_nodes( self.dataset, [self.tree_adapter.root])) for node in self.scene.nodes(): node_insts = len( self.tree_adapter.get_instances_in_nodes( self.dataset, [node.node_inst])) node.backgroundBrush = QBrush( def_color.lighter(120 - 20 * node_insts / max_insts)) elif self.regression_colors == self.COL_MEAN: minv = np.nanmin(self.dataset.Y) maxv = np.nanmax(self.dataset.Y) fact = 1 / (maxv - minv) if minv != maxv else 1 colors = self.scene.colors for node in self.scene.nodes(): node_mean = self.tree_adapter.get_distribution( node.node_inst)[0][0] node.backgroundBrush = QBrush(colors[fact * (node_mean - minv)]) else: nodes = list(self.scene.nodes()) variances = [ self.tree_adapter.get_distribution(node.node_inst)[0][1] for node in nodes ] max_var = max(variances) for node, var in zip(nodes, variances): node.backgroundBrush = QBrush( def_color.lighter(120 - 20 * var / max_var)) self.scene.update() def _get_tree_adapter(self, model): if isinstance(model, SklModel): return SklTreeAdapter(model) return TreeAdapter(model)
class OWMergeData(widget.OWWidget): name = "Merge Data" description = "Merge data sets based on the values of selected features." icon = "icons/MergeData.svg" priority = 1110 inputs = [ widget.InputSignal("Data", Orange.data.Table, "setData", widget.Default, replaces=["Data A"]), widget.InputSignal("Extra Data", Orange.data.Table, "setExtraData", replaces=["Data B"]) ] outputs = [ widget.OutputSignal( "Data", Orange.data.Table, replaces=["Merged Data A+B", "Merged Data B+A", "Merged Data"]) ] attr_augment_data = settings.Setting('', schema_only=True) attr_augment_extra = settings.Setting('', schema_only=True) attr_merge_data = settings.Setting('', schema_only=True) attr_merge_extra = settings.Setting('', schema_only=True) attr_combine_data = settings.Setting('', schema_only=True) attr_combine_extra = settings.Setting('', schema_only=True) merging = settings.Setting(0) want_main_area = False resizing_enabled = False class Warning(widget.OWWidget.Warning): duplicate_names = widget.Msg("Duplicate variable names in output.") def __init__(self): super().__init__() self.data = None self.extra_data = None self.extra_data = None self.model = itemmodels.VariableListModel() self.model_unique_with_id = itemmodels.VariableListModel() self.extra_model_unique = itemmodels.VariableListModel() self.extra_model_unique_with_id = itemmodels.VariableListModel() box = gui.hBox(self.controlArea, box=None) self.infoBoxData = gui.label(box, self, self.dataInfoText(None), box="Data") self.infoBoxExtraData = gui.label(box, self, self.dataInfoText(None), box="Extra Data") grp = gui.radioButtonsInBox(self.controlArea, self, "merging", box="Merging", callback=self.change_merging) self.attr_boxes = [] radio_width = \ QApplication.style().pixelMetric(QStyle.PM_ExclusiveIndicatorWidth) def add_option(label, pre_label, between_label, merge_type, model, extra_model): gui.appendRadioButton(grp, label) vbox = gui.vBox(grp) box = gui.hBox(vbox) box.layout().addSpacing(radio_width) self.attr_boxes.append(box) gui.widgetLabel(box, pre_label) model[:] = [getattr(self, 'attr_{}_data'.format(merge_type))] extra_model[:] = [ getattr(self, 'attr_{}_extra'.format(merge_type)) ] cb = gui.comboBox(box, self, 'attr_{}_data'.format(merge_type), callback=self._invalidate, model=model) cb.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed) cb.setFixedWidth(190) gui.widgetLabel(box, between_label) cb = gui.comboBox(box, self, 'attr_{}_extra'.format(merge_type), callback=self._invalidate, model=extra_model) cb.setSizePolicy(QSizePolicy.MinimumExpanding, QSizePolicy.Fixed) cb.setFixedWidth(190) vbox.layout().addSpacing(6) add_option("Append columns from Extra Data", "by matching", "with", "augment", self.model, self.extra_model_unique) add_option("Find matching rows", "where", "equals", "merge", self.model_unique_with_id, self.extra_model_unique_with_id) add_option("Concatenate tables, merge rows", "where", "equals", "combine", self.model_unique_with_id, self.extra_model_unique_with_id) self.set_merging() def set_merging(self): # pylint: disable=invalid-sequence-index # all boxes should be hidden before one is shown, otherwise widget's # layout changes height for box in self.attr_boxes: box.hide() self.attr_boxes[self.merging].show() def change_merging(self): self.set_merging() self._invalidate() @staticmethod def _set_unique_model(data, model): if data is None: model[:] = [] return m = [INDEX] for attr in chain(data.domain.variables, data.domain.metas): col = data.get_column_view(attr)[0] if attr.is_primitive(): col = col.astype(float) col = col[~np.isnan(col)] else: col = col[~(col == "")] if len(np.unique(col)) == len(col): m.append(attr) model[:] = m @staticmethod def _set_model(data, model): if data is None: model[:] = [] return model[:] = list(chain([INDEX], data.domain, data.domain.metas)) def _add_instanceid_to_models(self): needs_id = self.data is not None and self.extra_data is not None and \ len(np.intersect1d(self.data.ids, self.extra_data.ids)) for model in (self.model_unique_with_id, self.extra_model_unique_with_id): has_id = INSTANCEID in model if needs_id and not has_id: model.insert(0, INSTANCEID) elif not needs_id and has_id: model.remove(INSTANCEID) def _init_combo_current_items(self, variables, models): for var, model in zip(variables, models): value = getattr(self, var) if len(model) > 0: setattr(self, var, value if value in model else INDEX) def _find_best_match(self): def get_unique_str_metas_names(model_): return [m for m in model_ if isinstance(m, StringVariable)] def best_match(model, extra_model): attr, extra_attr, n_max_intersect = INDEX, INDEX, 0 str_metas = get_unique_str_metas_names(model) extra_str_metas = get_unique_str_metas_names(extra_model) for m_a, m_b in product(str_metas, extra_str_metas): n_inter = len( np.intersect1d(self.data[:, m_a].metas, self.extra_data[:, m_b].metas)) if n_inter > n_max_intersect: n_max_intersect, attr, extra_attr = n_inter, m_a, m_b return attr, extra_attr def set_attrs(attr_name, attr_extra_name, attr, extra_attr): if getattr(self, attr_name) == INDEX and \ getattr(self, attr_extra_name) == INDEX: setattr(self, attr_name, attr) setattr(self, attr_extra_name, extra_attr) if self.data and self.extra_data: attrs = best_match(self.model, self.extra_model_unique) set_attrs("attr_augment_data", "attr_augment_extra", *attrs) attrs = best_match(self.model_unique_with_id, self.extra_model_unique_with_id) set_attrs("attr_merge_data", "attr_merge_extra", *attrs) set_attrs("attr_combine_data", "attr_combine_extra", *attrs) @check_sql_input def setData(self, data): self.data = data self._set_model(data, self.model) self._set_unique_model(data, self.model_unique_with_id) self._add_instanceid_to_models() self._init_combo_current_items( ("attr_augment_data", "attr_merge_data", "attr_combine_data"), (self.model, self.model_unique_with_id, self.model_unique_with_id)) self.infoBoxData.setText(self.dataInfoText(data)) self._find_best_match() @check_sql_input def setExtraData(self, data): self.extra_data = data self._set_unique_model(data, self.extra_model_unique) self._set_unique_model(data, self.extra_model_unique_with_id) self._add_instanceid_to_models() self._init_combo_current_items( ("attr_augment_extra", "attr_merge_extra", "attr_combine_extra"), (self.extra_model_unique, self.extra_model_unique_with_id, self.extra_model_unique_with_id)) self.infoBoxExtraData.setText(self.dataInfoText(data)) self._find_best_match() def handleNewSignals(self): self._invalidate() def dataInfoText(self, data): if data is None: return "No data." else: return "{}\n{} instances\n{} variables".format( data.name, len(data), len(data.domain) + len(data.domain.metas)) def commit(self): self.Warning.duplicate_names.clear() if self.data is None or len(self.data) == 0 or \ self.extra_data is None or len(self.extra_data) == 0: merged_data = None else: merged_data = self.merge() if merged_data: merged_domain = merged_data.domain var_names = [ var.name for var in chain(merged_domain.variables, merged_domain.metas) ] if len(set(var_names)) != len(var_names): self.Warning.duplicate_names() self.send("Data", merged_data) def _invalidate(self): self.commit() def send_report(self): # pylint: disable=invalid-sequence-index attr = (self.attr_augment_data, self.attr_merge_data, self.attr_combine_data) extra_attr = (self.attr_augment_extra, self.attr_merge_extra, self.attr_combine_extra) merging_types = ("Append columns from Extra Data", "Find matching rows", "Concatenate tables, merge rows") self.report_items((("Merging", merging_types[self.merging]), ("Data attribute", attr[self.merging]), ("Extra data attribute", extra_attr[self.merging]))) def merge(self): # pylint: disable=invalid-sequence-index operation = ["augment", "merge", "combine"][self.merging] var_data = getattr(self, "attr_{}_data".format(operation)) var_extra_data = getattr(self, "attr_{}_extra".format(operation)) merge_method = getattr(self, "_{}_indices".format(operation)) as_string = not (isinstance(var_data, ContinuousVariable) and isinstance(var_extra_data, ContinuousVariable)) extra_map = self._get_keymap(self.extra_data, var_extra_data, as_string) match_indices = merge_method(var_data, extra_map, as_string) reduced_extra_data = self._compute_reduced_extra_data(var_extra_data) return self._join_table_by_indices(reduced_extra_data, match_indices) def _compute_reduced_extra_data(self, var_extra_data): """Prepare a table with extra columns that will appear in the merged table""" domain = self.data.domain extra_domain = self.extra_data.domain all_vars = set(chain(domain.variables, domain.metas)) if self.merging != MergeType.OUTER_JOIN: all_vars.add(var_extra_data) extra_vars = chain(extra_domain.variables, extra_domain.metas) return self.extra_data[:, [ var for var in extra_vars if var not in all_vars ]] @staticmethod def _values(data, var, as_string): """Return an iterotor over keys for rows of the table.""" if var == INSTANCEID: return (inst.id for inst in data) if var == INDEX: return range(len(data)) col = data.get_column_view(var)[0] if not as_string: return col if var.is_primitive(): return (var.str_val(val) if not np.isnan(val) else np.nan for val in col) else: return (str(val) if val else np.nan for val in col) @classmethod def _get_keymap(cls, data, var, as_string): """Return a generator of pairs (key, index) by enumerating and switching the values for rows (method `_values`). """ return ((val, i) for i, val in enumerate(cls._values(data, var, as_string))) def _augment_indices(self, var_data, extra_map, as_string): """Compute a two-row array of indices: - the first row contains indices for the primary table, - the second row contains the matching rows in the extra table or -1""" data = self.data extra_map = dict(extra_map) # Don't match nans. This is needed since numpy supports using nan as # keys. If numpy fixes this, the below conditions will always be false, # so we're OK again. if np.nan in extra_map: del extra_map[np.nan] keys = (extra_map.get(val, -1) for val in self._values(data, var_data, as_string)) return np.vstack((np.arange(len(data), dtype=np.int64), np.fromiter(keys, dtype=np.int64, count=len(data)))) def _merge_indices(self, var_data, extra_map, as_string): """Use _augment_indices to compute the array of indices, then remove those with no match in the second table""" augmented = self._augment_indices(var_data, extra_map, as_string) return augmented[:, augmented[1] != -1] def _combine_indices(self, var_data, extra_map, as_string): """Use _augment_indices to compute the array of indices, then add rows in the second table without a match in the first""" to_add, extra_map = tee(extra_map) # dict instead of set because we have pairs; we'll need only keys key_map = dict(self._get_keymap(self.data, var_data, as_string)) # _augment indices will skip rows where the key in the left table # is nan. See comment in `_augment_indices` wrt numpy and nan in dicts if np.nan in key_map: del key_map[np.nan] keys = np.fromiter((j for key, j in to_add if key not in key_map), dtype=np.int64) right_indices = np.vstack((np.full(len(keys), -1, np.int64), keys)) return np.hstack((self._augment_indices(var_data, extra_map, as_string), right_indices)) def _join_table_by_indices(self, reduced_extra, indices): """Join (horizontally) self.data and reduced_extra, taking the pairs of rows given in indices""" if not len(indices): return None domain = Orange.data.Domain(*(getattr(self.data.domain, x) + getattr(reduced_extra.domain, x) for x in ("attributes", "class_vars", "metas"))) X = self._join_array_by_indices(self.data.X, reduced_extra.X, indices) Y = self._join_array_by_indices(np.c_[self.data.Y], np.c_[reduced_extra.Y], indices) string_cols = [ i for i, var in enumerate(domain.metas) if var.is_string ] metas = self._join_array_by_indices(self.data.metas, reduced_extra.metas, indices, string_cols) return Orange.data.Table.from_numpy(domain, X, Y, metas) @staticmethod def _join_array_by_indices(left, right, indices, string_cols=None): """Join (horizontally) two arrays, taking pairs of rows given in indices """ tpe = object if object in (left.dtype, right.dtype) else left.dtype left_width, right_width = left.shape[1], right.shape[1] arr = np.full((indices.shape[1], left_width + right_width), np.nan, tpe) if string_cols: arr[:, string_cols] = "" for indices, to_change, lookup in ((indices[0], arr[:, :left_width], left), (indices[1], arr[:, left_width:], right)): known = indices != -1 to_change[known] = lookup[indices[known]] return arr