示例#1
0
class OWItemsets(widget.OWWidget):
    name = 'Frequent Itemsets'
    description = 'Explore sets of items that frequently appear together.'
    icon = 'icons/FrequentItemsets.svg'
    priority = 10

    class Inputs:
        data = Input("Data", Table)

    class Outputs:
        matching_data = Output("Matching Data", Table)

    class Error(widget.OWWidget.Error):
        need_discrete_data = widget.Msg(
            "Need some discrete data to work with.")
        no_disc_features = widget.Msg(
            "Discrete features required but data has none.")

    class Warning(widget.OWWidget.Warning):
        cont_attrs = widget.Msg(
            "Data has continuous attributes which will be skipped.")
        err_reg_expression = widget.Msg("Error in regular expression: {}")

    minSupport = settings.Setting(30)
    maxItemsets = settings.Setting(10000)
    filterSearch = settings.Setting(True)
    autoFind = settings.Setting(False)
    autoSend = settings.Setting(True)
    filterKeywords = settings.Setting('')
    filterMinItems = settings.Setting(1)
    filterMaxItems = settings.Setting(10000)

    UserAdviceMessages = [
        widget.Message(
            'Itemset are listed in item-sorted order, i.e. '
            'an itemset containing A and B is only listed once, as '
            'A > B (and not also B > A).', 'itemsets-order',
            widget.Message.Warning),
        widget.Message(
            'To select all the itemsets that are descendants of '
            '(include) some item X (i.e. the whole subtree), you '
            'can fold the subtree at that item and then select it.',
            'itemsets-order', widget.Message.Information)
    ]

    def __init__(self):
        self.data = None
        self.output = None
        self._is_running = False
        self.isRegexMatch = lambda x: True
        self.tree = QTreeWidget(self.mainArea,
                                columnCount=2,
                                allColumnsShowFocus=True,
                                alternatingRowColors=True,
                                selectionMode=QTreeWidget.ExtendedSelection,
                                uniformRowHeights=True)
        self.tree.setHeaderLabels(["Itemsets", "Support", "%"])
        self.tree.header().setStretchLastSection(True)
        self.tree.itemSelectionChanged.connect(self.selectionChanged)
        self.mainArea.layout().addWidget(self.tree)

        box = gui.widgetBox(self.controlArea, "Info")
        self.nItemsets = self.nSelectedExamples = self.nSelectedItemsets = ''
        gui.label(box, self, "Number of itemsets: %(nItemsets)s")
        gui.label(box, self, "Selected itemsets: %(nSelectedItemsets)s")
        gui.label(box, self, "Selected examples: %(nSelectedExamples)s")
        hbox = gui.widgetBox(box, orientation='horizontal')
        gui.button(hbox, self, "Expand all", callback=self.tree.expandAll)
        gui.button(hbox, self, "Collapse all", callback=self.tree.collapseAll)

        box = gui.widgetBox(self.controlArea, 'Find itemsets')
        gui.valueSlider(box,
                        self,
                        'minSupport',
                        values=[.0001, .0005, .001, .005, .01, .05, .1, .5] +
                        list(range(1, 101)),
                        label='Minimal support:',
                        labelFormat="%g%%",
                        callback=lambda: self.find_itemsets())
        gui.hSlider(box,
                    self,
                    'maxItemsets',
                    minValue=10000,
                    maxValue=100000,
                    step=10000,
                    label='Max. number of itemsets:',
                    labelFormat="%d",
                    callback=lambda: self.find_itemsets())
        self.button = gui.auto_commit(
            box,
            self,
            'autoFind',
            'Find Itemsets',
            commit=self.find_itemsets,
            callback=lambda: self.autoFind and self.find_itemsets())

        box = gui.widgetBox(self.controlArea, 'Filter itemsets')
        gui.lineEdit(box,
                     self,
                     'filterKeywords',
                     'Contains:',
                     callback=self.filter_change,
                     orientation='horizontal',
                     tooltip='A comma or space-separated list of regular '
                     'expressions.')
        hbox = gui.widgetBox(box, orientation='horizontal')
        gui.spin(hbox,
                 self,
                 'filterMinItems',
                 1,
                 998,
                 label='Min. items:',
                 callback=self.filter_change)
        gui.spin(hbox,
                 self,
                 'filterMaxItems',
                 2,
                 999,
                 label='Max. items:',
                 callback=self.filter_change)
        gui.checkBox(box,
                     self,
                     'filterSearch',
                     label='Apply these filters in search',
                     tooltip='If checked, the itemsets are filtered according '
                     'to these filter conditions already in the search '
                     'phase. \nIf unchecked, the only filters applied '
                     'during search are the ones above, '
                     'and the itemsets are \nfiltered afterwards only for '
                     'display, i.e. only the matching itemsets are shown.')

        gui.rubber(hbox)

        gui.rubber(self.controlArea)
        gui.auto_commit(self.controlArea, self, 'autoSend', 'Send selection')

        self.filter_change()

    ITEM_DATA_ROLE = Qt.UserRole + 1

    def selectionChanged(self):
        X = self.X
        mapping = self.onehot_mapping
        instances = set()
        where = np.where

        def whole_subtree(node):
            yield node
            for i in range(node.childCount()):
                yield from whole_subtree(node.child(i))

        def itemset(node):
            while node:
                yield node.data(0, self.ITEM_DATA_ROLE)
                node = node.parent()

        def selection_ranges(node):
            n_children = node.childCount()
            if n_children:
                yield (self.tree.indexFromItem(node.child(0)),
                       self.tree.indexFromItem(node.child(n_children - 1)))
            for i in range(n_children):
                yield from selection_ranges(node.child(i))

        nSelectedItemsets = 0
        item_selection = QItemSelection()
        for node in self.tree.selectedItems():
            nodes = (node, ) if node.isExpanded() else whole_subtree(node)
            if not node.isExpanded():
                for srange in selection_ranges(node):
                    item_selection.select(*srange)
            for node in nodes:
                nSelectedItemsets += 1
                cols, vals = zip(*(mapping[i] for i in itemset(node)))
                if issparse(X):
                    rows = (len(cols) == np.bincount(
                        (X[:, cols] != 0).indices,
                        minlength=X.shape[0])).nonzero()[0]
                else:
                    rows = where((X[:, cols] == vals).all(axis=1))[0]
                instances.update(rows)
        self.tree.itemSelectionChanged.disconnect(self.selectionChanged)
        self.tree.selectionModel().select(
            item_selection,
            QItemSelectionModel.Select | QItemSelectionModel.Rows)
        self.tree.itemSelectionChanged.connect(self.selectionChanged)

        self.nSelectedExamples = len(instances)
        self.nSelectedItemsets = nSelectedItemsets
        self.output = self.data[sorted(instances)] or None
        self.commit()

    def commit(self):
        self.Outputs.matching_data.send(self.output)

    def filter_change(self):
        self.Warning.err_reg_expression.clear()
        try:
            isRegexMatch = self.isRegexMatch = re.compile(
                '|'.join(
                    i.strip()
                    for i in re.split('(,|\s)+', self.filterKeywords.strip())
                    if i.strip()), re.IGNORECASE).search
        except Exception as e:
            self.Warning.err_reg_expression(e.args[0])
            isRegexMatch = self.isRegexMatch = lambda x: True

        def hide(node, depth, has_kw):
            if not has_kw:
                has_kw = isRegexMatch(node.text(0))
            hidden = (
                sum(
                    hide(node.child(i), depth + 1, has_kw)
                    for i in range(node.childCount())) == node.childCount()
                if node.childCount() else
                (not has_kw
                 or not self.filterMinItems <= depth <= self.filterMaxItems))
            node.setHidden(hidden)
            return hidden

        hide(self.tree.invisibleRootItem(), 0, False)

    class TreeWidgetItem(QTreeWidgetItem):
        def data(self, column, role):
            """Construct lazy tooltips"""
            if role != Qt.ToolTipRole:
                return super().data(column, role)
            tooltip = []
            while self:
                tooltip.append(self.text(0))
                self = self.parent()
            return '\n'.join(reversed(tooltip))

    def find_itemsets(self):
        if self.data is None or not len(self.data):
            return
        if self._is_running:
            self._is_running = False
            return
        self._is_running = True

        self.button.button.setText('Cancel')

        data = self.data
        self.tree.clear()
        self.tree.setUpdatesEnabled(False)
        self.tree.blockSignals(True)

        class ItemDict(dict):
            def __init__(self, item):
                self.item = item

        top = ItemDict(self.tree.invisibleRootItem())
        X, mapping = OneHot.encode(data)
        self.Error.need_discrete_data.clear()
        if X is None:
            self.Error.need_discrete_data()

        self.onehot_mapping = mapping
        ITEM_FMT = '{}' if issparse(data.X) else '{}={}'
        names = {
            item: ITEM_FMT.format(var.name, val)
            for item, var, val in OneHot.decode(mapping.keys(), data, mapping)
        }
        nItemsets = 0

        filterSearch = self.filterSearch
        filterMinItems, filterMaxItems = self.filterMinItems, self.filterMaxItems
        isRegexMatch = self.isRegexMatch

        # Find itemsets and populate the TreeView
        with self.progressBar(self.maxItemsets + 1) as progress:
            for itemset, support in frequent_itemsets(X,
                                                      self.minSupport / 100):

                if filterSearch and not filterMinItems <= len(
                        itemset) <= filterMaxItems:
                    continue

                parent = top
                first_new_item = None
                itemset_matches_filter = False

                for item in sorted(itemset):
                    name = names[item]

                    if filterSearch and not itemset_matches_filter:
                        itemset_matches_filter = isRegexMatch(name)

                    child = parent.get(name)
                    if child is None:
                        try:
                            wi = self.TreeWidgetItem(parent.item, [
                                name,
                                str(support), '{:.4g}'.format(
                                    100 * support / len(data))
                            ])
                        except RuntimeError:
                            # FIXME: When autoFind was in effect and the support
                            # slider was moved, this line excepted with:
                            #     RuntimeError: wrapped C/C++ object of type
                            #                   TreeWidgetItem has been deleted
                            return
                        wi.setData(0, self.ITEM_DATA_ROLE, item)
                        child = parent[name] = ItemDict(wi)

                        if first_new_item is None:
                            first_new_item = (parent, name)
                    parent = child

                if filterSearch and not itemset_matches_filter:
                    parent, name = first_new_item
                    parent.item.removeChild(parent[name].item)
                    del parent[name].item
                    del parent[name]
                else:
                    nItemsets += 1
                    progress.advance()

                if not self._is_running or nItemsets >= self.maxItemsets:
                    break

                qApp.processEvents()

        if not filterSearch:
            self.filter_change()
        self.nItemsets = nItemsets
        self.nSelectedItemsets = 0
        self.nSelectedExamples = 0
        self.tree.expandAll()
        for i in range(self.tree.columnCount()):
            self.tree.resizeColumnToContents(i)
        self.tree.setUpdatesEnabled(True)
        self.tree.blockSignals(False)
        self._is_running = False
        self.button.button.setText('Find Itemsets')

    @Inputs.data
    def set_data(self, data):
        self.data = data
        is_error = False
        if data is not None:
            self.Warning.cont_attrs.clear()
            self.Error.no_disc_features.clear()
            self.button.setDisabled(False)
            self.X = data.X
            if issparse(data.X):
                self.X = data.X.tocsc()
            else:
                if not data.domain.has_discrete_attributes():
                    self.Error.no_disc_features()
                    is_error = True
                    self.button.setDisabled(True)
                elif data.domain.has_continuous_attributes():
                    self.Warning.cont_attrs()
        else:
            self.output = None
            self.commit()
        if self.autoFind and not is_error:
            self.find_itemsets()