class OWWordEnrichment(OWWidget): # Basic widget info name = "Word Enrichment" description = "Word enrichment analysis for selected documents." icon = "icons/SetEnrichment.svg" priority = 600 # Input/output class Inputs: selected_data = Input("Selected Data", Table) data = Input("Data", Table) want_main_area = True class Error(OWWidget.Error): no_bow_features = Msg('No bag-of-words features!') no_words_overlap = Msg('No words overlap!') empty_selection = Msg('Selected data is empty!') all_selected = Msg('All examples can not be selected!') # Settings filter_by_p = Setting(False) filter_p_value = Setting(0.01) filter_by_fdr = Setting(True) filter_fdr_value = Setting(0.2) def __init__(self): super().__init__() # Init data self.data = None self.selected_data = None self.selected_data_transformed = None # used for transforming the 'selected data' into the 'data' domain self.words = [] self.p_values = [] self.fdr_values = [] # Info section fbox = gui.widgetBox(self.controlArea, "Info") self.info_all = gui.label(fbox, self, 'Cluster words:') self.info_sel = gui.label(fbox, self, 'Selected words:') self.info_fil = gui.label(fbox, self, 'After filtering:') # Filtering settings fbox = gui.widgetBox(self.controlArea, "Filter") hbox = gui.widgetBox(fbox, orientation=0) self.chb_p = gui.checkBox(hbox, self, "filter_by_p", "p-value", callback=self.filter_and_display, tooltip="Filter by word p-value") self.spin_p = gui.doubleSpin(hbox, self, 'filter_p_value', 1e-4, 1, step=1e-4, labelWidth=15, callback=self.filter_and_display, callbackOnReturn=True, tooltip="Max p-value for word") self.spin_p.setEnabled(self.filter_by_p) hbox = gui.widgetBox(fbox, orientation=0) self.chb_fdr = gui.checkBox(hbox, self, "filter_by_fdr", "FDR", callback=self.filter_and_display, tooltip="Filter by word FDR") self.spin_fdr = gui.doubleSpin(hbox, self, 'filter_fdr_value', 1e-4, 1, step=1e-4, labelWidth=15, callback=self.filter_and_display, callbackOnReturn=True, tooltip="Max p-value for word") self.spin_fdr.setEnabled(self.filter_by_fdr) gui.rubber(self.controlArea) # Word's list view self.cols = ['Word', 'p-value', 'FDR'] self.sig_words = QTreeWidget() self.sig_words.setColumnCount(len(self.cols)) self.sig_words.setHeaderLabels(self.cols) self.sig_words.setSortingEnabled(True) self.sig_words.setSelectionMode(QTreeView.ExtendedSelection) self.sig_words.sortByColumn(2, 0) # 0 is ascending order for i in range(len(self.cols)): self.sig_words.resizeColumnToContents(i) self.mainArea.layout().addWidget(self.sig_words) def sizeHint(self): return QSize(450, 240) @Inputs.data def set_data(self, data=None): self.data = data @Inputs.selected_data def set_data_selected(self, data=None): self.selected_data = data def handleNewSignals(self): self.check_data() def get_bow_domain(self): domain = self.data.domain return Domain( attributes=[a for a in domain.attributes if a.attributes.get('bow-feature', False)], class_vars=domain.class_vars, metas=domain.metas, source=domain) def check_data(self): self.Error.clear() if isinstance(self.data, Table) and \ isinstance(self.selected_data, Table): if len(self.selected_data) == 0: self.Error.empty_selection() self.clear() return # keep only BoW features bow_domain = self.get_bow_domain() if len(bow_domain.attributes) == 0: self.Error.no_bow_features() self.clear() return self.data = Corpus.from_table(bow_domain, self.data) self.selected_data_transformed = Corpus.from_table(bow_domain, self.selected_data) if np_sp_sum(self.selected_data_transformed.X) == 0: self.Error.no_words_overlap() self.clear() elif len(self.data) == len(self.selected_data): self.Error.all_selected() self.clear() else: self.apply() else: self.clear() def clear(self): self.sig_words.clear() self.info_all.setText('Cluster words:') self.info_sel.setText('Selected words:') self.info_fil.setText('After filtering:') def filter_enabled(self, b): self.chb_p.setEnabled(b) self.chb_fdr.setEnabled(b) self.spin_p.setEnabled(b) self.spin_fdr.setEnabled(b) def filter_and_display(self): self.spin_p.setEnabled(self.filter_by_p) self.spin_fdr.setEnabled(self.filter_by_fdr) self.sig_words.clear() if self.selected_data_transformed is None: # do nothing when no Data return count = 0 if self.words: for word, pval, fval in zip(self.words, self.p_values, self.fdr_values): if (not self.filter_by_p or pval <= self.filter_p_value) and \ (not self.filter_by_fdr or fval <= self.filter_fdr_value): it = EATreeWidgetItem(word, pval, fval, self.sig_words) self.sig_words.addTopLevelItem(it) count += 1 for i in range(len(self.cols)): self.sig_words.resizeColumnToContents(i) self.info_all.setText('Cluster words: {}'.format(len(self.selected_data_transformed.domain.attributes))) self.info_sel.setText('Selected words: {}'.format(np.count_nonzero(np_sp_sum(self.selected_data_transformed.X, axis=0)))) if not self.filter_by_p and not self.filter_by_fdr: self.info_fil.setText('After filtering:') self.info_fil.setEnabled(False) else: self.info_fil.setEnabled(True) self.info_fil.setText('After filtering: {}'.format(count)) def progress(self, p): self.progressBarSet(p) def apply(self): self.clear() self.progressBarInit() self.filter_enabled(False) self.words = [i.name for i in self.selected_data_transformed.domain.attributes] self.p_values = hypergeom_p_values(self.data.X, self.selected_data_transformed.X, callback=self.progress) self.fdr_values = false_discovery_rate(self.p_values) self.filter_and_display() self.filter_enabled(True) self.progressBarFinished() def tree_to_table(self): view = [self.cols] items = self.sig_words.topLevelItemCount() for i in range(items): line = [] for j in range(3): line.append(self.sig_words.topLevelItem(i).text(j)) view.append(line) return(view) def send_report(self): if self.words: self.report_table("Enriched words", self.tree_to_table())
class OWWordEnrichment(OWWidget): # Basic widget info name = "Word Enrichment" description = "Word enrichment analysis for selected documents." icon = "icons/SetEnrichment.svg" priority = 600 # Input/output class Inputs: selected_data = Input("Selected Data", Table) data = Input("Data", Table) want_main_area = True class Error(OWWidget.Error): no_bow_features = Msg('No bag-of-words features!') no_words_overlap = Msg('No words overlap!') empty_selection = Msg('Selected data is empty!') all_selected = Msg('All examples can not be selected!') # Settings filter_by_p = Setting(False) filter_p_value = Setting(0.01) filter_by_fdr = Setting(True) filter_fdr_value = Setting(0.2) def __init__(self): super().__init__() # Init data self.data = None self.selected_data = None self.selected_data_transformed = None # used for transforming the 'selected data' into the 'data' domain self.words = [] self.p_values = [] self.fdr_values = [] # Info section fbox = gui.widgetBox(self.controlArea, "Info") self.info_all = gui.label(fbox, self, 'Cluster words:') self.info_sel = gui.label(fbox, self, 'Selected words:') self.info_fil = gui.label(fbox, self, 'After filtering:') # Filtering settings fbox = gui.widgetBox(self.controlArea, "Filter") hbox = gui.widgetBox(fbox, orientation=0) self.chb_p = gui.checkBox(hbox, self, "filter_by_p", "p-value", callback=self.filter_and_display, tooltip="Filter by word p-value") self.spin_p = gui.doubleSpin(hbox, self, 'filter_p_value', 1e-4, 1, step=1e-4, labelWidth=15, callback=self.filter_and_display, callbackOnReturn=True, tooltip="Max p-value for word") self.spin_p.setEnabled(self.filter_by_p) hbox = gui.widgetBox(fbox, orientation=0) self.chb_fdr = gui.checkBox(hbox, self, "filter_by_fdr", "FDR", callback=self.filter_and_display, tooltip="Filter by word FDR") self.spin_fdr = gui.doubleSpin(hbox, self, 'filter_fdr_value', 1e-4, 1, step=1e-4, labelWidth=15, callback=self.filter_and_display, callbackOnReturn=True, tooltip="Max p-value for word") self.spin_fdr.setEnabled(self.filter_by_fdr) gui.rubber(self.controlArea) # Word's list view self.cols = ['Word', 'p-value', 'FDR'] self.sig_words = QTreeWidget() self.sig_words.setColumnCount(len(self.cols)) self.sig_words.setHeaderLabels(self.cols) self.sig_words.setSortingEnabled(True) self.sig_words.setSelectionMode(QTreeView.ExtendedSelection) self.sig_words.sortByColumn(2, 0) # 0 is ascending order for i in range(len(self.cols)): self.sig_words.resizeColumnToContents(i) self.mainArea.layout().addWidget(self.sig_words) @Inputs.data def set_data(self, data=None): self.data = data @Inputs.selected_data def set_data_selected(self, data=None): self.selected_data = data def handleNewSignals(self): self.check_data() def get_bow_domain(self): domain = self.data.domain return Domain(attributes=[ a for a in domain.attributes if a.attributes.get('bow-feature', False) ], class_vars=domain.class_vars, metas=domain.metas, source=domain) def check_data(self): self.Error.clear() if isinstance(self.data, Table) and \ isinstance(self.selected_data, Table): if len(self.selected_data) == 0: self.Error.empty_selection() self.clear() return # keep only BoW features bow_domain = self.get_bow_domain() if len(bow_domain.attributes) == 0: self.Error.no_bow_features() self.clear() return self.data = Corpus.from_table(bow_domain, self.data) self.selected_data_transformed = Corpus.from_table( bow_domain, self.selected_data) if np_sp_sum(self.selected_data_transformed.X) == 0: self.Error.no_words_overlap() self.clear() elif len(self.data) == len(self.selected_data): self.Error.all_selected() self.clear() else: self.apply() else: self.clear() def clear(self): self.sig_words.clear() self.info_all.setText('Cluster words:') self.info_sel.setText('Selected words:') self.info_fil.setText('After filtering:') def filter_enabled(self, b): self.chb_p.setEnabled(b) self.chb_fdr.setEnabled(b) self.spin_p.setEnabled(b) self.spin_fdr.setEnabled(b) def filter_and_display(self): self.spin_p.setEnabled(self.filter_by_p) self.spin_fdr.setEnabled(self.filter_by_fdr) self.sig_words.clear() if self.selected_data_transformed is None: # do nothing when no Data return count = 0 if self.words: for word, pval, fval in zip(self.words, self.p_values, self.fdr_values): if (not self.filter_by_p or pval <= self.filter_p_value) and \ (not self.filter_by_fdr or fval <= self.filter_fdr_value): it = EATreeWidgetItem(word, pval, fval, self.sig_words) self.sig_words.addTopLevelItem(it) count += 1 for i in range(len(self.cols)): self.sig_words.resizeColumnToContents(i) self.info_all.setText('Cluster words: {}'.format( len(self.selected_data_transformed.domain.attributes))) self.info_sel.setText('Selected words: {}'.format( np.count_nonzero( np_sp_sum(self.selected_data_transformed.X, axis=0)))) if not self.filter_by_p and not self.filter_by_fdr: self.info_fil.setText('After filtering:') self.info_fil.setEnabled(False) else: self.info_fil.setEnabled(True) self.info_fil.setText('After filtering: {}'.format(count)) def progress(self, p): self.progressBarSet(p) def apply(self): self.clear() self.progressBarInit() self.filter_enabled(False) self.words = [ i.name for i in self.selected_data_transformed.domain.attributes ] self.p_values = hypergeom_p_values(self.data.X, self.selected_data_transformed.X, callback=self.progress) self.fdr_values = false_discovery_rate(self.p_values) self.filter_and_display() self.filter_enabled(True) self.progressBarFinished()
class OWWordEnrichment(OWWidget, ConcurrentWidgetMixin): # Basic widget info name = "Word Enrichment" description = "Word enrichment analysis for selected documents." icon = "icons/SetEnrichment.svg" priority = 600 # Input/output class Inputs: selected_data = Input("Selected Data", Table) data = Input("Data", Table) want_main_area = True class Error(OWWidget.Error): no_bow_features = Msg('No bag-of-words features!') no_words_overlap = Msg('No words overlap!') empty_selection = Msg('Selected data is empty!') all_selected = Msg('All examples can not be selected!') # Settings filter_by_p: bool = Setting(False) filter_p_value: float = Setting(0.01) filter_by_fdr: bool = Setting(True) filter_fdr_value: float = Setting(0.2) def __init__(self): OWWidget.__init__(self) ConcurrentWidgetMixin.__init__(self) # Init data self.data = None self.selected_data = None # used for transforming the 'selected data' into the 'data' domain self.selected_data_transformed = None self.results = Result() # info box fbox = gui.widgetBox(self.controlArea, "Info") self.info_fil = gui.label(fbox, self, 'Words displayed: 0') # Filtering settings fbox = gui.widgetBox(self.controlArea, "Filter") hbox = gui.widgetBox(fbox, orientation=0) self.chb_p = gui.checkBox(hbox, self, "filter_by_p", "p-value", callback=self.filter_and_display, tooltip="Filter by word p-value") self.spin_p = gui.doubleSpin(hbox, self, 'filter_p_value', 1e-4, 1, step=1e-4, labelWidth=15, callback=self.filter_and_display, tooltip="Max p-value for word") self.spin_p.setEnabled(self.filter_by_p) hbox = gui.widgetBox(fbox, orientation=0) self.chb_fdr = gui.checkBox(hbox, self, "filter_by_fdr", "FDR", callback=self.filter_and_display, tooltip="Filter by word FDR") self.spin_fdr = gui.doubleSpin(hbox, self, 'filter_fdr_value', 1e-4, 1, step=1e-4, labelWidth=15, callback=self.filter_and_display, tooltip="Max p-value for word") self.spin_fdr.setEnabled(self.filter_by_fdr) gui.rubber(self.controlArea) # Word's list view self.cols = ['Word', 'p-value', 'FDR'] self.sig_words = QTreeWidget() self.sig_words.setColumnCount(len(self.cols)) self.sig_words.setHeaderLabels(self.cols) self.sig_words.setSortingEnabled(True) self.sig_words.setSelectionMode(QTreeView.NoSelection) self.sig_words.sortByColumn(2, 0) # 0 is ascending order for i in range(len(self.cols)): self.sig_words.resizeColumnToContents(i) self.mainArea.layout().addWidget(self.sig_words) def sizeHint(self): return QSize(450, 240) @Inputs.data def set_data(self, data=None): self.data = data # selected data transformed depends on data domain self.selected_data_transformed = None @Inputs.selected_data def set_data_selected(self, data=None): self.selected_data = data def handleNewSignals(self): self.check_data() def get_bow_domain(self): domain = self.data.domain return Domain(attributes=[ a for a in domain.attributes if a.attributes.get('bow-feature', False) ], class_vars=domain.class_vars, metas=domain.metas, source=domain) def check_data(self): self.Error.clear() if isinstance(self.data, Table) and \ isinstance(self.selected_data, Table): if len(self.selected_data) == 0: self.Error.empty_selection() self.clear() return # keep only BoW features bow_domain = self.get_bow_domain() if len(bow_domain.attributes) == 0: self.Error.no_bow_features() self.clear() return self.data = Corpus.from_table(bow_domain, self.data) self.selected_data_transformed = Corpus.from_table( bow_domain, self.selected_data) if np_sp_sum(self.selected_data_transformed.X) == 0: self.Error.no_words_overlap() self.clear() elif len(self.data) == len(self.selected_data): self.Error.all_selected() self.clear() else: self.set_input_info() self.apply() else: self.clear() def clear(self): self.sig_words.clear() self.info.set_input_summary(self.info.NoInput) self.set_displayed_info(0) def filter_enabled(self, b): self.chb_p.setEnabled(b) self.chb_fdr.setEnabled(b) self.spin_p.setEnabled(b) self.spin_fdr.setEnabled(b) def filter_and_display(self): self.spin_p.setEnabled(self.filter_by_p) self.spin_fdr.setEnabled(self.filter_by_fdr) self.sig_words.clear() if self.selected_data_transformed is None: # do nothing when no Data return if self.results.words: count = self.build_tree() else: count = 0 for i in range(len(self.cols)): self.sig_words.resizeColumnToContents(i) self.set_displayed_info(count) def build_tree(self) -> int: count = 0 for word, pval, fval in zip(self.results.words, self.results.p_values, self.results.fdr_values): if ((not self.filter_by_p or pval <= self.filter_p_value) and (not self.filter_by_fdr or fval <= self.filter_fdr_value)): it = EATreeWidgetItem(word, pval, fval, self.sig_words) self.sig_words.addTopLevelItem(it) count += 1 return count def set_input_info(self) -> None: cluster_words = len(self.selected_data_transformed.domain.attributes) selected_words = np.count_nonzero( np_sp_sum(self.selected_data_transformed.X, axis=0)) self.info.set_input_summary( f"{cluster_words}|{selected_words}", f"Total words: {cluster_words}\n" f"Words in subset: {selected_words}") def set_displayed_info(self, count: int) -> None: self.info_fil.setText(f"Words displayed: {count}") def apply(self): self.sig_words.clear() self.filter_enabled(False) self.start(Runner.run, self.selected_data_transformed, self.data, self.results) def on_done(self, result: Result) -> None: self.filter_and_display() self.filter_enabled(True) def on_exception(self, ex: Exception) -> None: self.filter_enabled(True) def tree_to_table(self): view = [self.cols] items = self.sig_words.topLevelItemCount() for i in range(items): line = [] for j in range(3): line.append(self.sig_words.topLevelItem(i).text(j)) view.append(line) return view def send_report(self): if self.results.words: self.report_table("Enriched words", self.tree_to_table())