class TabularCoOccurrenceGUI(GridBox): # pylint: disable=too-many-ancestors def __init__(self, *, bundle: Bundle, default_token_filter: str = None, **kwargs): global CURRENT_BUNDLE CURRENT_BUNDLE = bundle """Alternative implementation that uses VectorizedCorpus""" self.bundle: Bundle = bundle self.co_occurrences: pd.DataFrame = None self.pivot_column_name: str = 'time_period' if not isinstance(bundle.token2id, Token2Id): raise ValueError( f"Expected Token2Id, found {type(bundle.token2id)}") if not isinstance(bundle.compute_options, dict): raise ValueError( "Expected Compute Options in bundle but found no such thing.") """Current processed corpus""" self.corpus: VectorizedCorpus = bundle.corpus """Properties that changes current corpus""" self._pivot: Dropdown = Dropdown( options=["year", "lustrum", "decade"], value="decade", placeholder='Group by', layout=Layout(width='auto'), ) """"Keyness source""" self._keyness_source: Dropdown = Dropdown( options={ "Full corpus": KeynessMetricSource.Full, "Concept corpus": KeynessMetricSource.Concept, "Weighed corpus": KeynessMetricSource.Weighed, } if bundle.concept_corpus is not None else { "Full corpus": KeynessMetricSource.Full, }, value=KeynessMetricSource.Weighed if bundle.concept_corpus is not None else KeynessMetricSource.Full, layout=Layout(width='auto'), ) """Properties that changes current corpus""" self._keyness: Dropdown = Dropdown( options={ "TF": KeynessMetric.TF, "TF (norm)": KeynessMetric.TF_normalized, "TF-IDF": KeynessMetric.TF_IDF, "HAL CWR": KeynessMetric.HAL_cwr, "PPMI": KeynessMetric.PPMI, "LLR": KeynessMetric.LLR, "LLR(Z)": KeynessMetric.LLR_Z, "LLR(N)": KeynessMetric.LLR_N, "DICE": KeynessMetric.DICE, }, value=KeynessMetric.TF, layout=Layout(width='auto'), ) """Properties that don't change current corpus""" self._token_filter: Text = Text(value=default_token_filter, placeholder='token match', layout=Layout(width='auto')) self._global_threshold_filter: Dropdown = Dropdown( options={ f'>= {i}': i for i in (1, 2, 3, 4, 5, 10, 25, 50, 100, 250, 500) }, value=5, layout=Layout(width='auto'), ) self.concepts: Set[str] = set(self.bundle.context_opts.concept or []) self._largest: Dropdown = Dropdown( options=[10**i for i in range(0, 7)], value=10000, layout=Layout(width='auto'), ) self._show_concept = ToggleButton( description='Show concept', value=False, icon='', layout=Layout(width='auto'), ) self._message: HTML = HTML() self._compute: Button = Button(description="Compute", button_style='success', layout=Layout(width='auto')) self._save = Button(description='Save', layout=Layout(width='auto')) self._download = Button(description='Download', layout=Layout(width='auto')) self._download_output: Output = Output() self._table_view = TableViewerClass(data=empty_data()) self._button_bar = HBox( children=[ VBox([HTML("<b>Token match</b>"), self._token_filter]), VBox([HTML("<b>Source</b>"), self._keyness_source]), VBox([HTML("<b>Keyness</b>"), self._keyness]), VBox([HTML("🙂"), self._show_concept]), VBox([HTML("<b>Group by</b>"), self._pivot]), VBox([HTML("<b>Threshold</b>"), self._global_threshold_filter]), VBox([HTML("<b>Group limit</b>"), self._largest]), VBox([self._save, self._download]), VBox([self._compute, self._message]), self._download_output, ], layout=Layout(width='auto'), ) super().__init__( children=[self._button_bar, self._table_view.container], layout=Layout(width='auto'), **kwargs) self._save.on_click(self.save) self._download.on_click(self.download) self.start_observe() def _compute_handler(self, *_): try: self.set_buzy(True, "Computing...") self.update_corpus() self.update_co_occurrences() self.set_buzy(False, "✔") except ValueError as ex: self.alert(str(ex)) except Exception as ex: logger.exception(ex) self.alert(str(ex)) raise self.set_buzy(False) def set_buzy(self, is_buzy: bool = True, message: str = None): if message: self.alert(message) self._keyness.disabled = is_buzy self._keyness_source.disabled = is_buzy self._show_concept.disabled = is_buzy or self.bundle.concept_corpus is None self._pivot.disabled = is_buzy self._global_threshold_filter.disabled = is_buzy self._token_filter.disabled = is_buzy self._save.disabled = is_buzy self._download.disabled = is_buzy self._largest.disabled = is_buzy def start_observe(self): self.stop_observe() self._compute.on_click(self._compute_handler) self._show_concept.observe(self.update_co_occurrences, 'value') self._largest.observe(self.update_co_occurrences, 'value') self._show_concept.observe(self._update_toggle_icon, 'value') self._token_filter.observe(self._filter_co_occurrences, 'value') return self def stop_observe(self): with contextlib.suppress(Exception): self._show_concept.unobserve(self.update_co_occurrences, 'value') self._largest.unobserve(self.update_co_occurrences, 'value') self._token_filter.unobserve(self._filter_co_occurrences, 'value') self._show_concept.unobserve(self._update_toggle_icon, 'value') def alert(self, message: str) -> None: self._message.value = f"<span style='color: red; font-weight: bold;'>{message}</span>" def info(self, message: str) -> None: self._message.value = f"<span style='color: green; font-weight: bold;'>{message}</span>" def update_corpus(self, *_): self.set_buzy(True, "⌛ Computing...") self.corpus = self.to_corpus() self.set_buzy(False, "✔") def update_co_occurrences(self, *_) -> pd.DataFrame: self.set_buzy(True, "⌛ Preparing data...") self.co_occurrences = self.to_co_occurrences() self.set_buzy(False, "✔") self.set_buzy(True, "⌛ Loading table...") self._table_view.update(self.co_occurrences[DISPLAY_COLUMNS]) self.set_buzy(False, "✔") self.info(f"Data size: {len(self.co_occurrences)}") def _filter_co_occurrences(self, *_) -> pd.DataFrame: co_occurrences: pd.DataFrame = self.to_filtered_co_occurrences() self._table_view.update(co_occurrences[DISPLAY_COLUMNS]) self.info(f"Data size: {len(co_occurrences)}") def _update_toggle_icon(self, event: dict) -> None: with contextlib.suppress(Exception): event['owner'].icon = 'check' if event['new'] else '' @property def ignores(self) -> List[str]: if self.show_concept or not self.concepts: return set() if isinstance(self.concepts, Iterable): return set(self.concepts) return {self.concepts} @property def show_concept(self) -> bool: return self._show_concept.value @show_concept.setter def show_concept(self, value: bool): self._show_concept.value = value @property def keyness(self) -> KeynessMetric: return self._keyness.value @keyness.setter def keyness(self, value: KeynessMetric): self._keyness.value = value @property def keyness_source(self) -> KeynessMetricSource: return self._keyness_source.value @keyness_source.setter def keyness_source(self, value: KeynessMetricSource): self._keyness_source.value = value @property def global_threshold(self) -> int: return self._global_threshold_filter.value @global_threshold.setter def global_threshold(self, value: int): self._global_threshold_filter.value = value @property def largest(self) -> int: return self._largest.value @largest.setter def largest(self, value: int): self._largest.value = value @property def token_filter(self) -> List[str]: return self._token_filter.value.strip().split() @token_filter.setter def token_filter(self, value: List[str]): self._token_filter.value = ' '.join(value) if isinstance( value, list) else value @property def pivot(self) -> str: return self._pivot.value @pivot.setter def pivot(self, value: str): self._pivot.value = value def save(self, *_b): store_co_occurrences( filename=path_add_timestamp('co_occurrence_data.csv'), co_occurrences=self.co_occurrences, store_feather=False, ) def download(self, *_): self._button_bar.disabled = True with contextlib.suppress(Exception): js_download = create_js_download(self.co_occurrences, index=True) if js_download is not None: with self._download_output: IPython_display.display(js_download) self._button_bar.disabled = False def to_co_occurrences(self) -> pd.DataFrame: self.set_buzy(True, "⌛ Preparing co-occurrences...") try: if self.pivot_column_name not in self.corpus.document_index.columns: raise ValueError( f"expected '{self.pivot_column_name}' but not found in {', '.join(self.corpus.document_index.columns)}" ) co_occurrences: pd.DataFrame = (CoOccurrenceHelper( corpus=self.corpus, source_token2id=self.bundle.token2id, pivot_keys=self.pivot_column_name, ).exclude(self.ignores).largest(self.largest)).value self.set_buzy(False, None) self.alert("✔") except Exception as ex: self.set_buzy(False) self.alert(f"😢 {str(ex)}") raise return co_occurrences def to_filtered_co_occurrences(self) -> pd.DataFrame: if not self.token_filter: return self.co_occurrences co_occurrences: pd.DataFrame = self.co_occurrences re_filters: List[str] = [ fnmatch.translate(s) for s in self.token_filter ] for re_filter in re_filters: co_occurrences = co_occurrences[co_occurrences.token.astype( str).str.contains(pat=re_filter, case=False, na="")] return co_occurrences def to_corpus(self) -> VectorizedCorpus: """Returns a grouped, optionally TF-IDF, corpus filtered by token & threshold.""" self.set_buzy(True, "⌛ updating corpus...") try: corpus: VectorizedCorpus = self.bundle.keyness_transform( opts=self.compute_opts()) self.set_buzy(False, None) self.alert("✔") except Exception as ex: self.set_buzy(False) self.alert(f"😢 {str(ex)}") raise return corpus def setup(self) -> "TabularCoOccurrenceGUI": self.update_corpus() return self def compute_opts(self) -> ComputeKeynessOpts: return ComputeKeynessOpts( period_pivot=self.pivot, keyness_source=self.keyness_source, keyness=self.keyness, tf_threshold=self.global_threshold, pivot_column_name=self.pivot_column_name, normalize=False, )