def preprocess_corpus(self, ents_corpus): ents_corpus_list = [ ents_corpus[pmcid]['ents_proc1'] for pmcid in ents_corpus ] ents_corpus_list_flat = flatten_nested_list(ents_corpus_list) # Get unique entity strings for pre-processing unique_ents = list(set(ents_corpus_list_flat)) ents_counter = Counter(ents_corpus_list_flat) # Spell check preprocessed entity strings spell_checker = self._create_spell_checker_dict( ents_counter, self.n_spell_check_thres) unique_ents_proc = [ self._spell_check(ent_string, spell_checker) if ents_counter[ent_string] < self.n_spell_check_thres else ent_string for ent_string in unique_ents ] # Replace entities per article with spell corrected entity strings and create Counter orig_to_proc = { orig: proc for orig, proc in zip(unique_ents, unique_ents_proc) } # proc_to_orig = {proc: orig for orig, proc in zip(unique_ents, unique_ents_proc)} ents_corpus_list_proc = self._replace_ents(ents_corpus_list, orig_to_proc) # Count up entity occurrences if self.ignore_article_counts: ents_counter = Counter([ ent.lower() for article in ents_corpus_list_proc for ent in list(set(article)) ]) else: ents_counter = Counter(flatten_nested_list(ents_corpus_list_proc)) # Threshold entities by their counts - specified by user: '******' proc_to_final = { ent: (ent if count > self.n_thres else None) for ent, count in ents_counter.items() } # Replace entities per article with final entity dictionary ents_corpus_final = self._update_article_dict(ents_corpus, orig_to_proc, proc_to_final) return ents_corpus_final
def _list_luns(self): self._check_self() return set(flatten_nested_list([t.luns for t in self.tpgs]))
def _list_network_portals(self): self._check_self() return set(flatten_nested_list([t.network_portals for t in self.tpgs]))
def _list_node_acls(self): self._check_self() return set(flatten_nested_list([t.node_acls for t in self.tpgs]))
def _list_tpgs(self): self._check_self() return set(flatten_nested_list([t.tpgs for t in self.targets]))
def _list_storage_objects(self): self._check_self() return set(flatten_nested_list([backstore.storage_objects for backstore in self.backstores]))