def generate_gibbs_states_plots(self, states_path: str, cat: str = "likelihood"): new_dir = Tools.get_path(states_path, f"{cat}_plots") if Tools.path_exists(new_dir): print("Plots found, skipping..") return Tools.initialise_directory(new_dir) with Tools.scan_directory(states_path) as outputs: for i, output in enumerate(outputs): try: state_file = Tools.get_path(output.path, "state.log") df = pd.read_csv(filepath_or_buffer=state_file, delim_whitespace=True, index_col="iter") ax = sns.lineplot(x=df.index, y=cat, data=df) ax.margins(x=0) name = output.name fig = ax.get_figure() fig.savefig(Tools.get_path(states_path, f"{cat}_plots", f"{name}.png"), dpi=300, bbox_incehs="tight", format="png") fig.clf() print(f"{i}") except FileNotFoundError: print(f"→ Skipping {output.name}")
def _generate_lda_c_corpus(self): """ Convert a group of files LDA_C corpus and store it on disk""" bow_corpus, id2word_map, plain_docs = self._convert_corpus_to_bow() # Sterialise into LDA_C and store on disk output_dir = Tools.get_path( self.input_docs_path, f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}", f"_{self.hdp_alpha_s:0.1f}_common_{self.drop_uncommon}") Tools.initialise_directory(output_dir) save_location = Tools.get_path(output_dir, f"{self.lda_c_fname}.dat") bleicorpus.BleiCorpus.serialize(fname=save_location, corpus=bow_corpus, id2word=id2word_map) return plain_docs, bow_corpus
def _invoke_gibbs_hdp(self): """Invoke Gibbs hdp posterior inference on the corpus""" path_executable = Tools.get_path(self.hdp_path, "hdp.exe") param_data = Tools.get_path( self.input_docs_path, f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}", f"_{self.hdp_alpha_s:0.1f}_common_{self.drop_uncommon}", f"{self.lda_c_fname}.dat") param_directory = Tools.get_path(self.input_docs_path, self.hdp_output_directory) # Prepare the output directory Tools.initialise_directory(param_directory) if self.hdp_seed is not None and self.hdp_seed > 0: ret = s.run([ path_executable, "--algorithm", "train", "--data", param_data, "--directory", param_directory, "--max_iter", str(self.hdp_iterations), "--sample_hyper", "yes" if self.hdp_hyper_sampling else "no", "--save_lag", "-1", "--eta", str(self.hdp_eta), "--random_seed", str(self.hdp_seed), "--gamma_a", str(self.hdp_gamma_s), "--alpha_a", str(self.hdp_alpha_s) ], check=True, capture_output=True, text=True) else: ret = s.run([ path_executable, "--algorithm", "train", "--data", param_data, "--directory", param_directory, "--max_iter", str(self.hdp_iterations), "--sample_hyper", "yes" if self.hdp_hyper_sampling else "no", "--save_lag", "-1", "--eta", str(self.hdp_eta), "--gamma_a", str(self.hdp_gamma_s), "--alpha_a", str(self.hdp_alpha_s) ], check=True, capture_output=True, text=True) return ret.stdout