示例#1
0
    def generate_gibbs_states_plots(self,
                                    states_path: str,
                                    cat: str = "likelihood"):
        new_dir = Tools.get_path(states_path, f"{cat}_plots")
        if Tools.path_exists(new_dir):
            print("Plots found, skipping..")
            return

        Tools.initialise_directory(new_dir)
        with Tools.scan_directory(states_path) as outputs:
            for i, output in enumerate(outputs):
                try:
                    state_file = Tools.get_path(output.path, "state.log")
                    df = pd.read_csv(filepath_or_buffer=state_file,
                                     delim_whitespace=True,
                                     index_col="iter")
                    ax = sns.lineplot(x=df.index, y=cat, data=df)
                    ax.margins(x=0)
                    name = output.name
                    fig = ax.get_figure()
                    fig.savefig(Tools.get_path(states_path, f"{cat}_plots",
                                               f"{name}.png"),
                                dpi=300,
                                bbox_incehs="tight",
                                format="png")
                    fig.clf()
                    print(f"{i}")
                except FileNotFoundError:
                    print(f"→ Skipping {output.name}")
示例#2
0
    def _generate_lda_c_corpus(self):
        """ Convert a group of files LDA_C corpus and store it on disk"""
        bow_corpus, id2word_map, plain_docs = self._convert_corpus_to_bow()
        # Sterialise into LDA_C and store on disk
        output_dir = Tools.get_path(
            self.input_docs_path,
            f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}",
            f"_{self.hdp_alpha_s:0.1f}_common_{self.drop_uncommon}")

        Tools.initialise_directory(output_dir)
        save_location = Tools.get_path(output_dir, f"{self.lda_c_fname}.dat")

        bleicorpus.BleiCorpus.serialize(fname=save_location,
                                        corpus=bow_corpus,
                                        id2word=id2word_map)
        return plain_docs, bow_corpus
示例#3
0
    def _invoke_gibbs_hdp(self):
        """Invoke Gibbs hdp posterior inference on the corpus"""
        path_executable = Tools.get_path(self.hdp_path, "hdp.exe")

        param_data = Tools.get_path(
            self.input_docs_path,
            f"lda_c_format_{self.hdp_eta:0.1f}_{self.hdp_gamma_s:0.1f}",
            f"_{self.hdp_alpha_s:0.1f}_common_{self.drop_uncommon}",
            f"{self.lda_c_fname}.dat")

        param_directory = Tools.get_path(self.input_docs_path,
                                         self.hdp_output_directory)

        # Prepare the output directory
        Tools.initialise_directory(param_directory)

        if self.hdp_seed is not None and self.hdp_seed > 0:
            ret = s.run([
                path_executable, "--algorithm", "train", "--data", param_data,
                "--directory", param_directory, "--max_iter",
                str(self.hdp_iterations), "--sample_hyper",
                "yes" if self.hdp_hyper_sampling else "no", "--save_lag", "-1",
                "--eta",
                str(self.hdp_eta), "--random_seed",
                str(self.hdp_seed), "--gamma_a",
                str(self.hdp_gamma_s), "--alpha_a",
                str(self.hdp_alpha_s)
            ],
                        check=True,
                        capture_output=True,
                        text=True)
        else:
            ret = s.run([
                path_executable, "--algorithm", "train", "--data", param_data,
                "--directory", param_directory, "--max_iter",
                str(self.hdp_iterations), "--sample_hyper",
                "yes" if self.hdp_hyper_sampling else "no", "--save_lag", "-1",
                "--eta",
                str(self.hdp_eta), "--gamma_a",
                str(self.hdp_gamma_s), "--alpha_a",
                str(self.hdp_alpha_s)
            ],
                        check=True,
                        capture_output=True,
                        text=True)

        return ret.stdout