Exemplo n.º 1
0
    def test_lda(self):
        my_lda = LDA("test.mm", "test.dict", ntopics=3)
        my_lda.save(os.path.join(module_path, 'test.model'))
        top_words = my_lda.get_top_words(15)
        self.assertEqual(len(top_words), 3)
        self.assertEqual(len(top_words[1]), 15)

        my_lda.termite_data(os.path.isfile(os.path.join(module_path, 'test_termite.csv')))

        self.assertTrue(os.path.isfile(os.path.join(module_path, 'test.model')))
        self.assertTrue(os.path.isfile(os.path.join(module_path, 'test_termite.model')))
def run_topic_model(output_dir,
                    n_topics,
                    content_fields,
                    field_filters=None,
                    field_filter_vals=None,
                    seed=42):

    np.random.seed(seed)
    # documents = iter_elastic_query(ES_INSTANCE + ES_INDEX, "abstract", "", query=None)
    documents = read_bulk_index(elastic + "original/", content_fields,
                                field_filters, field_filter_vals)

    corpus = EntitiesTokenizer(
        documents)  #receives a generator of strings (content for each doc)

    # if os.path.isdir(output_dir):
    #     shutil.rmtree(output_dir)

    # os.makedirs(output_dir)

    corpus_bow = CorpusBOW(corpus)

    corpus_dict = corpus_bow.save_dict(os.path.join(output_dir, 'corpus.dict'))
    # Serialize and store the corpus
    corpus_file = corpus_bow.serialize(os.path.join(output_dir, 'corpus.mm'))
    # Create LDA model from corpus and dictionary

    topik_lda = LDA(os.path.join(output_dir, 'corpus.mm'),
                    os.path.join(output_dir, 'corpus.dict'),
                    n_topics,
                    update_every=1,
                    passes=5)

    topik_lda.save(os.path.join(output_dir, 'model.gensim'))

    # Generate the input for the termite plot
    topik_lda.termite_data(os.path.join(output_dir, 'termite.csv'))
    # Get termite plot for this model
    termite = Termite(os.path.join(output_dir, 'termite.csv'), "Termite Plot")
    termite.plot(os.path.join(output_dir, 'termite.html'))

    df_results = generate_csv_output_file(documents, corpus, corpus_bow,
                                          topik_lda.model)

    to_r_ldavis(corpus_bow,
                dir_name=os.path.join(output_dir, 'ldavis'),
                lda=topik_lda)
    os.environ["LDAVIS_DIR"] = os.path.join(output_dir, 'ldavis')
    try:
        subprocess.call(
            ['Rscript',
             os.path.join(BASEDIR, 'topic-space/R/runLDAvis.R')])
    except ValueError:
        logging.warning("Unable to run runLDAvis.R")
Exemplo n.º 3
0
    def test_lda(self):
        my_lda = LDA("test.mm", "test.dict", ntopics=3)
        my_lda.save(os.path.join(module_path, 'test.model'))
        top_words = my_lda.get_top_words(15)
        self.assertEqual(len(top_words), 3)
        self.assertEqual(len(top_words[1]), 15)

        my_lda.termite_data(os.path.isfile(os.path.join(module_path, 'test_termite.csv')))

        self.assertTrue(os.path.isfile(os.path.join(module_path, 'test.model')))
        self.assertTrue(os.path.isfile(os.path.join(module_path, 'test_termite.model')))