def test_align_to_target(self):
     full_corpus = get_test_corpus()
     corpus = full_corpus.remove_categories(['swift'])
     priors = PriorFactory(full_corpus).use_all_categories().get_priors()
     with self.assertRaises(ValueError):
         (LogOddsRatioInformativeDirichletPrior(priors).get_scores(
             *corpus.get_term_freq_df().values.T))
     priors = (PriorFactory(full_corpus).use_all_categories().
               align_to_target(corpus).get_priors())
     (LogOddsRatioInformativeDirichletPrior(priors).get_scores(
         *corpus.get_term_freq_df().values.T))
示例#2
0
import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

fn = 'rotten_fresh2.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences).build())
priors = (st.PriorFactory(corpus,
                          category='fresh',
                          not_categories=['rotten'],
                          starting_count=1).use_general_term_frequencies().
          use_all_categories().get_priors())
(open(fn, 'wb').write(
    st.produce_fightin_words_explorer(
        corpus,
        category='fresh',
        not_categories=['rotten'],
        metadata=df['movie_name'],
        term_scorer=LogOddsRatioInformativeDirichletPrior(priors, alpha_w=10),
    ).encode('utf-8')))
print(fn)
from scattertext.termcompaction.CompactTerms import CompactTerms

import scattertext as st
from scattertext import LogOddsRatioInformativeDirichletPrior

fn = 'demo_log_odds_ratio_prior.html'
df = st.SampleCorpora.RottenTomatoes.get_data()
corpus = (st.CorpusFromPandas(df,
                              category_col='category',
                              text_col='text',
                              nlp=st.whitespace_nlp_with_sentences)
          .build())
priors = (st.PriorFactory(corpus,
                          category='fresh',
                          not_categories=['rotten'],
                          starting_count=1)
          #.use_general_term_frequencies()
          .use_all_categories()
          .get_priors())
(open(fn, 'wb')
	.write(
	st.produce_frequency_explorer(
		corpus,
		category='fresh',
		not_categories=['rotten'],
		metadata=df['movie_name'],
		term_scorer=LogOddsRatioInformativeDirichletPrior(priors, 1),
	).encode('utf-8'))
)
print(fn)