def test_align_to_target(self): full_corpus = get_test_corpus() corpus = full_corpus.remove_categories(['swift']) priors = PriorFactory(full_corpus).use_all_categories().get_priors() with self.assertRaises(ValueError): (LogOddsRatioInformativeDirichletPrior(priors).get_scores( *corpus.get_term_freq_df().values.T)) priors = (PriorFactory(full_corpus).use_all_categories(). align_to_target(corpus).get_priors()) (LogOddsRatioInformativeDirichletPrior(priors).get_scores( *corpus.get_term_freq_df().values.T))
import scattertext as st from scattertext import LogOddsRatioInformativeDirichletPrior fn = 'rotten_fresh2.html' df = st.SampleCorpora.RottenTomatoes.get_data() corpus = (st.CorpusFromPandas(df, category_col='category', text_col='text', nlp=st.whitespace_nlp_with_sentences).build()) priors = (st.PriorFactory(corpus, category='fresh', not_categories=['rotten'], starting_count=1).use_general_term_frequencies(). use_all_categories().get_priors()) (open(fn, 'wb').write( st.produce_fightin_words_explorer( corpus, category='fresh', not_categories=['rotten'], metadata=df['movie_name'], term_scorer=LogOddsRatioInformativeDirichletPrior(priors, alpha_w=10), ).encode('utf-8'))) print(fn)
from scattertext.termcompaction.CompactTerms import CompactTerms import scattertext as st from scattertext import LogOddsRatioInformativeDirichletPrior fn = 'demo_log_odds_ratio_prior.html' df = st.SampleCorpora.RottenTomatoes.get_data() corpus = (st.CorpusFromPandas(df, category_col='category', text_col='text', nlp=st.whitespace_nlp_with_sentences) .build()) priors = (st.PriorFactory(corpus, category='fresh', not_categories=['rotten'], starting_count=1) #.use_general_term_frequencies() .use_all_categories() .get_priors()) (open(fn, 'wb') .write( st.produce_frequency_explorer( corpus, category='fresh', not_categories=['rotten'], metadata=df['movie_name'], term_scorer=LogOddsRatioInformativeDirichletPrior(priors, 1), ).encode('utf-8')) ) print(fn)