def __init__(self, priors, sigma=10, scale_type='none', prior_power=1): ''' Parameters ---------- priors : pd.Series term -> prior count sigma : np.float prior scale scale_type : str 'none': Don't scale prior. Jurafsky approach. 'class-size': Scale prior st the sum of the priors is the same as the word count in the document-class being scaled 'corpus-size': Scale prior to the size of the corpus 'word': Original formulation from MCQ. Sum of priors will be sigma. 'background-corpus-size': Scale corpus size to multiple of background-corpus. prior_power : numeric Exponent to apply to prior > 1 will shrink frequent words ''' assert scale_type in [ 'none', 'class-size', 'corpus-size', 'background-corpus-size', 'word' ] self._priors = priors self._scale_type = scale_type self._prior_power = prior_power self._scale = sigma LogOddsRatioUninformativeDirichletPrior.__init__(self, sigma)
def __init__(self, priors, sigma=10, scale_type='none', prior_power=1): ''' Parameters ---------- priors : pd.Series term -> prior count sigma : np.float prior scale scale_type : str 'none': Don't scale prior. Jurafsky approach. 'class-size': Scale prior st the sum of the priors is the same as the word count in the document-class being scaled 'corpus-size': Scale prior to the size of the corpus 'word': Original formulation from MCQ. Sum of priors will be sigma. 'background-corpus-size': Scale corpus size to multiple of background-corpus. prior_power : numeric Exponent to apply to prior > 1 will shrink frequent words ''' assert scale_type in ['none', 'class-size', 'corpus-size', 'background-corpus-size', 'word'] self._priors = priors self._scale_type = scale_type self._prior_power = prior_power self._scale = sigma LogOddsRatioUninformativeDirichletPrior.__init__(self, sigma)
def __init__(self, priors, alpha_w=10): ''' Parameters ---------- alpha_w : np.float The constant prior. ''' self._priors = priors LogOddsRatioUninformativeDirichletPrior.__init__(self, alpha_w)
def test_get_p_vals(self): tdm = build_hamlet_jz_term_doc_mat() df = tdm.get_term_freq_df() X = df[['hamlet freq', 'jay-z/r. kelly freq']].values pvals = LogOddsRatioUninformativeDirichletPrior().get_p_vals(X) self.assertGreaterEqual(min(pvals), 0) self.assertLessEqual(min(pvals), 1)
def get_thresholded_score(cat_word_counts, not_cat_word_counts, alpha_w=0.01, threshold=0.1): scores = (LogOddsRatioUninformativeDirichletPrior( alpha_w).get_p_values_from_counts(cat_word_counts, not_cat_word_counts)) * 2 - 1 # scores = (np.min(np.array([1 - scores, scores]), axis=0) <= threshold) * scores return scores * ((scores < -(1. - (threshold * 2))) | (scores > (1. - (threshold * 2))))
def get_score(cat_word_counts, not_cat_word_counts, alpha_w=0.01): X = LogOddsUninformativePriorScore. \ _turn_counts_into_matrix(cat_word_counts, not_cat_word_counts) p_vals = LogOddsRatioUninformativeDirichletPrior(alpha_w).get_p_vals(X) scores = LogOddsUninformativePriorScore._turn_pvals_into_scores(p_vals) return scores
def get_delta_hats(cat_word_counts, not_cat_word_counts, alpha_w=0.01): return (LogOddsRatioUninformativeDirichletPrior(alpha_w) .get_log_odds_with_prior(LogOddsUninformativePriorScore ._turn_counts_into_matrix(cat_word_counts, not_cat_word_counts)))