예제 #1
0
    def __init__(self, priors, sigma=10, scale_type='none', prior_power=1):
        '''
		Parameters
		----------
		priors : pd.Series
			term -> prior count

		sigma : np.float
			prior scale

		scale_type : str
			'none': Don't scale prior. Jurafsky approach.
			'class-size': Scale prior st the sum of the priors is the same as the word count
			  in the document-class being scaled
			'corpus-size': Scale prior to the size of the corpus
			'word': Original formulation from MCQ. Sum of priors will be sigma.
			'background-corpus-size': Scale corpus size to multiple of background-corpus.

		prior_power : numeric
			Exponent to apply to prior
			> 1 will shrink frequent words

		'''
        assert scale_type in [
            'none', 'class-size', 'corpus-size', 'background-corpus-size',
            'word'
        ]
        self._priors = priors
        self._scale_type = scale_type
        self._prior_power = prior_power
        self._scale = sigma
        LogOddsRatioUninformativeDirichletPrior.__init__(self, sigma)
	def __init__(self,
	             priors,
	             sigma=10,
	             scale_type='none',
	             prior_power=1):
		'''
		Parameters
		----------
		priors : pd.Series
			term -> prior count

		sigma : np.float
			prior scale

		scale_type : str
			'none': Don't scale prior. Jurafsky approach.
			'class-size': Scale prior st the sum of the priors is the same as the word count
			  in the document-class being scaled
			'corpus-size': Scale prior to the size of the corpus
			'word': Original formulation from MCQ. Sum of priors will be sigma.
			'background-corpus-size': Scale corpus size to multiple of background-corpus.

		prior_power : numeric
			Exponent to apply to prior
			> 1 will shrink frequent words

		'''
		assert scale_type in ['none', 'class-size', 'corpus-size',
		                      'background-corpus-size', 'word']
		self._priors = priors
		self._scale_type = scale_type
		self._prior_power = prior_power
		self._scale = sigma
		LogOddsRatioUninformativeDirichletPrior.__init__(self, sigma)
    def __init__(self, priors, alpha_w=10):
        '''
		Parameters
		----------
		alpha_w : np.float
			The constant prior.
		'''
        self._priors = priors
        LogOddsRatioUninformativeDirichletPrior.__init__(self, alpha_w)
예제 #4
0
	def test_get_p_vals(self):
		tdm = build_hamlet_jz_term_doc_mat()
		df = tdm.get_term_freq_df()
		X = df[['hamlet freq', 'jay-z/r. kelly freq']].values
		pvals = LogOddsRatioUninformativeDirichletPrior().get_p_vals(X)
		self.assertGreaterEqual(min(pvals), 0)
		self.assertLessEqual(min(pvals), 1)
 def get_thresholded_score(cat_word_counts,
                           not_cat_word_counts,
                           alpha_w=0.01,
                           threshold=0.1):
     scores = (LogOddsRatioUninformativeDirichletPrior(
         alpha_w).get_p_values_from_counts(cat_word_counts,
                                           not_cat_word_counts)) * 2 - 1
     # scores = (np.min(np.array([1 - scores, scores]), axis=0) <= threshold) * scores
     return scores * ((scores < -(1. - (threshold * 2)))
                      | (scores > (1. - (threshold * 2))))
	def get_score(cat_word_counts, not_cat_word_counts, alpha_w=0.01):
		X = LogOddsUninformativePriorScore. \
			_turn_counts_into_matrix(cat_word_counts, not_cat_word_counts)
		p_vals = LogOddsRatioUninformativeDirichletPrior(alpha_w).get_p_vals(X)
		scores = LogOddsUninformativePriorScore._turn_pvals_into_scores(p_vals)
		return scores
	def get_delta_hats(cat_word_counts, not_cat_word_counts, alpha_w=0.01):
		return (LogOddsRatioUninformativeDirichletPrior(alpha_w)
		        .get_log_odds_with_prior(LogOddsUninformativePriorScore
		                                 ._turn_counts_into_matrix(cat_word_counts,
		                                                           not_cat_word_counts)))