Exemplo n.º 1
0
 def _get_default_scores(self, category, df):
     category_column_name = category + ' freq'
     cat_word_counts = df[category_column_name]
     not_cat_word_counts = self._get_not_category_term_frequency(
         category_column_name, df)
     scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts)
     return scores
Exemplo n.º 2
0
 def _get_default_scores(self, category, other_categories, df):
     category_column_name = category + ' freq'
     cat_word_counts = df[category_column_name]
     not_cat_word_counts = df[[c + ' freq'
                               for c in other_categories]].sum(axis=1)
     scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts)
     return scores
Exemplo n.º 3
0
 def _get_default_scores(self, category, df):
     category_column_name = category + ' freq'
     cat_word_counts = df[category_column_name]
     not_cat_word_counts = df[[
         c for c in df.columns if c != category_column_name
     ]].sum(axis=1)
     scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts)
     return scores
	def get_p_vals(self, X):
		'''
		Imputes p-values from the Z-scores of `ScaledFScore` scores.  Assuming incorrectly
		that the scaled f-scores are normally distributed.

		Parameters
		----------
		X : np.array
			Array of word counts, shape (N, 2) where N is the vocab size.  X[:,0] is the
			positive class, while X[:,1] is the negative class.

		Returns
		-------
		np.array of p-values

		'''
		f_scores = ScaledFScore.get_scores(X[:,0], X[:,1], self.scaler_algo, self.beta)
		z_scores = (f_scores - np.mean(f_scores))/(np.std(f_scores)/np.sqrt(len(f_scores)))
		return norm.cdf(z_scores)