Exemplo n.º 1
0
 def _get_default_scores(self, category, df):
     category_column_name = category + ' freq'
     cat_word_counts = df[category_column_name]
     not_cat_word_counts = self._get_not_category_term_frequency(
         category_column_name, df)
     scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts)
     return scores
Exemplo n.º 2
0
 def _get_default_scores(self, category, other_categories, df):
     category_column_name = category + ' freq'
     cat_word_counts = df[category_column_name]
     not_cat_word_counts = df[[c + ' freq'
                               for c in other_categories]].sum(axis=1)
     scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts)
     return scores
Exemplo n.º 3
0
 def get_scaled_f_scores_vs_background(
         self,
         scaler_algo=DEFAULT_BACKGROUND_SCALER_ALGO,
         beta=DEFAULT_BACKGROUND_BETA):
     df = self.get_term_and_background_counts()
     df['Scaled f-score'] = ScaledFScore.get_scores_for_category(
         df['corpus'], df['background'], scaler_algo, beta)
     return df.sort_values(by='Scaled f-score', ascending=False)
Exemplo n.º 4
0
 def _get_default_scores(self, category, df):
     category_column_name = category + ' freq'
     cat_word_counts = df[category_column_name]
     not_cat_word_counts = df[[
         c for c in df.columns if c != category_column_name
     ]].sum(axis=1)
     scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts)
     return scores
	def get_scaled_f_scores_vs_background(self,
	                                      scaler_algo=DEFAULT_BACKGROUND_SCALER_ALGO,
	                                      beta=DEFAULT_BACKGROUND_BETA):
		df = self.get_term_and_background_counts()
		df['Scaled f-score'] = ScaledFScore.get_scores_for_category(
			df['corpus'], df['background'], scaler_algo, beta
		)
		return df.sort_values(by='Scaled f-score', ascending=False)
 def get_scaled_f_scores_vs_background(self,
                                       scaler_algo=DEFAULT_BACKGROUND_SCALER_ALGO,
                                       beta=DEFAULT_BACKGROUND_BETA):
     '''
     Parameters
     ----------
     scaler_algo : str
         see get_scaled_f_scores, default 'none'
     beta : float
       default 1.
     Returns
     -------
     pd.DataFrame of scaled_f_score scores compared to background corpus
     '''
     df = self.get_term_and_background_counts()
     df['Scaled f-score'] = ScaledFScore.get_scores_for_category(
         df['corpus'], df['background'], scaler_algo, beta
     )
     return df.sort_values(by='Scaled f-score', ascending=False)
	def get_p_vals(self, X):
		'''
		Imputes p-values from the Z-scores of `ScaledFScore` scores.  Assuming incorrectly
		that the scaled f-scores are normally distributed.

		Parameters
		----------
		X : np.array
			Array of word counts, shape (N, 2) where N is the vocab size.  X[:,0] is the
			positive class, while X[:,1] is the negative class.

		Returns
		-------
		np.array of p-values

		'''
		f_scores = ScaledFScore.get_scores(X[:,0], X[:,1], self.scaler_algo, self.beta)
		z_scores = (f_scores - np.mean(f_scores))/(np.std(f_scores)/np.sqrt(len(f_scores)))
		return norm.cdf(z_scores)
 def get_scaled_f_scores_vs_background(self,
                                       scaler_algo=DEFAULT_BACKGROUND_SCALER_ALGO,
                                       beta=DEFAULT_BACKGROUND_BETA):
     '''
     Parameters
     ----------
     scaler_algo : str
         see get_scaled_f_scores, default 'none'
     beta : float
       default 1.
     Returns
     -------
     pd.DataFrame of scaled_f_score scores compared to background corpus
     '''
     df = self.get_term_and_background_counts()
     df['Scaled f-score'] = ScaledFScore.get_scores_for_category(
         df['corpus'], df['background'], scaler_algo, beta
     )
     return df.sort_values(by='Scaled f-score', ascending=False)