def get_scores(self, corpus): ''' Parameters ---------- corpus Returns ------- float, pd.Series float: point on x-axis at even characteristicness pd.Series: term -> value between 0 and 1, sorted by score in a descending manner Background scores from corpus ''' term_ranks = self.term_ranker(corpus).get_ranks() bg = pd.DataFrame({ 'corpus': term_ranks.sum(axis=1), 'bg': self.background_frequencies.get_background_frequency_df() ['background'] }).dropna() scores = RankDifference().get_scores(bg['corpus'], bg['bg']).sort_values() if scores.min() < 0 and scores.max() > 0: zero_marker = -scores.min() / (scores.max() - scores.min()) elif scores.min() > 0: zero_marker = 0 else: zero_marker = 1 bg['score'] = scale(scores) return zero_marker, bg.sort_values(by='score', ascending=False)['score']
def get_scores(self, corpus): ''' Parameters ---------- corpus Returns ------- float, pd.Series float: point on x-axis at even characteristicness pd.Series: term -> value between 0 and 1, sorted by score in a descending manner Background scores from corpus ''' term_ranks = self.term_ranker(corpus).get_ranks() freq_df = pd.DataFrame({ 'corpus': term_ranks.sum(axis=1), 'standard': self.background_frequencies.get_background_frequency_df() ['background'] }) freq_df = freq_df.loc[freq_df['corpus'].dropna().index].fillna(0) corpus_rank = rankdata(freq_df.corpus, 'dense') standard_rank = rankdata(freq_df.standard, 'dense') scores = corpus_rank / corpus_rank.max( ) - standard_rank / standard_rank.max() if self.rerank_ranks: rank_scores, zero_marker = self._rerank_scores(scores) freq_df['score'] = pd.Series(rank_scores, index=freq_df.index) else: if scores.min() < 0 and scores.max() > 0: zero_marker = -scores.min() / (scores.max() - scores.min()) elif scores.min() > 0: zero_marker = 0 else: zero_marker = 1 freq_df['score'] = scale(scores) return zero_marker, freq_df.sort_values(by='score', ascending=False)['score']
def get_scores(self, corpus): ''' Parameters ---------- corpus Returns ------- float, pd.Series float: point on x-axis at even characteristicness pd.Series: term -> value between 0 and 1, sorted by score in a descending manner Background scores from corpus ''' term_ranks = self.term_ranker(corpus).get_ranks() freq_df = pd.DataFrame({ 'corpus': term_ranks.sum(axis=1), 'standard': self.background_frequencies.get_background_frequency_df()['background']} ).dropna() corpus_rank = rankdata(freq_df.corpus, 'dense') standard_rank = rankdata(freq_df.standard, 'dense') scores = corpus_rank/corpus_rank.max() - standard_rank/standard_rank.max() #scores = RankDifference().get_scores(bg['corpus'], bg['bg']).sort_values() # import pdb; pdb.set_trace() if self.rerank_ranks: rank_scores, zero_marker = self._rerank_scores(scores) freq_df['score'] = pd.Series(rank_scores, index=freq_df.index) else: if scores.min() < 0 and scores.max() > 0: zero_marker = -scores.min() / (scores.max() - scores.min()) elif scores.min() > 0: zero_marker = 0 else: zero_marker = 1 freq_df['score'] = scale(scores) return zero_marker, freq_df.sort_values(by='score', ascending=False)['score']
def produce_fightin_words_explorer( corpus, category, category_name=None, not_category_name=None, term_ranker=termranking.AbsoluteFrequencyRanker, alpha=0.01, use_term_significance=False, term_scorer=None, not_categories=None, grey_threshold=1.96, y_axis_values=None, **kwargs): ''' Produces a Monroe et al. style visualization. Parameters ---------- corpus : Corpus Corpus to use. category : str Name of category column as it appears in original data frame. category_name : str or None Name of category to use. E.g., "5-star reviews." Defaults to category not_category_name : str or None Name of everything that isn't in category. E.g., "Below 5-star reviews". Defaults to "Not " + category_name term_ranker : TermRanker TermRanker class for determining term frequency ranks. alpha : float, default = 0.01 Uniform dirichlet prior for p-value calculation use_term_significance : bool, True by default Use term scorer term_scorer : TermSignificance Subclass of TermSignificance to use as for scores and significance not_categories : list All categories other than category by default. Documents labeled with remaining category. grey_threshold : float Score to grey points. Default is 1.96 y_axis_values : list Custom y-axis values. Defaults to linspace Remaining arguments are from `produce_scattertext_explorer`.' Returns ------- str, html of visualization ''' if not_categories is None: not_categories = [c for c in corpus.get_categories() if c != category] if term_scorer is None: term_scorer = LogOddsRatioUninformativeDirichletPrior(alpha) term_freq_df = term_ranker(corpus).get_ranks() + 1 freqs = term_freq_df[[c + ' freq' for c in [category] + not_categories ]].sum(axis=1).values x_axis_values = [ round_downer(10**x) for x in np.linspace(0, np.log(freqs.max()) / np.log(10), 5) ] # y_axis_values = [-2.58, -1.96, 0, 1.96, 2.58] frequencies_log_scaled = scale(np.log(freqs) - np.log(1)) if 'scores' not in kwargs: zeta_i_j = term_scorer.get_scores( term_freq_df[category + ' freq'], term_freq_df[[c + ' freq' for c in not_categories]].sum(axis=1)) kwargs['scores'] = kwargs.get('scores', zeta_i_j) def y_axis_rescale(coords): return ((coords - 0.5) / (np.abs(coords - 0.5).max()) + 1) / 2 # from https://stackoverflow.com/questions/3410976/how-to-round-a-number-to-significant-figures-in-python def round_to_1(x): if x == 0: return 0 return round(x, -int(np.floor(np.log10(abs(x))))) if y_axis_values is None: y_axis_values = [ round_to_1(x) for x in sorted( set(-np.linspace(0, np.max(np.abs(kwargs['scores'])), 4)) | set(np.linspace(0, np.max(np.abs(kwargs['scores'])), 4)) | {0}) ] scores_scaled_for_charting = scale_neg_1_to_1_with_zero_mean_abs_max( kwargs['scores']) # kwargs['metadata'] = kwargs.get('metadata', None), if use_term_significance: kwargs['term_significance'] = term_scorer color_func = '''(function(d) { return (Math.abs(d.os) < %s) ? d3.interpolate(d3.rgb(230, 230, 230), d3.rgb(130, 130, 130))(Math.abs(d.os)/%s) : d3.interpolateRdYlBu(d.y); })''' % (grey_threshold, grey_threshold) return produce_scattertext_explorer( corpus, category=category, category_name=category_name, not_category_name=not_category_name, x_coords=frequencies_log_scaled, y_coords=scores_scaled_for_charting, original_x=freqs, original_y=kwargs['scores'], x_axis_values=x_axis_values, y_axis_values=y_axis_values, rescale_x=scale, rescale_y=y_axis_rescale, sort_by_dist=False, term_ranker=term_ranker, color_func=color_func, not_categories=not_categories, x_label=kwargs.get('x_label', 'Log Frequency'), y_label=kwargs.get('y_label', term_scorer.get_name()), **kwargs)
def produce_fightin_words_explorer( corpus, category, category_name=None, not_category_name=None, term_ranker=termranking.AbsoluteFrequencyRanker, alpha=0.01, use_term_significance=True, **kwargs): ''' Produces a Monroe et al. style visualization. Parameters ---------- corpus : Corpus Corpus to use. category : str Name of category column as it appears in original data frame. category_name : str or None Name of category to use. E.g., "5-star reviews." Defaults to category not_category_name : str or None Name of everything that isn't in category. E.g., "Below 5-star reviews". Defaults to "Not " + category_name term_ranker : TermRanker TermRanker class for determining term frequency ranks. alpha : float, default = 0.01 Uniform dirichlet prior for p-value calculation use_term_significance : bool, True by default Use Log Odds Ratio w/ Uninformative Prior or specified values for significance. Remaining arguments are from `produce_scattertext_explorer`. Returns ------- str, html of visualization ''' if category_name is None: category_name = category if not_category_name is None: not_category_name = "Not " + category_name term_freq_df = term_ranker(corpus).get_ranks() frequencies_log_scaled = scale(np.log(term_freq_df.sum(axis=1).values)) if 'scores' not in kwargs: zeta_i_j = (LogOddsRatioUninformativeDirichletPrior( alpha).get_zeta_i_j_given_separate_counts( term_freq_df[category + ' freq'], term_freq_df[[ c + ' freq' for c in corpus.get_categories() if c != category ]].sum(axis=1))) kwargs['scores'] = kwargs.get('scores', zeta_i_j) def y_axis_rescale(coords): return ((coords - 0.5) / (np.abs(coords - 0.5).max()) + 1) / 2 scores_scaled_for_charting = scale_neg_1_to_1_with_zero_mean_abs_max( kwargs['scores']) # kwargs['metadata'] = kwargs.get('metadata', None), if use_term_significance: kwargs['term_significance'] = LogOddsRatioUninformativeDirichletPrior( alpha) return produce_scattertext_explorer( corpus, category=category, category_name=category_name, not_category_name=not_category_name, x_coords=frequencies_log_scaled, y_coords=scores_scaled_for_charting, rescale_x=scale, rescale_y=y_axis_rescale, sort_by_dist=False, term_ranker=term_ranker, p_value_colors=True, # x_label=kwargs.get('x_label', 'Log Frequency'), # y_label=kwargs.get('y_label', 'Z-Score: Log Odds Ratio w/ Prior'), **kwargs)
def zero_centered_scale(ar): ar[ar > 0] = scale(ar[ar > 0]) ar[ar < 0] = -scale(-ar[ar < 0]) return (ar + 1) / 2.
convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas(convention_df, category_col='party', text_col='text', nlp=nlp).build() term_freq_df = corpus.get_term_freq_df() def zero_centered_scale(ar): ar[ar > 0] = scale(ar[ar > 0]) ar[ar < 0] = -scale(-ar[ar < 0]) return (ar + 1) / 2. frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values)) scores = corpus.get_logreg_coefs( 'democrat', LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1)) scores_scaled = zero_centered_scale(scores) html = produce_scattertext_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, pmi_threshold_coefficient=4, width_in_pixels=1000, x_coords=frequencies_scaled, y_coords=scores_scaled,