def get_scores(self, corpus):
        '''
		Parameters
		----------
		corpus

		Returns
		-------
		float, pd.Series
		float: point on x-axis at even characteristicness
		pd.Series: term -> value between 0 and 1, sorted by score in a descending manner
		Background scores from corpus
		'''
        term_ranks = self.term_ranker(corpus).get_ranks()

        bg = pd.DataFrame({
            'corpus':
            term_ranks.sum(axis=1),
            'bg':
            self.background_frequencies.get_background_frequency_df()
            ['background']
        }).dropna()
        scores = RankDifference().get_scores(bg['corpus'],
                                             bg['bg']).sort_values()
        if scores.min() < 0 and scores.max() > 0:
            zero_marker = -scores.min() / (scores.max() - scores.min())
        elif scores.min() > 0:
            zero_marker = 0
        else:
            zero_marker = 1
        bg['score'] = scale(scores)
        return zero_marker, bg.sort_values(by='score',
                                           ascending=False)['score']
    def get_scores(self, corpus):
        '''
		Parameters
		----------
		corpus

		Returns
		-------
		float, pd.Series
		float: point on x-axis at even characteristicness
		pd.Series: term -> value between 0 and 1, sorted by score in a descending manner
		Background scores from corpus
		'''
        term_ranks = self.term_ranker(corpus).get_ranks()

        freq_df = pd.DataFrame({
            'corpus':
            term_ranks.sum(axis=1),
            'standard':
            self.background_frequencies.get_background_frequency_df()
            ['background']
        })

        freq_df = freq_df.loc[freq_df['corpus'].dropna().index].fillna(0)

        corpus_rank = rankdata(freq_df.corpus, 'dense')
        standard_rank = rankdata(freq_df.standard, 'dense')
        scores = corpus_rank / corpus_rank.max(
        ) - standard_rank / standard_rank.max()

        if self.rerank_ranks:
            rank_scores, zero_marker = self._rerank_scores(scores)
            freq_df['score'] = pd.Series(rank_scores, index=freq_df.index)
        else:
            if scores.min() < 0 and scores.max() > 0:
                zero_marker = -scores.min() / (scores.max() - scores.min())
            elif scores.min() > 0:
                zero_marker = 0
            else:
                zero_marker = 1
            freq_df['score'] = scale(scores)
        return zero_marker, freq_df.sort_values(by='score',
                                                ascending=False)['score']
	def get_scores(self, corpus):
		'''
		Parameters
		----------
		corpus

		Returns
		-------
		float, pd.Series
		float: point on x-axis at even characteristicness
		pd.Series: term -> value between 0 and 1, sorted by score in a descending manner
		Background scores from corpus
		'''
		term_ranks = self.term_ranker(corpus).get_ranks()

		freq_df = pd.DataFrame({
			'corpus': term_ranks.sum(axis=1),
			'standard': self.background_frequencies.get_background_frequency_df()['background']}
		).dropna()
		corpus_rank = rankdata(freq_df.corpus, 'dense')
		standard_rank = rankdata(freq_df.standard, 'dense')
		scores = corpus_rank/corpus_rank.max() - standard_rank/standard_rank.max()


		#scores = RankDifference().get_scores(bg['corpus'], bg['bg']).sort_values()
		# import pdb; pdb.set_trace()
		if self.rerank_ranks:
			rank_scores, zero_marker = self._rerank_scores(scores)
			freq_df['score'] = pd.Series(rank_scores, index=freq_df.index)
		else:
			if scores.min() < 0 and scores.max() > 0:
				zero_marker = -scores.min() / (scores.max() - scores.min())
			elif scores.min() > 0:
				zero_marker = 0
			else:
				zero_marker = 1
			freq_df['score'] = scale(scores)
		return zero_marker, freq_df.sort_values(by='score', ascending=False)['score']
示例#4
0
def produce_fightin_words_explorer(
        corpus,
        category,
        category_name=None,
        not_category_name=None,
        term_ranker=termranking.AbsoluteFrequencyRanker,
        alpha=0.01,
        use_term_significance=False,
        term_scorer=None,
        not_categories=None,
        grey_threshold=1.96,
        y_axis_values=None,
        **kwargs):
    '''
	Produces a Monroe et al. style visualization.

	Parameters
	----------
	corpus : Corpus
		Corpus to use.
	category : str
		Name of category column as it appears in original data frame.
	category_name : str or None
		Name of category to use.  E.g., "5-star reviews."
		Defaults to category
	not_category_name : str or None
		Name of everything that isn't in category.  E.g., "Below 5-star reviews".
		Defaults to "Not " + category_name
	term_ranker : TermRanker
		TermRanker class for determining term frequency ranks.
	alpha : float, default = 0.01
		Uniform dirichlet prior for p-value calculation
	use_term_significance : bool, True by default
		Use term scorer
	term_scorer : TermSignificance
		Subclass of TermSignificance to use as for scores and significance
	not_categories : list
		All categories other than category by default.  Documents labeled
		with remaining category.
	grey_threshold : float
		Score to grey points. Default is 1.96
	y_axis_values : list
		Custom y-axis values. Defaults to linspace
	Remaining arguments are from `produce_scattertext_explorer`.'
	Returns
	-------
		str, html of visualization
	'''

    if not_categories is None:
        not_categories = [c for c in corpus.get_categories() if c != category]
    if term_scorer is None:
        term_scorer = LogOddsRatioUninformativeDirichletPrior(alpha)

    term_freq_df = term_ranker(corpus).get_ranks() + 1
    freqs = term_freq_df[[c + ' freq' for c in [category] + not_categories
                          ]].sum(axis=1).values
    x_axis_values = [
        round_downer(10**x)
        for x in np.linspace(0,
                             np.log(freqs.max()) / np.log(10), 5)
    ]
    # y_axis_values = [-2.58, -1.96, 0, 1.96, 2.58]
    frequencies_log_scaled = scale(np.log(freqs) - np.log(1))

    if 'scores' not in kwargs:
        zeta_i_j = term_scorer.get_scores(
            term_freq_df[category + ' freq'],
            term_freq_df[[c + ' freq' for c in not_categories]].sum(axis=1))
        kwargs['scores'] = kwargs.get('scores', zeta_i_j)

    def y_axis_rescale(coords):
        return ((coords - 0.5) / (np.abs(coords - 0.5).max()) + 1) / 2

    # from https://stackoverflow.com/questions/3410976/how-to-round-a-number-to-significant-figures-in-python
    def round_to_1(x):
        if x == 0:
            return 0
        return round(x, -int(np.floor(np.log10(abs(x)))))

    if y_axis_values is None:
        y_axis_values = [
            round_to_1(x) for x in sorted(
                set(-np.linspace(0, np.max(np.abs(kwargs['scores'])), 4))
                | set(np.linspace(0, np.max(np.abs(kwargs['scores'])), 4))
                | {0})
        ]
    scores_scaled_for_charting = scale_neg_1_to_1_with_zero_mean_abs_max(
        kwargs['scores'])
    # kwargs['metadata'] = kwargs.get('metadata', None),
    if use_term_significance:
        kwargs['term_significance'] = term_scorer

    color_func = '''(function(d) {
	return (Math.abs(d.os) < %s) 
	 ? d3.interpolate(d3.rgb(230, 230, 230), d3.rgb(130, 130, 130))(Math.abs(d.os)/%s) 
	 : d3.interpolateRdYlBu(d.y);
	})''' % (grey_threshold, grey_threshold)

    return produce_scattertext_explorer(
        corpus,
        category=category,
        category_name=category_name,
        not_category_name=not_category_name,
        x_coords=frequencies_log_scaled,
        y_coords=scores_scaled_for_charting,
        original_x=freqs,
        original_y=kwargs['scores'],
        x_axis_values=x_axis_values,
        y_axis_values=y_axis_values,
        rescale_x=scale,
        rescale_y=y_axis_rescale,
        sort_by_dist=False,
        term_ranker=term_ranker,
        color_func=color_func,
        not_categories=not_categories,
        x_label=kwargs.get('x_label', 'Log Frequency'),
        y_label=kwargs.get('y_label', term_scorer.get_name()),
        **kwargs)
示例#5
0
def produce_fightin_words_explorer(
        corpus,
        category,
        category_name=None,
        not_category_name=None,
        term_ranker=termranking.AbsoluteFrequencyRanker,
        alpha=0.01,
        use_term_significance=True,
        **kwargs):
    '''
	Produces a Monroe et al. style visualization.

	Parameters
	----------
	corpus : Corpus
		Corpus to use.
	category : str
		Name of category column as it appears in original data frame.
	category_name : str or None
		Name of category to use.  E.g., "5-star reviews."
		Defaults to category
	not_category_name : str or None
		Name of everything that isn't in category.  E.g., "Below 5-star reviews".
		Defaults to "Not " + category_name
	term_ranker : TermRanker
		TermRanker class for determining term frequency ranks.
	alpha : float, default = 0.01
		Uniform dirichlet prior for p-value calculation
	use_term_significance : bool, True by default
		Use Log Odds Ratio w/ Uninformative Prior or specified values for significance.
	Remaining arguments are from `produce_scattertext_explorer`.
	Returns
	-------
		str, html of visualization
	'''
    if category_name is None:
        category_name = category
    if not_category_name is None:
        not_category_name = "Not " + category_name

    term_freq_df = term_ranker(corpus).get_ranks()
    frequencies_log_scaled = scale(np.log(term_freq_df.sum(axis=1).values))

    if 'scores' not in kwargs:
        zeta_i_j = (LogOddsRatioUninformativeDirichletPrior(
            alpha).get_zeta_i_j_given_separate_counts(
                term_freq_df[category + ' freq'], term_freq_df[[
                    c + ' freq' for c in corpus.get_categories()
                    if c != category
                ]].sum(axis=1)))
        kwargs['scores'] = kwargs.get('scores', zeta_i_j)

    def y_axis_rescale(coords):
        return ((coords - 0.5) / (np.abs(coords - 0.5).max()) + 1) / 2

    scores_scaled_for_charting = scale_neg_1_to_1_with_zero_mean_abs_max(
        kwargs['scores'])
    # kwargs['metadata'] = kwargs.get('metadata', None),
    if use_term_significance:
        kwargs['term_significance'] = LogOddsRatioUninformativeDirichletPrior(
            alpha)

    return produce_scattertext_explorer(
        corpus,
        category=category,
        category_name=category_name,
        not_category_name=not_category_name,
        x_coords=frequencies_log_scaled,
        y_coords=scores_scaled_for_charting,
        rescale_x=scale,
        rescale_y=y_axis_rescale,
        sort_by_dist=False,
        term_ranker=term_ranker,
        p_value_colors=True,
        # x_label=kwargs.get('x_label', 'Log Frequency'),
        # y_label=kwargs.get('y_label', 'Z-Score: Log Odds Ratio w/ Prior'),
        **kwargs)
def zero_centered_scale(ar):
    ar[ar > 0] = scale(ar[ar > 0])
    ar[ar < 0] = -scale(-ar[ar < 0])
    return (ar + 1) / 2.
convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text',
                          nlp=nlp).build()

term_freq_df = corpus.get_term_freq_df()


def zero_centered_scale(ar):
    ar[ar > 0] = scale(ar[ar > 0])
    ar[ar < 0] = -scale(-ar[ar < 0])
    return (ar + 1) / 2.


frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))
scores = corpus.get_logreg_coefs(
    'democrat',
    LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1))
scores_scaled = zero_centered_scale(scores)

html = produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=5,
    pmi_threshold_coefficient=4,
    width_in_pixels=1000,
    x_coords=frequencies_scaled,
    y_coords=scores_scaled,