示例#1
0
	def __init__(self,
	             term_doc_matrix,
	             left_categories,
	             right_categories,
	             top_categories,
	             bottom_categories,
	             left_category_name=None,
	             right_category_name=None,
	             top_category_name=None,
	             bottom_category_name=None,
	             x_scorer=RankDifference(),
	             y_scorer=RankDifference(),
	             term_ranker=AbsoluteFrequencyRanker,
	             labels=None):
		for param in [left_categories, right_categories, top_categories, bottom_categories]:
			assert type(param) == list
			assert set(param) - set(term_doc_matrix.get_categories()) == set()
			assert len(param) > 0
		self.term_doc_matrix_ = term_doc_matrix
		self._labels = labels
		self.left_category_name_ = left_category_name if left_category_name is not None else left_categories[0]
		self.right_category_name_ = right_category_name if right_category_name is not None else right_categories[0]
		self.top_category_name_ = top_category_name if top_category_name is not None else top_categories[0]
		self.bottom_category_name_ = bottom_category_name if bottom_category_name is not None else bottom_categories[0]
		self.x_scorer_ = x_scorer
		self.y_scorer_ = y_scorer
		self.term_ranker_ = term_ranker
		self.left_categories_, self.right_categories_, self.top_categories_, self.bottom_categories_ \
			= left_categories, right_categories, top_categories, bottom_categories
		self.axes = self._build_axes()
		self.lexicons = self._build_lexicons()
 def _build_square(self, term_doc_matrix, term_ranker, labels, scorer):
     self.term_doc_matrix_ = term_doc_matrix
     self.term_ranker = term_ranker(term_doc_matrix)
     self.scorer = RankDifference() \
         if scorer is None else scorer
     self.axes = self._build_axes(scorer)
     self.lexicons = self._build_lexicons()
     self._labels = labels
    def get_scores(self, corpus):
        '''
		Parameters
		----------
		corpus

		Returns
		-------
		float, pd.Series
		float: point on x-axis at even characteristicness
		pd.Series: term -> value between 0 and 1, sorted by score in a descending manner
		Background scores from corpus
		'''
        term_ranks = self.term_ranker(corpus).get_ranks()

        bg = pd.DataFrame({
            'corpus':
            term_ranks.sum(axis=1),
            'bg':
            self.background_frequencies.get_background_frequency_df()
            ['background']
        }).dropna()
        scores = RankDifference().get_scores(bg['corpus'],
                                             bg['bg']).sort_values()
        if scores.min() < 0 and scores.max() > 0:
            zero_marker = -scores.min() / (scores.max() - scores.min())
        elif scores.min() > 0:
            zero_marker = 0
        else:
            zero_marker = 1
        bg['score'] = scale(scores)
        return zero_marker, bg.sort_values(by='score',
                                           ascending=False)['score']
示例#4
0
 def _get_default_scores(self, category, other_categories, df):
     category_column_name = category + ' freq'
     cat_word_counts = df[category_column_name]
     not_cat_word_counts = df[[c + ' freq' for c in other_categories]].sum(axis=1)
     # scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts)
     scores = RankDifference().get_scores(cat_word_counts, not_cat_word_counts)
     return scores
示例#5
0
    def __init__(self,
                 corpus,
                 scorer=RankDifference(),
                 ranker=AbsoluteFrequencyRanker,
                 use_non_text_features=False,
                 color_palette=QUALITATIVE_COLORS):
        '''
        Assigns scores to colors for categories

        :param corpus: TermDocMatrix
        :param scorer: scorer
        :param color_palette: list of colors [[red, green, blue], ...]
        '''
        self.corpus = corpus
        self.scorer = scorer
        self.color_palette = color_palette
        my_ranker = ranker(corpus)
        if use_non_text_features:
            my_ranker.use_non_text_features()
        tdf = my_ranker.get_ranks()
        tdf_sum = tdf.sum(axis=1)
        term_scores = {}
        for cat in tdf.columns:
            term_scores[cat[:-5]] = pd.Series(self.scorer.get_scores(
                tdf[cat], tdf_sum - tdf[cat]),
                                              index=tdf.index)
        self.term_cat = pd.DataFrame(term_scores).idxmax(axis=1)
        ranked_list_categories = pd.Series(
            corpus.get_category_names_by_row()).value_counts().index
        self.category_colors = pd.Series(
            self.color_palette[:len(ranked_list_categories)],
            index=ranked_list_categories)
示例#6
0
    def get_topics_from_terms(self,
                              terms=None,
                              num_terms_per_topic=10,
                              scorer=RankDifference()):
        '''
		Parameters
		----------
		terms : list or None
			If terms is list, make these the seed terms for the topoics
			If none, use the first 30 terms in get_scaled_f_scores_vs_background
		num_terms_per_topic : int, default 10
			Use this many terms per topic
		scorer : TermScorer
			Implements get_scores, default is RankDifferce, which tends to work best

		Returns
		-------
		dict: {term: [term1, ...], ...}
		'''
        topic_model = {}

        if terms is None:
            terms = self.corpus.get_scaled_f_scores_vs_background().index[:30]

        for term in terms:
            termidx = self.termidxstore.getidxstrict(term)
            labels = self.sentX[:, termidx].astype(bool).todense().A1
            poscnts = self.sentX[labels, :].astype(bool).sum(axis=0).A1
            negcnts = self.sentX[~labels, :].astype(bool).sum(axis=0).A1
            scores = scorer.get_scores(poscnts, negcnts)
            topic_model[term] = [
                self.termidxstore.getval(i)
                for i in np.argsort(-scores)[:num_terms_per_topic]
            ]
        return topic_model
示例#7
0
    def get_category_association(self, ranker=None, scorer=None):
        if scorer is None:
            scorer = RankDifference()
        if ranker is None:
            ranker = AbsoluteFrequencyRanker(self.corpus)
        if self.use_metadata:
            ranker = ranker.use_non_text_features()
        term_freq_df = ranker.get_ranks('')
        global_freq = term_freq_df.sum(axis=1)
        data = []
        for cat in self.corpus.get_categories():
            cat_freq = term_freq_df[cat]
            for term_rank, (term, score) in enumerate(scorer.get_scores(
                    cat_freq, global_freq - cat_freq
            ).sort_values(ascending=False).iteritems()):
                data.append({'Category': cat, 'Term': term, 'Rank': term_rank, 'Score': score})

        return pd.DataFrame(data).groupby('Rank')
示例#8
0
    def __init__(self,
                 term_doc_matrix,
                 category_to_timestep_func,
                 is_gap_between_sequences_func,
                 timesteps_to_lag=4,
                 num_top_terms_each_timestep=10,
                 num_terms_to_include=40,
                 starting_time_step=None,
                 term_ranker=AbsoluteFrequencyRanker,
                 term_scorer=RankDifference()):
        '''
		Parameters
		----------
		term_doc_matrix : TermDocMatrix
		category_to_timestep_func : lambda
		is_gap_between_sequences_func : lambda
		timesteps_to_lag : int
		num_top_terms_each_timestep : int
		num_terms_to_include : int
		starting_time_step : object
		term_ranker : TermRanker
		term_scorer : TermScorer
		'''
        self.corpus = term_doc_matrix
        self.timesteps_to_lag = timesteps_to_lag
        self.num_top_terms_each_timestep = num_top_terms_each_timestep
        self.num_terms_to_include = num_terms_to_include
        self.is_gap_between_sequences_func = is_gap_between_sequences_func
        self.category_to_timestep_func = category_to_timestep_func
        self.term_ranker = term_ranker
        self.term_scorer = term_scorer
        categories = list(sorted(self.corpus.get_categories()))
        if len(categories) <= timesteps_to_lag:
            raise Exception(
                "The number of categories in the term doc matrix is <= " +
                str(timesteps_to_lag))
        if starting_time_step is None:
            starting_time_step = categories[timesteps_to_lag + 1]
        self.starting_time_step = starting_time_step
示例#9
0
def produce_pairplot(corpus,
                     asian_mode=False,
                     category_width_in_pixels=500,
                     category_height_in_pixels=700,
                     term_width_in_pixels=500,
                     term_height_in_pixels=700,
                     terms_to_show=3000,
                     scaler=scale_neg_1_to_1_with_zero_mean,
                     term_ranker=AbsoluteFrequencyRanker,
                     use_metadata=False,
                     category_projector=CategoryProjector(),
                     category_projection=None,
                     topic_model_term_lists=None,
                     topic_model_preview_size=10,
                     metadata_descriptions=None,
                     initial_category=None,
                     x_dim=0,
                     y_dim=1,
                     show_halo=True,
                     num_terms_in_halo=5,
                     category_color_func='(function(x) {return "#5555FF"})',
                     protocol='https',
                     d3_url_struct=D3URLs(),
                     **kwargs):
    if category_projection is None:
        if use_metadata:
            category_projection = category_projector.project_with_metadata(
                corpus, x_dim=x_dim, y_dim=y_dim)
            term_projection = category_projector
        else:
            category_projection = category_projector.project(corpus,
                                                             x_dim=x_dim,
                                                             y_dim=y_dim)

    if initial_category is None:
        initial_category = corpus.get_categories()[0]

    category_scatter_chart_explorer = ScatterChartExplorer(
        category_projection.category_corpus,
        minimum_term_frequency=0,
        minimum_not_category_term_frequency=0,
        pmi_threshold_coefficient=0,
        filter_unigrams=False,
        jitter=0,
        max_terms=None,
        term_ranker=term_ranker,
        use_non_text_features=True,
        term_significance=None,
        terms_to_include=None)
    proj_df = category_projection.get_pandas_projection()
    category_scatter_chart_explorer.inject_coordinates(
        x_coords=scaler(proj_df['x']),
        y_coords=scaler(proj_df['y']),
        original_x=proj_df['x'],
        original_y=proj_df['y'])
    category_scatter_chart_data = category_scatter_chart_explorer.to_dict(
        category=initial_category,
        max_docs_per_category=0,
    )

    category_tooltip_func = '(function(d) {return d.term})'

    category_scatterplot_structure = ScatterplotStructure(
        VizDataAdapter(category_scatter_chart_data),
        width_in_pixels=category_width_in_pixels,
        height_in_pixels=category_height_in_pixels,
        asian_mode=asian_mode,
        use_non_text_features=True,
        show_top_terms=False,
        show_characteristic=False,
        get_tooltip_content=category_tooltip_func,
        color_func=category_color_func,
        show_axes=False,
        unified_context=True,
        show_category_headings=False,
        show_cross_axes=True,
        horizontal_line_y_position=0,
        vertical_line_x_position=0,
        y_label='',
        x_label='',
        full_data='getCategoryDataAndInfo()',
        alternative_term_func=
        '(function (termInfo) {termPlotInterface.drawCategoryAssociation(termInfo.i); return false;})',
        div_name='cat-plot')

    compacted_corpus = AssociationCompactor(terms_to_show).compact(corpus)
    terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms())
    print('num terms to hide', len(terms_to_hide))
    print('num terms to show', compacted_corpus.get_num_terms())

    term_scatter_chart_explorer = ScatterChartExplorer(
        corpus,
        minimum_term_frequency=0,
        minimum_not_category_term_frequency=0,
        pmi_threshold_coefficient=0,
        term_ranker=term_ranker,
        use_non_text_features=use_metadata,
        score_transform=stretch_0_to_1,
    ).hide_terms(terms_to_hide)

    if topic_model_term_lists is not None:
        term_scatter_chart_explorer.inject_metadata_term_lists(
            topic_model_term_lists)
    if metadata_descriptions is not None:
        term_scatter_chart_explorer.inject_metadata_descriptions(
            metadata_descriptions)

    if use_metadata:
        tdf = corpus.get_metadata_freq_df('')
    else:
        tdf = corpus.get_term_freq_df('')
    scores = RankDifference().get_scores(
        tdf[initial_category],
        tdf[[c for c in corpus.get_categories()
             if c != initial_category]].sum(axis=1))

    term_scatter_chart_data = term_scatter_chart_explorer.to_dict(
        category=initial_category,
        scores=scores,
        include_term_category_counts=True,
        transform=dense_rank,
        **kwargs)

    term_scatterplot_structure = ScatterplotStructure(
        VizDataAdapter(term_scatter_chart_data),
        width_in_pixels=term_width_in_pixels,
        height_in_pixels=term_height_in_pixels,
        asian_mode=asian_mode,
        use_non_text_features=use_metadata,
        show_top_terms=True,
        show_characteristic=False,
        get_tooltip_content=None,
        show_category_headings=False,
        use_full_doc=use_metadata,
        horizontal_line_y_position=0,
        vertical_line_x_position=0,
        topic_model_preview_size=topic_model_preview_size,
        y_label=initial_category,
        x_label='Not ' + initial_category,
        full_data='getTermDataAndInfo()',
        div_name='d3-div-1',
    )

    return PairPlotFromScatterplotStructure(category_scatterplot_structure,
                                            term_scatterplot_structure,
                                            category_projection,
                                            category_width_in_pixels,
                                            category_height_in_pixels,
                                            num_terms=num_terms_in_halo,
                                            show_halo=show_halo,
                                            d3_url_struct=d3_url_struct,
                                            x_dim=x_dim,
                                            y_dim=y_dim,
                                            protocol=protocol).to_html()
示例#10
0
def produce_pairplot(
        corpus,
        asian_mode=False,
        category_width_in_pixels=500,
        category_height_in_pixels=700,
        term_width_in_pixels=500,
        term_height_in_pixels=700,
        terms_to_show=3000,
        scaler=scale_neg_1_to_1_with_zero_mean,
        term_ranker=AbsoluteFrequencyRanker,
        use_metadata=False,
        category_projector=CategoryProjector(),
        category_projection=None,
        topic_model_term_lists=None,
        topic_model_preview_size=10,
        metadata_descriptions=None,
        initial_category=None,
        x_dim=0,
        y_dim=1,
        show_halo=True,
        num_terms_in_halo=5,
        category_color_func='(function(x) {return "#5555FF"})',
        protocol='https',
        d3_url_struct=D3URLs(),
        category_focused=False,
        verbose=False,
        use_full_doc=True,
        default_to_term_comparison=True,
        category_x_label='',
        category_y_label='',
        category_show_axes_and_cross_hairs=False,
        highlight_selected_category=True,
        term_x_label=None,  # used if default_to_term_comparison
        term_y_label=None,  # used if default_to_term_comparison
        wordfish_style=False,
        **kwargs):
    if category_projection is None:
        if use_metadata:
            category_projection = category_projector.project_with_metadata(
                corpus, x_dim=x_dim, y_dim=y_dim)
        else:
            category_projection = category_projector.project(corpus,
                                                             x_dim=x_dim,
                                                             y_dim=y_dim)

    if initial_category is None:
        initial_category = corpus.get_categories()[0]
    category_scatter_chart_explorer = _get_category_scatter_chart_explorer(
        category_projection, scaler, term_ranker, verbose)
    category_scatter_chart_data = category_scatter_chart_explorer.to_dict(
        category=initial_category,
        max_docs_per_category=0,
    )

    category_tooltip_func = '(function(d) {return d.term})'

    initial_category_idx = corpus.get_categories().index(initial_category)
    term_plot_change_func = _get_term_plot_change_js_func(
        wordfish_style, category_focused, initial_category_idx)

    category_scatterplot_structure = ScatterplotStructure(
        VizDataAdapter(category_scatter_chart_data),
        width_in_pixels=category_width_in_pixels,
        height_in_pixels=category_height_in_pixels,
        asian_mode=asian_mode,
        use_non_text_features=True,
        show_characteristic=False,
        x_label=category_x_label,
        y_label=category_y_label,
        show_axes_and_cross_hairs=category_show_axes_and_cross_hairs,
        full_data='getCategoryDataAndInfo()',
        show_top_terms=False,
        get_tooltip_content=category_tooltip_func,
        color_func=category_color_func,
        show_axes=False,
        horizontal_line_y_position=0,
        vertical_line_x_position=0,
        unified_context=True,
        show_category_headings=False,
        show_cross_axes=True,
        div_name='cat-plot',
        alternative_term_func=term_plot_change_func,
        highlight_selected_category=highlight_selected_category)
    compacted_corpus = AssociationCompactor(
        terms_to_show, use_non_text_features=use_metadata).compact(corpus)
    terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms())
    if verbose:
        print('num terms to hide', len(terms_to_hide))
        print('num terms to show', compacted_corpus.get_num_terms())

    term_scatter_chart_explorer = ScatterChartExplorer(
        category_projection.get_corpus(),
        minimum_term_frequency=0,
        minimum_not_category_term_frequency=0,
        pmi_threshold_coefficient=0,
        term_ranker=term_ranker,
        use_non_text_features=use_metadata,
        score_transform=stretch_0_to_1,
        verbose=verbose).hide_terms(terms_to_hide)

    if default_to_term_comparison:
        if topic_model_term_lists is not None:
            term_scatter_chart_explorer.inject_metadata_term_lists(
                topic_model_term_lists)
        if metadata_descriptions is not None:
            term_scatter_chart_explorer.inject_metadata_descriptions(
                metadata_descriptions)

        if use_metadata:
            tdf = corpus.get_metadata_freq_df('')
        else:
            tdf = corpus.get_term_freq_df('')

        scores = RankDifference().get_scores(
            tdf[initial_category],
            tdf[[c for c in corpus.get_categories()
                 if c != initial_category]].sum(axis=1))

        term_scatter_chart_data = term_scatter_chart_explorer.to_dict(
            category=initial_category,
            scores=scores,
            include_term_category_counts=True,
            transform=dense_rank,
            **kwargs)
        y_label = initial_category,
        x_label = 'Not ' + initial_category,
        color_func = None
        show_top_terms = True
        show_axes = False
    else:
        term_projection = category_projection.get_term_projection()
        original_x = term_projection['x']
        original_y = term_projection['y']
        x_coords = scaler(term_projection['x'])
        y_coords = scaler(term_projection['y'])
        x_label = term_x_label if term_x_label is not None else ''
        y_label = term_y_label if term_y_label is not None else ''
        show_axes = True
        horizontal_line_y_position = 0
        vertical_line_x_position = 0
        term_scatter_chart_explorer.inject_coordinates(x_coords,
                                                       y_coords,
                                                       original_x=original_x,
                                                       original_y=original_y)

        if topic_model_term_lists is not None:
            term_scatter_chart_explorer.inject_metadata_term_lists(
                topic_model_term_lists)
        if metadata_descriptions is not None:
            term_scatter_chart_explorer.inject_metadata_descriptions(
                metadata_descriptions)
        term_scatter_chart_data = term_scatter_chart_explorer.to_dict(
            category=initial_category,
            category_name=initial_category,
            include_term_category_counts=True,
            # transform=dense_rank,
        )
        color_func = '(function(x) {return "#5555FF"})'
        show_top_terms = False

    term_scatterplot_structure = ScatterplotStructure(
        VizDataAdapter(term_scatter_chart_data),
        width_in_pixels=term_width_in_pixels,
        height_in_pixels=term_height_in_pixels,
        use_full_doc=use_metadata or use_full_doc,
        asian_mode=asian_mode,
        use_non_text_features=use_metadata,
        show_characteristic=False,
        x_label=x_label,
        y_label=y_label,
        full_data='getTermDataAndInfo()',
        show_top_terms=show_top_terms,
        get_tooltip_content=None,
        color_func=color_func,
        # horizontal_line_y_position=0,
        # vertical_line_x_position=0,
        show_axes=show_axes,
        topic_model_preview_size=topic_model_preview_size,
        show_category_headings=False,
        div_name='d3-div-1',
        unified_context=True,
        highlight_selected_category=highlight_selected_category)
    return PairPlotFromScatterplotStructure(category_scatterplot_structure,
                                            term_scatterplot_structure,
                                            category_projection,
                                            category_width_in_pixels,
                                            category_height_in_pixels,
                                            num_terms=num_terms_in_halo,
                                            show_halo=show_halo,
                                            d3_url_struct=d3_url_struct,
                                            x_dim=x_dim,
                                            y_dim=y_dim,
                                            protocol=protocol).to_html()
示例#11
0
from scattertext.Scalers import dense_rank

from scattertext.termscoring.RankDifference import RankDifference

from scattertext.termcompaction.AssociationCompactor import AssociationCompactor

from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas

convention_df = SampleCorpora.ConventionData2012.get_data()
corpus = CorpusFromPandas(
    convention_df,
    category_col='party',
    text_col='text',
    nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact(
        AssociationCompactor(4000))

html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    minimum_term_frequency=0,
                                    pmi_threshold_coefficient=0,
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'],
                                    term_scorer=RankDifference(),
                                    transform=dense_rank)

open('./demo_dense_rank.html', 'wb').write(html.encode('utf-8'))
print('Open ./demo_dense_rank.html in Chrome or Firefox.')
class SemioticSquare(SemioticSquareBase):
    '''
    Create a visualization of a semiotic square.  Requires Corpus to have
    at least three categories.
    >>> newsgroups_train = fetch_20newsgroups(subset='train',
    ...   remove=('headers', 'footers', 'quotes'))
    >>> vectorizer = CountVectorizer()
    >>> X = vectorizer.fit_transform(newsgroups_train.data)
    >>> corpus = st.CorpusFromScikit(
    ... 	X=X,
    ... 	y=newsgroups_train.target,
    ... 	feature_vocabulary=vectorizer.vocabulary_,
    ... 	category_names=newsgroups_train.target_names,
    ... 	raw_texts=newsgroups_train.data
    ... 	).build()
    >>> semseq = SemioticSquare(corpus,
    ... 	category_a = 'alt.atheism',
    ... 	category_b = 'soc.religion.christian',
    ... 	neutral_categories = ['talk.religion.misc']
    ... )
    >>> # A simple HTML table
    >>> html = SemioticSquareViz(semseq).to_html()
    >>> # The table with an interactive scatterplot below it
    >>> html = st.produce_semiotic_square_explorer(semiotic_square,
    ...                                            x_label='More Atheism, Less Xtnity',
    ...                                            y_label='General Religious Talk')
    '''

    def __init__(self,
                 term_doc_matrix,
                 category_a,
                 category_b,
                 neutral_categories,
                 labels=None,
                 term_ranker=AbsoluteFrequencyRanker,
                 scorer=None):
        '''
        Parameters
        ----------
        term_doc_matrix : TermDocMatrix
            TermDocMatrix (or descendant) which will be used in constructing square.
        category_a : str
            Category name for term A
        category_b : str
            Category name for term B (in opposition to A)
        neutral_categories : list[str]
            List of category names that A and B will be contrasted to.  Should be in same domain.
        labels : dict
            None by default. Labels are dictionary of {'a_and_b': 'A and B', ...} to be shown
            above each category.
        term_ranker : TermRanker
            Class for returning a term-frequency convention_df
        scorer : termscoring class, optional
            Term scoring class for lexicon mining. Default: `scattertext.termscoring.ScaledFScore`
        '''
        assert category_a in term_doc_matrix.get_categories()
        assert category_b in term_doc_matrix.get_categories()
        for category in neutral_categories:
            assert category in term_doc_matrix.get_categories()
        if len(neutral_categories) == 0:
            raise EmptyNeutralCategoriesError()
        self.category_a_ = category_a
        self.category_b_ = category_b
        self.neutral_categories_ = neutral_categories
        self._build_square(term_doc_matrix, term_ranker, labels, scorer)

    def _build_square(self, term_doc_matrix, term_ranker, labels, scorer):
        self.term_doc_matrix_ = term_doc_matrix
        self.term_ranker = term_ranker(term_doc_matrix)
        self.scorer = RankDifference() \
            if scorer is None else scorer
        self.axes = self._build_axes(scorer)
        self.lexicons = self._build_lexicons()
        self._labels = labels

    def get_axes(self, scorer=None):
        '''
        Returns
        -------
        pd.DataFrame
        '''
        if scorer:
            return self._build_axes(scorer)
        return self.axes

    def get_lexicons(self, num_terms=10):
        '''
        Parameters
        ----------
        num_terms, int

        Returns
        -------
        dict
        '''
        return {k: v.index[:num_terms]
                for k, v in self.lexicons.items()}

    def get_labels(self):
        a = self._get_default_a_label()
        b = self._get_default_b_label()
        default_labels = {'a': a,
                          'not_a': 'Not ' + a,
                          'b': b,
                          'not_b': 'Not ' + b,
                          'a_and_b': a + ' + ' + b,
                          'not_a_and_not_b': 'Not ' + a + ' + Not ' + b,
                          'a_and_not_b': a + ' + Not ' + b,
                          'b_and_not_a': 'Not ' + a + ' + ' + b}
        labels = self._labels
        if labels is None:
            labels = {}
        return {name + '_label': labels.get(name, default_labels[name])
                for name in default_labels}

    def _get_default_b_label(self):
        return self.category_b_

    def _get_default_a_label(self):
        return self.category_a_

    def _build_axes(self, scorer):
        if scorer is None:
            scorer = self.scorer
        tdf = self._get_term_doc_count_df()
        counts = tdf.sum(axis=1)
        tdf['x'] = self._get_x_axis(scorer, tdf)
        tdf['x'][np.isnan(tdf['x'])] = self.scorer.get_default_score()
        tdf['y'] = self._get_y_axis(scorer, tdf)
        tdf['y'][np.isnan(tdf['y'])] = self.scorer.get_default_score()
        tdf['counts'] = counts
        return tdf[['x', 'y', 'counts']]

    def _get_x_axis(self, scorer, tdf):
        return scorer.get_scores(
            tdf[self.category_a_ + ' freq'],
            tdf[self.category_b_ + ' freq']
        )

    def _get_y_axis(self, scorer, tdf):
        return scorer.get_scores(
            tdf[[t + ' freq' for t in [self.category_a_, self.category_b_]]].sum(axis=1),
            tdf[[t + ' freq' for t in self.neutral_categories_]].sum(axis=1)
        )

    def _get_term_doc_count_df(self):
        return (self.term_ranker.get_ranks()
        [[t + ' freq' for t in self._get_all_categories()]])

    def _get_all_categories(self):
        return [self.category_a_, self.category_b_] + self.neutral_categories_

    def _build_lexicons(self):
        self.lexicons = {}
        ax = self.axes
        x_max = ax['x'].max()
        y_max = ax['y'].max()
        x_min = ax['x'].min()
        y_min = ax['y'].min()
        x_baseline = self._get_x_baseline()
        y_baseline = self._get_y_baseline()

        def dist(candidates, x_bound, y_bound):
            return ((x_bound - candidates['x']) ** 2 + (y_bound - candidates['y']) ** 2).sort_values()

        self.lexicons['a'] = dist(ax[(ax['x'] > x_baseline) & (ax['y'] > y_baseline)], x_max, y_max)
        self.lexicons['not_a'] = dist(ax[(ax['x'] < x_baseline) & (ax['y'] < y_baseline)], x_min, y_min)

        self.lexicons['b'] = dist(ax[(ax['x'] < x_baseline) & (ax['y'] > y_baseline)], x_min, y_max)
        self.lexicons['not_b'] = dist(ax[(ax['x'] > x_baseline) & (ax['y'] < y_baseline)], x_max, y_min)

        self.lexicons['a_and_b'] = dist(ax[(ax['y'] > y_baseline)], x_baseline, y_max)
        self.lexicons['not_a_and_not_b'] = dist(ax[(ax['y'] < y_baseline)], x_baseline, y_min)

        self.lexicons['a_and_not_b'] = dist(ax[(ax['x'] > x_baseline)], x_max, y_baseline)

        self.lexicons['b_and_not_a'] = dist(ax[(ax['x'] < x_baseline)], x_min, y_baseline)

        return self.lexicons

    def _get_y_baseline(self):
        return self.scorer.get_default_score()

    def _get_x_baseline(self):
        return self.scorer.get_default_score()