Пример #1
0
    def _term_rank_score_and_frequency_df(self, all_categories, category,
                                          other_categories, scores):
        term_ranker = self.scatterchartdata.term_ranker(self.term_doc_matrix)
        if self.scatterchartdata.use_non_text_features:
            term_ranker.use_non_text_features()
        df = term_ranker.get_ranks()
        if self.x_coords is not None:
            df['x'] = self.x_coords
            df['y'] = self.y_coords

        if not self.original_x is None:
            try:
                df['ox'] = self.original_x.values
            except AttributeError:
                df['ox'] = self.original_x

        if not self.original_y is None:
            try:
                df['oy'] = self.original_y.values
            except AttributeError:
                df['oy'] = self.original_y

        if scores is None:
            scores = self._get_default_scores(category, other_categories, df)
        # np.array(self.term_doc_matrix.get_rudder_scores(category))
        # df['category score'] = np.array(self.term_doc_matrix.get_rudder_scores(category))
        category_column_name = category + ' freq'
        df['category score'] = CornerScore.get_scores_for_category(
            df[category_column_name],
            df[[c + ' freq' for c in other_categories]].sum(axis=1))
        if self.scatterchartdata.term_significance is not None:
            df['p'] = get_p_vals(df, category_column_name,
                                 self.scatterchartdata.term_significance)
        df['not category score'] = CornerScore.get_scores_for_category(
            df[[c + ' freq' for c in other_categories]].sum(axis=1),
            df[category_column_name])
        df['color_scores'] = scores
        if self.scatterchartdata.terms_to_include is None:
            df = self._filter_bigrams_by_minimum_not_category_term_freq(
                category_column_name, other_categories, df)
            df = filter_bigrams_by_pmis(
                self._filter_by_minimum_term_frequency(all_categories, df),
                threshold_coef=self.scatterchartdata.pmi_threshold_coefficient)

        if self.scatterchartdata.filter_unigrams:
            df = filter_out_unigrams_that_only_occur_in_one_bigram(df)
        if len(df) == 0:
            raise NoWordMeetsTermFrequencyRequirementsError()
        df['category score rank'] = rankdata(df['category score'],
                                             method='ordinal')
        df['not category score rank'] = rankdata(df['not category score'],
                                                 method='ordinal')
        if self.scatterchartdata.max_terms and self.scatterchartdata.max_terms < len(
                df):
            assert self.scatterchartdata.max_terms > 0
            df = self._limit_max_terms(category, df)
        df = df.reset_index()
        return df
Пример #2
0
    def _term_rank_score_and_frequency_df(self, all_categories, category,
                                          other_categories, scores):
        df = self._get_term_category_frequencies()
        self._add_x_and_y_coords_to_term_df_if_injected(df)

        if scores is None:
            scores = self._get_default_scores(category, other_categories, df)
        # np.array(self.term_doc_matrix.get_rudder_scores(category))
        # convention_df['category score'] = np.array(self.term_doc_matrix.get_rudder_scores(category))
        category_column_name = category + ' freq'
        df['category score'] = CornerScore.get_scores_for_category(
            df[category_column_name],
            df[[c + ' freq' for c in other_categories]].sum(axis=1))
        if self.scatterchartdata.term_significance is not None:
            df['p'] = get_p_vals(df, category_column_name,
                                 self.scatterchartdata.term_significance)
        df['not category score'] = CornerScore.get_scores_for_category(
            df[[c + ' freq' for c in other_categories]].sum(axis=1),
            df[category_column_name])
        df['color_scores'] = scores
        if self.scatterchartdata.terms_to_include is None:
            df = self._filter_bigrams_by_minimum_not_category_term_freq(
                category_column_name, other_categories, df)
            df = filter_bigrams_by_pmis(
                self._filter_by_minimum_term_frequency(all_categories, df),
                threshold_coef=self.scatterchartdata.pmi_threshold_coefficient)

        if self.scatterchartdata.filter_unigrams:
            df = filter_out_unigrams_that_only_occur_in_one_bigram(df)
        if len(df) == 0:
            raise NoWordMeetsTermFrequencyRequirementsError()
        df['category score rank'] = rankdata(df['category score'],
                                             method='ordinal')
        df['not category score rank'] = rankdata(df['not category score'],
                                                 method='ordinal')
        if self.scatterchartdata.max_terms and self.scatterchartdata.max_terms < len(
                df):
            assert self.scatterchartdata.max_terms > 0
            df = self._limit_max_terms(category, df)
        df = df.reset_index()
        return df
Пример #3
0
 def _term_rank_score_and_frequency_df(self, all_categories, category,
                                       scores):
     term_ranker = self.term_ranker(self.term_doc_matrix)
     if self.use_non_text_features:
         term_ranker.use_non_text_features()
     df = term_ranker.get_ranks()
     if scores is None:
         scores = self._get_default_scores(category, df)
     # np.array(self.term_doc_matrix.get_rudder_scores(category))
     # df['category score'] = np.array(self.term_doc_matrix.get_rudder_scores(category))
     category_column_name = category + ' freq'
     df['category score'] = RudderScore.get_score(
         df[category_column_name],
         df[[c for c in df.columns
             if c != category_column_name]].sum(axis=1))
     if self.term_significance is not None:
         df['p'] = get_p_vals(self.term_doc_matrix, category_column_name,
                              self.term_significance)
     df['not category score'] = np.sqrt(2) - df['category score']
     df['color_scores'] = scores
     df = filter_bigrams_by_pmis(
         df[df[all_categories].sum(axis=1) > self.minimum_term_frequency],
         threshold_coef=self.pmi_threshold_coefficient)
     if self.filter_unigrams:
         if self.filter_unigrams == 2:
             df = filter_out_all_unigrams(df)
         else:
             df = filter_out_unigrams_that_only_occur_in_one_bigram(df)
     if len(df) == 0:
         raise NoWordMeetsTermFrequencyRequirementsError()
     df['category score rank'] = rankdata(df['category score'],
                                          method='ordinal')
     df['not category score rank'] = rankdata(df['not category score'],
                                              method='ordinal')
     if self.max_terms and self.max_terms < len(df):
         assert self.max_terms > 0
         df = self._limit_max_terms(category, df)
     df = df.reset_index()
     return df
Пример #4
0
    def to_dict(self,
                category,
                category_name=None,
                not_category_name=None,
                scores=None,
                transform=percentile_alphabetical,
                title_case_names=False,
                not_categories=None,
                neutral_categories=None,
                extra_categories=None,
                background_scorer=None):
        '''

        Parameters
        ----------
        category : str
            Category to annotate.  Exact value of category.
        category_name : str, optional
            Name of category which will appear on web site. Default None is same as category.
        not_category_name : str, optional
            Name of ~category which will appear on web site. Default None is same as "not " + category.
        scores : np.array, optional
            Scores to use for coloring.  Defaults to None, or RankDifference scores
        transform : function, optional
            Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.
        title_case_names : bool, default False
          Title case category name and no-category name?
        not_categories : list, optional
            List of categories to use as "not category".  Defaults to all others.
        neutral_categories : list, optional
            List of categories to use as neutral.  Defaults [].
        extra_categories : list, optional
            List of categories to use as extra.  Defaults [].
        background_scorer : CharacteristicScorer, optional
            Used for bg scores

        Returns
        -------
        Dictionary that encodes the scatter chart
        information. The dictionary can be dumped as a json document, and
        used in scattertext.html
         {info: {category_name: ..., not_category_name},
          data: [{term:,
                  x:frequency [0-1],
                  y:frequency [0-1],
                  ox: score,
                  oy: score,
                  s: score,
                  os: original score,
                  p: p-val,
                  cat25k: freq per 25k in category,
                  cat: count in category,
                  ncat: count in non-category,
                  catdocs: [docnum, ...],
                  ncatdocs: [docnum, ...]
                  ncat25k: freq per 25k in non-category}, ...]}}

        '''
        if self.used:
            raise Exception("Cannot reuse a ScatterChart constructor")

        all_categories = self.term_doc_matrix.get_categories()
        assert category in all_categories

        if not_categories is None:
            not_categories = [c for c in all_categories if c != category]
            neutral_categories = []
            extra_categories = []
        elif neutral_categories is None:
            neutral_categories = [
                c for c in all_categories
                if c not in [category] + not_categories
            ]
            extra_categories = []
        elif extra_categories is None:
            extra_categories = [
                c for c in all_categories
                if c not in [category] + not_categories + neutral_categories
            ]
        all_categories = [
            category
        ] + not_categories + neutral_categories + extra_categories

        df = self._get_term_category_frequencies()

        self._add_x_and_y_coords_to_term_df_if_injected(df)

        if scores is None:
            scores = self._get_default_scores(category, not_categories, df)
        category_column_name = category + ' freq'
        df['category score'] = CornerScore.get_scores_for_category(
            df[category_column_name],
            df[[c + ' freq' for c in not_categories]].sum(axis=1))
        if self.scatterchartdata.term_significance is not None:
            df['p'] = get_p_vals(df, category_column_name,
                                 self.scatterchartdata.term_significance)
        df['not category score'] = CornerScore.get_scores_for_category(
            df[[c + ' freq' for c in not_categories]].sum(axis=1),
            df[category_column_name])
        df['color_scores'] = scores
        if self.scatterchartdata.terms_to_include is None:
            df = self._filter_bigrams_by_minimum_not_category_term_freq(
                category_column_name, not_categories, df)
            df = filter_bigrams_by_pmis(
                self._filter_by_minimum_term_frequency(all_categories, df),
                threshold_coef=self.scatterchartdata.pmi_threshold_coefficient)

        if self.scatterchartdata.filter_unigrams:
            df = filter_out_unigrams_that_only_occur_in_one_bigram(df)
        if len(df) == 0:
            raise NoWordMeetsTermFrequencyRequirementsError()
        df['category score rank'] = rankdata(df['category score'],
                                             method='ordinal')
        df['not category score rank'] = rankdata(df['not category score'],
                                                 method='ordinal')
        if self.scatterchartdata.max_terms and self.scatterchartdata.max_terms < len(
                df):
            assert self.scatterchartdata.max_terms > 0
            df = self._limit_max_terms(category, df)
        df = df.reset_index()

        if self.x_coords is None:
            self.x_coords, self.y_coords = self._get_coordinates_from_transform_and_jitter_frequencies \
                (category, df, not_categories, transform)
            df['x'], df['y'] = self.x_coords, self.y_coords
            df['ox'], df['oy'] = self.x_coords, self.y_coords

        df['not cat freq'] = df[[x + ' freq'
                                 for x in not_categories]].sum(axis=1)
        if neutral_categories != []:
            df['neut cat freq'] = df[[x + ' freq' for x in neutral_categories
                                      ]].sum(axis=1).fillna(0)
        if extra_categories != []:
            df['extra cat freq'] = df[[x + ' freq' for x in extra_categories
                                       ]].sum(axis=1).fillna(0)

        json_df = df[['x', 'y', 'ox', 'oy', 'term']]

        if self.scatterchartdata.term_significance:
            json_df['p'] = df['p']
        self._add_term_freq_to_json_df(json_df, df, category)
        json_df['s'] = percentile_min(df['color_scores'])
        json_df['os'] = df['color_scores']
        if background_scorer:
            bg_scores = background_scorer.get_scores(self.term_doc_matrix)
            json_df['bg'] = bg_scores[1].loc[json_df.term].values
        elif not self.scatterchartdata.use_non_text_features:
            json_df['bg'] = self._get_corpus_characteristic_scores(json_df)

        self._preform_axis_rescale(json_df, self._rescale_x, 'x')
        self._preform_axis_rescale(json_df, self._rescale_y, 'y')

        if self.scatterchartdata.terms_to_include is not None:
            json_df = self._use_only_selected_terms(json_df)

        category_terms = list(json_df.sort_values('s')['term'][:10])
        not_category_terms = list(json_df.sort_values('s')['term'][:10])
        if category_name is None:
            category_name = category
        if not_category_name is None:
            not_category_name = 'Not ' + category_name

        def better_title(x):
            if title_case_names:
                return ' '.join(
                    [t[0].upper() + t[1:].lower() for t in x.split()])
            else:
                return x

        j = {
            'info': {
                'category_name': better_title(category_name),
                'not_category_name': better_title(not_category_name),
                'category_terms': category_terms,
                'not_category_terms': not_category_terms,
                'category_internal_name': category,
                'not_category_internal_names': not_categories,
                'categories': self.term_doc_matrix.get_categories(),
                'neutral_category_internal_names': neutral_categories,
                'extra_category_internal_names': extra_categories
            }
        }
        if self.metadata_term_lists is not None:
            j['metalists'] = self.metadata_term_lists
        if self.metadata_descriptions is not None:
            j['metadescriptions'] = self.metadata_descriptions
        if self.term_colors is not None:
            j['info']['term_colors'] = self.term_colors
        #j['data'] = json_df.sort_values(by=['x', 'y', 'term']).to_dict(orient='records')
        j['data'] = json_df.to_dict(orient='records')

        return j