예제 #1
0
    def graph(self, data, output_folder, parent_folder):
        if self.type == None:
            raise ValueError("Grapher type must be set to a string")

        data = util.group_words_by_term(data, get_tfidf=True)
        data = data[data['word'].str.len() > 1]
        data = data[data['type'] == 'word']
        terms = sorted(data.term.unique().tolist())
        N = len(terms)
        rows, cols = util.get_rows_cols(N)

        fig, ax = plt.subplots(figsize=(cols * 2, rows * 3), ncols=cols, nrows=rows, squeeze=False)
        plt.subplots_adjust(
            left    =  0.2,
            bottom  =  0.1,
            right   =  2,
            top     =  0.9,
            wspace  =  0.5,
            hspace  =  1.1
        )

        for i in range(N):
            ax[int(i / cols)][i % cols].set_title(terms[i], y = 1)
            to_plot = data[data['term'] == terms[i]]
            to_plot = to_plot.head(10)[['term','word','tf_idf']]
            plot = sns.barplot(
                y=to_plot['word'],
                x=to_plot['tf_idf'],
                data=to_plot,
                palette = config.PALETTE,
                orient="h",
                ax=ax[int(i / cols)][i % cols]
            )
            plot.set(
                ylabel="",
                xlabel="Distinctiveness Score"
            )

        TITLE = "Each Term's Most Distinguishing {}".format(self.type)
        plt.suptitle(TITLE, y = 1.09, fontsize=20)
        fig.savefig(
            "{}/{}.png".format(output_folder, slugify(TITLE)),
            bbox_inches='tight',
            pad_inches=config.PAD_INCHES
        )
        fig.clf()
예제 #2
0
    def graph(self, data, output_folder, parent_folder):
        data = util.group_words_by_sender(data, get_tfidf=True)
        data = data[data['word'].str.len() > 1]
        data = data[data['type'] == 'word']
        senders = data[config.SENDER_COLUMN_NAME].unique().tolist()
        N = len(senders)
        rows, cols = util.get_rows_cols(N)

        fig, ax = plt.subplots(figsize=(cols * 2, rows * 3),
                               ncols=cols,
                               nrows=rows,
                               squeeze=False)
        plt.subplots_adjust(left=0.2,
                            bottom=0.1,
                            right=2,
                            top=0.9,
                            wspace=0.5,
                            hspace=1.1)

        for i in range(N):
            ax[int(i / cols)][i % cols].set_title(senders[i], y=1)
            to_plot = data[data[config.SENDER_COLUMN_NAME] == senders[i]]
            to_plot = to_plot.head(10)[[
                config.SENDER_COLUMN_NAME, 'word', 'tf_idf'
            ]]
            plot = sns.barplot(y=to_plot['word'],
                               x=to_plot['tf_idf'],
                               data=to_plot,
                               palette=config.PALETTE,
                               orient="h",
                               ax=ax[int(i / cols)][i % cols])
            plot.set(ylabel="", xlabel="Distinctiveness Score")

        TITLE = "Our Most Distinguishing Words"
        plt.suptitle(TITLE, y=1.09, fontsize=20)
        fig.savefig("{}/{}.png".format(output_folder, slugify(TITLE)),
                    bbox_inches='tight',
                    pad_inches=config.PAD_INCHES)
        fig.clf()