Exemplo n.º 1
0
class KernelDensity (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'plotsummary')

        self.in_dir = self.args['<directory>']

        if self.args['<term_file>']:
            self.term_file = self.args['<term_file>']

            self.terms = [line.strip().lower() for line in open(self.term_file)]

        elif self.args["<first_term>"] and self.args["<second_term>"]:
            self.terms = []
            self.terms.append(self.args["<first_term>"])
            self.terms.append(self.args["<second_term>"])

        elif self.args["<term>"]:
            self.terms = []
            self.terms.append(self.args["<term>"])

        if self.args["<count>"]:
            self.max = int(self.args["<count>"])

        self.dir = os.path.dirname(os.path.abspath(__file__))

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

        if self.args['--caption']:
            self.caption = self.args['--caption']
        else:
            self.caption = 'Term Plot'

        if self.args['--nostem']:
            self.nostem = self.args['--nostem']
        else:
            self.nostem = None

        if self.args['single']:
            self.action = 'single'
        elif self.args['group']:
            self.second_term_file = self.args['<second_term_file>']
            self.term_name = self.args['<term_name>']
            self.second_term_name = self.args['<second_term_name>']
            self.second_terms = [line.strip().lower() for line in open(self.second_term_file)]
            self.action = 'group'
        elif self.args['hist']:
            self.action = 'hist'
        elif self.args['rawcount']:
            self.action = 'rawcount'
        elif self.args['overlap']:
            self.action = 'overlap'
        elif self.args['search']:
            self.action = 'search'

        if self.args['--words']:
            self.words = int(self.args['--words'])
        else:
            self.words = 5000

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='kernel-density-estimation v0.1')

    def run(self):
        if self.args['--debug']:
            if self.nostem:
                with open(self.nostem) as f:
                    nostem_words = set(f.read().splitlines())
            else:
                nostem_words = []

            for term in self.terms:
                if not term in nostem_words and term != Text.show_stem(term):
                    self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
                else:
                    self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))

            if self.action == 'group':
                for term in self.second_terms:
                    if not term in nostem_words:
                        self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
                    else:
                        self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))

        file_list = listdir(self.in_dir)

        for file_name in file_list:
            if file_name.endswith(".txt"):
                self.plot(file_name)

    def plot(self, file_name):
        self.debug.print_debug(self, u'Loading ' + file_name)

        textplot = Text.from_file(join(self.in_dir, file_name), self.debug, nostem=self.nostem)

        self.debug.print_debug(self, u'Plotting ' + file_name)

        if self.action == 'single':
            graph = textplot.plot_terms(self.terms, self.caption)

        elif self.action == 'group':
            graph = textplot.plot_terms_two_groups(self.terms, self.term_name, self.second_terms,self.second_term_name, self.caption)

        elif self.action == 'hist':
            graph = textplot.plot_terms_histogram(self.terms, self.caption, self.words)

        elif self.action == 'rawcount':
            graph = textplot.plot_terms_raw_count(self.terms, self.caption, self.words)

        elif self.action == 'overlap':
            graph = textplot.plot_kde_overlap(self.terms)

        elif self.action == 'search':
            newterms = textplot.anchored_scores(self.terms[0])

            count = 0
            self.debug.print_(self, u'Top twenty correlated terms (with more than one occurrence) for {0}: '.format(self.terms[0]))

            for item in newterms:
                if len(textplot.terms[item]) > 1 and item != textplot.stem(self.terms[0]):
                    if count > self.max:
                        break

                    self.debug.print_(self, item)
                    count += 1

        if self.action != 'search':
            self.debug.print_debug(self, u'Saving ' + file_name.replace('.txt', '.png'))

            graph.savefig(join(self.in_dir, file_name.replace('.txt', '.png')))
            graph.close()