Exemplo n.º 1
0
class CassiusImport(Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'cassius-import')

        self.in_file = self.args['<in-file>']
        self.out_file = self.args['<out-file>']

        self.dir = os.path.dirname(os.path.abspath(__file__))

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='cassius-import v0.1')

    def run(self):
        command = "java -cp '{0}{1}saxon9.jar':'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar':'{0}{1}..{1}runtime{1}' net.sf.saxon.Transform -r:org.apache.xml.resolver.tools.CatalogResolver -y:org.apache.xml.resolver.tools.ResolvingXMLReader -x:org.apache.xml.resolver.tools.ResolvingXMLReader -u -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(
            self.dir, os.sep, self.out_file, self.in_file)
        #command = "java -jar '{0}{1}saxon9.jar';'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar' -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(self.dir, os.sep, self.out_file, self.in_file)

        #-r org.apache.xml.resolver.tools.CatalogResolver -catalog '{0}{1}..{1}runtime{1}catalog.xml'

        self.debug.print_debug(self,
                               u'Running saxon transform (JATS -> CaSSius)')

        subprocess.call(command, stdin=None, shell=True)
Exemplo n.º 2
0
class CassiusImport (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'cassius-import')

        self.in_file = self.args['<in-file>']
        self.out_file = self.args['<out-file>']

        self.dir = os.path.dirname(os.path.abspath(__file__))

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='cassius-import v0.1')

    def run(self):
        command = "java -cp '{0}{1}saxon9.jar':'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar':'{0}{1}..{1}runtime{1}' net.sf.saxon.Transform -r:org.apache.xml.resolver.tools.CatalogResolver -y:org.apache.xml.resolver.tools.ResolvingXMLReader -x:org.apache.xml.resolver.tools.ResolvingXMLReader -u -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(self.dir, os.sep, self.out_file, self.in_file)
        #command = "java -jar '{0}{1}saxon9.jar';'{0}{1}..{1}runtime{1}xml-resolver-1.1.jar' -o '{2}' '{3}' '{0}{1}..{1}transform{1}xsl{1}cassius-main.xsl'".format(self.dir, os.sep, self.out_file, self.in_file)

        #-r org.apache.xml.resolver.tools.CatalogResolver -catalog '{0}{1}..{1}runtime{1}catalog.xml'

        self.debug.print_debug(self, u'Running saxon transform (JATS -> CaSSius)')

        subprocess.call(command, stdin=None, shell=True)
class ChronicWordFreq (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'CWF')

        self.corpus = self.args['<corpus_directory>']
        self.words = self.args['<word_list>'].split(",")
        self.output = self.args['<output_csv>']
        self.terms = {}
        self.years = []
        self.year_count = {}

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='chronicWordFreq 0.1')

    def read_file(self, file):
        match = re.search('\d{4}', file)
        year = match.group(0) if match else 'NODATE'

        if year == 'NODATE':
            self.debug.print_debug(self, u'No date detected in filename: {0}. Ignoring.'.format(file))
            return

        self.debug.print_debug(self, u'Processing {0} for year {1}.'.format(file, year))

        if not year in self.years:
            self.years.append(year)

        if not year in self.year_count:
            self.year_count[year] = 1
        else:
            self.year_count[year] += 1

        with open(join(self.corpus, file)) as f:
            content = f.read()
            content = content.upper()

            for word in self.words:
                if word.upper() in content:
                    if word in self.terms:
                        if year in self.terms[word]:
                            current_value = self.terms[word][year]
                            current_value += 1
                            self.terms[word][year] = current_value
                        else:
                            self.terms[word][year] = 1
                    else:
                        self.terms[word] = {year: 1}
                    self.debug.print_debug(self, u'Found {0} in {1}.'.format(word, file))

    def read_dir(self):
        files = [f for f in listdir(self.corpus) if isfile(join(self.corpus, f))]
        return files

    def write_output(self):
        self.years.sort()

        output_list = [u'{0},{1}\n'.format('Word', ",".join(self.years))]

        for word in self.words:
            line = word

            if word in self.terms:
                for year in self.years:
                    if year in self.terms[word]:
                        percent = (float(self.terms[word][year]) / float(self.year_count[year])) * 100
                        line += u',{0}'.format(percent)
                    else:
                        line += u',0'
                output_list.append(line + '\n')

        with open(self.output, 'w') as f:
            f.writelines(output_list)

    def run(self):
        file_list = self.read_dir()

        for file in file_list:
            self.read_file(file)

        self.write_output()
Exemplo n.º 4
0
class KernelDensity (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'plotsummary')

        self.in_dir = self.args['<directory>']
        self.term_file = self.args['<term_file>']

        self.terms = [line.strip().lower() for line in open(self.term_file)]

        self.dir = os.path.dirname(os.path.abspath(__file__))

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

        if self.args['--caption']:
            self.caption = self.args['--caption']
        else:
            self.caption = 'Term Plot'

        if self.args['--nostem']:
            self.nostem = self.args['--nostem']
        else:
            self.nostem = None

        if self.args['single']:
            self.action = 'single'
        elif self.args['group']:
            self.second_term_file = self.args['<second_term_file>']
            self.term_name = self.args['<term_name>']
            self.second_term_name = self.args['<second_term_name>']
            self.second_terms = [line.strip().lower() for line in open(self.second_term_file)]
            self.action = 'group'
        elif self.args['hist']:
            self.action = 'hist'
        elif self.args['rawcount']:
            self.action = 'rawcount'

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='kernel-density-estimation v0.1')

    def run(self):
        if self.args['--debug']:
            if self.nostem:
                with open(self.nostem) as f:
                    nostem_words = set(f.read().splitlines())
            else:
                nostem_words = []

            for term in self.terms:
                if not term in nostem_words:
                    self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
                else:
                    self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))

            if self.action == 'group':
                for term in self.second_terms:
                    if not term in nostem_words:
                        self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
                    else:
                        self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))

        file_list = listdir(self.in_dir)

        for file_name in file_list:
            if file_name.endswith(".txt"):
                self.plot(file_name)

    def plot(self, file_name):
        self.debug.print_debug(self, u'Loading ' + file_name)
        textplot = Text.from_file(join(self.in_dir, file_name), self.debug, nostem=self.nostem)

        self.debug.print_debug(self, u'Plotting ' + file_name)
        if self.action == 'single':
            graph = textplot.plot_terms(self.terms, self.caption)

        elif self.action == 'group':
            graph = textplot.plot_terms_two_groups(self.terms, self.term_name, self.second_terms,self.second_term_name, self.caption)

        elif self.action == 'hist':
            graph = textplot.plot_terms_histogram(self.terms, self.caption, 5000)
        elif self.action == 'rawcount':
            graph = textplot.plot_terms_raw_count(self.terms, self.caption, 5000)

        self.debug.print_debug(self, u'Saving ' + file_name.replace('.txt', '.png'))
        graph.savefig(join(self.in_dir, file_name.replace('.txt', '.png')))

        graph.close()
Exemplo n.º 5
0
class KernelDensity (Debuggable):
    def __init__(self):
        # read  command line arguments
        self.args = self.read_command_line()

        # absolute first priority is to initialize debugger so that anything triggered here can be logged
        self.debug = Debug()

        Debuggable.__init__(self, 'plotsummary')

        self.in_dir = self.args['<directory>']

        if self.args['<term_file>']:
            self.term_file = self.args['<term_file>']

            self.terms = [line.strip().lower() for line in open(self.term_file)]

        elif self.args["<first_term>"] and self.args["<second_term>"]:
            self.terms = []
            self.terms.append(self.args["<first_term>"])
            self.terms.append(self.args["<second_term>"])

        elif self.args["<term>"]:
            self.terms = []
            self.terms.append(self.args["<term>"])

        if self.args["<count>"]:
            self.max = int(self.args["<count>"])

        self.dir = os.path.dirname(os.path.abspath(__file__))

        if self.args['--debug']:
            self.debug.enable_debug()

        self.debug.enable_prompt(Interactive(self.args['--debug']))

        if self.args['--caption']:
            self.caption = self.args['--caption']
        else:
            self.caption = 'Term Plot'

        if self.args['--nostem']:
            self.nostem = self.args['--nostem']
        else:
            self.nostem = None

        if self.args['single']:
            self.action = 'single'
        elif self.args['group']:
            self.second_term_file = self.args['<second_term_file>']
            self.term_name = self.args['<term_name>']
            self.second_term_name = self.args['<second_term_name>']
            self.second_terms = [line.strip().lower() for line in open(self.second_term_file)]
            self.action = 'group'
        elif self.args['hist']:
            self.action = 'hist'
        elif self.args['rawcount']:
            self.action = 'rawcount'
        elif self.args['overlap']:
            self.action = 'overlap'
        elif self.args['search']:
            self.action = 'search'

        if self.args['--words']:
            self.words = int(self.args['--words'])
        else:
            self.words = 5000

    @staticmethod
    def read_command_line():
        return docopt(__doc__, version='kernel-density-estimation v0.1')

    def run(self):
        if self.args['--debug']:
            if self.nostem:
                with open(self.nostem) as f:
                    nostem_words = set(f.read().splitlines())
            else:
                nostem_words = []

            for term in self.terms:
                if not term in nostem_words and term != Text.show_stem(term):
                    self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
                else:
                    self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))

            if self.action == 'group':
                for term in self.second_terms:
                    if not term in nostem_words:
                        self.debug.print_debug(self, u'{0} will be stemmed to {1}'.format(term, Text.show_stem(term)))
                    else:
                        self.debug.print_debug(self, u'{0} will not be stemmed'.format(term))

        file_list = listdir(self.in_dir)

        for file_name in file_list:
            if file_name.endswith(".txt"):
                self.plot(file_name)

    def plot(self, file_name):
        self.debug.print_debug(self, u'Loading ' + file_name)

        textplot = Text.from_file(join(self.in_dir, file_name), self.debug, nostem=self.nostem)

        self.debug.print_debug(self, u'Plotting ' + file_name)

        if self.action == 'single':
            graph = textplot.plot_terms(self.terms, self.caption)

        elif self.action == 'group':
            graph = textplot.plot_terms_two_groups(self.terms, self.term_name, self.second_terms,self.second_term_name, self.caption)

        elif self.action == 'hist':
            graph = textplot.plot_terms_histogram(self.terms, self.caption, self.words)

        elif self.action == 'rawcount':
            graph = textplot.plot_terms_raw_count(self.terms, self.caption, self.words)

        elif self.action == 'overlap':
            graph = textplot.plot_kde_overlap(self.terms)

        elif self.action == 'search':
            newterms = textplot.anchored_scores(self.terms[0])

            count = 0
            self.debug.print_(self, u'Top twenty correlated terms (with more than one occurrence) for {0}: '.format(self.terms[0]))

            for item in newterms:
                if len(textplot.terms[item]) > 1 and item != textplot.stem(self.terms[0]):
                    if count > self.max:
                        break

                    self.debug.print_(self, item)
                    count += 1

        if self.action != 'search':
            self.debug.print_debug(self, u'Saving ' + file_name.replace('.txt', '.png'))

            graph.savefig(join(self.in_dir, file_name.replace('.txt', '.png')))
            graph.close()