Exemplo n.º 1
0
    def test_file_prefix_option_works(self):
        runner = ExtractionRunner()
        runner.add_runnable(ImplTestFileExtractor)
        runner.run('whatever',
                   output_dir=self.results_dir,
                   file_prefix='prefix.')

        result_file_path = os.path.join(self.results_dir, 'prefix.test.txt')
        self.assertTrue(os.path.isfile(result_file_path))
        self.assertEqual(open(result_file_path, 'r').read(), 'test test')
Exemplo n.º 2
0
def get_extraction_runner():

    runner = ExtractionRunner()
    runner.enable_logging('~/logs/results', '~/logs/runnables')

    runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)
    runner.add_runnable(filters.AcademicPaperFilter)
    runner.add_runnable(grobid.GrobidHeaderTEIExtractor)
    runner.add_runnable(tei.TEItoHeaderExtractor)
    runner.add_runnable(parscit.ParsCitCitationExtractor)
    runner.add_runnable(figures.PDFFiguresExtractor)
    runner.add_runnable(algorithms.AlgorithmsExtractor)

    return runner
Exemplo n.º 3
0
    def test_run_batch(self):
        batch = ['test 0', 'test 1', 'test 2']
        prefixes = ['1', '2', '3']
        output_dirs = [self.results_dir] * 3
        runner = ExtractionRunner()
        runner.add_runnable(SelfExtractor)
        runner.run_batch(batch, output_dirs, file_prefixes=prefixes)

        for prefix, text in zip(prefixes, batch):
            result_file_path = os.path.join(
                self.results_dir, '{0}SelfExtractor.xml'.format(prefix))
            self.assertTrue(os.path.isfile(result_file_path))
            xml = ET.parse(result_file_path).getroot()
            self.assertEqual(xml.text, text)
Exemplo n.º 4
0
    def test_disable_logs_works(self):
        runner = ExtractionRunner()
        results_log_path = os.path.join(self.results_dir, 'results')
        runnables_log_path = os.path.join(self.results_dir, 'runnables')

        runner.enable_logging(results_log_path, runnables_log_path)
        runner.disable_logging()
        runner.add_runnable(SelfLogExtractor)
        runner.run('abc', output_dir=self.results_dir, run_name='RUN!')

        log_list = glob.glob(results_log_path + "*.log")
        self.assertFalse(log_list)
        log_list = glob.glob(runnables_log_path + "*.log")
        self.assertFalse(log_list)
Exemplo n.º 5
0
    def test_extractor_errors_cascade_no_write_dep_errors(self):
        runner = ExtractionRunner()
        runner.add_runnable(ErrorExtractor)
        runner.add_runnable(DepsOnErrorExtractor)
        runner.add_runnable(DepsOnErrorExtractor2)

        runner.run('Test', output_dir=self.results_dir)
        ee_path = os.path.join(self.results_dir, 'ErrorExtractor.xml')
        self.assertTrue(os.path.isfile(ee_path))
        self.assertEqual(ET.parse(ee_path).getroot().tag, 'error')

        doee_path = os.path.join(self.results_dir, 'DepsOnErrorExtractor.xml')
        self.assertFalse(os.path.isfile(doee_path))
        doee2_path = os.path.join(self.results_dir,
                                  'DepsOnErrorExtractor2.xml')
Exemplo n.º 6
0
def get_extraction_runner():

   runner = ExtractionRunner()
   runner.enable_logging('~/logs/results', '~/logs/runnables')

   # Option 1
   runner.add_runnable(grobid.GrobidTEIExtractor)
   runner.add_runnable(extractors.TEItoPlainTextExtractor)
   runner.add_runnable(extractors.TEItoHeaderExtractor)
   # OR
   # Option 2
   # runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)

   runner.add_runnable(filters.AcademicPaperFilter)

   return runner
Exemplo n.º 7
0
    def test_run_from_file_batch(self):
        runner = ExtractionRunner()
        runner.add_runnable(SelfExtractor)
        paths = [self.f1_path, self.f2_path, self.f3_path]
        prefixes = ['1', '2', '3']
        output_dirs = [self.results_dir] * 3

        runner.run_from_file_batch(paths, output_dirs, file_prefixes=prefixes)
        result_file_paths = [
            os.path.join(self.results_dir, '{0}SelfExtractor.xml'.format(i))
            for i in prefixes
        ]
        file_content = ['file {0}'.format(i) for i in prefixes]
        for path, content in zip(result_file_paths, file_content):
            self.assertTrue(os.path.isfile(path))
            xml = ET.parse(path).getroot()
            self.assertEqual(xml.text, content)
Exemplo n.º 8
0
    def test_logs_work(self):
        runner = ExtractionRunner()
        results_log_path = os.path.join(self.results_dir, 'results')
        runnables_log_path = os.path.join(self.results_dir, 'runnables')

        runner.enable_logging(results_log_path, runnables_log_path)
        runner.add_runnable(SelfLogExtractor)
        runner.run('abc', output_dir=self.results_dir, run_name='RUN!')

        results_log = glob.glob(results_log_path + "*.log")[0]
        log_data = open(results_log, 'r').read()
        self.assertTrue('[SUCCESS]' in log_data)
        self.assertTrue('RUN!' in log_data)

        runnables_log = glob.glob(runnables_log_path + "*.log")[0]
        log_data = open(runnables_log, 'r').read()
        self.assertTrue('abc' in log_data)
        self.assertTrue('SelfLogExtractor' in log_data)
        self.assertTrue('RUN!' in log_data)
Exemplo n.º 9
0
            (status, stdout, stderr) = utils.external_process(
                ['awk', '/^[0-9]/ {print;}', '-'], input_data=data, timeout=5)
        except subprocess.TimeoutExpired:
            raise RunnableError('awk timed out')

        lines = [line for line in stdout.split("\n") if line]

        root = ET.Element('extraction')
        for line in lines:
            ele = ET.SubElement(root, 'line')
            ele.text = line

        return ExtractorResult(xml_result=root)


# Set up and run extraction
extraction_runner = ExtractionRunner()
extraction_runner.add_runnable(HasNumbersFilter)
extraction_runner.add_runnable(EmailExtractor)
extraction_runner.add_runnable(LinesStartWithNumberExtractor)

extraction_runner.run(u'''Random data that contains some emails [email protected]
Test lines with some @ signs now and then. Meet you@[email protected].
Line with another email embedded [email protected] in the line.
[email protected] [email protected]
123 Some lines even start with numbers
Some lines don't start with numbers
004 The final line in the test data''',
                      'extraction/test/sample_output',
                      run_name='Sample Data')
Exemplo n.º 10
0
 def test_nothing_run(self):
     runner = ExtractionRunner()
     runner.run(u'data!', output_dir=self.results_dir)
     # should be no files in output_dir
     self.assertFalse(os.listdir(self.results_dir))
Exemplo n.º 11
0
    def test_no_extraction_result_works(self):
        runner = ExtractionRunner()
        runner.add_runnable(NothingExtractor)
        runner.run('pizza', output_dir=self.results_dir)

        self.assertFalse(os.listdir(self.results_dir))
Exemplo n.º 12
0
 def get_extraction_runner(self):
     runner = ExtractionRunner()
     web.debug('getting runner')
     #runner.enable_logging('/home/huy138/logs/service/results', '/home/huy138/logs/service/runnables')
     web.debug('runner gotten')
     return runner