示例#1
0
   def test_output_results_option_when_false(self):
      runner = ExtractionRunner()
      runner.add_runnable(SelfExtractor, output_results=False)
      runner.run('test', output_dir = self.results_dir)

      result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml')
      self.assertFalse(os.path.isfile(result_file_path))
示例#2
0
   def test_output_results_option_defaults_to_true(self):
      runner = ExtractionRunner()
      runner.add_runnable(SelfExtractor)
      runner.run('test', output_dir = self.results_dir)

      result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml')
      self.assertTrue(os.path.isfile(result_file_path))
示例#3
0
   def test_file_prefix_option_works(self):
      runner = ExtractionRunner()
      runner.add_runnable(ImplTestFileExtractor)
      runner.run('whatever', output_dir=self.results_dir, file_prefix = 'prefix.')

      result_file_path = os.path.join(self.results_dir, 'prefix.test.txt')
      self.assertTrue(os.path.isfile(result_file_path))
      self.assertEqual(open(result_file_path, 'r').read(), 'test test')
示例#4
0
   def test_file_name_result_works(self):
      runner = ExtractionRunner()
      runner.add_runnable(SelfChangeNameExtractor)
      runner.run('pizza', output_dir=self.results_dir)

      result_file_path = os.path.join(self.results_dir, SelfChangeNameExtractor.result_file_name)
      self.assertTrue(os.path.isfile(result_file_path))
      self.assertEqual(ET.parse(result_file_path).getroot().text, 'pizza')
示例#5
0
   def test_run_from_file(self):
      runner = ExtractionRunner()
      runner.add_runnable(SelfExtractor)
      runner.run_from_file(self.f1_path, output_dir=self.results_dir)

      result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml')
      self.assertTrue(os.path.isfile(result_file_path))

      xml = ET.parse(result_file_path).getroot()
      self.assertEqual(xml.text, 'file 1')
示例#6
0
   def test_dependency_results_work(self):
      runner = ExtractionRunner()
      runner.add_runnable(ImplTestFileExtractor)
      runner.add_runnable(DepsOnTestFileExtractor)
      # an error will be thrown if dependency doesn't work
      # so no need to assert anything in this test
      runner.run('whatever', output_dir=self.results_dir)

      runner = ExtractionRunner()
      runner.add_runnable(DepsOnTestFileExtractor)
      self.assertRaises(LookupError, runner.run, 'whatever', output_dir=self.results_dir)
示例#7
0
   def test_run_batch(self):
      batch = ['test 0', 'test 1', 'test 2']
      prefixes = ['1', '2', '3']
      output_dirs = [self.results_dir] * 3
      runner = ExtractionRunner()
      runner.add_runnable(SelfExtractor)
      runner.run_batch(batch, output_dirs, file_prefixes=prefixes)

      for prefix, text in zip(prefixes, batch):
         result_file_path = os.path.join(self.results_dir, '{0}SelfExtractor.xml'.format(prefix))
         self.assertTrue(os.path.isfile(result_file_path))
         xml = ET.parse(result_file_path).getroot()
         self.assertEqual(xml.text, text)
示例#8
0
    def test_disable_logs_works(self):
        runner = ExtractionRunner()
        results_log_path = os.path.join(self.results_dir, 'results')
        runnables_log_path = os.path.join(self.results_dir, 'runnables')

        runner.enable_logging(results_log_path, runnables_log_path)
        runner.disable_logging()
        runner.add_runnable(SelfLogExtractor)
        runner.run('abc', output_dir=self.results_dir, run_name='RUN!')

        log_list = glob.glob(results_log_path + "*.log")
        self.assertFalse(log_list)
        log_list = glob.glob(runnables_log_path + "*.log")
        self.assertFalse(log_list)
示例#9
0
   def test_run_from_file_batch(self):
      runner = ExtractionRunner()
      runner.add_runnable(SelfExtractor)
      paths = [self.f1_path, self.f2_path, self.f3_path]
      prefixes = ['1', '2', '3']
      output_dirs = [self.results_dir] * 3

      runner.run_from_file_batch(paths, output_dirs, file_prefixes=prefixes)
      result_file_paths = [os.path.join(self.results_dir, '{0}SelfExtractor.xml'.format(i)) for i in prefixes]
      file_content = ['file {0}'.format(i) for i in prefixes]
      for path, content in zip(result_file_paths, file_content):
         self.assertTrue(os.path.isfile(path))
         xml = ET.parse(path).getroot()
         self.assertEqual(xml.text, content)
示例#10
0
    def test_extractor_errors_cascade_no_write_dep_errors(self):
        runner = ExtractionRunner()
        runner.add_runnable(ErrorExtractor)
        runner.add_runnable(DepsOnErrorExtractor)
        runner.add_runnable(DepsOnErrorExtractor2)

        runner.run('Test', output_dir=self.results_dir)
        ee_path = os.path.join(self.results_dir, 'ErrorExtractor.xml')
        self.assertTrue(os.path.isfile(ee_path))
        self.assertEqual(ET.parse(ee_path).getroot().tag, 'error')

        doee_path = os.path.join(self.results_dir, 'DepsOnErrorExtractor.xml')
        self.assertFalse(os.path.isfile(doee_path))
        doee2_path = os.path.join(self.results_dir,
                                  'DepsOnErrorExtractor2.xml')
示例#11
0
   def test_extractor_errors_cascade_no_write_dep_errors(self):
      runner = ExtractionRunner()
      runner.add_runnable(ErrorExtractor)
      runner.add_runnable(DepsOnErrorExtractor)
      runner.add_runnable(DepsOnErrorExtractor2)

      runner.run('Test', output_dir = self.results_dir)
      ee_path = os.path.join(self.results_dir, 'ErrorExtractor.xml')
      self.assertTrue(os.path.isfile(ee_path))
      self.assertEqual(ET.parse(ee_path).getroot().tag, 'error')

      doee_path = os.path.join(self.results_dir, 'DepsOnErrorExtractor.xml')
      self.assertFalse(os.path.isfile(doee_path))
      doee2_path = os.path.join(self.results_dir, 'DepsOnErrorExtractor2.xml')
示例#12
0
   def test_disable_logs_works(self):
      runner = ExtractionRunner()
      results_log_path = os.path.join(self.results_dir, 'results')
      runnables_log_path = os.path.join(self.results_dir, 'runnables')

      runner.enable_logging(results_log_path, runnables_log_path)
      runner.disable_logging()
      runner.add_runnable(SelfLogExtractor)
      runner.run('abc', output_dir = self.results_dir, run_name = 'RUN!')

      log_list = glob.glob(results_log_path + "*.log")
      self.assertFalse(log_list) 
      log_list = glob.glob(runnables_log_path + "*.log")
      self.assertFalse(log_list) 
示例#13
0
    def test_output_results_option_when_false(self):
        runner = ExtractionRunner()
        runner.add_runnable(SelfExtractor, output_results=False)
        runner.run('test', output_dir=self.results_dir)

        result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml')
        self.assertFalse(os.path.isfile(result_file_path))
示例#14
0
    def test_output_results_option_defaults_to_true(self):
        runner = ExtractionRunner()
        runner.add_runnable(SelfExtractor)
        runner.run('test', output_dir=self.results_dir)

        result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml')
        self.assertTrue(os.path.isfile(result_file_path))
示例#15
0
    def test_files_get_written(self):
        runner = ExtractionRunner()
        runner.add_runnable(ImplTestFileExtractor)
        runner.run('whatever', output_dir=self.results_dir)

        result_file_path = os.path.join(self.results_dir, 'test.txt')
        self.assertTrue(os.path.isfile(result_file_path))
        self.assertEqual(open(result_file_path, 'r').read(), 'test test')
示例#16
0
    def test_file_name_result_works(self):
        runner = ExtractionRunner()
        runner.add_runnable(SelfChangeNameExtractor)
        runner.run('pizza', output_dir=self.results_dir)

        result_file_path = os.path.join(
            self.results_dir, SelfChangeNameExtractor.result_file_name)
        self.assertTrue(os.path.isfile(result_file_path))
        self.assertEqual(ET.parse(result_file_path).getroot().text, 'pizza')
示例#17
0
    def test_run_from_file(self):
        runner = ExtractionRunner()
        runner.add_runnable(SelfExtractor)
        runner.run_from_file(self.f1_path, output_dir=self.results_dir)

        result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml')
        self.assertTrue(os.path.isfile(result_file_path))

        xml = ET.parse(result_file_path).getroot()
        self.assertEqual(xml.text, 'file 1')
示例#18
0
    def test_logs_work(self):
        runner = ExtractionRunner()
        results_log_path = os.path.join(self.results_dir, 'results')
        runnables_log_path = os.path.join(self.results_dir, 'runnables')

        runner.enable_logging(results_log_path, runnables_log_path)
        runner.add_runnable(SelfLogExtractor)
        runner.run('abc', output_dir=self.results_dir, run_name='RUN!')

        results_log = glob.glob(results_log_path + "*.log")[0]
        log_data = open(results_log, 'r').read()
        self.assertTrue('[SUCCESS]' in log_data)
        self.assertTrue('RUN!' in log_data)

        runnables_log = glob.glob(runnables_log_path + "*.log")[0]
        log_data = open(runnables_log, 'r').read()
        self.assertTrue('abc' in log_data)
        self.assertTrue('SelfLogExtractor' in log_data)
        self.assertTrue('RUN!' in log_data)
示例#19
0
    def test_run_batch(self):
        batch = ['test 0', 'test 1', 'test 2']
        prefixes = ['1', '2', '3']
        output_dirs = [self.results_dir] * 3
        runner = ExtractionRunner()
        runner.add_runnable(SelfExtractor)
        runner.run_batch(batch, output_dirs, file_prefixes=prefixes)

        for prefix, text in zip(prefixes, batch):
            result_file_path = os.path.join(
                self.results_dir, '{0}SelfExtractor.xml'.format(prefix))
            self.assertTrue(os.path.isfile(result_file_path))
            xml = ET.parse(result_file_path).getroot()
            self.assertEqual(xml.text, text)
示例#20
0
   def test_logs_work(self):
      runner = ExtractionRunner()
      results_log_path = os.path.join(self.results_dir, 'results')
      runnables_log_path = os.path.join(self.results_dir, 'runnables')

      runner.enable_logging(results_log_path, runnables_log_path)
      runner.add_runnable(SelfLogExtractor)
      runner.run('abc', output_dir = self.results_dir, run_name = 'RUN!')

      results_log = glob.glob(results_log_path + "*.log")[0]
      log_data = open(results_log, 'r').read()
      self.assertTrue('[SUCCESS]' in log_data)
      self.assertTrue('RUN!' in log_data)

      runnables_log = glob.glob(runnables_log_path + "*.log")[0]
      log_data = open(runnables_log, 'r').read()
      self.assertTrue('abc' in log_data)
      self.assertTrue('SelfLogExtractor' in log_data)
      self.assertTrue('RUN!' in log_data)
示例#21
0
    def test_run_from_file_batch(self):
        runner = ExtractionRunner()
        runner.add_runnable(SelfExtractor)
        paths = [self.f1_path, self.f2_path, self.f3_path]
        prefixes = ['1', '2', '3']
        output_dirs = [self.results_dir] * 3

        runner.run_from_file_batch(paths, output_dirs, file_prefixes=prefixes)
        result_file_paths = [
            os.path.join(self.results_dir, '{0}SelfExtractor.xml'.format(i))
            for i in prefixes
        ]
        file_content = ['file {0}'.format(i) for i in prefixes]
        for path, content in zip(result_file_paths, file_content):
            self.assertTrue(os.path.isfile(path))
            xml = ET.parse(path).getroot()
            self.assertEqual(xml.text, content)
示例#22
0
文件: main.py 项目: afcarl/pdfmef
def get_extraction_runner(modules):
    runner = ExtractionRunner()
    if modules['fulltext'] == 'True':
        if modules['fulltext_pdfbox'] == 'True':
            runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)
    if modules['academicfilter'] == 'True':
        runner.add_runnable(filters.AcademicPaperFilter)
    if modules['fulltext'] == 'True':
        if modules['fulltext_grobid'] == 'True':
            runner.add_runnable(grobid.GrobidTEIExtractor)
        if modules['fulltext_tei_to_csx'] == 'True':
            runner.add_runnable(tei.TEItoPlainTextExtractor)
    if modules['header'] == 'True':
        if modules['header_grobid'] == 'True':
            runner.add_runnable(grobid.GrobidHeaderTEIExtractor)
        if modules['header_tei_to_csx'] == 'True':
            runner.add_runnable(tei.TEItoHeaderExtractor)
    if modules['citation'] == 'True':
        if modules['citation_parscit'] == 'True':
            runner.add_runnable(parscit.ParsCitCitationExtractor)
        if modules['citation_grobid'] == 'True':
            runner.add_runnable(grobid.GrobidCitationTEIExtractor)
    if modules['figures'] == 'True':
        runner.add_runnable(figures.PDFFiguresExtractor)
    if modules['algorithms'] == 'True':
        runner.add_runnable(algorithms.AlgorithmsExtractor)

    return runner
示例#23
0
   def test_no_extraction_result_works(self):
      runner = ExtractionRunner()
      runner.add_runnable(NothingExtractor)
      runner.run('pizza', output_dir=self.results_dir)

      self.assertFalse(os.listdir(self.results_dir))
示例#24
0
    def test_filter_results_cascade(self):
        runner = ExtractionRunner()
        runner.add_runnable(FailFilter)
        runner.add_runnable(FailingDepsExtractor)

        runner.run('Test', output_dir=self.results_dir)
        fde_path = os.path.join(self.results_dir, 'FailingDepsExtractor.xml')
        self.assertFalse(os.path.isfile(fde_path))

        runner.run('Test', output_dir=self.results_dir, write_dep_errors=True)
        self.assertTrue(os.path.isfile(fde_path))
        self.assertEqual(ET.parse(fde_path).getroot().tag, 'error')

        runner = ExtractionRunner()
        runner.add_runnable(PassFilter)
        runner.add_runnable(PassingDepsExtractor)

        runner.run('Test', output_dir=self.results_dir)
        pde_path = os.path.join(self.results_dir, 'PassingDepsExtractor.xml')
        self.assertTrue(os.path.isfile(pde_path))
        self.assertEqual(ET.parse(pde_path).getroot().text, 'Test')
        os.remove(pde_path)

        runner.run('Test', output_dir=self.results_dir, write_dep_errors=True)
        self.assertTrue(os.path.isfile(pde_path))
        self.assertEqual(ET.parse(pde_path).getroot().text, 'Test')
示例#25
0
 def test_nothing_run(self):
     runner = ExtractionRunner()
     runner.run(u'data!', output_dir=self.results_dir)
     # should be no files in output_dir
     self.assertFalse(os.listdir(self.results_dir))
示例#26
0
 def get_extraction_runner(self):
     runner = ExtractionRunner()
     web.debug('getting runner')
     #runner.enable_logging('/home/huy138/logs/service/results', '/home/huy138/logs/service/runnables')
     web.debug('runner gotten')
     return runner
示例#27
0
    def test_dependency_results_work(self):
        runner = ExtractionRunner()
        runner.add_runnable(ImplTestFileExtractor)
        runner.add_runnable(DepsOnTestFileExtractor)
        # an error will be thrown if dependency doesn't work
        # so no need to assert anything in this test
        runner.run('whatever', output_dir=self.results_dir)

        runner = ExtractionRunner()
        runner.add_runnable(DepsOnTestFileExtractor)
        self.assertRaises(LookupError,
                          runner.run,
                          'whatever',
                          output_dir=self.results_dir)
示例#28
0
    def test_no_extraction_result_works(self):
        runner = ExtractionRunner()
        runner.add_runnable(NothingExtractor)
        runner.run('pizza', output_dir=self.results_dir)

        self.assertFalse(os.listdir(self.results_dir))
示例#29
0
文件: main.py 项目: anukat2015/PDFMEF
def get_extraction_runner(modules):
    runner = ExtractionRunner()
    if modules['fulltext'] == 'True':
        if modules['fulltext_pdfbox'] == 'True':
            runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)
        if modules['fulltext_grobid'] == 'True':
            runner.add_runnable(grobid.GrobidTEIExtractor)
        if modules['fulltext_tei_to_csx'] == 'True':
            runner.add_runnable(tei.TEItoPlainTextExtractor)
    if modules['academicfilter'] == 'True':
        runner.add_runnable(filters.AcademicPaperFilter)
    if modules['header'] == 'True':
        if modules['header_grobid'] == 'True':
            runner.add_runnable(grobid.GrobidHeaderTEIExtractor)
        if modules['header_tei_to_csx'] == 'True':
            runner.add_runnable(tei.TEItoHeaderExtractor)
    if modules['citation'] == 'True':
        if modules['citation_parscit'] == 'True':
            runner.add_runnable(parscit.ParsCitCitationExtractor)
        if modules['citation_grobid'] == 'True':
            runner.add_runnable(grobid.GrobidCitationTEIExtractor)
    if modules['figures'] == 'True':
        runner.add_runnable(figures.PDFFiguresExtractor)
    if modules['algorithms'] == 'True':
        runner.add_runnable(algorithms.AlgorithmsExtractor)

    return runner
示例#30
0
文件: sample.py 项目: SeerLabs/pdfmef
         (status, stdout, stderr) = utils.external_process(['awk', '/^[0-9]/ {print;}', '-'], input_data=data, timeout=5)
      except subprocess.TimeoutExpired:
         raise RunnableError('awk timed out')

      lines = [line for line in stdout.split("\n") if line]

      root = ET.Element('extraction')
      for line in lines:
         ele = ET.SubElement(root, 'line')
         ele.text = line

      return ExtractorResult(xml_result=root)


# Set up and run extraction
extraction_runner = ExtractionRunner()
extraction_runner.add_runnable(HasNumbersFilter)
extraction_runner.add_runnable(EmailExtractor)
extraction_runner.add_runnable(LinesStartWithNumberExtractor)

extraction_runner.run(u'''Random data that contains some emails [email protected]
Test lines with some @ signs now and then. Meet you@[email protected].
Line with another email embedded [email protected] in the line.
[email protected] [email protected]
123 Some lines even start with numbers
Some lines don't start with numbers
004 The final line in the test data''', 'extraction/test/sample_output', run_name = 'Sample Data')


      
      
示例#31
0
def get_extraction_runner():

    runner = ExtractionRunner()
    runner.enable_logging('~/logs/results', '~/logs/runnables')

    runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)
    runner.add_runnable(filters.AcademicPaperFilter)
    runner.add_runnable(grobid.GrobidHeaderTEIExtractor)
    runner.add_runnable(tei.TEItoHeaderExtractor)
    runner.add_runnable(parscit.ParsCitCitationExtractor)
    runner.add_runnable(figures.PDFFiguresExtractor)
    runner.add_runnable(algorithms.AlgorithmsExtractor)

    return runner
示例#32
0
 def test_nothing_run(self):
    runner = ExtractionRunner()
    runner.run(u'data!', output_dir=self.results_dir)
    # should be no files in output_dir
    self.assertFalse(os.listdir(self.results_dir))
示例#33
0
def get_extraction_runner():

   runner = ExtractionRunner()
   runner.enable_logging('~/logs/results', '~/logs/runnables')

   runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)
   runner.add_runnable(filters.AcademicPaperFilter)
   runner.add_runnable(grobid.GrobidHeaderTEIExtractor)
   runner.add_runnable(tei.TEItoHeaderExtractor)
   runner.add_runnable(parscit.ParsCitCitationExtractor)
   runner.add_runnable(figures.PDFFiguresExtractor)
   runner.add_runnable(algorithms.AlgorithmsExtractor)

   return runner
示例#34
0
            (status, stdout, stderr) = utils.external_process(
                ['awk', '/^[0-9]/ {print;}', '-'], input_data=data, timeout=5)
        except subprocess.TimeoutExpired:
            raise RunnableError('awk timed out')

        lines = [line for line in stdout.split("\n") if line]

        root = ET.Element('extraction')
        for line in lines:
            ele = ET.SubElement(root, 'line')
            ele.text = line

        return ExtractorResult(xml_result=root)


# Set up and run extraction
extraction_runner = ExtractionRunner()
extraction_runner.add_runnable(HasNumbersFilter)
extraction_runner.add_runnable(EmailExtractor)
extraction_runner.add_runnable(LinesStartWithNumberExtractor)

extraction_runner.run(u'''Random data that contains some emails [email protected]
Test lines with some @ signs now and then. Meet you@[email protected].
Line with another email embedded [email protected] in the line.
[email protected] [email protected]
123 Some lines even start with numbers
Some lines don't start with numbers
004 The final line in the test data''',
                      'extraction/test/sample_output',
                      run_name='Sample Data')
示例#35
0
   def test_filter_results_cascade(self):
      runner = ExtractionRunner()
      runner.add_runnable(FailFilter)
      runner.add_runnable(FailingDepsExtractor)

      runner.run('Test', output_dir = self.results_dir)
      fde_path = os.path.join(self.results_dir, 'FailingDepsExtractor.xml')
      self.assertFalse(os.path.isfile(fde_path))

      runner.run('Test', output_dir = self.results_dir, write_dep_errors=True)
      self.assertTrue(os.path.isfile(fde_path))
      self.assertEqual(ET.parse(fde_path).getroot().tag, 'error')

      runner = ExtractionRunner()
      runner.add_runnable(PassFilter)
      runner.add_runnable(PassingDepsExtractor)
      
      runner.run('Test', output_dir = self.results_dir)
      pde_path = os.path.join(self.results_dir, 'PassingDepsExtractor.xml')
      self.assertTrue(os.path.isfile(pde_path))
      self.assertEqual(ET.parse(pde_path).getroot().text, 'Test')
      os.remove(pde_path)

      runner.run('Test', output_dir = self.results_dir, write_dep_errors=True)
      self.assertTrue(os.path.isfile(pde_path))
      self.assertEqual(ET.parse(pde_path).getroot().text, 'Test')
示例#36
0
def get_extraction_runner():

   runner = ExtractionRunner()
   runner.enable_logging('~/logs/results', '~/logs/runnables')

   # Option 1
   runner.add_runnable(grobid.GrobidTEIExtractor)
   runner.add_runnable(extractors.TEItoPlainTextExtractor)
   runner.add_runnable(extractors.TEItoHeaderExtractor)
   # OR
   # Option 2
   # runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)

   runner.add_runnable(filters.AcademicPaperFilter)

   return runner
示例#37
0
文件: main.py 项目: SeerLabs/pdfmef
def get_extraction_runner(modules):
    runner = ExtractionRunner()
    if modules["fulltext"] == "True":
        if modules["fulltext_pdfbox"] == "True":
            runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)
    if modules["academicfilter"] == "True":
        runner.add_runnable(filters.AcademicPaperFilter)
    if modules["fulltext"] == "True":
        if modules["fulltext_grobid"] == "True":
            runner.add_runnable(grobid.GrobidTEIExtractor)
        if modules["fulltext_tei_to_csx"] == "True":
            runner.add_runnable(tei.TEItoPlainTextExtractor)
    if modules["header"] == "True":
        if modules["header_grobid"] == "True":
            runner.add_runnable(grobid.GrobidHeaderTEIExtractor)
        if modules["header_tei_to_csx"] == "True":
            runner.add_runnable(tei.TEItoHeaderExtractor)
    if modules["citation"] == "True":
        if modules["citation_parscit"] == "True":
            runner.add_runnable(parscit.ParsCitCitationExtractor)
        if modules["citation_grobid"] == "True":
            runner.add_runnable(grobid.GrobidCitationTEIExtractor)
    if modules["figures"] == "True":
        runner.add_runnable(figures.PDFFiguresExtractor)
    if modules["algorithms"] == "True":
        runner.add_runnable(algorithms.AlgorithmsExtractor)

    return runner