def test_output_results_option_when_false(self): runner = ExtractionRunner() runner.add_runnable(SelfExtractor, output_results=False) runner.run('test', output_dir = self.results_dir) result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml') self.assertFalse(os.path.isfile(result_file_path))
def test_output_results_option_defaults_to_true(self): runner = ExtractionRunner() runner.add_runnable(SelfExtractor) runner.run('test', output_dir = self.results_dir) result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml') self.assertTrue(os.path.isfile(result_file_path))
def test_file_prefix_option_works(self): runner = ExtractionRunner() runner.add_runnable(ImplTestFileExtractor) runner.run('whatever', output_dir=self.results_dir, file_prefix = 'prefix.') result_file_path = os.path.join(self.results_dir, 'prefix.test.txt') self.assertTrue(os.path.isfile(result_file_path)) self.assertEqual(open(result_file_path, 'r').read(), 'test test')
def test_file_name_result_works(self): runner = ExtractionRunner() runner.add_runnable(SelfChangeNameExtractor) runner.run('pizza', output_dir=self.results_dir) result_file_path = os.path.join(self.results_dir, SelfChangeNameExtractor.result_file_name) self.assertTrue(os.path.isfile(result_file_path)) self.assertEqual(ET.parse(result_file_path).getroot().text, 'pizza')
def test_run_from_file(self): runner = ExtractionRunner() runner.add_runnable(SelfExtractor) runner.run_from_file(self.f1_path, output_dir=self.results_dir) result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml') self.assertTrue(os.path.isfile(result_file_path)) xml = ET.parse(result_file_path).getroot() self.assertEqual(xml.text, 'file 1')
def test_dependency_results_work(self): runner = ExtractionRunner() runner.add_runnable(ImplTestFileExtractor) runner.add_runnable(DepsOnTestFileExtractor) # an error will be thrown if dependency doesn't work # so no need to assert anything in this test runner.run('whatever', output_dir=self.results_dir) runner = ExtractionRunner() runner.add_runnable(DepsOnTestFileExtractor) self.assertRaises(LookupError, runner.run, 'whatever', output_dir=self.results_dir)
def test_run_batch(self): batch = ['test 0', 'test 1', 'test 2'] prefixes = ['1', '2', '3'] output_dirs = [self.results_dir] * 3 runner = ExtractionRunner() runner.add_runnable(SelfExtractor) runner.run_batch(batch, output_dirs, file_prefixes=prefixes) for prefix, text in zip(prefixes, batch): result_file_path = os.path.join(self.results_dir, '{0}SelfExtractor.xml'.format(prefix)) self.assertTrue(os.path.isfile(result_file_path)) xml = ET.parse(result_file_path).getroot() self.assertEqual(xml.text, text)
def test_disable_logs_works(self): runner = ExtractionRunner() results_log_path = os.path.join(self.results_dir, 'results') runnables_log_path = os.path.join(self.results_dir, 'runnables') runner.enable_logging(results_log_path, runnables_log_path) runner.disable_logging() runner.add_runnable(SelfLogExtractor) runner.run('abc', output_dir=self.results_dir, run_name='RUN!') log_list = glob.glob(results_log_path + "*.log") self.assertFalse(log_list) log_list = glob.glob(runnables_log_path + "*.log") self.assertFalse(log_list)
def test_run_from_file_batch(self): runner = ExtractionRunner() runner.add_runnable(SelfExtractor) paths = [self.f1_path, self.f2_path, self.f3_path] prefixes = ['1', '2', '3'] output_dirs = [self.results_dir] * 3 runner.run_from_file_batch(paths, output_dirs, file_prefixes=prefixes) result_file_paths = [os.path.join(self.results_dir, '{0}SelfExtractor.xml'.format(i)) for i in prefixes] file_content = ['file {0}'.format(i) for i in prefixes] for path, content in zip(result_file_paths, file_content): self.assertTrue(os.path.isfile(path)) xml = ET.parse(path).getroot() self.assertEqual(xml.text, content)
def test_extractor_errors_cascade_no_write_dep_errors(self): runner = ExtractionRunner() runner.add_runnable(ErrorExtractor) runner.add_runnable(DepsOnErrorExtractor) runner.add_runnable(DepsOnErrorExtractor2) runner.run('Test', output_dir=self.results_dir) ee_path = os.path.join(self.results_dir, 'ErrorExtractor.xml') self.assertTrue(os.path.isfile(ee_path)) self.assertEqual(ET.parse(ee_path).getroot().tag, 'error') doee_path = os.path.join(self.results_dir, 'DepsOnErrorExtractor.xml') self.assertFalse(os.path.isfile(doee_path)) doee2_path = os.path.join(self.results_dir, 'DepsOnErrorExtractor2.xml')
def test_extractor_errors_cascade_no_write_dep_errors(self): runner = ExtractionRunner() runner.add_runnable(ErrorExtractor) runner.add_runnable(DepsOnErrorExtractor) runner.add_runnable(DepsOnErrorExtractor2) runner.run('Test', output_dir = self.results_dir) ee_path = os.path.join(self.results_dir, 'ErrorExtractor.xml') self.assertTrue(os.path.isfile(ee_path)) self.assertEqual(ET.parse(ee_path).getroot().tag, 'error') doee_path = os.path.join(self.results_dir, 'DepsOnErrorExtractor.xml') self.assertFalse(os.path.isfile(doee_path)) doee2_path = os.path.join(self.results_dir, 'DepsOnErrorExtractor2.xml')
def test_disable_logs_works(self): runner = ExtractionRunner() results_log_path = os.path.join(self.results_dir, 'results') runnables_log_path = os.path.join(self.results_dir, 'runnables') runner.enable_logging(results_log_path, runnables_log_path) runner.disable_logging() runner.add_runnable(SelfLogExtractor) runner.run('abc', output_dir = self.results_dir, run_name = 'RUN!') log_list = glob.glob(results_log_path + "*.log") self.assertFalse(log_list) log_list = glob.glob(runnables_log_path + "*.log") self.assertFalse(log_list)
def test_output_results_option_when_false(self): runner = ExtractionRunner() runner.add_runnable(SelfExtractor, output_results=False) runner.run('test', output_dir=self.results_dir) result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml') self.assertFalse(os.path.isfile(result_file_path))
def test_output_results_option_defaults_to_true(self): runner = ExtractionRunner() runner.add_runnable(SelfExtractor) runner.run('test', output_dir=self.results_dir) result_file_path = os.path.join(self.results_dir, 'SelfExtractor.xml') self.assertTrue(os.path.isfile(result_file_path))
def test_files_get_written(self): runner = ExtractionRunner() runner.add_runnable(ImplTestFileExtractor) runner.run('whatever', output_dir=self.results_dir) result_file_path = os.path.join(self.results_dir, 'test.txt') self.assertTrue(os.path.isfile(result_file_path)) self.assertEqual(open(result_file_path, 'r').read(), 'test test')
def test_file_name_result_works(self): runner = ExtractionRunner() runner.add_runnable(SelfChangeNameExtractor) runner.run('pizza', output_dir=self.results_dir) result_file_path = os.path.join( self.results_dir, SelfChangeNameExtractor.result_file_name) self.assertTrue(os.path.isfile(result_file_path)) self.assertEqual(ET.parse(result_file_path).getroot().text, 'pizza')
def test_logs_work(self): runner = ExtractionRunner() results_log_path = os.path.join(self.results_dir, 'results') runnables_log_path = os.path.join(self.results_dir, 'runnables') runner.enable_logging(results_log_path, runnables_log_path) runner.add_runnable(SelfLogExtractor) runner.run('abc', output_dir=self.results_dir, run_name='RUN!') results_log = glob.glob(results_log_path + "*.log")[0] log_data = open(results_log, 'r').read() self.assertTrue('[SUCCESS]' in log_data) self.assertTrue('RUN!' in log_data) runnables_log = glob.glob(runnables_log_path + "*.log")[0] log_data = open(runnables_log, 'r').read() self.assertTrue('abc' in log_data) self.assertTrue('SelfLogExtractor' in log_data) self.assertTrue('RUN!' in log_data)
def test_run_batch(self): batch = ['test 0', 'test 1', 'test 2'] prefixes = ['1', '2', '3'] output_dirs = [self.results_dir] * 3 runner = ExtractionRunner() runner.add_runnable(SelfExtractor) runner.run_batch(batch, output_dirs, file_prefixes=prefixes) for prefix, text in zip(prefixes, batch): result_file_path = os.path.join( self.results_dir, '{0}SelfExtractor.xml'.format(prefix)) self.assertTrue(os.path.isfile(result_file_path)) xml = ET.parse(result_file_path).getroot() self.assertEqual(xml.text, text)
def test_logs_work(self): runner = ExtractionRunner() results_log_path = os.path.join(self.results_dir, 'results') runnables_log_path = os.path.join(self.results_dir, 'runnables') runner.enable_logging(results_log_path, runnables_log_path) runner.add_runnable(SelfLogExtractor) runner.run('abc', output_dir = self.results_dir, run_name = 'RUN!') results_log = glob.glob(results_log_path + "*.log")[0] log_data = open(results_log, 'r').read() self.assertTrue('[SUCCESS]' in log_data) self.assertTrue('RUN!' in log_data) runnables_log = glob.glob(runnables_log_path + "*.log")[0] log_data = open(runnables_log, 'r').read() self.assertTrue('abc' in log_data) self.assertTrue('SelfLogExtractor' in log_data) self.assertTrue('RUN!' in log_data)
def test_run_from_file_batch(self): runner = ExtractionRunner() runner.add_runnable(SelfExtractor) paths = [self.f1_path, self.f2_path, self.f3_path] prefixes = ['1', '2', '3'] output_dirs = [self.results_dir] * 3 runner.run_from_file_batch(paths, output_dirs, file_prefixes=prefixes) result_file_paths = [ os.path.join(self.results_dir, '{0}SelfExtractor.xml'.format(i)) for i in prefixes ] file_content = ['file {0}'.format(i) for i in prefixes] for path, content in zip(result_file_paths, file_content): self.assertTrue(os.path.isfile(path)) xml = ET.parse(path).getroot() self.assertEqual(xml.text, content)
def get_extraction_runner(modules): runner = ExtractionRunner() if modules['fulltext'] == 'True': if modules['fulltext_pdfbox'] == 'True': runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor) if modules['academicfilter'] == 'True': runner.add_runnable(filters.AcademicPaperFilter) if modules['fulltext'] == 'True': if modules['fulltext_grobid'] == 'True': runner.add_runnable(grobid.GrobidTEIExtractor) if modules['fulltext_tei_to_csx'] == 'True': runner.add_runnable(tei.TEItoPlainTextExtractor) if modules['header'] == 'True': if modules['header_grobid'] == 'True': runner.add_runnable(grobid.GrobidHeaderTEIExtractor) if modules['header_tei_to_csx'] == 'True': runner.add_runnable(tei.TEItoHeaderExtractor) if modules['citation'] == 'True': if modules['citation_parscit'] == 'True': runner.add_runnable(parscit.ParsCitCitationExtractor) if modules['citation_grobid'] == 'True': runner.add_runnable(grobid.GrobidCitationTEIExtractor) if modules['figures'] == 'True': runner.add_runnable(figures.PDFFiguresExtractor) if modules['algorithms'] == 'True': runner.add_runnable(algorithms.AlgorithmsExtractor) return runner
def test_no_extraction_result_works(self): runner = ExtractionRunner() runner.add_runnable(NothingExtractor) runner.run('pizza', output_dir=self.results_dir) self.assertFalse(os.listdir(self.results_dir))
def test_filter_results_cascade(self): runner = ExtractionRunner() runner.add_runnable(FailFilter) runner.add_runnable(FailingDepsExtractor) runner.run('Test', output_dir=self.results_dir) fde_path = os.path.join(self.results_dir, 'FailingDepsExtractor.xml') self.assertFalse(os.path.isfile(fde_path)) runner.run('Test', output_dir=self.results_dir, write_dep_errors=True) self.assertTrue(os.path.isfile(fde_path)) self.assertEqual(ET.parse(fde_path).getroot().tag, 'error') runner = ExtractionRunner() runner.add_runnable(PassFilter) runner.add_runnable(PassingDepsExtractor) runner.run('Test', output_dir=self.results_dir) pde_path = os.path.join(self.results_dir, 'PassingDepsExtractor.xml') self.assertTrue(os.path.isfile(pde_path)) self.assertEqual(ET.parse(pde_path).getroot().text, 'Test') os.remove(pde_path) runner.run('Test', output_dir=self.results_dir, write_dep_errors=True) self.assertTrue(os.path.isfile(pde_path)) self.assertEqual(ET.parse(pde_path).getroot().text, 'Test')
def test_nothing_run(self): runner = ExtractionRunner() runner.run(u'data!', output_dir=self.results_dir) # should be no files in output_dir self.assertFalse(os.listdir(self.results_dir))
def get_extraction_runner(self): runner = ExtractionRunner() web.debug('getting runner') #runner.enable_logging('/home/huy138/logs/service/results', '/home/huy138/logs/service/runnables') web.debug('runner gotten') return runner
def get_extraction_runner(modules): runner = ExtractionRunner() if modules['fulltext'] == 'True': if modules['fulltext_pdfbox'] == 'True': runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor) if modules['fulltext_grobid'] == 'True': runner.add_runnable(grobid.GrobidTEIExtractor) if modules['fulltext_tei_to_csx'] == 'True': runner.add_runnable(tei.TEItoPlainTextExtractor) if modules['academicfilter'] == 'True': runner.add_runnable(filters.AcademicPaperFilter) if modules['header'] == 'True': if modules['header_grobid'] == 'True': runner.add_runnable(grobid.GrobidHeaderTEIExtractor) if modules['header_tei_to_csx'] == 'True': runner.add_runnable(tei.TEItoHeaderExtractor) if modules['citation'] == 'True': if modules['citation_parscit'] == 'True': runner.add_runnable(parscit.ParsCitCitationExtractor) if modules['citation_grobid'] == 'True': runner.add_runnable(grobid.GrobidCitationTEIExtractor) if modules['figures'] == 'True': runner.add_runnable(figures.PDFFiguresExtractor) if modules['algorithms'] == 'True': runner.add_runnable(algorithms.AlgorithmsExtractor) return runner
(status, stdout, stderr) = utils.external_process(['awk', '/^[0-9]/ {print;}', '-'], input_data=data, timeout=5) except subprocess.TimeoutExpired: raise RunnableError('awk timed out') lines = [line for line in stdout.split("\n") if line] root = ET.Element('extraction') for line in lines: ele = ET.SubElement(root, 'line') ele.text = line return ExtractorResult(xml_result=root) # Set up and run extraction extraction_runner = ExtractionRunner() extraction_runner.add_runnable(HasNumbersFilter) extraction_runner.add_runnable(EmailExtractor) extraction_runner.add_runnable(LinesStartWithNumberExtractor) extraction_runner.run(u'''Random data that contains some emails [email protected] Test lines with some @ signs now and then. Meet you@[email protected]. Line with another email embedded [email protected] in the line. [email protected] [email protected] 123 Some lines even start with numbers Some lines don't start with numbers 004 The final line in the test data''', 'extraction/test/sample_output', run_name = 'Sample Data')
def get_extraction_runner(): runner = ExtractionRunner() runner.enable_logging('~/logs/results', '~/logs/runnables') runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor) runner.add_runnable(filters.AcademicPaperFilter) runner.add_runnable(grobid.GrobidHeaderTEIExtractor) runner.add_runnable(tei.TEItoHeaderExtractor) runner.add_runnable(parscit.ParsCitCitationExtractor) runner.add_runnable(figures.PDFFiguresExtractor) runner.add_runnable(algorithms.AlgorithmsExtractor) return runner
(status, stdout, stderr) = utils.external_process( ['awk', '/^[0-9]/ {print;}', '-'], input_data=data, timeout=5) except subprocess.TimeoutExpired: raise RunnableError('awk timed out') lines = [line for line in stdout.split("\n") if line] root = ET.Element('extraction') for line in lines: ele = ET.SubElement(root, 'line') ele.text = line return ExtractorResult(xml_result=root) # Set up and run extraction extraction_runner = ExtractionRunner() extraction_runner.add_runnable(HasNumbersFilter) extraction_runner.add_runnable(EmailExtractor) extraction_runner.add_runnable(LinesStartWithNumberExtractor) extraction_runner.run(u'''Random data that contains some emails [email protected] Test lines with some @ signs now and then. Meet you@[email protected]. Line with another email embedded [email protected] in the line. [email protected] [email protected] 123 Some lines even start with numbers Some lines don't start with numbers 004 The final line in the test data''', 'extraction/test/sample_output', run_name='Sample Data')
def test_filter_results_cascade(self): runner = ExtractionRunner() runner.add_runnable(FailFilter) runner.add_runnable(FailingDepsExtractor) runner.run('Test', output_dir = self.results_dir) fde_path = os.path.join(self.results_dir, 'FailingDepsExtractor.xml') self.assertFalse(os.path.isfile(fde_path)) runner.run('Test', output_dir = self.results_dir, write_dep_errors=True) self.assertTrue(os.path.isfile(fde_path)) self.assertEqual(ET.parse(fde_path).getroot().tag, 'error') runner = ExtractionRunner() runner.add_runnable(PassFilter) runner.add_runnable(PassingDepsExtractor) runner.run('Test', output_dir = self.results_dir) pde_path = os.path.join(self.results_dir, 'PassingDepsExtractor.xml') self.assertTrue(os.path.isfile(pde_path)) self.assertEqual(ET.parse(pde_path).getroot().text, 'Test') os.remove(pde_path) runner.run('Test', output_dir = self.results_dir, write_dep_errors=True) self.assertTrue(os.path.isfile(pde_path)) self.assertEqual(ET.parse(pde_path).getroot().text, 'Test')
def get_extraction_runner(): runner = ExtractionRunner() runner.enable_logging('~/logs/results', '~/logs/runnables') # Option 1 runner.add_runnable(grobid.GrobidTEIExtractor) runner.add_runnable(extractors.TEItoPlainTextExtractor) runner.add_runnable(extractors.TEItoHeaderExtractor) # OR # Option 2 # runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor) runner.add_runnable(filters.AcademicPaperFilter) return runner
def get_extraction_runner(modules): runner = ExtractionRunner() if modules["fulltext"] == "True": if modules["fulltext_pdfbox"] == "True": runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor) if modules["academicfilter"] == "True": runner.add_runnable(filters.AcademicPaperFilter) if modules["fulltext"] == "True": if modules["fulltext_grobid"] == "True": runner.add_runnable(grobid.GrobidTEIExtractor) if modules["fulltext_tei_to_csx"] == "True": runner.add_runnable(tei.TEItoPlainTextExtractor) if modules["header"] == "True": if modules["header_grobid"] == "True": runner.add_runnable(grobid.GrobidHeaderTEIExtractor) if modules["header_tei_to_csx"] == "True": runner.add_runnable(tei.TEItoHeaderExtractor) if modules["citation"] == "True": if modules["citation_parscit"] == "True": runner.add_runnable(parscit.ParsCitCitationExtractor) if modules["citation_grobid"] == "True": runner.add_runnable(grobid.GrobidCitationTEIExtractor) if modules["figures"] == "True": runner.add_runnable(figures.PDFFiguresExtractor) if modules["algorithms"] == "True": runner.add_runnable(algorithms.AlgorithmsExtractor) return runner