def test_should_use_grobid_only_if_crf_or_cv_model_are_not_enabled(self): with patch_conversion_pipeline() as mocks: opt = get_default_args() opt.base_data_path = BASE_DATA_PATH opt.pdf_path = None opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv' opt.output_path = OUTPUT_PATH opt.output_suffix = OUTPUT_SUFFIX opt.crf_model = None opt.cv_model_export_dir = None opt.use_grobid = True opt.grobid_url = 'http://test/api' with TestPipeline() as p: mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1]) mocks['read_all_from_path'].return_value = PDF_CONTENT_1 mocks['grobid_service'].return_value = lambda x: ( PDF_FILE_1, TEI_XML_CONTENT_1) configure_pipeline(p, opt) mocks['grobid_service'].assert_called_with( opt.grobid_url, opt.grobid_action, start_service=opt.start_grobid_service) mocks['save_file_content'].assert_called_with( OUTPUT_XML_FILE_1, TEI_XML_CONTENT_1)
def test_should_use_grobid_if_enabled(self): with patch_conversion_pipeline() as mocks: grobid_xml_enhancer = mocks['GrobidXmlEnhancer'].return_value opt = get_default_args() opt.base_data_path = BASE_DATA_PATH opt.pdf_path = None opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv' opt.output_path = OUTPUT_PATH opt.output_suffix = OUTPUT_SUFFIX opt.use_grobid = True opt.grobid_url = 'http://test/api' with TestPipeline() as p: mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1]) mocks['read_all_from_path'].return_value = PDF_CONTENT_1 mocks[ 'convert_pdf_bytes_to_lxml'].return_value = LXML_CONTENT_1 configure_pipeline(p, opt) mocks['GrobidXmlEnhancer'].assert_called_with( opt.grobid_url, start_service=opt.start_grobid_service) grobid_xml_enhancer.assert_called_with( mocks['extract_annotated_structured_document_to_xml']. return_value) mocks['save_file_content'].assert_called_with( OUTPUT_XML_FILE_1, grobid_xml_enhancer.return_value)
def test_should_pass_around_values_with_default_pipeline(self): with patch_conversion_pipeline() as mocks: opt = get_default_args() opt.base_data_path = BASE_DATA_PATH opt.pdf_path = None opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv' opt.output_path = OUTPUT_PATH opt.output_suffix = OUTPUT_SUFFIX with TestPipeline() as p: mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1]) mocks['read_all_from_path'].return_value = PDF_CONTENT_1 configure_pipeline(p, opt) mocks[ 'convert_pdf_bytes_to_structured_document'].assert_called_with( PDF_CONTENT_1, page_range=None, path=PDF_FILE_1) mocks[ 'predict_and_annotate_structured_document'].assert_called_with( mocks['convert_pdf_bytes_to_structured_document']. return_value, mocks['load_crf_model'].return_value) mocks[ 'extract_annotated_structured_document_to_xml'].assert_called_with( mocks['predict_and_annotate_structured_document']. return_value, tag_scope=CRF_TAG_SCOPE) mocks['save_file_content'].assert_called_with( OUTPUT_XML_FILE_1, mocks['extract_annotated_structured_document_to_xml']. return_value)
def test_should_pass_pdf_pattern_to_find_files_and_read_pdf_file(self): with patch_conversion_pipeline() as mocks: opt = get_default_args() opt.base_data_path = BASE_DATA_PATH opt.pdf_path = PDF_PATH opt.pdf_file_list = None with TestPipeline() as p: mocks['FindFiles'].return_value = beam.Create([PDF_FILE_1]) configure_pipeline(p, opt) mocks['FindFiles'].assert_called_with(BASE_DATA_PATH + '/' + PDF_PATH) mocks['read_all_from_path'].assert_called_with(PDF_FILE_1)
def test_should_pass_pdf_file_list_and_limit_to_read_file_list_and_read_pdf_file( self): with patch_conversion_pipeline() as mocks: opt = get_default_args() opt.base_data_path = BASE_DATA_PATH opt.pdf_path = None opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv' opt.limit = 100 with TestPipeline() as p: mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1]) configure_pipeline(p, opt) mocks['ReadFileList'].assert_called_with(opt.pdf_file_list, column='pdf_url', limit=opt.limit) mocks['read_all_from_path'].assert_called_with(PDF_FILE_1)
def test_should_save_annotated_lxml_if_enabled(self): with patch_conversion_pipeline() as mocks: opt = get_default_args() opt.base_data_path = BASE_DATA_PATH opt.pdf_path = None opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv' opt.output_path = OUTPUT_PATH opt.output_suffix = OUTPUT_SUFFIX opt.save_annot_lxml = True with TestPipeline() as p: mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1]) configure_pipeline(p, opt) mocks['save_structured_document'].assert_called_with( OUTPUT_PATH + '/' + REL_PDF_FILE_WITHOUT_EXT_1 + OutputExt.CRF_ANNOT_LXML, mocks['predict_and_annotate_structured_document'].return_value)
def test_should_use_cv_model_only_if_enabled(self): with patch_conversion_pipeline() as mocks: inference_model_wrapper = mocks[ 'InferenceModelWrapper'].return_value opt = get_default_args() opt.base_data_path = BASE_DATA_PATH opt.pdf_path = None opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv' opt.output_path = OUTPUT_PATH opt.output_suffix = OUTPUT_SUFFIX opt.crf_model = None opt.cv_model_export_dir = CV_MODEL_EXPORT_DIR with TestPipeline() as p: mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1]) mocks['read_all_from_path'].return_value = PDF_CONTENT_1 _setup_mocks_for_pages(mocks, [1, 2]) configure_pipeline(p, opt) mocks[ 'convert_pdf_bytes_to_structured_document'].assert_called_with( PDF_CONTENT_1, page_range=None, path=PDF_FILE_1) # cv model inference_model_wrapper.assert_called_with( [fake_pdf_png_page(i) for i in [1, 2]]) mocks[ 'annotate_structured_document_using_predicted_image_data'].assert_called_with( mocks['convert_pdf_bytes_to_structured_document']. return_value, inference_model_wrapper.return_value, inference_model_wrapper.get_color_map.return_value, tag_scope=CV_TAG_SCOPE) mocks['extract_annotated_structured_document_to_xml'].assert_called_with( mocks[ 'annotate_structured_document_using_predicted_image_data']. return_value, tag_scope=CV_TAG_SCOPE) # crf model not be called mocks[ 'predict_and_annotate_structured_document'].assert_not_called( )
def test_should_use_lxml_file_list_if_provided_and_load_structured_documents( self): with patch_conversion_pipeline() as mocks: opt = get_default_args() opt.base_data_path = BASE_DATA_PATH opt.pdf_path = None opt.pdf_file_list = None opt.lxml_file_list = BASE_DATA_PATH + '/file-list.tsv' opt.output_path = OUTPUT_PATH opt.output_suffix = OUTPUT_SUFFIX with TestPipeline() as p: mocks['ReadFileList'].return_value = beam.Create([LXML_FILE_1]) configure_pipeline(p, opt) mocks[ 'extract_annotated_structured_document_to_xml'].assert_called_with( mocks['load_structured_document'].return_value, tag_scope=None) mocks['save_file_content'].assert_called_with( OUTPUT_XML_FILE_1, mocks['extract_annotated_structured_document_to_xml']. return_value)
def test_should_use_grobid_with_lxml_file_list_if_enabled(self): with patch_conversion_pipeline() as mocks: grobid_xml_enhancer = mocks['GrobidXmlEnhancer'].return_value opt = get_default_args() opt.base_data_path = BASE_DATA_PATH opt.pdf_path = None opt.pdf_file_list = None opt.lxml_file_list = BASE_DATA_PATH + '/file-list.tsv' opt.output_path = OUTPUT_PATH opt.output_suffix = OUTPUT_SUFFIX opt.crf_model = None opt.cv_model_export_dir = None opt.use_grobid = True opt.grobid_url = 'http://test/api' with TestPipeline() as p: mocks['ReadFileList'].return_value = beam.Create([LXML_FILE_1]) configure_pipeline(p, opt) mocks[ 'extract_annotated_structured_document_to_xml'].assert_called_with( mocks['load_structured_document'].return_value, tag_scope=None) mocks['save_file_content'].assert_called_with( OUTPUT_XML_FILE_1, grobid_xml_enhancer.return_value)