示例#1
0
    def test_should_use_grobid_only_if_crf_or_cv_model_are_not_enabled(self):
        with patch_conversion_pipeline() as mocks:
            opt = get_default_args()
            opt.base_data_path = BASE_DATA_PATH
            opt.pdf_path = None
            opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv'
            opt.output_path = OUTPUT_PATH
            opt.output_suffix = OUTPUT_SUFFIX
            opt.crf_model = None
            opt.cv_model_export_dir = None
            opt.use_grobid = True
            opt.grobid_url = 'http://test/api'
            with TestPipeline() as p:
                mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1])
                mocks['read_all_from_path'].return_value = PDF_CONTENT_1
                mocks['grobid_service'].return_value = lambda x: (
                    PDF_FILE_1, TEI_XML_CONTENT_1)
                configure_pipeline(p, opt)

            mocks['grobid_service'].assert_called_with(
                opt.grobid_url,
                opt.grobid_action,
                start_service=opt.start_grobid_service)
            mocks['save_file_content'].assert_called_with(
                OUTPUT_XML_FILE_1, TEI_XML_CONTENT_1)
示例#2
0
    def test_should_use_grobid_if_enabled(self):
        with patch_conversion_pipeline() as mocks:
            grobid_xml_enhancer = mocks['GrobidXmlEnhancer'].return_value
            opt = get_default_args()
            opt.base_data_path = BASE_DATA_PATH
            opt.pdf_path = None
            opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv'
            opt.output_path = OUTPUT_PATH
            opt.output_suffix = OUTPUT_SUFFIX
            opt.use_grobid = True
            opt.grobid_url = 'http://test/api'
            with TestPipeline() as p:
                mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1])
                mocks['read_all_from_path'].return_value = PDF_CONTENT_1
                mocks[
                    'convert_pdf_bytes_to_lxml'].return_value = LXML_CONTENT_1
                configure_pipeline(p, opt)

            mocks['GrobidXmlEnhancer'].assert_called_with(
                opt.grobid_url, start_service=opt.start_grobid_service)
            grobid_xml_enhancer.assert_called_with(
                mocks['extract_annotated_structured_document_to_xml'].
                return_value)
            mocks['save_file_content'].assert_called_with(
                OUTPUT_XML_FILE_1, grobid_xml_enhancer.return_value)
示例#3
0
    def test_should_pass_around_values_with_default_pipeline(self):
        with patch_conversion_pipeline() as mocks:
            opt = get_default_args()
            opt.base_data_path = BASE_DATA_PATH
            opt.pdf_path = None
            opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv'
            opt.output_path = OUTPUT_PATH
            opt.output_suffix = OUTPUT_SUFFIX
            with TestPipeline() as p:
                mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1])
                mocks['read_all_from_path'].return_value = PDF_CONTENT_1
                configure_pipeline(p, opt)

            mocks[
                'convert_pdf_bytes_to_structured_document'].assert_called_with(
                    PDF_CONTENT_1, page_range=None, path=PDF_FILE_1)
            mocks[
                'predict_and_annotate_structured_document'].assert_called_with(
                    mocks['convert_pdf_bytes_to_structured_document'].
                    return_value, mocks['load_crf_model'].return_value)
            mocks[
                'extract_annotated_structured_document_to_xml'].assert_called_with(
                    mocks['predict_and_annotate_structured_document'].
                    return_value,
                    tag_scope=CRF_TAG_SCOPE)
            mocks['save_file_content'].assert_called_with(
                OUTPUT_XML_FILE_1,
                mocks['extract_annotated_structured_document_to_xml'].
                return_value)
示例#4
0
    def test_should_pass_pdf_pattern_to_find_files_and_read_pdf_file(self):
        with patch_conversion_pipeline() as mocks:
            opt = get_default_args()
            opt.base_data_path = BASE_DATA_PATH
            opt.pdf_path = PDF_PATH
            opt.pdf_file_list = None
            with TestPipeline() as p:
                mocks['FindFiles'].return_value = beam.Create([PDF_FILE_1])
                configure_pipeline(p, opt)

            mocks['FindFiles'].assert_called_with(BASE_DATA_PATH + '/' +
                                                  PDF_PATH)
            mocks['read_all_from_path'].assert_called_with(PDF_FILE_1)
示例#5
0
    def test_should_pass_pdf_file_list_and_limit_to_read_file_list_and_read_pdf_file(
            self):
        with patch_conversion_pipeline() as mocks:
            opt = get_default_args()
            opt.base_data_path = BASE_DATA_PATH
            opt.pdf_path = None
            opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv'
            opt.limit = 100
            with TestPipeline() as p:
                mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1])
                configure_pipeline(p, opt)

            mocks['ReadFileList'].assert_called_with(opt.pdf_file_list,
                                                     column='pdf_url',
                                                     limit=opt.limit)
            mocks['read_all_from_path'].assert_called_with(PDF_FILE_1)
示例#6
0
    def test_should_save_annotated_lxml_if_enabled(self):
        with patch_conversion_pipeline() as mocks:
            opt = get_default_args()
            opt.base_data_path = BASE_DATA_PATH
            opt.pdf_path = None
            opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv'
            opt.output_path = OUTPUT_PATH
            opt.output_suffix = OUTPUT_SUFFIX
            opt.save_annot_lxml = True
            with TestPipeline() as p:
                mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1])
                configure_pipeline(p, opt)

            mocks['save_structured_document'].assert_called_with(
                OUTPUT_PATH + '/' + REL_PDF_FILE_WITHOUT_EXT_1 +
                OutputExt.CRF_ANNOT_LXML,
                mocks['predict_and_annotate_structured_document'].return_value)
示例#7
0
    def test_should_use_cv_model_only_if_enabled(self):
        with patch_conversion_pipeline() as mocks:
            inference_model_wrapper = mocks[
                'InferenceModelWrapper'].return_value
            opt = get_default_args()
            opt.base_data_path = BASE_DATA_PATH
            opt.pdf_path = None
            opt.pdf_file_list = BASE_DATA_PATH + '/file-list.tsv'
            opt.output_path = OUTPUT_PATH
            opt.output_suffix = OUTPUT_SUFFIX
            opt.crf_model = None
            opt.cv_model_export_dir = CV_MODEL_EXPORT_DIR
            with TestPipeline() as p:
                mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1])
                mocks['read_all_from_path'].return_value = PDF_CONTENT_1
                _setup_mocks_for_pages(mocks, [1, 2])
                configure_pipeline(p, opt)

            mocks[
                'convert_pdf_bytes_to_structured_document'].assert_called_with(
                    PDF_CONTENT_1, page_range=None, path=PDF_FILE_1)

            # cv model
            inference_model_wrapper.assert_called_with(
                [fake_pdf_png_page(i) for i in [1, 2]])
            mocks[
                'annotate_structured_document_using_predicted_image_data'].assert_called_with(
                    mocks['convert_pdf_bytes_to_structured_document'].
                    return_value,
                    inference_model_wrapper.return_value,
                    inference_model_wrapper.get_color_map.return_value,
                    tag_scope=CV_TAG_SCOPE)
            mocks['extract_annotated_structured_document_to_xml'].assert_called_with(
                mocks[
                    'annotate_structured_document_using_predicted_image_data'].
                return_value,
                tag_scope=CV_TAG_SCOPE)

            # crf model not be called
            mocks[
                'predict_and_annotate_structured_document'].assert_not_called(
                )
示例#8
0
    def test_should_use_lxml_file_list_if_provided_and_load_structured_documents(
            self):
        with patch_conversion_pipeline() as mocks:
            opt = get_default_args()
            opt.base_data_path = BASE_DATA_PATH
            opt.pdf_path = None
            opt.pdf_file_list = None
            opt.lxml_file_list = BASE_DATA_PATH + '/file-list.tsv'
            opt.output_path = OUTPUT_PATH
            opt.output_suffix = OUTPUT_SUFFIX
            with TestPipeline() as p:
                mocks['ReadFileList'].return_value = beam.Create([LXML_FILE_1])
                configure_pipeline(p, opt)

            mocks[
                'extract_annotated_structured_document_to_xml'].assert_called_with(
                    mocks['load_structured_document'].return_value,
                    tag_scope=None)
            mocks['save_file_content'].assert_called_with(
                OUTPUT_XML_FILE_1,
                mocks['extract_annotated_structured_document_to_xml'].
                return_value)
示例#9
0
    def test_should_use_grobid_with_lxml_file_list_if_enabled(self):
        with patch_conversion_pipeline() as mocks:
            grobid_xml_enhancer = mocks['GrobidXmlEnhancer'].return_value
            opt = get_default_args()
            opt.base_data_path = BASE_DATA_PATH
            opt.pdf_path = None
            opt.pdf_file_list = None
            opt.lxml_file_list = BASE_DATA_PATH + '/file-list.tsv'
            opt.output_path = OUTPUT_PATH
            opt.output_suffix = OUTPUT_SUFFIX
            opt.crf_model = None
            opt.cv_model_export_dir = None
            opt.use_grobid = True
            opt.grobid_url = 'http://test/api'
            with TestPipeline() as p:
                mocks['ReadFileList'].return_value = beam.Create([LXML_FILE_1])
                configure_pipeline(p, opt)

            mocks[
                'extract_annotated_structured_document_to_xml'].assert_called_with(
                    mocks['load_structured_document'].return_value,
                    tag_scope=None)
            mocks['save_file_content'].assert_called_with(
                OUTPUT_XML_FILE_1, grobid_xml_enhancer.return_value)