def test_should_write_multiple_tfrecords_from_pdf_xml_file_list(self):
        with patch_preprocessing_pipeline() as mocks:
            opt = get_default_args()
            opt.pdf_path = None
            opt.xml_path = None
            opt.pdf_xml_file_list = '.temp/file-list.tsv'
            opt.save_tfrecords = True
            with TestPipeline() as p:
                mocks['ReadDictCsv'].return_value = beam.Create([{
                    'source_url':
                    PDF_FILE_1,
                    'xml_url':
                    XML_FILE_1
                }, {
                    'source_url':
                    PDF_FILE_2,
                    'xml_url':
                    XML_FILE_2
                }])
                _setup_mocks_for_pages(mocks, [1], file_count=2)
                configure_pipeline(p, opt)

            mocks['ReadDictCsv'].assert_called_with(opt.pdf_xml_file_list,
                                                    limit=None)
            for pdf_file in [PDF_FILE_1, PDF_FILE_2]:
                mocks['tfrecords'].assert_any_call(
                    opt.output_path + '/data',
                    [_expected_tfrecord_props(pdf_file)])
            assert mocks['tfrecords'].call_count == 2
    def test_should_pass_pdf_and_xml_patterns_to_find_file_pairs_grouped_by_parent_directory(
            self):
        with patch_preprocessing_pipeline() as mocks:
            opt = get_default_args()
            opt.base_data_path = 'base'
            opt.pdf_path = 'pdf'
            opt.xml_path = 'xml'
            with TestPipeline() as p:
                mocks['find_file_pairs_grouped_by_parent_directory_or_name'].return_value = []
                configure_pipeline(p, opt)

            mocks[
                'find_file_pairs_grouped_by_parent_directory_or_name'].assert_called_with(
                    ['base/pdf', 'base/xml'])
    def test_should_write_tfrecords_from_pdf_xml_path(self):
        with patch_preprocessing_pipeline() as mocks:
            opt = get_default_args()
            opt.save_tfrecords = True
            with TestPipeline() as p:
                mocks[
                    'find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [
                        (PDF_FILE_1, XML_FILE_1)
                    ]
                _setup_mocks_for_pages(mocks, [1])
                configure_pipeline(p, opt)

            mocks['tfrecords'].assert_called_with(
                opt.output_path + '/data',
                [_expected_tfrecord_props(PDF_FILE_1)])
    def test_should_pass_limit_to_find_file_pairs_grouped_by_parent_directory_or_name(
            self):
        with patch_preprocessing_pipeline() as mocks:
            opt = get_default_args()
            opt.base_data_path = 'base'
            opt.pdf_path = 'pdf'
            opt.lxml_path = ''
            opt.xml_path = 'xml'
            opt.save_tfrecords = True
            opt.limit = 1
            with TestPipeline() as p:
                mocks[
                    'find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [
                        (PDF_FILE_1, XML_FILE_1), (PDF_FILE_2, XML_FILE_2)
                    ]
                configure_pipeline(p, opt)

            assert mocks['tfrecords'].call_count == 1
    def test_should_only_process_selected_pages(self):
        with patch_preprocessing_pipeline() as mocks:
            opt = get_default_args()
            opt.save_tfrecords = True
            opt.save_png = True
            opt.pages = (1, 3)
            with TestPipeline() as p:
                mocks[
                    'find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [
                        (PDF_FILE_1, XML_FILE_1)
                    ]
                _setup_mocks_for_pages(mocks, [1, 2])
                configure_pipeline(p, opt)

            assert mocks['convert_pdf_bytes_to_lxml'].called
            assert mocks['convert_pdf_bytes_to_lxml'].call_args[1].get(
                'page_range') == opt.pages

            assert mocks['pdf_bytes_to_png_pages'].called
            assert mocks['pdf_bytes_to_png_pages'].call_args[1].get(
                'page_range') == opt.pages
    def test_should_pass_limit_to_read_dict_csv(self):
        with patch_preprocessing_pipeline() as mocks:
            opt = get_default_args()
            opt.pdf_path = None
            opt.xml_path = None
            opt.pdf_xml_file_list = '.temp/file-list.tsv'
            opt.limit = 1
            opt.save_tfrecords = True
            with TestPipeline() as p:
                mocks['ReadDictCsv'].return_value = beam.Create([{
                    'source_url':
                    PDF_FILE_1,
                    'xml_url':
                    XML_FILE_1
                }])
                _setup_mocks_for_pages(mocks, [1])
                configure_pipeline(p, opt)

            mocks['ReadDictCsv'].assert_called_with(opt.pdf_xml_file_list,
                                                    limit=opt.limit)
            assert mocks['tfrecords'].call_count == 1
    def test_should_not_write_tfrecord_below_annotation_threshold_and_count_pages(
            self):
        custom_mocks = dict(evaluate_document_by_page=lambda _: [
            {
                'percentage': {
                    # low percentage of None (no annotation, include)
                    None: 0.1
                }
            },
            {
                'percentage': {
                    # low percentage of None (no annotation, exclude)
                    None: 0.9
                }
            }
        ])
        with patch_preprocessing_pipeline(**custom_mocks) as mocks:
            opt = get_default_args()
            opt.save_tfrecords = True
            opt.min_annotation_percentage = 0.5
            with TestPipeline() as p:
                mocks[
                    'find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [
                        (PDF_FILE_1, XML_FILE_1)
                    ]
                _setup_mocks_for_pages(mocks, [1, 2])
                configure_pipeline(p, opt)

                p_result = p.run()
                assert get_counter_value(p_result,
                                         MetricCounters.FILE_PAIR) == 1
                assert get_counter_value(p_result, MetricCounters.PAGE) == 2
                assert get_counter_value(p_result,
                                         MetricCounters.FILTERED_PAGE) == 1

            mocks['tfrecords'].assert_called_with(
                opt.output_path + '/data',
                [_expected_tfrecord_props(PDF_FILE_1, page_no=i) for i in [1]])
    def test_should_write_multiple_tfrecords_and_count_pages(self):
        with patch_preprocessing_pipeline() as mocks:
            opt = get_default_args()
            opt.save_tfrecords = True
            with TestPipeline() as p:
                mocks[
                    'find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [
                        (PDF_FILE_1, XML_FILE_1)
                    ]
                _setup_mocks_for_pages(mocks, [1, 2])
                configure_pipeline(p, opt)

                p_result = p.run()
                assert get_counter_value(p_result,
                                         MetricCounters.FILE_PAIR) == 1
                assert get_counter_value(p_result, MetricCounters.PAGE) == 2
                assert get_counter_value(p_result,
                                         MetricCounters.FILTERED_PAGE) is None

            mocks['tfrecords'].assert_called_with(opt.output_path + '/data', [
                _expected_tfrecord_props(PDF_FILE_1, page_no=i)
                for i in [1, 2]
            ])