Пример #1
0
    def test_should_pass_around_values_with_single_step(
            self, pipeline, app_config, file_list_args, mocks):
        opt = file_list_args

        step1 = _pdf_step(response={'content': XML_CONTENT_1})

        pipeline.get_steps.return_value = [step1]

        with TestPipeline() as p:
            mocks['get_remaining_file_list_for_args'].return_value = [
                PDF_FILE_1
            ]
            mocks['read_all_from_path'].return_value = PDF_CONTENT_1
            configure_pipeline(p, opt, pipeline, app_config)
            assert get_counter_value(p.run(),
                                     get_step_processed_counter(step1)) == 1

        step1.assert_called_with({
            'content': PDF_CONTENT_1,
            'source_filename': PDF_FILE_1,
            'filename': PDF_FILE_1,
            'type': MimeTypes.PDF
        })
        mocks['save_file_content'].assert_called_with(OUTPUT_XML_FILE_1,
                                                      XML_CONTENT_1)
Пример #2
0
 def test_should_increase_error_metric_counter_if_exception_was_raised(
         self):
     with TestPipeline() as p:
         _ = (  # noqa: F841
             p | beam.Create([SOME_VALUE_1]) | MapOrLog(
                 FN_RAISING_EXCEPTION, error_count=ERROR_COUNT_METRIC_NAME))
         assert get_counter_value(p.run(), ERROR_COUNT_METRIC_NAME) == 1
Пример #3
0
 def test_should_increase_count_per_item_using_function(self):
     with TestPipeline() as p:
         _ = (  # noqa: F841
             p | beam.Create([SOME_VALUE_1, SOME_VALUE_2])
             | TransformAndCount(beam.Map(lambda x: x), COUNT_METRIC_NAME_1,
                                 len))
         assert get_counter_value(
             p.run(),
             COUNT_METRIC_NAME_1) == (len(SOME_VALUE_1) + len(SOME_VALUE_2))
    def test_should_not_write_tfrecord_below_annotation_threshold_and_count_pages(
            self):
        custom_mocks = dict(evaluate_document_by_page=lambda _: [
            {
                'percentage': {
                    # low percentage of None (no annotation, include)
                    None: 0.1
                }
            },
            {
                'percentage': {
                    # low percentage of None (no annotation, exclude)
                    None: 0.9
                }
            }
        ])
        with patch_preprocessing_pipeline(**custom_mocks) as mocks:
            opt = get_default_args()
            opt.save_tfrecords = True
            opt.min_annotation_percentage = 0.5
            with TestPipeline() as p:
                mocks[
                    'find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [
                        (PDF_FILE_1, XML_FILE_1)
                    ]
                _setup_mocks_for_pages(mocks, [1, 2])
                configure_pipeline(p, opt)

                p_result = p.run()
                assert get_counter_value(p_result,
                                         MetricCounters.FILE_PAIR) == 1
                assert get_counter_value(p_result, MetricCounters.PAGE) == 2
                assert get_counter_value(p_result,
                                         MetricCounters.FILTERED_PAGE) == 1

            mocks['tfrecords'].assert_called_with(
                opt.output_path + '/data',
                [_expected_tfrecord_props(PDF_FILE_1, page_no=i) for i in [1]])
    def test_should_write_multiple_tfrecords_and_count_pages(self):
        with patch_preprocessing_pipeline() as mocks:
            opt = get_default_args()
            opt.save_tfrecords = True
            with TestPipeline() as p:
                mocks[
                    'find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [
                        (PDF_FILE_1, XML_FILE_1)
                    ]
                _setup_mocks_for_pages(mocks, [1, 2])
                configure_pipeline(p, opt)

                p_result = p.run()
                assert get_counter_value(p_result,
                                         MetricCounters.FILE_PAIR) == 1
                assert get_counter_value(p_result, MetricCounters.PAGE) == 2
                assert get_counter_value(p_result,
                                         MetricCounters.FILTERED_PAGE) is None

            mocks['tfrecords'].assert_called_with(opt.output_path + '/data', [
                _expected_tfrecord_props(PDF_FILE_1, page_no=i)
                for i in [1, 2]
            ])
    def test_should_skip_step_if_data_type_doesnt_match_and_increase_ignored_count(
            self, pipeline, app_config, file_list_args, mocks):

        opt = file_list_args

        step1 = _convert_step(name='step1', supported_types={'other'})

        pipeline.get_steps.return_value = [step1]

        with TestPipeline() as p:
            mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1])
            mocks['read_all_from_path'].return_value = PDF_CONTENT_1
            configure_pipeline(p, opt, pipeline, app_config)
            assert get_counter_value(p.run(),
                                     get_step_ignored_counter(step1)) == 1
    def test_should_skip_item_causing_exception_and_increase_error_count(
            self, pipeline, app_config, file_list_args, mocks):

        opt = file_list_args

        step1 = _pdf_step()
        step1.side_effect = RuntimeError('doh1')

        pipeline.get_steps.return_value = [step1]

        with TestPipeline() as p:
            mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1])
            mocks['read_all_from_path'].return_value = PDF_CONTENT_1
            configure_pipeline(p, opt, pipeline, app_config)
            assert get_counter_value(p.run(),
                                     get_step_error_counter(step1)) == 1

        mocks['save_file_content'].assert_not_called()