def test_should_pass_around_values_with_single_step( self, pipeline, app_config, file_list_args, mocks): opt = file_list_args step1 = _pdf_step(response={'content': XML_CONTENT_1}) pipeline.get_steps.return_value = [step1] with TestPipeline() as p: mocks['get_remaining_file_list_for_args'].return_value = [ PDF_FILE_1 ] mocks['read_all_from_path'].return_value = PDF_CONTENT_1 configure_pipeline(p, opt, pipeline, app_config) assert get_counter_value(p.run(), get_step_processed_counter(step1)) == 1 step1.assert_called_with({ 'content': PDF_CONTENT_1, 'source_filename': PDF_FILE_1, 'filename': PDF_FILE_1, 'type': MimeTypes.PDF }) mocks['save_file_content'].assert_called_with(OUTPUT_XML_FILE_1, XML_CONTENT_1)
def test_should_increase_error_metric_counter_if_exception_was_raised( self): with TestPipeline() as p: _ = ( # noqa: F841 p | beam.Create([SOME_VALUE_1]) | MapOrLog( FN_RAISING_EXCEPTION, error_count=ERROR_COUNT_METRIC_NAME)) assert get_counter_value(p.run(), ERROR_COUNT_METRIC_NAME) == 1
def test_should_increase_count_per_item_using_function(self): with TestPipeline() as p: _ = ( # noqa: F841 p | beam.Create([SOME_VALUE_1, SOME_VALUE_2]) | TransformAndCount(beam.Map(lambda x: x), COUNT_METRIC_NAME_1, len)) assert get_counter_value( p.run(), COUNT_METRIC_NAME_1) == (len(SOME_VALUE_1) + len(SOME_VALUE_2))
def test_should_not_write_tfrecord_below_annotation_threshold_and_count_pages( self): custom_mocks = dict(evaluate_document_by_page=lambda _: [ { 'percentage': { # low percentage of None (no annotation, include) None: 0.1 } }, { 'percentage': { # low percentage of None (no annotation, exclude) None: 0.9 } } ]) with patch_preprocessing_pipeline(**custom_mocks) as mocks: opt = get_default_args() opt.save_tfrecords = True opt.min_annotation_percentage = 0.5 with TestPipeline() as p: mocks[ 'find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [ (PDF_FILE_1, XML_FILE_1) ] _setup_mocks_for_pages(mocks, [1, 2]) configure_pipeline(p, opt) p_result = p.run() assert get_counter_value(p_result, MetricCounters.FILE_PAIR) == 1 assert get_counter_value(p_result, MetricCounters.PAGE) == 2 assert get_counter_value(p_result, MetricCounters.FILTERED_PAGE) == 1 mocks['tfrecords'].assert_called_with( opt.output_path + '/data', [_expected_tfrecord_props(PDF_FILE_1, page_no=i) for i in [1]])
def test_should_write_multiple_tfrecords_and_count_pages(self): with patch_preprocessing_pipeline() as mocks: opt = get_default_args() opt.save_tfrecords = True with TestPipeline() as p: mocks[ 'find_file_pairs_grouped_by_parent_directory_or_name'].return_value = [ (PDF_FILE_1, XML_FILE_1) ] _setup_mocks_for_pages(mocks, [1, 2]) configure_pipeline(p, opt) p_result = p.run() assert get_counter_value(p_result, MetricCounters.FILE_PAIR) == 1 assert get_counter_value(p_result, MetricCounters.PAGE) == 2 assert get_counter_value(p_result, MetricCounters.FILTERED_PAGE) is None mocks['tfrecords'].assert_called_with(opt.output_path + '/data', [ _expected_tfrecord_props(PDF_FILE_1, page_no=i) for i in [1, 2] ])
def test_should_skip_step_if_data_type_doesnt_match_and_increase_ignored_count( self, pipeline, app_config, file_list_args, mocks): opt = file_list_args step1 = _convert_step(name='step1', supported_types={'other'}) pipeline.get_steps.return_value = [step1] with TestPipeline() as p: mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1]) mocks['read_all_from_path'].return_value = PDF_CONTENT_1 configure_pipeline(p, opt, pipeline, app_config) assert get_counter_value(p.run(), get_step_ignored_counter(step1)) == 1
def test_should_skip_item_causing_exception_and_increase_error_count( self, pipeline, app_config, file_list_args, mocks): opt = file_list_args step1 = _pdf_step() step1.side_effect = RuntimeError('doh1') pipeline.get_steps.return_value = [step1] with TestPipeline() as p: mocks['ReadFileList'].return_value = beam.Create([PDF_FILE_1]) mocks['read_all_from_path'].return_value = PDF_CONTENT_1 configure_pipeline(p, opt, pipeline, app_config) assert get_counter_value(p.run(), get_step_error_counter(step1)) == 1 mocks['save_file_content'].assert_not_called()