Пример #1
0
    def test_pipeline_ignore_headerless_columns_false(self):

        filepath = os.path.join(self.data_dir, 'headerless_columns.csv')
        validator = Pipeline(filepath, processors=('structure', ))
        result, report = validator.run()

        self.assertFalse(result)
Пример #2
0
    def test_register_processor_append(self):

        pipeline = Pipeline(self.data_string)
        self.assertEqual(len(pipeline.pipeline), 1)

        pipeline.register_processor('schema')
        self.assertEqual(len(pipeline.pipeline), 2)
Пример #3
0
    def test_pipeline_infer_schema(self):
        filepath = os.path.join(self.data_dir, 'valid.csv')
        options = {'schema': {'infer_schema': True}}
        validator = Pipeline(filepath, processors=('schema',), options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 0)
Пример #4
0
    def test_multilingual_xlsx(self):

        data = os.path.join(self.data_dir, 'jungle', 'multilingual.xlsx')
        pipeline = Pipeline(data, format='excel')
        result, report = pipeline.run()

        self.assertTrue(pipeline.data)
Пример #5
0
    def test_messytables_source_six(self):

        data = os.path.join(self.data_dir, 'jungle', 'messytables-excel_properties.xls')
        pipeline = Pipeline(data, format='excel')
        result, report = pipeline.run()

        self.assertTrue(pipeline.data)
Пример #6
0
    def test_messytables_source_five(self):

        data = 'https://raw.githubusercontent.com/okfn/messytables/master/horror/characters.csv'
        pipeline = Pipeline(data)
        result, report = pipeline.run()

        self.assertTrue(pipeline.data)
Пример #7
0
    def test_register_processor_append(self):

        pipeline = Pipeline(self.data_string)
        self.assertEqual(len(pipeline.pipeline), 1)

        pipeline.register_processor('schema')
        self.assertEqual(len(pipeline.pipeline), 2)
Пример #8
0
    def test_pipeline_ignore_defective_rows_false(self):

        filepath = os.path.join(self.data_dir, 'defective_rows.csv')
        validator = Pipeline(filepath, processors=('structure',))
        result, report = validator.run()

        self.assertFalse(result)
Пример #9
0
    def test_rm_workspace(self):

        pipeline = Pipeline(self.data_string, dry_run=False)
        self.assertTrue(pipeline.workspace)
        pipeline.rm_workspace()

        self.assertFalse(os.path.exists(pipeline.workspace))
Пример #10
0
    def test_register_processor_insert(self):

        pipeline = Pipeline(self.data_string)
        self.assertEqual(len(pipeline.pipeline), 1)

        pipeline.register_processor('schema', position=0)
        self.assertEqual(len(pipeline.pipeline), 2)
Пример #11
0
    def test_pipeline_ignore_headerless_columns_false(self):

        filepath = os.path.join(self.data_dir, 'headerless_columns.csv')
        validator = Pipeline(filepath, processors=('structure',))
        result, report = validator.run()

        self.assertFalse(result)
Пример #12
0
    def test_pipeline_ignore_duplicate_rows_false(self):

        filepath = os.path.join(self.data_dir, 'duplicate_rows.csv')
        validator = Pipeline(filepath, processors=('structure', ))
        result, report = validator.run()

        self.assertFalse(result)
Пример #13
0
    def test_messytables_source_two(self):

        data = 'https://raw.githubusercontent.com/okfn/messytables/master/horror/utf-16le_encoded.csv'
        pipeline = Pipeline(data)
        result, report = pipeline.run()

        self.assertTrue(pipeline.data)
Пример #14
0
    def test_messytables_source_three(self):

        data = 'https://raw.githubusercontent.com/okfn/messytables/master/horror/sparse_with_column_errors.csv'
        pipeline = Pipeline(data)
        result, report = pipeline.run()

        self.assertTrue(pipeline.data)
Пример #15
0
    def test_register_processor_insert(self):

        pipeline = Pipeline(self.data_string)
        self.assertEqual(len(pipeline.pipeline), 1)

        pipeline.register_processor('schema', position=0)
        self.assertEqual(len(pipeline.pipeline), 2)
Пример #16
0
    def test_pipeline_infer_schema(self):
        filepath = os.path.join(self.data_dir, 'valid.csv')
        options = {'schema': {'infer_schema': True}}
        validator = Pipeline(filepath, processors=('schema',), options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 0)
Пример #17
0
    def test_pipeline_empty_rows_are_not_duplicatable(self):

        filepath = os.path.join(self.data_dir, 'empty_rows_multiple.csv')
        validator = Pipeline(filepath, processors=('structure',), fail_fast=False)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 11)
Пример #18
0
    def test_pipeline_case_insensitive_headers(self):
        filepath = os.path.join(self.data_dir, 'case_insensitive_headers.csv')
        schema = os.path.join(self.data_dir, 'test_schema.json')
        options = {'schema': {'schema': schema, 'case_insensitive_headers': True}}
        validator = Pipeline(filepath, processors=('schema',), options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 0)
Пример #19
0
    def test_gla_source_clean(self):

        data = 'https://raw.githubusercontent.com/rgrp/dataset-gla/master/data/all.csv'
        pipeline = Pipeline(data)
        result, report = pipeline.run()

        self.assertTrue(result)
        self.assertTrue(pipeline.data)
Пример #20
0
    def test_pipeline_hmt_bbsrc(self):

        data = os.path.join(self.data_dir, 'hmt', '1011-bbsrc-25k-spend-return.csv')
        encoding = 'ISO-8859-2'
        pipeline = Pipeline(data, encoding=encoding)
        result, report = pipeline.run()

        self.assertTrue(pipeline.data)
Пример #21
0
    def test_gla_source_three(self):

        data = os.path.join(self.data_dir, 'jungle', 'gla-250-report-2014-15-P08.csv')
        pipeline = Pipeline(data)
        result, report = pipeline.run()

        self.assertFalse(result)
        self.assertTrue(pipeline.data)
Пример #22
0
    def test_pipeline_field_unique(self):
        filepath = os.path.join(self.data_dir, 'unique_field.csv')
        schema = os.path.join(self.data_dir, 'unique_field.json')
        options = {'schema': {'schema': schema}}
        validator = Pipeline(filepath, processors=('schema',), options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 1)
Пример #23
0
    def test_header_index_invalid(self):

        filepath = os.path.join(self.data_dir, 'invalid_header_index_1.csv')
        options = {}
        validator = Pipeline(filepath, options=options, header_index=1)
        result, report = validator.run()

        self.assertFalse(result)
Пример #24
0
    def test_gla_source_five(self):

        data = os.path.join(self.data_dir, 'jungle', 'gla-2012-13-P10-250.csv')
        pipeline = Pipeline(data)
        result, report = pipeline.run()

        self.assertFalse(result)
        self.assertTrue(pipeline.data)
Пример #25
0
    def test_gla_source_six(self):

        data = os.path.join(self.data_dir, 'jungle', 'gla-december_2009.csv')
        pipeline = Pipeline(data)
        result, report = pipeline.run()

        self.assertFalse(result)
        self.assertTrue(pipeline.data)
Пример #26
0
    def test_pipeline_field_unique(self):
        filepath = os.path.join(self.data_dir, 'unique_field.csv')
        schema = os.path.join(self.data_dir, 'unique_field.json')
        options = {'schema': {'schema': schema}}
        validator = Pipeline(filepath, processors=('schema',), options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 1)
Пример #27
0
    def test_pipeline_case_insensitive_headers(self):
        filepath = os.path.join(self.data_dir, 'case_insensitive_headers.csv')
        schema = os.path.join(self.data_dir, 'test_schema.json')
        options = {'schema': {'schema': schema, 'case_insensitive_headers': True}}
        validator = Pipeline(filepath, processors=('schema',), options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 0)
Пример #28
0
    def test_pipeline_info_result_for_required_false(self):
        filepath = os.path.join(self.data_dir, 'required_false.csv')
        schema = os.path.join(self.data_dir, 'required_false_schema.json')
        options = {'schema': {'schema': schema, 'result_level': 'info'}}
        validator = Pipeline(filepath, processors=('schema',), options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 1)
Пример #29
0
    def test_pipeline_info_result_for_required_false(self):
        filepath = os.path.join(self.data_dir, 'required_false.csv')
        schema = os.path.join(self.data_dir, 'required_false_schema.json')
        options = {'schema': {'schema': schema, 'result_level': 'info'}}
        validator = Pipeline(filepath, processors=('schema',), options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 1)
Пример #30
0
    def test_header_index_invalid(self):

        filepath = os.path.join(self.data_dir, 'invalid_header_index_1.csv')
        options = {}
        validator = Pipeline(filepath, options=options, header_index=1)
        result, report = validator.run()

        self.assertFalse(result)
Пример #31
0
    def test_pipeline_row_limit_in_range(self):

        filepath = os.path.join(self.data_dir, 'row_limit_structure.csv')
        options = {}
        validator = Pipeline(filepath, processors=('structure',),
                             row_limit=2, options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 0)
Пример #32
0
    def test__report_limit_in_range(self):

        filepath = os.path.join(self.data_dir, 'report_limit_structure.csv')
        options = {}
        validator = Pipeline(filepath, processors=('structure',),
                             report_limit=1, options=options)
        result, report = validator.run()

        self.assertEqual(len([r for r in report.generate()['results'] if r['processor'] == 'structure']), 1)
Пример #33
0
    def test_pipeline_report_stream_none(self):
        filepath = os.path.join(self.data_dir, 'valid.csv')
        report_stream = None
        options = {}
        validator = Pipeline(filepath, processors=('schema',),
                             report_stream=report_stream, options=options)
        result, report = validator.run()

        self.assertTrue(result)
Пример #34
0
    def test_hmt_three(self):
        data = 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/407609/Publishable_December_2014_Spend.csv'
        schema = os.path.join(self.data_dir, 'hmt', 'spend-publishing-schema.json')
        options = {'schema': {'schema': schema}}
        pipeline = Pipeline(data, processors=('structure', 'schema'),
                            options=options)
        result, report = pipeline.run()

        self.assertTrue(pipeline.data)
Пример #35
0
    def test_pipeline_custom_empty_strings(self):

        filepath = os.path.join(self.data_dir, 'empty_rows_custom.csv')
        options = {'structure': {'empty_strings': ('-',)}}
        validator = Pipeline(filepath, processors=('structure',),
                             options=options)
        result, report = validator.run()

        self.assertFalse(result)
Пример #36
0
    def test_pipeline_empty_rows_are_not_duplicatable(self):

        filepath = os.path.join(self.data_dir, 'empty_rows_multiple.csv')
        validator = Pipeline(filepath,
                             processors=('structure', ),
                             fail_fast=False)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 11)
Пример #37
0
    def test_pipeline_fail_fast_false(self):

        filepath = os.path.join(self.data_dir, 'fail_fast_two_structure_errors.csv')
        options = {}
        validator = Pipeline(filepath, processors=('structure',),
                             options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 2)
Пример #38
0
    def test_pipeline_ignore_duplicate_rows_true(self):

        filepath = os.path.join(self.data_dir, 'duplicate_rows.csv')
        options = {'structure': {'ignore_duplicate_rows': True}}
        validator = Pipeline(filepath, processors=('structure',),
                             options=options)
        result, report = validator.run()

        self.assertTrue(result)
Пример #39
0
    def test_create_file(self):

        filepath = 'example.file'
        headers = ['first', 'second', 'three']
        row = '1,2,3\n'
        pipeline = Pipeline(self.data_string, dry_run=False)
        pipeline.create_file(row, filepath, headers=headers)

        self.assertTrue(os.path.exists(os.path.join(pipeline.workspace, filepath)))
Пример #40
0
    def test_pipeline_report_stream_none(self):
        filepath = os.path.join(self.data_dir, 'valid.csv')
        report_stream = None
        options = {}
        validator = Pipeline(filepath, processors=('schema',),
                             report_stream=report_stream, options=options)
        result, report = validator.run()

        self.assertTrue(result)
Пример #41
0
    def test__report_limit_in_range(self):

        filepath = os.path.join(self.data_dir, 'report_limit_structure.csv')
        options = {}
        validator = Pipeline(filepath, processors=('structure',),
                             report_limit=1, options=options)
        result, report = validator.run()

        self.assertEqual(len([r for r in report.generate()['results'] if r['processor'] == 'structure']), 1)
Пример #42
0
    def test_report_summary(self):

        filepath = os.path.join(self.data_dir, 'invalid_header_index_1.csv')
        options = {}
        validator = Pipeline(filepath, options=options, header_index=1)
        result, report = validator.run()
        generated = report.generate()

        self.assertEqual(generated['meta']['bad_row_count'], 1)
        self.assertEqual(generated['meta']['row_count'], 9)
Пример #43
0
    def test_report_results_grouped_by_rows(self):

        filepath = os.path.join(self.data_dir, 'fail_fast_two_schema_errors.csv')
        schema = os.path.join(self.data_dir, 'test_schema.json')
        options = {'schema': {'schema': schema}}
        validator = Pipeline(filepath, processors=('schema',), options=options,
                             fail_fast=True, report_type='grouped')
        result, report = validator.run()
        generated = report.generate()
        self.assertEqual(1, len(generated['results']))
Пример #44
0
    def test_hmt_bis_two(self):
        # excel
        data = os.path.join(self.data_dir, 'hmt', 'BIS_monthly_spend_December_2012.xls')
        schema = os.path.join(self.data_dir, 'hmt', 'bis-modified.json')
        options = {'schema': {'schema': schema}}
        pipeline = Pipeline(data, processors=('structure', 'schema'),
                            options=options, format='excel')
        result, report = pipeline.run()

        self.assertTrue(pipeline.data)
Пример #45
0
    def test_pipeline_custom_empty_strings(self):

        filepath = os.path.join(self.data_dir, 'empty_rows_custom.csv')
        options = {'structure': {'empty_strings': ('-', )}}
        validator = Pipeline(filepath,
                             processors=('structure', ),
                             options=options)
        result, report = validator.run()

        self.assertFalse(result)
Пример #46
0
    def test_pipeline_fail_fast_false(self):

        filepath = os.path.join(self.data_dir, 'fail_fast_two_schema_errors.csv')
        schema = os.path.join(self.data_dir, 'test_schema.json')
        options = {'schema': {'schema': schema}}
        validator = Pipeline(filepath, processors=('schema',),
                             options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 7)
Пример #47
0
    def test_pipeline_ignore_duplicate_rows_true(self):

        filepath = os.path.join(self.data_dir, 'duplicate_rows.csv')
        options = {'structure': {'ignore_duplicate_rows': True}}
        validator = Pipeline(filepath,
                             processors=('structure', ),
                             options=options)
        result, report = validator.run()

        self.assertTrue(result)
Пример #48
0
    def test_pipeline_fail_fast_false(self):

        filepath = os.path.join(self.data_dir, 'fail_fast_two_schema_errors.csv')
        schema = os.path.join(self.data_dir, 'test_schema.json')
        options = {'schema': {'schema': schema}}
        validator = Pipeline(filepath, processors=('schema',),
                             options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 5)
Пример #49
0
    def test_pipeline_row_limit_in_range(self):

        filepath = os.path.join(self.data_dir, 'row_limit_structure.csv')
        options = {}
        validator = Pipeline(filepath,
                             processors=('structure', ),
                             row_limit=2,
                             options=options)
        result, report = validator.run()

        self.assertEqual(len(report.generate()['results']), 0)