Python Profiler示例，dataprofiler.Profiler Python示例

示例#1

0

显示文件

    def test_warning_tf_multiple_dp_with_update(self):
        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
        })
        print('running dp1')
        profile1 = dp.Profiler(data, profiler_options=profile_options)

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
        })
        print('running dp2')
        profile2 = dp.Profiler(data, profiler_options=profile_options)

        profile1.update_profile(data)

示例#2

0

显示文件

    def test_odd_merge_profile_list(self, mock_data_labeler, *mocks):
        """
        A top-level function which takes in a list of profile objects, merges
            all the profiles together into one profile, and returns the single
            merged profile as the return value.

            The labeler object is removed prior to merge and added back to the
            single profile object.
        """
        self._setup_data_labeler_mock(mock_data_labeler)

        data = pd.DataFrame([1, 2, 3, 4, 5, 60, 1])
        profile_one = dp.Profiler(data[:2])
        profile_two = dp.Profiler(data[2:])
        profile_three = dp.Profiler(data[2:])

        list_of_profiles = [profile_one, profile_two, profile_three]
        single_profile = utils.merge_profile_list(
            list_of_profiles=list_of_profiles)
        single_report = single_profile.report()

        self.assertEqual(1, len(single_report["data_stats"]))
        self.assertEqual(1, single_report["global_stats"]["column_count"])
        self.assertEqual(12, single_report["global_stats"]["row_count"])

        self.assertEqual("int", single_report["data_stats"][0]["data_type"])

        self.assertEqual(1,
                         single_report["data_stats"][0]["statistics"]["min"])
        self.assertEqual(60.0,
                         single_report["data_stats"][0]["statistics"]["max"])

示例#3

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def test_sample_size_passed_to_profile(self, *mocks):

        update_mock = mocks[0]

        # data setup
        data = pd.DataFrame([0] * int(50e3))

        # option setup
        profiler_options = ProfilerOptions()
        profiler_options.structured_options.multiprocess.is_enabled = False
        profiler_options.set({'data_labeler.is_enabled': False})

        # test data size < min_sample_size = 5000 by default
        profiler = dp.Profiler(data[:1000], profiler_options=profiler_options)
        profiler._min_sample_size = 5000
        profiler._sampling_ratio = 0.2
        self.assertEqual(1000, update_mock.call_args[0][1])

        # test data size * 0.20 < min_sample_size < data size
        profiler = dp.Profiler(data[:10000], profiler_options=profiler_options)
        profiler._min_sample_size = 5000
        profiler._sampling_ratio = 0.2
        self.assertEqual(5000, update_mock.call_args[0][1])

        # test min_sample_size > data size * 0.20
        profiler = dp.Profiler(data, profiler_options=profiler_options)
        profiler._min_sample_size = 5000
        profiler._sampling_ratio = 0.2
        self.assertEqual(10000, update_mock.call_args[0][1])

示例#4

0

显示文件

文件： test_profile_builder.py 项目： sagars729/data-profiler

    def test_null_calculation_with_differently_sampled_cols(self):
        opts = ProfilerOptions()
        opts.structured_options.multiprocess.is_enabled = False
        data = pd.DataFrame({"full": [1, 2, 3, 4, 5, 6, 7, 8, 9],
                             "sparse": [1, None, 3, None, 5, None, 7, None, 9]})
        profile = dp.Profiler(data, samples_per_update=5, min_true_samples=5,
                              profiler_options=opts)
        # Rows 2, 4, 5, 6, 7 are sampled in first column
        # Therefore only those rows should be considered for null calculations
        # The only null in those rows in second column in that subset are 5, 7
        # Therefore only 2 rows have null according to row_has_null_count
        self.assertEqual(0, profile.row_is_null_count)
        self.assertEqual(2, profile.row_has_null_count)
        # Accordingly, make sure ratio of null rows accounts for the fact that
        # Only 5 total rows were sampled (5 in col 1, 9 in col 2)
        self.assertEqual(0, profile._get_row_is_null_ratio())
        self.assertEqual(0.4, profile._get_row_has_null_ratio())

        data2 = pd.DataFrame(
            {"sparse": [1, None, 3, None, 5, None, 7, None],
             "sparser": [1, None, None, None, None, None, None, 8]})
        profile2 = dp.Profiler(data2, samples_per_update=2, min_true_samples=2,
                               profiler_options=opts)
        # Rows are sampled as follows: [6, 5], [1, 4], [2, 3], [0, 7]
        # First column gets min true samples from ids 1, 4, 5, 6
        # Second column gets completely sampled (has a null in 1, 4, 5, 6)
        # rows 1 and 5 are completely null, 4 and 6 only null in col 2
        self.assertEqual(2, profile2.row_is_null_count)
        self.assertEqual(4, profile2.row_has_null_count)
        # Only 4 total rows sampled, ratio accordingly
        self.assertEqual(0.5, profile2._get_row_is_null_ratio())
        self.assertEqual(1, profile2._get_row_has_null_ratio())

示例#5

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def test_integrated_merge_diff_options(self):
        options = dp.ProfilerOptions()
        options.set({'data_labeler.is_enabled': False})

        data = pd.DataFrame([1, 2, 3, 4])
        profile1 = dp.Profiler(data, profiler_options=options)
        profile2 = dp.Profiler(data)
        with self.assertRaisesRegex(
                ValueError, 'Structured profilers were not setup with '
                'the same options, hence they do not '
                'calculate the same profiles and cannot be '
                'added together.'):
            profile1 + profile2

示例#6

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def test_duplicate_column_names(self, *mocks):
        # validate works first
        valid_data = pd.DataFrame([[1, 2]], columns=['a', 'b'])
        profile = dp.Profiler(valid_data)
        self.assertIn('a', profile._profile)
        self.assertIn('b', profile._profile)

        # data has duplicate column names
        invalid_data = pd.DataFrame([[1, 2]], columns=['a', 'a'])
        with self.assertRaisesRegex(
                ValueError, '`Profiler` does not currently support '
                'data which contains columns with duplicate'
                ' names.'):
            profile = dp.Profiler(invalid_data)

示例#7

0

显示文件

文件： test_profile_builder.py 项目： sagars729/data-profiler

    def test_stream_profilers(self, *mocks):
        data = pd.DataFrame([
            ['test1', 1.0],
            ['test2', None],
            ['test1', 1.0],
            [None, None],
            [None, 5.0],
            [None, 5.0],
            [None, None],
            ['test3', 7.0]])

        # check prior to update
        profiler = dp.Profiler(data[:3])
        self.assertEqual(1, profiler.row_has_null_count)
        self.assertEqual(0, profiler.row_is_null_count)
        self.assertEqual(3, profiler.total_samples)
        self.assertEqual(2, len(profiler.hashed_row_dict))

        # check after update
        profiler.update_profile(data[3:])

        self.assertIsNone(profiler.encoding)
        self.assertEqual(
            "<class 'pandas.core.frame.DataFrame'>", profiler.file_type)
        self.assertEqual(5, profiler.row_has_null_count)
        self.assertEqual(2, profiler.row_is_null_count)
        self.assertEqual(8, profiler.total_samples)
        self.assertEqual(5, len(profiler.hashed_row_dict))

示例#8

0

显示文件

 def setUpClass(cls):
     test_dict = {
         '1': ['nan', 'null', None, None, ''],
         1: ['nan', 'None', 'null', None, ''],
     }
     test_dataset = pd.DataFrame(data=test_dict)
     cls.trained_schema = dp.Profiler(test_dataset, len(test_dataset))

示例#9

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def test_correct_total_sample_size_and_counts_and_mutability(self):
        data = [['test1', 1.0], ['test2', 2.0], ['test3', 3.0], [None, None],
                ['test5', 5.0], ['test6', 6.0], [None, None], ['test7', 7.0]]
        data = pd.DataFrame(data, columns=['NAME', 'VALUE'])
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})

        col_one_len = len(data['NAME'])
        col_two_len = len(data['VALUE'])

        # Test reloading data, ensuring immutable
        for i in range(2):

            # Profile Once
            data.index = pd.RangeIndex(0, 8)
            profile = dp.Profiler(data,
                                  profiler_options=profiler_options,
                                  samples_per_update=2)

            # Profile Twice
            data.index = pd.RangeIndex(8, 16)
            profile.update_profile(data)

            # rows sampled are [5, 6], [13, 14] (0 index)
            self.assertEqual(16, profile.total_samples)
            self.assertEqual(4, profile._max_col_samples_used)
            self.assertEqual(2, profile.row_has_null_count)
            self.assertEqual(0.5, profile._get_row_has_null_ratio())
            self.assertEqual(2, profile.row_is_null_count)
            self.assertEqual(0.5, profile._get_row_is_null_ratio())
            self.assertEqual(0.4375, profile._get_unique_row_ratio())
            self.assertEqual(9, profile._get_duplicate_row_count())

        self.assertEqual(col_one_len, len(data['NAME']))
        self.assertEqual(col_two_len, len(data['VALUE']))

示例#10

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def test_save_and_load(self):
        datapth = "dataprofiler/tests/data/"
        test_files = ["csv/guns.csv", "csv/iris.csv"]

        def _clean_report(report):
            data_stats = report["data_stats"]
            for key in data_stats:
                stats = data_stats[key]["statistics"]
                if "histogram" in stats:
                    if "bin_counts" in stats["histogram"]:
                        stats["histogram"]["bin_counts"] = \
                            stats["histogram"]["bin_counts"].tolist()
                    if "bin_edges" in stats["histogram"]:
                        stats["histogram"]["bin_edges"] = \
                            stats["histogram"]["bin_edges"].tolist()
            return report

        for test_file in test_files:
            # Create Data and Profiler objects
            data = dp.Data(os.path.join(datapth, test_file))
            save_profile = dp.Profiler(data)

            # Save and Load profile with Mock IO
            with mock.patch('builtins.open') as m:
                mock_file = setup_save_mock_open(m)
                save_profile.save()
                mock_file.seek(0)
                load_profile = dp.Profiler.load("mock.pkl")

            # Check that reports are equivalent
            save_report = _clean_report(save_profile.report())
            load_report = _clean_report(load_profile.report())
            self.assertDictEqual(save_report, load_report)

示例#11

0

显示文件

    def test_warning_tf_run_dp_multiple_times(self):
        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')

        for i in range(3):
            print('running dp =============================', i)
            data = dp.Data(path)
            profile_options = dp.ProfilerOptions()
            profile_options.set({
                "text.is_enabled": False,
                "int.is_enabled": False,
                "float.is_enabled": False,
                "order.is_enabled": False,
                "category.is_enabled": False,
                "datetime.is_enabled": False,
            })

            profile = dp.Profiler(data, profiler_options=profile_options)

            results = profile.report()

            columns = []
            predictions = []
            for col in results['data_stats']:
                columns.append(col)
                predictions.append(results['data_stats'][col]['data_label'])

示例#12

0

显示文件

文件： data_profiler_profile_diff.py 项目： admariner/great_expectations

    def _pandas(
        cls,
        execution_engine,
        metric_domain_kwargs,
        metric_value_kwargs,
        metrics,
        runtime_configuration,
    ):
        df, _, _ = execution_engine.get_compute_domain(
            metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE)
        first_profile = None
        try:
            first_profile_path = metric_value_kwargs["profile_path"]
            first_profile = dp.Profiler.load(first_profile_path)
        except FileNotFoundError:
            raise ValueError(
                "'profile_path' does not point to a valid DataProfiler stored profile."
            )

        profiler_opts = dp.ProfilerOptions()
        profiler_opts.structured_options.multiprocess.is_enabled = False
        new_profile = dp.Profiler(df, options=profiler_opts)

        report_diff = new_profile.diff(
            first_profile)  # Results in diff of new_prof - first_prof
        # Values in this report indicate +/- change from old profile
        return report_diff

示例#13

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

 def test_text_data_raises_error(self):
     text_file_path = os.path.join(test_root_path, 'data',
                                   'txt/sentence-10x.txt')
     with self.assertRaisesRegex(
             TypeError, 'Cannot provide TextData object'
             ' to Profiler'):
         profile = dp.Profiler(dp.Data(text_file_path))

示例#14

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def test_correct_rows_ingested(self):
        test_dict = {
            '1': ['nan', 'null', None, None, ''],
            1: ['nan', 'None', 'null', None, ''],
        }
        test_dataset = pd.DataFrame(data=test_dict)
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})
        trained_schema = dp.Profiler(test_dataset,
                                     len(test_dataset),
                                     profiler_options=profiler_options)

        self.assertCountEqual(['', 'nan', 'None', 'null'],
                              trained_schema.profile['1'].null_types)
        self.assertEqual(5, trained_schema.profile['1'].null_count)
        self.assertEqual({
            '': {4},
            'nan': {0},
            'None': {2, 3},
            'null': {1}
        }, trained_schema.profile['1'].null_types_index)
        self.assertCountEqual(['', 'nan', 'None', 'null'],
                              trained_schema.profile[1].null_types)
        self.assertEqual(5, trained_schema.profile[1].null_count)
        self.assertEqual({
            '': {4},
            'nan': {0},
            'None': {1, 3},
            'null': {2}
        }, trained_schema.profile[1].null_types_index)

示例#15

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def test_min_col_samples_used(self, *mocks):
        # No cols sampled since no cols to sample
        empty_df = pd.DataFrame([])
        empty_profile = dp.Profiler(empty_df)
        self.assertEqual(0, empty_profile._min_col_samples_used)

        # Every column fully sampled
        full_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
        full_profile = dp.Profiler(full_df)
        self.assertEqual(3, full_profile._min_col_samples_used)

        # First col sampled only twice, so that is min
        sparse_df = pd.DataFrame([[1, None, None], [1, 1, None], [1, None, 1]])
        sparse_profile = dp.Profiler(sparse_df,
                                     min_true_samples=2,
                                     samples_per_update=1)
        self.assertEqual(2, sparse_profile._min_col_samples_used)

示例#16

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def test_add_profilers(self, *mocks):
        data = pd.DataFrame([1, None, 3, 4, 5, None, 1])
        profile1 = dp.Profiler(data[:2])
        profile2 = dp.Profiler(data[2:])

        # test incorrect type
        with self.assertRaisesRegex(
                TypeError, '`Profiler` and `int` are '
                'not of the same profiler type.'):
            profile1 + 3

        # test mismatched profiles
        popped_profile = profile2._profile.pop(0)
        with self.assertRaisesRegex(ValueError,
                                    'Profiles do not have the same schema.'):
            profile1 + profile2

        # test mismatched profiles due to options
        profile2._profile[0] = None
        with self.assertRaisesRegex(
                ValueError, 'The two profilers were not setup with the '
                'same options, hence they do not calculate '
                'the same profiles and cannot be added '
                'together.'):
            profile1 + profile2

        # test success
        profile1._profile = dict(test=1)
        profile2._profile = dict(test=2)
        merged_profile = profile1 + profile2
        self.assertEqual(3, merged_profile._profile['test'])
        self.assertIsNone(merged_profile.encoding)
        self.assertEqual("<class 'pandas.core.frame.DataFrame'>",
                         merged_profile.file_type)
        self.assertEqual(2, merged_profile.row_has_null_count)
        self.assertEqual(2, merged_profile.row_is_null_count)
        self.assertEqual(7, merged_profile.total_samples)
        self.assertEqual(5, len(merged_profile.hashed_row_dict))

        # test success if drawn from multiple files
        profile2.encoding = 'test'
        profile2.file_type = 'test'
        merged_profile = profile1 + profile2
        self.assertEqual('multiple files', merged_profile.encoding)
        self.assertEqual('multiple files', merged_profile.file_type)

示例#17

0

显示文件

文件： test_profile_builder.py 项目： sagars729/data-profiler

    def test_correct_null_row_counts(self):
        file_path = os.path.join(test_root_path, 'data', 'csv/empty_rows.txt')
        data = pd.read_csv(file_path)
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})
        profile = dp.Profiler(data, profiler_options=profiler_options)
        self.assertEqual(2, profile.row_has_null_count)
        self.assertEqual(0.25, profile._get_row_has_null_ratio())
        self.assertEqual(2, profile.row_is_null_count)
        self.assertEqual(0.25, profile._get_row_is_null_ratio())

        file_path = os.path.join(test_root_path, 'data','csv/iris-with-null-rows.csv')
        data = pd.read_csv(file_path)
        profile = dp.Profiler(data, profiler_options=profiler_options)
        self.assertEqual(13, profile.row_has_null_count)
        self.assertEqual(13/24, profile._get_row_has_null_ratio())
        self.assertEqual(3, profile.row_is_null_count)
        self.assertEqual(3/24, profile._get_row_is_null_ratio())

示例#18

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def setUpClass(cls):

        cls.input_file_path = os.path.join(test_root_path, 'data',
                                           'csv/aws_honeypot_marx_geo.csv')
        cls.aws_dataset = pd.read_csv(cls.input_file_path)
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})
        cls.trained_schema = dp.Profiler(cls.aws_dataset,
                                         len(cls.aws_dataset),
                                         profiler_options=profiler_options)

示例#19

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def test_sample_size_warning_in_the_profiler(self, *mocks):
        # structure data profile mock
        sdp_mock = mock.Mock()
        sdp_mock.clean_data_and_get_base_stats.return_value = (None, None)
        mocks[0].return_value = sdp_mock

        data = pd.DataFrame([1, None, 3, 4, 5, None])
        with self.assertWarnsRegex(
                UserWarning, "The data will be profiled with a sample "
                "size of 3. All statistics will be based on "
                "this subsample and not the whole dataset."):
            profile1 = dp.Profiler(data, samples_per_update=3)

示例#20

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

 def test_data_label_assigned(self):
     # only use 5 samples
     trained_schema = dp.Profiler(self.aws_dataset, samples_per_update=5)
     report = trained_schema.report()
     has_non_null_column = False
     for key in report['data_stats']:
         # only test non-null columns
         if report['data_stats'][key]['data_type'] is not None:
             self.assertIsNotNone(report['data_stats'][key]['data_label'])
             has_non_null_column = True
     if not has_non_null_column:
         self.fail(
             "Dataset tested did not have a non-null column and therefore "
             "could not validate the test.")

示例#21

0

显示文件

文件： test_profile_builder.py 项目： sagars729/data-profiler

    def test_null_in_file(self):
        filename_null_in_file = os.path.join(
            test_root_path, 'data', 'csv/sparse-first-and-last-column.txt')
        profiler_options = ProfilerOptions()
        profiler_options.set({'data_labeler.is_enabled': False})
        data = dp.Data(filename_null_in_file)
        profile = dp.Profiler(data, profiler_options=profiler_options)

        report = profile.report(report_options={"output_format":"pretty"})
        
        self.assertEqual(
            report['data_stats']['COUNT']['statistics']['null_types_index'],
            {'': '[2, 3, 4, 5, 7, 8]'}
        )
        
        self.assertEqual(
            report['data_stats'][' NUMBERS']['statistics']['null_types_index'],
            {'': '[5, 6, 8]', ' ': '[2, 4]'}
        )

示例#22

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

    def test_get_sample_size(self):
        data = pd.DataFrame([0] * int(50e3))

        # test data size < min_sample_size = 5000 by default
        profiler = dp.Profiler(pd.DataFrame([]))
        profiler._min_sample_size = 5000
        profiler._sampling_ratio = 0.2
        sample_size = profiler._get_sample_size(data[:1000])
        self.assertEqual(1000, sample_size)

        # test data size * 0.20 < min_sample_size < data size
        sample_size = profiler._get_sample_size(data[:10000])
        self.assertEqual(5000, sample_size)

        # test min_sample_size > data size * 0.20
        sample_size = profiler._get_sample_size(data)
        self.assertEqual(10000, sample_size)

        # test min_sample_size > data size * 0.10
        profiler._sampling_ratio = 0.5
        sample_size = profiler._get_sample_size(data)
        self.assertEqual(25000, sample_size)

示例#23

0

显示文件

文件： test_marginal_dist.py 项目： capitalone/synthetic-data

def test_marginal_dist_detection():

    iris = datasets.load_iris()
    data = pd.DataFrame(
        data=np.c_[iris["data"], iris["target"]],
        columns=iris["feature_names"] + ["target"],
    )
    data.target = data.target.astype(int)

    profile_options = dp.ProfilerOptions()
    profile_options.set({
        "data_labeler.is_enabled": False,
        "correlation.is_enabled": True,
        "structured_options.multiprocess.is_enabled": False,
    })

    profile = dp.Profiler(data, options=profile_options)
    report = profile.report()
    marginal_dist_list = detect_dist(report)

    assert len(marginal_dist_list) == len(
        report["data_stats"]
    ), "Length of distributions list must be equal to number of columns"

    for col_num, col in enumerate(report["data_stats"]):
        dist_name = marginal_dist_list[col_num]["dist"]

        assert hasattr(
            stats, dist_name
        ), "The detected distribution must be defined in scipy.stats"
        dist_method = getattr(stats, dist_name)
        if col["data_type"] == "float":
            assert isinstance(
                dist_method, stats.rv_continuous
            ), "Detected distribution must be continuous for columns with continuous random variables"
        else:
            assert isinstance(
                dist_method, stats.rv_discrete
            ), "Detected distribution must be discrete for columns with discrete random variables"

示例#24

0

显示文件

    def test_no_tensorflow(self):
        import sys
        import importlib
        import types
        import pandas
        orig_import = __import__
        # necessary for any wrapper around the library to test if snappy caught
        # as an issue

        def import_mock(name, *args):
            if name == 'tensorflow':
                raise ImportError('test')
            return orig_import(name, *args)
                        
        with mock.patch('builtins.__import__', side_effect=import_mock):

            with self.assertWarns(RuntimeWarning) as w:
                import dataprofiler
                df = pandas.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]])
                profile = dataprofiler.Profiler(df)
        
        warning_msg = "Partial Profiler Failure"
        self.assertIn(warning_msg, str(w.warning))

示例#25

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

 def test_generating_report_ensure_no_error(self):
     file_path = os.path.join(test_root_path, 'data', 'csv/diamonds.csv')
     data = pd.read_csv(file_path)
     profile = dp.Profiler(data[:1000])
     readable_report = profile.report(
         report_options={"output_format": "compact"})

示例#26

0

显示文件

    for sample_size in sample_sizes:
        # setup time dict

        print(f"Evaluating sample size: {sample_size}")
        df = data.data.sample(sample_size, replace=True).reset_index(drop=True)

        if PERCENT_TO_NAN:
            samples_to_nan = int(len(df) * PERCENT_TO_NAN / 100)
            for col_name in df:
                ind_to_nan = random.sample(list(df.index), samples_to_nan)
                df[col_name][ind_to_nan] = "None"

        # time profiling
        start_time = time.time()
        if ALLOW_SUBSAMPLING:
            profiler = dp.Profiler(df, options=options)
        else:
            profiler = dp.Profiler(df,
                                   samples_per_update=len(df),
                                   options=options)
        total_time = time.time() - start_time

        # get overall time for merging profiles
        start_time = time.time()
        try:
            merged_profile = profiler + profiler
        except ValueError:
            pass  # empty profile merge if 0 data
        merge_time = time.time() - start_time

        # get times for each profile in the columns

示例#27

0

显示文件

文件： test_profile_builder.py 项目： Pean-Mura/DataProfiler

 def test_string_index_doesnt_cause_error(self, *mocks):
     dp.Profiler(pd.DataFrame([[1, 2, 3]], index=["hello"]))

示例#28

0

显示文件

文件： expect_profile_numeric_columns_diff_between_inclusive_threshold_range.py 项目： admariner/great_expectations

class ExpectProfileNumericColumnsDiffBetweenInclusiveThresholdRange(
        TableExpectation):
    """
    This expectation takes the difference report between the data it is called on and a DataProfiler profile of the same schema loaded from a provided path.
    This function builds upon the custom table expectations of Great Expectations.
    Each numerical column will be checked against a user provided dictionary of columns paired with dictionaries of statistics containing lower and upper bounds.
    It is expected that a statistics value for a given column is within the specified threshold, inclusive.

    Args:
        profile_path (str): A path to a saved DataProfiler profile object on the local filesystem.
        limit_check_report_keys (dict): A dict, containing column names as keys and dicts as values that contain statistics as keys and dicts as values containing two keys:
                                        "lower" denoting the lower bound for the threshold range, and "upper" denoting the upper bound for the threshold range.
        mostly (float - optional): a value indicating the lower bound percentage of successful values that must be present to evaluate to success=True.
    validator.expect_profile_numerical_columns_diff_between_threshold_range(
        profile_path = "C:/path_to/my_profile.pkl",
        limit_check_report_keys = {
            "column_one": {
                "min": {"lower": 2.0, "upper": 10.0},
            },
            "*": {
                "*": {"lower": 0, "upper": 100},
            },
        }
    )
    Note: In limit_check_report_keys, "*" in place of a column denotes a general operator in which the value it stores will be applied to every column in the data that has no explicit key.
          "*" in place of a statistic denotes a general operator in which the bounds it stores will be applied to every statistic for the given column that has no explicit key.
    """

    example_profile_data = [
        [2, 5, "10", "ten", 25],
        [4, 10, "20", "twenty", 50],
        [6, 15, "30", "thirty", 75],
        [8, 20, "40", "forty", 100],
        [10, 25, "50", "fifty", 125],
    ]
    example_profile_columns = [
        "by_2",
        "by_5",
        "str_by_10",
        "words_by_10",
        "by_25",
    ]

    df = pd.DataFrame(example_profile_data, columns=example_profile_columns)
    profiler_opts = dp.ProfilerOptions()
    profiler_opts.structured_options.multiprocess.is_enabled = False

    example_profile = dp.Profiler(df, options=profiler_opts)

    profile_path = (
        "/example_profiles/expect_profile_diff_less_than_threshold_profile.pkl"
    )

    dir_path = os.path.dirname(os.path.abspath(__file__))
    profile_path = dir_path + profile_path

    example_profile.save(filepath=profile_path)

    examples = [
        {
            "data": {
                "by_2": [4, 6, 8, 10, 12],
                "by_5": [10, 15, 20, 25, 30],
                "str_by_10": ["20", "30", "40", "50", "60"],
                "words_by_10": ["twenty", "thirty", "forty", "fifty", "sixty"],
                "by_25": [50, 75, 100, 125, 150],
            },
            "tests": [
                {
                    "title": "profile_min_delta_witin_threshold",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "profile_path": profile_path,
                        "limit_check_report_keys": {
                            "*": {
                                "min": {
                                    "lower": 0,
                                    "upper": 50
                                },
                            },
                        },
                    },
                    "out": {
                        "success": True
                    },
                },
                {
                    "title": "profile_all_stats_beyond_delta_threshold",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "profile_path": profile_path,
                        "limit_check_report_keys": {
                            "*": {
                                "*": {
                                    "lower": 0,
                                    "upper": 0
                                }
                            },
                            "by_2": {
                                "min": {
                                    "lower": -1,
                                    "upper": 1
                                },
                            },
                        },
                    },
                    "out": {
                        "success": False
                    },
                },
                {
                    "title": "checking_single_failure_in_one_column",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "profile_path": profile_path,
                        "limit_check_report_keys": {
                            "*": {
                                "*": {
                                    "lower": -25,
                                    "upper": 50
                                }
                            },
                            "by_2": {
                                "min": {
                                    "lower": 0,
                                    "upper": 0
                                }
                            },
                        },
                    },
                    "out": {
                        "success": False
                    },
                },
                {
                    "title": "single_failure_still_mostly_successful",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "profile_path": profile_path,
                        "limit_check_report_keys": {
                            "*": {
                                "*": {
                                    "lower": -25,
                                    "upper": 50
                                }
                            },
                            "by_2": {
                                "min": {
                                    "lower": 0,
                                    "upper": 0
                                }
                            },
                        },
                        "mostly": 0.75,
                    },
                    "out": {
                        "success": True
                    },
                },
            ],
        },
    ]

    metric_dependencies = (
        "data_profiler.profile_numeric_columns_diff_between_inclusive_threshold_range",
    )

    success_keys = (
        "profile_path",
        "limit_check_report_keys",
        "numerical_diff_statistics",
        "mostly",
    )

    default_limit_check_report_keys = {
        "*": {
            "min": {
                "lower": 0,
                "upper": 0
            },
            "max": {
                "lower": 0,
                "upper": 0
            },
            "sum": {
                "lower": 0,
                "upper": 0
            },
            "mean": {
                "lower": 0,
                "upper": 0
            },
            "median": {
                "lower": 0,
                "upper": 0
            },
            "median_absolute_deviation": {
                "lower": 0,
                "upper": 0
            },
            "variance": {
                "lower": 0,
                "upper": 0
            },
            "stddev": {
                "lower": 0,
                "upper": 0
            },
            "unique_count": {
                "lower": 0,
                "upper": 0
            },
            "unique_ratio": {
                "lower": 0,
                "upper": 0
            },
            "gini_impurity": {
                "lower": 0,
                "upper": 0
            },
            "unalikeability": {
                "lower": 0,
                "upper": 0
            },
            "sample_size": {
                "lower": 0,
                "upper": 0
            },
            "null_count": {
                "lower": 0,
                "upper": 0
            },
        }
    }

    numerical_diff_statistics = list(
        default_limit_check_report_keys["*"].keys())

    default_kwarg_values = {
        "limit_check_report_keys": default_limit_check_report_keys,
        "numerical_diff_statistics": numerical_diff_statistics,
        "mostly": 1.0,
    }

    def _validate(
        self,
        configuration: ExpectationConfiguration,
        metrics: Dict,
        runtime_configuration: dict = None,
        execution_engine: ExecutionEngine = None,
    ):
        delta_between_thresholds = metrics.get(
            "data_profiler.profile_numeric_columns_diff_between_inclusive_threshold_range"
        )
        mostly = self.get_success_kwargs().get(
            "mostly", self.default_kwarg_values.get("mostly"))

        unexpected_values = {}
        total_stats = 0.0
        failed_stats = 0.0
        for column, value in delta_between_thresholds.items():
            column_unexpected_values = {}
            for stat, val in value.items():
                if val is not True:
                    column_unexpected_values[stat] = val
                    failed_stats += 1.0
                total_stats += 1.0
            if column_unexpected_values != {}:
                unexpected_values[column] = column_unexpected_values

        successful_stats = total_stats - failed_stats
        percent_successful = successful_stats / total_stats

        success = percent_successful >= mostly

        results = {
            "success": success,
            "expectation_config": configuration,
            "result": {
                "unexpected_values": unexpected_values,
            },
        }
        return results

    library_metadata = {
        "requirements":
        ["dataprofiler", "tensorflow", "scikit-learn", "numpy"],
        "maturity":
        "experimental",  # "concept_only", "experimental", "beta", or "production"
        "tags": [
            "dataprofiler",
            "dataassistance",
        ],  # Tags for this Expectation in the Gallery
        "contributors":
        [  # Github handles for all contributors to this Expectation.
            "@stevensecreti",  # Don't forget to add your github handle here!
        ],
    }