예제 #1
0
    def test_warning_tf_multiple_dp_with_update(self):
        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
        })
        print('running dp1')
        profile1 = dp.Profiler(data, profiler_options=profile_options)

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
        })
        print('running dp2')
        profile2 = dp.Profiler(data, profiler_options=profile_options)

        profile1.update_profile(data)
    def test_warning_tf(self):

        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')
        data = dp.Data(path)

        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "chi2_homogeneity.is_enabled": False,
            "datetime.is_enabled": False
        })

        profile = dp.StructuredProfiler(data, options=profile_options)
        results = profile.report()

        columns = []
        predictions = []
        for i in range(len(results['data_stats'])):
            columns.append(i)
            predictions.append(results['data_stats'][i]['data_label'])
    def test_warning_tf_run_dp_multiple_times(self):
        test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, "data")
        path = os.path.join(test_dir, "csv/diamonds.csv")

        for i in range(3):
            print("running dp =============================", i)
            data = dp.Data(path)
            profile_options = dp.ProfilerOptions()
            profile_options.structured_options.set(
                {
                    "text.is_enabled": False,
                    "int.is_enabled": False,
                    "float.is_enabled": False,
                    "order.is_enabled": False,
                    "category.is_enabled": False,
                    "chi2_homogeneity.is_enabled": False,
                    "datetime.is_enabled": False,
                }
            )

            profile = dp.StructuredProfiler(data, options=profile_options)

            results = profile.report()

            columns = []
            predictions = []
            for j in range(len(results["data_stats"])):
                columns.append(j)
                predictions.append(results["data_stats"][j]["data_label"])
    def _pandas(
        cls,
        execution_engine,
        metric_domain_kwargs,
        metric_value_kwargs,
        metrics,
        runtime_configuration,
    ):
        df, _, _ = execution_engine.get_compute_domain(
            metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE)
        first_profile = None
        try:
            first_profile_path = metric_value_kwargs["profile_path"]
            first_profile = dp.Profiler.load(first_profile_path)
        except FileNotFoundError:
            raise ValueError(
                "'profile_path' does not point to a valid DataProfiler stored profile."
            )

        profiler_opts = dp.ProfilerOptions()
        profiler_opts.structured_options.multiprocess.is_enabled = False
        new_profile = dp.Profiler(df, options=profiler_opts)

        report_diff = new_profile.diff(
            first_profile)  # Results in diff of new_prof - first_prof
        # Values in this report indicate +/- change from old profile
        return report_diff
예제 #5
0
    def test_warning_tf_run_dp_multiple_times(self):
        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')

        for i in range(3):
            print('running dp =============================', i)
            data = dp.Data(path)
            profile_options = dp.ProfilerOptions()
            profile_options.set({
                "text.is_enabled": False,
                "int.is_enabled": False,
                "float.is_enabled": False,
                "order.is_enabled": False,
                "category.is_enabled": False,
                "datetime.is_enabled": False,
            })

            profile = dp.Profiler(data, profiler_options=profile_options)

            results = profile.report()

            columns = []
            predictions = []
            for col in results['data_stats']:
                columns.append(col)
                predictions.append(results['data_stats'][col]['data_label'])
예제 #6
0
 def setUpClass(cls):
     cls.data = pd.DataFrame(
         [[1, 'a', 1.0, '1/2/2021'], [None, 'b', None, '1/2/2020'],
          [3, 'c', 3.5, '1/2/2022'], [4, 'd', 4.5, '1/2/2023'],
          [5, 'e', 6.0, '5/2/2020'], [None, 'f', None, '1/5/2020'],
          [1, 'g', 1.0, '2/5/2020'], [None, 1, 10.0, '3/5/2020']],
         columns=['int', 'str', 'float', 'datetime'])
     cls.options = dp.ProfilerOptions()
     cls.options.set({"data_labeler.is_enabled": False})
     cls.options.set({"multiprocess.is_enabled": False})
     cls.profiler = dp.StructuredProfiler(cls.data, options=cls.options)
    def test_warning_tf_multiple_dp_with_update(self):
        test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, "data")
        path = os.path.join(test_dir, "csv/diamonds.csv")

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set(
            {
                "text.is_enabled": False,
                "int.is_enabled": False,
                "float.is_enabled": False,
                "order.is_enabled": False,
                "category.is_enabled": False,
                "datetime.is_enabled": False,
                "chi2_homogeneity.is_enabled": False,
                "correlation.is_enabled": False,
            }
        )
        print("running dp1")
        profile1 = dp.StructuredProfiler(data, options=profile_options)

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set(
            {
                "text.is_enabled": False,
                "int.is_enabled": False,
                "float.is_enabled": False,
                "order.is_enabled": False,
                "category.is_enabled": False,
                "datetime.is_enabled": False,
                "chi2_homogeneity.is_enabled": False,
                "correlation.is_enabled": False,
            }
        )
        print("running dp2")
        profile2 = dp.StructuredProfiler(data, options=profile_options)

        profile1.update_profile(data)
    def test_integrated_merge_diff_options(self):
        options = dp.ProfilerOptions()
        options.set({'data_labeler.is_enabled': False})

        data = pd.DataFrame([1, 2, 3, 4])
        profile1 = dp.Profiler(data, profiler_options=options)
        profile2 = dp.Profiler(data)
        with self.assertRaisesRegex(
                ValueError, 'Structured profilers were not setup with '
                'the same options, hence they do not '
                'calculate the same profiles and cannot be '
                'added together.'):
            profile1 + profile2
    def test_warning_tf_run_dp_merge(self):
        test_root_path = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        test_dir = os.path.join(test_root_path, 'data')
        path = os.path.join(test_dir, 'csv/diamonds.csv')

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
            "chi2_homogeneity.is_enabled": False,
            "correlation.is_enabled": False
        })
        print('running dp1')
        profile1 = dp.StructuredProfiler(data, options=profile_options)

        data = dp.Data(path)
        profile_options = dp.ProfilerOptions()
        profile_options.structured_options.set({
            "text.is_enabled": False,
            "int.is_enabled": False,
            "float.is_enabled": False,
            "order.is_enabled": False,
            "category.is_enabled": False,
            "datetime.is_enabled": False,
            "chi2_homogeneity.is_enabled": False,
            "correlation.is_enabled": False
        })
        print('running dp2')
        profile2 = dp.StructuredProfiler(data, options=profile_options)

        profile = profile1 + profile2
예제 #10
0
 def setUpClass(cls):
     cls.data = pd.DataFrame(
         [
             [1, "a", 1.0, "1/2/2021"],
             [None, "b", None, "1/2/2020"],
             [3, "c", 3.5, "1/2/2022"],
             [4, "d", 4.5, "1/2/2023"],
             [5, "e", 6.0, "5/2/2020"],
             [None, "f", None, "1/5/2020"],
             [1, "g", 1.0, "2/5/2020"],
             [None, 1, 10.0, "3/5/2020"],
         ],
         columns=["int", "str", "float", "datetime"],
     )
     cls.options = dp.ProfilerOptions()
     cls.options.set({"data_labeler.is_enabled": False})
     cls.options.set({"multiprocess.is_enabled": False})
     cls.profiler = dp.StructuredProfiler(cls.data, options=cls.options)
def test_marginal_dist_detection():

    iris = datasets.load_iris()
    data = pd.DataFrame(
        data=np.c_[iris["data"], iris["target"]],
        columns=iris["feature_names"] + ["target"],
    )
    data.target = data.target.astype(int)

    profile_options = dp.ProfilerOptions()
    profile_options.set({
        "data_labeler.is_enabled": False,
        "correlation.is_enabled": True,
        "structured_options.multiprocess.is_enabled": False,
    })

    profile = dp.Profiler(data, options=profile_options)
    report = profile.report()
    marginal_dist_list = detect_dist(report)

    assert len(marginal_dist_list) == len(
        report["data_stats"]
    ), "Length of distributions list must be equal to number of columns"

    for col_num, col in enumerate(report["data_stats"]):
        dist_name = marginal_dist_list[col_num]["dist"]

        assert hasattr(
            stats, dist_name
        ), "The detected distribution must be defined in scipy.stats"
        dist_method = getattr(stats, dist_name)
        if col["data_type"] == "float":
            assert isinstance(
                dist_method, stats.rv_continuous
            ), "Detected distribution must be continuous for columns with continuous random variables"
        else:
            assert isinstance(
                dist_method, stats.rv_discrete
            ), "Detected distribution must be discrete for columns with discrete random variables"
예제 #12
0
 def setUpClass(cls):
     cls.options = dp.ProfilerOptions()
     cls.options.set({"data_labeler.is_enabled": False})
     cls.options.set({"multiprocess.is_enabled": False})
     cls.options.set({"correlation.is_enabled": False})
     cls.options.set({"chi2_homogeneity.is_enabled": False})
예제 #13
0
class ExpectColumnValuesToBeEqualToOrGreaterThanProfileMin(ColumnMapExpectation):
    """
    This function builds upon the custom column map expectations of Great Expectations. This function asks a yes/no question of each row in the user-specified column;
    namely, is the value greater than or equal to the minimum value of the respective column within the provided profile report generated from the DataProfiler.

    Args:
        column(str): The column that you want to check.
        profile(dict(str, Any)): The report, which is assumed to contain a column of the same name, previously generated using the DataProfiler.

    df.expect_column_values_to_be_equal_to_or_greater_than_profile_min(
        column,
        profile
    )

    """

    # These examples will be shown in the public gallery.
    # They will also be executed as unit tests for your Expectation.

    data = [
        [-36, -25, -44],
        [18, 45, 46],
        [-16, -29, -49],
        [21, 4, 35],
        [-18, -7, -40],
        [22, -4, -37],
        [-17, -21, 11],
        [48, -32, -48],
        [0, -44, 20],
    ]
    cols = ["col_a", "col_b", "col_c"]

    df = pd.DataFrame(data, columns=cols)
    profiler_opts = dp.ProfilerOptions()
    profiler_opts.structured_options.multiprocess.is_enabled = False
    profileObj = dp.Profiler(df, options=profiler_opts)
    profileReport = profileObj.report(report_options={"output_format": "serializable"})
    profileReport["global_stats"]["profile_schema"] = dict(
        profileReport["global_stats"]["profile_schema"]
    )

    examples = [
        {
            "data": {
                "col_a": [-3, 21, 20, 5],
                "col_b": [-7, 41, -47, 12],
                "col_c": [54, -10, 19, 19],
            },
            "tests": [
                {
                    "title": "column_lower_bounded_by_min",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column": "col_a",
                        "profile": profileReport,
                    },
                    "out": {"success": True},
                },
                {
                    "title": "column_has_value_less_than_min",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column": "col_b",
                        "profile": profileReport,
                    },
                    "out": {"success": False},
                },
            ],
        }
    ]

    # This is the id string of the Metric used by this Expectation.
    # For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above.
    map_metric = "column_values.greater_than_or_equal_to_profile_min"

    # This is a list of parameter names that can affect whether the Expectation evaluates to True or False
    success_keys = (
        "profile",
        "mostly",
    )

    # This dictionary contains default values for any parameters that should have default values
    default_kwarg_values = {
        "profile": None,
        "result_format": "BASIC",
        "include_config": True,
        "catch_exceptions": False,
    }

    # This object contains metadata for display in the public Gallery
    library_metadata = {
        "requirements": ["dataprofiler", "tensorflow", "scikit-learn", "numpy"],
        "maturity": "experimental",  # "concept_only", "experimental", "beta", or "production"
        "tags": ["dataprofiler"],  # Tags for this Expectation in the Gallery
        "contributors": [  # Github handles for all contributors to this Expectation.
            "@stevensecreti",  # Don't forget to add your github handle here!
        ],
    }
예제 #14
0
try:
    import sys

    sys.path.insert(0, "../../..")
    import dataprofiler as dp
except ImportError:
    import dataprofiler as dp

# suppress TF warnings
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

################################################################################
######################## set any optional changes here #########################
################################################################################
options = dp.ProfilerOptions()

# these two options default to True if commented out
options.structured_options.multiprocess.is_enabled = False
# options.structured_options.data_labeler.is_enabled = False

# parameter alteration
ALLOW_SUBSAMPLING = True  # profiler to subsample the dataset if large
PERCENT_TO_NAN = 0.0  # Value must be between 0 and 100

sample_sizes = [100, 1000, 5000, 7500, int(1e5)]
################################################################################

if __name__ == "__main__":

    # set seed
class ExpectProfileNumericColumnsDiffBetweenInclusiveThresholdRange(
        TableExpectation):
    """
    This expectation takes the difference report between the data it is called on and a DataProfiler profile of the same schema loaded from a provided path.
    This function builds upon the custom table expectations of Great Expectations.
    Each numerical column will be checked against a user provided dictionary of columns paired with dictionaries of statistics containing lower and upper bounds.
    It is expected that a statistics value for a given column is within the specified threshold, inclusive.

    Args:
        profile_path (str): A path to a saved DataProfiler profile object on the local filesystem.
        limit_check_report_keys (dict): A dict, containing column names as keys and dicts as values that contain statistics as keys and dicts as values containing two keys:
                                        "lower" denoting the lower bound for the threshold range, and "upper" denoting the upper bound for the threshold range.
        mostly (float - optional): a value indicating the lower bound percentage of successful values that must be present to evaluate to success=True.
    validator.expect_profile_numerical_columns_diff_between_threshold_range(
        profile_path = "C:/path_to/my_profile.pkl",
        limit_check_report_keys = {
            "column_one": {
                "min": {"lower": 2.0, "upper": 10.0},
            },
            "*": {
                "*": {"lower": 0, "upper": 100},
            },
        }
    )
    Note: In limit_check_report_keys, "*" in place of a column denotes a general operator in which the value it stores will be applied to every column in the data that has no explicit key.
          "*" in place of a statistic denotes a general operator in which the bounds it stores will be applied to every statistic for the given column that has no explicit key.
    """

    example_profile_data = [
        [2, 5, "10", "ten", 25],
        [4, 10, "20", "twenty", 50],
        [6, 15, "30", "thirty", 75],
        [8, 20, "40", "forty", 100],
        [10, 25, "50", "fifty", 125],
    ]
    example_profile_columns = [
        "by_2",
        "by_5",
        "str_by_10",
        "words_by_10",
        "by_25",
    ]

    df = pd.DataFrame(example_profile_data, columns=example_profile_columns)
    profiler_opts = dp.ProfilerOptions()
    profiler_opts.structured_options.multiprocess.is_enabled = False

    example_profile = dp.Profiler(df, options=profiler_opts)

    profile_path = (
        "/example_profiles/expect_profile_diff_less_than_threshold_profile.pkl"
    )

    dir_path = os.path.dirname(os.path.abspath(__file__))
    profile_path = dir_path + profile_path

    example_profile.save(filepath=profile_path)

    examples = [
        {
            "data": {
                "by_2": [4, 6, 8, 10, 12],
                "by_5": [10, 15, 20, 25, 30],
                "str_by_10": ["20", "30", "40", "50", "60"],
                "words_by_10": ["twenty", "thirty", "forty", "fifty", "sixty"],
                "by_25": [50, 75, 100, 125, 150],
            },
            "tests": [
                {
                    "title": "profile_min_delta_witin_threshold",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "profile_path": profile_path,
                        "limit_check_report_keys": {
                            "*": {
                                "min": {
                                    "lower": 0,
                                    "upper": 50
                                },
                            },
                        },
                    },
                    "out": {
                        "success": True
                    },
                },
                {
                    "title": "profile_all_stats_beyond_delta_threshold",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "profile_path": profile_path,
                        "limit_check_report_keys": {
                            "*": {
                                "*": {
                                    "lower": 0,
                                    "upper": 0
                                }
                            },
                            "by_2": {
                                "min": {
                                    "lower": -1,
                                    "upper": 1
                                },
                            },
                        },
                    },
                    "out": {
                        "success": False
                    },
                },
                {
                    "title": "checking_single_failure_in_one_column",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "profile_path": profile_path,
                        "limit_check_report_keys": {
                            "*": {
                                "*": {
                                    "lower": -25,
                                    "upper": 50
                                }
                            },
                            "by_2": {
                                "min": {
                                    "lower": 0,
                                    "upper": 0
                                }
                            },
                        },
                    },
                    "out": {
                        "success": False
                    },
                },
                {
                    "title": "single_failure_still_mostly_successful",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "profile_path": profile_path,
                        "limit_check_report_keys": {
                            "*": {
                                "*": {
                                    "lower": -25,
                                    "upper": 50
                                }
                            },
                            "by_2": {
                                "min": {
                                    "lower": 0,
                                    "upper": 0
                                }
                            },
                        },
                        "mostly": 0.75,
                    },
                    "out": {
                        "success": True
                    },
                },
            ],
        },
    ]

    metric_dependencies = (
        "data_profiler.profile_numeric_columns_diff_between_inclusive_threshold_range",
    )

    success_keys = (
        "profile_path",
        "limit_check_report_keys",
        "numerical_diff_statistics",
        "mostly",
    )

    default_limit_check_report_keys = {
        "*": {
            "min": {
                "lower": 0,
                "upper": 0
            },
            "max": {
                "lower": 0,
                "upper": 0
            },
            "sum": {
                "lower": 0,
                "upper": 0
            },
            "mean": {
                "lower": 0,
                "upper": 0
            },
            "median": {
                "lower": 0,
                "upper": 0
            },
            "median_absolute_deviation": {
                "lower": 0,
                "upper": 0
            },
            "variance": {
                "lower": 0,
                "upper": 0
            },
            "stddev": {
                "lower": 0,
                "upper": 0
            },
            "unique_count": {
                "lower": 0,
                "upper": 0
            },
            "unique_ratio": {
                "lower": 0,
                "upper": 0
            },
            "gini_impurity": {
                "lower": 0,
                "upper": 0
            },
            "unalikeability": {
                "lower": 0,
                "upper": 0
            },
            "sample_size": {
                "lower": 0,
                "upper": 0
            },
            "null_count": {
                "lower": 0,
                "upper": 0
            },
        }
    }

    numerical_diff_statistics = list(
        default_limit_check_report_keys["*"].keys())

    default_kwarg_values = {
        "limit_check_report_keys": default_limit_check_report_keys,
        "numerical_diff_statistics": numerical_diff_statistics,
        "mostly": 1.0,
    }

    def _validate(
        self,
        configuration: ExpectationConfiguration,
        metrics: Dict,
        runtime_configuration: dict = None,
        execution_engine: ExecutionEngine = None,
    ):
        delta_between_thresholds = metrics.get(
            "data_profiler.profile_numeric_columns_diff_between_inclusive_threshold_range"
        )
        mostly = self.get_success_kwargs().get(
            "mostly", self.default_kwarg_values.get("mostly"))

        unexpected_values = {}
        total_stats = 0.0
        failed_stats = 0.0
        for column, value in delta_between_thresholds.items():
            column_unexpected_values = {}
            for stat, val in value.items():
                if val is not True:
                    column_unexpected_values[stat] = val
                    failed_stats += 1.0
                total_stats += 1.0
            if column_unexpected_values != {}:
                unexpected_values[column] = column_unexpected_values

        successful_stats = total_stats - failed_stats
        percent_successful = successful_stats / total_stats

        success = percent_successful >= mostly

        results = {
            "success": success,
            "expectation_config": configuration,
            "result": {
                "unexpected_values": unexpected_values,
            },
        }
        return results

    library_metadata = {
        "requirements":
        ["dataprofiler", "tensorflow", "scikit-learn", "numpy"],
        "maturity":
        "experimental",  # "concept_only", "experimental", "beta", or "production"
        "tags": [
            "dataprofiler",
            "dataassistance",
        ],  # Tags for this Expectation in the Gallery
        "contributors":
        [  # Github handles for all contributors to this Expectation.
            "@stevensecreti",  # Don't forget to add your github handle here!
        ],
    }