def hasHistogramValues(self, column, assertion, binningUdf=None, maxBins=None): """ Creates a constraint that asserts on column's value distribution. @param column Column to run the assertion on @param assertion Function that receives a Distribution input parameter and returns a boolean. E.g .hasHistogramValues("att2", _.absolutes("f") == 3) .hasHistogramValues("att2", _.ratios(Histogram.NullFieldReplacement) == 2/6.0) @param binningUdf An optional binning function @param maxBins Histogram details is only provided for N column values with top counts. maxBins sets the N @param hint A hint to provide additional context why a constraint could have failed """ function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.hasHistogramValues( column, function, getattr(self.jvmCheck, "hasHistogramValues$default$3")(), getattr(self.jvmCheck, "hasHistogramValues$default$4")(), getattr(self.jvmCheck, "hasHistogramValues$default$5")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def isNonNegative(self, column, assertion=is_one): """ Creates a constraint that asserts that a column contains no negative values @param column Column to run the assertion on @param assertion Function that receives a double input parameter and returns a boolean @param hint A hint to provide additional context why a constraint could have failed """ function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.isNonNegative( column, function, getattr(self.jvmCheck, "isNonNegative$default$3")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def hasApproxCountDistinct(self, column, assertion): """ Creates a constraint that asserts on the approximate count distinct of the given column @param column Column to run the assertion on @param assertion Function that receives a double input parameter and returns a boolean @param hint A hint to provide additional context why a constraint could have failed """ function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.hasApproxCountDistinct( column, function, getattr(self.jvmCheck, "hasApproxCountDistinct$default$3")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def hasSize(self, assertion): """ Creates a constraint that calculates the data frame size and runs the assertion on it. Args: assertion (function): Returns: checks.Check object including this constraint """ function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.hasSize( function, getattr(self.jvmCheck, "hasSize$default$2")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def isGreaterThan(self, columnA, columnB, assertion=is_one): """ Asserts that, in each row, the value of columnA is greater than the value of columnB @param columnA Column to run the assertion on @param columnB Column to run the assertion on @param assertion Function that receives a double input parameter and returns a boolean @param hint A hint to provide additional context why a constraint could have failed """ function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.isGreaterThan( columnA, columnB, function, getattr(self.jvmCheck, "isGreaterThan$default$4")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def hasCorrelation(self, columnA, columnB, assertion): """ Creates a constraint that asserts on the pearson correlation between two columns. @param columnA First column for correlation calculation @param columnB Second column for correlation calculation @param assertion Function that receives a double input parameter and returns a boolean @param hint A hint to provide additional context why a constraint could have failed """ function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.hasCorrelation( columnA, columnB, function, getattr(self.jvmCheck, "hasCorrelation$default$4")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def hasCompleteness(self, column, assertion): """ Creates a constraint that asserts on a column completion. Uses the given history selection strategy to retrieve historical completeness values on this column from the history provider. @param column Column to run the assertion on @param assertion Function that receives a double input parameter and returns a boolean @param hint A hint to provide additional context why a constraint could have failed """ function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.hasCompleteness( column, function, getattr(self.jvmCheck, "hasCompleteness$default$3")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def hasApproxQuantile(self, column, quantile, assertion): """ Creates a constraint that asserts on an approximated quantile @param column Column to run the assertion on @param quantile Which quantile to assert on @param assertion Function that receives a double input parameter (the computed quantile) and returns a boolean @param hint A hint to provide additional context why a constraint could have failed """ function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.hasApproxQuantile( column, quantile, function, getattr(self.jvmCheck, "hasApproxQuantile$default$4")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def hasUniqueness(self, columns, assertion): """ Creates a constraint that asserts on uniqueness in a single or combined set of key columns. @param columns Key columns @param assertion Function that receives a double input parameter and returns a boolean. Refers to the fraction of unique values @param hint A hint to provide additional context why a constraint could have failed """ if (not isinstance(columns, list)): # Single column is provided columns = [columns] function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.hasUniqueness( jc.iterable_to_scala_seq(self._jvm, columns), function) return Check(self.spark, self.level, self.description, jvmConstraint)
def isContainedIn(self, column, allowedValues, assertion=is_one): """ Asserts that every non-null value in a column is contained in a set of predefined values @param column Column to run the assertion on @param allowedValues Allowed values for the column @param assertion Function that receives a double input parameter and returns a boolean @param hint A hint to provide additional context why a constraint could have failed """ if (isinstance(allowedValues, list) == False): raise ValueError("'allowedValues' must be a list of strings.") function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) scalaArray = jc.iterable_to_scala_array(self._jvm, allowedValues) jvmConstraint = self.jvmCheck.isContainedIn( column, scalaArray, function, getattr(self.jvmCheck, "isContainedIn$default$6")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def satisfies(self, columnCondition, constraintName, assertion): """ Creates a constraint that runs the given condition on the data frame. @param columnCondition Data frame column which is a combination of expression and the column name. It has to comply with Spark SQL syntax. Can be written in an exact same way with conditions inside the `WHERE` clause. @param constraintName A name that summarizes the check being made. This name is being used to name the metrics for the analysis being done. @param assertion Function that receives a double input parameter and returns a boolean @param hint A hint to provide additional context why a constraint could have failed """ function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.satisfies( columnCondition, constraintName, function, getattr(self.jvmCheck, "satisfies$default$4")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def hasNumberOfDistinctValues(self, column, assertion, binningUdf=None, maxBins=None): """ Creates a constraint that asserts on the number of distinct values a column has. @param column Column to run the assertion on @param assertion Function that receives a long input parameter and returns a boolean @param binningUdf An optional binning function @param maxBins Histogram details is only provided for N column values with top counts. maxBins sets the N @param hint A hint to provide additional context why a constraint could have failed """ function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.hasNumberOfDistinctValues( column, function, getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$3")(), getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$4")(), getattr(self.jvmCheck, "hasNumberOfDistinctValues$default$5")()) return Check(self.spark, self.level, self.description, jvmConstraint)
def hasDataType(self, column, dataType, assertion): """ Check to run against the fraction of rows that conform to the given data type. @param column Name of the columns that should be checked. @param dataType Data type that the columns should be compared against. @param assertion Function that receives a double input parameter and returns a boolean @param hint A hint to provide additional context why a constraint could have failed """ _jconstDataTypes = self._jvm.com.amazon.deequ.constraints.ConstrainableDataTypes dataTypes = { 'null': _jconstDataTypes.Null(), 'boolean': _jconstDataTypes.Boolean(), 'string': _jconstDataTypes.String(), 'numeric': _jconstDataTypes.Numeric(), 'fractional': _jconstDataTypes.Fractional(), 'integer': _jconstDataTypes.Integral() } function = jc.scala_function1(self.spark.sparkContext._gateway, assertion) jvmConstraint = self.jvmCheck.hasDataType( column, dataTypes[dataType], function, getattr(self.jvmCheck, "hasDataType$default$4")()) return Check(self.spark, self.level, self.description, jvmConstraint)