def _spark(cls, column, strftime_format, **kwargs): # Below is a simple validation that the provided format can both format and parse a datetime object. # %D is an example of a format that can format but not parse, e.g. try: datetime.strptime( datetime.strftime(datetime.now(), strftime_format), strftime_format) except ValueError as e: raise ValueError( f"Unable to use provided strftime_format: {str(e)}") def is_parseable_by_format(val): if val is None: return False try: datetime.strptime(val, strftime_format) return True except TypeError: raise TypeError( "Values passed to expect_column_values_to_match_strftime_format must be of type string.\nIf you want to validate a column of dates or timestamps, please call the expectation before converting from string format." ) except ValueError: return False success_udf = F.udf(is_parseable_by_format, sparktypes.BooleanType()) return success_udf(column)
def _spark(cls, column, **kwargs): def is_ascii(val): return str(val).isascii() is_ascii_udf = F.udf(is_ascii, sparktypes.BooleanType()) return is_ascii_udf(column)
def _spark(cls, column, **kwargs): def is_xml(val): try: xml_doc = etree.fromstring(val) return True except: return False is_xml_udf = F.udf(is_xml, sparktypes.BooleanType()) return is_xml_udf(column)
def _spark(cls, column, json_schema, **kwargs): def is_json(val): try: json.loads(val) return True except: return False is_json_udf = F.udf(is_json, sparktypes.BooleanType()) return is_json_udf(column)
def _spark(cls, column, **kwargs): center_point = kwargs.get("center_point") unit = kwargs.get("unit") range = kwargs.get("range") projection = kwargs.get("projection") if projection == "fcc": if unit == "kilometers": distances = F.udf( lambda x, y=center_point: fcc_projection(x, y), sparktypes.FloatType(), ) elif unit == "miles": distances = F.udf( lambda x, y=center_point: fcc_projection(x, y) * 1.609344, sparktypes.FloatType(), ) range = range * 1.609344 return F.when(distances(column) < range, F.lit(True)).otherwise(F.lit(False)) elif projection == "pythagorean": if unit == "kilometers": distances = F.udf( lambda x, y=center_point: pythagorean_projection(x, y), sparktypes.FloatType(), ) elif unit == "miles": distances = F.udf( lambda x, y=center_point: pythagorean_projection(x, y) * 1.609344, sparktypes.FloatType(), ) range = range * 1.609344 return F.when(distances(column) < range, F.lit(True)).otherwise(F.lit(False))
def _spark(cls, column, json_schema, **kwargs): def matches_json_schema(val): if val is None: return False try: val_json = json.loads(val) jsonschema.validate(val_json, json_schema) # jsonschema.validate raises an error if validation fails. # So if we make it this far, we know that the validation succeeded. return True except jsonschema.ValidationError: return False except jsonschema.SchemaError: raise except: raise matches_json_schema_udf = F.udf(matches_json_schema, sparktypes.BooleanType()) return matches_json_schema_udf(column)
def _spark(cls, column, xml_schema, **kwargs): try: xmlschema_doc = etree.fromstring(xml_schema) xmlschema = etree.XMLSchema(xmlschema_doc) except etree.ParseError: raise except: raise def matches_xml_schema(val): if val is None: return False try: xml_doc = etree.fromstring(val) return xmlschema(xml_doc) except: raise matches_xml_schema_udf = F.udf(matches_xml_schema, sparktypes.BooleanType()) return matches_xml_schema_udf(column)
def _spark(cls, column, **kwargs): tz_udf = F.udf(is_valid_timezone, sparktypes.BooleanType()) return tz_udf(column)