def generate(self, df_schema: StructType, schema: StructType): def remove_metadata(json_schema): for field in json_schema["fields"]: field["metadata"] = {} return json_schema expected_schema = remove_metadata(schema.jsonValue()) df_schema = remove_metadata(df_schema.jsonValue()) exclude_nullable = re.compile(r"\['nullable'\]") ddiff = DeepDiff(expected_schema, df_schema, ignore_string_case=True, ignore_order=True, exclude_regex_paths=[exclude_nullable]) result = [] if ddiff: if "values_changed" in ddiff: result.extend( self.__get_changed(expected_schema, ddiff["values_changed"])) if "type_changes" in ddiff: result.extend( self.__get_changed(expected_schema, ddiff["type_changes"], is_values=False)) if "iterable_item_added" in ddiff: result.extend( self.__get_iterable_item(expected_schema, ddiff["iterable_item_added"], "unexpected field")) if "iterable_item_removed" in ddiff: result.extend( self.__get_iterable_item(expected_schema, ddiff["iterable_item_removed"], "missing field")) if "dictionary_item_added" in ddiff: result.extend( self.__get_dictionary_item(expected_schema, ddiff["dictionary_item_added"], "Unexpected field")) if "dictionary_item_removed" in ddiff: result.extend( self.__get_dictionary_item( expected_schema, ddiff["dictionary_item_removed"], "Missing field")) return result
def test_spark_type_mapping(pandas_df_with_all_types): import pyspark from pyspark.sql.types import ( BooleanType, IntegerType, LongType, FloatType, DoubleType, StringType, BinaryType, TimestampType, ) from pyspark.sql.types import StructField, StructType assert isinstance(DataType.boolean.to_spark(), BooleanType) assert isinstance(DataType.integer.to_spark(), IntegerType) assert isinstance(DataType.long.to_spark(), LongType) assert isinstance(DataType.float.to_spark(), FloatType) assert isinstance(DataType.double.to_spark(), DoubleType) assert isinstance(DataType.string.to_spark(), StringType) assert isinstance(DataType.binary.to_spark(), BinaryType) assert isinstance(DataType.datetime.to_spark(), TimestampType) pandas_df_with_all_types = pandas_df_with_all_types.drop( columns=["boolean_ext", "integer_ext", "string_ext"]) schema = _infer_schema(pandas_df_with_all_types) expected_spark_schema = StructType([ StructField(t.name, t.to_spark(), True) for t in schema.input_types() ]) actual_spark_schema = schema.as_spark_schema() assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue() spark_session = pyspark.sql.SparkSession( pyspark.SparkContext.getOrCreate()) sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=actual_spark_schema) schema2 = _infer_schema(sparkdf) assert schema == schema2 # test unnamed columns schema = Schema([ColSpec(col.type) for col in schema.inputs]) expected_spark_schema = StructType([ StructField(str(i), t.to_spark(), True) for i, t in enumerate(schema.input_types()) ]) actual_spark_schema = schema.as_spark_schema() assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue() # test single unnamed column is mapped to just a single spark type schema = Schema([ColSpec(DataType.integer)]) spark_type = schema.as_spark_schema() assert isinstance(spark_type, IntegerType)
def coerce_to_schema(normalized_df: DataFrame, to_schema: StructType, aliases: dict) -> DataFrame: """ Common transformation to ensure a DataFrame adheres to the correct schema for consistency across all Spark jobs. Handles column ordering, renaming, and type coercion. Currently using a `collections.OrderedDict` to build the mapping, but this can be changed to a native Python dictionary as soon as we update our EMR cluster and AWS Data Pipeline to use >= 3.6. Python 3.4 dicts do not preserve ordering, which is important in this case because the mapping gets translated to a select statement. :param normalized_df: DataFrame that represents the final output of a transformation but doesn't yet match the required schema exactly. :param to_schema: The defined `spark.sql.types.StructType` schema into which the input DataFrame should be converted. :param aliases: Dictionary mapping columns in the input DataFrame to the respective schema column names. :return: A new DataFrame that matches the required schema. """ types = {f['name']: f['type'] for f in to_schema.jsonValue()['fields']} mapping = OrderedDict() for name in to_schema.names: if name not in aliases: selected = name else: selected = snek(aliases[name]) mapping[name] = F.col(selected).cast(types[name]) expr = build_col_expr(mapping) coerced = normalized_df.select(expr) return coerced
def should_successfully_merge_struct_types_with( schema_a: StructType, schema_b: StructType, expected_schema: StructType ): # given ^ # when merged_schema = merge_schemas(schema_a, schema_b) # then assert merged_schema.jsonValue() == expected_schema.jsonValue() # ...expect distinct objects assert merged_schema is not schema_a assert merged_schema is not schema_b