예제 #1
0
    def generate(self, df_schema: StructType, schema: StructType):
        def remove_metadata(json_schema):
            for field in json_schema["fields"]:
                field["metadata"] = {}

            return json_schema

        expected_schema = remove_metadata(schema.jsonValue())
        df_schema = remove_metadata(df_schema.jsonValue())

        exclude_nullable = re.compile(r"\['nullable'\]")
        ddiff = DeepDiff(expected_schema,
                         df_schema,
                         ignore_string_case=True,
                         ignore_order=True,
                         exclude_regex_paths=[exclude_nullable])

        result = []
        if ddiff:

            if "values_changed" in ddiff:
                result.extend(
                    self.__get_changed(expected_schema,
                                       ddiff["values_changed"]))

            if "type_changes" in ddiff:
                result.extend(
                    self.__get_changed(expected_schema,
                                       ddiff["type_changes"],
                                       is_values=False))

            if "iterable_item_added" in ddiff:
                result.extend(
                    self.__get_iterable_item(expected_schema,
                                             ddiff["iterable_item_added"],
                                             "unexpected field"))

            if "iterable_item_removed" in ddiff:
                result.extend(
                    self.__get_iterable_item(expected_schema,
                                             ddiff["iterable_item_removed"],
                                             "missing field"))

            if "dictionary_item_added" in ddiff:
                result.extend(
                    self.__get_dictionary_item(expected_schema,
                                               ddiff["dictionary_item_added"],
                                               "Unexpected field"))

            if "dictionary_item_removed" in ddiff:
                result.extend(
                    self.__get_dictionary_item(
                        expected_schema, ddiff["dictionary_item_removed"],
                        "Missing field"))

        return result
예제 #2
0
def test_spark_type_mapping(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import (
        BooleanType,
        IntegerType,
        LongType,
        FloatType,
        DoubleType,
        StringType,
        BinaryType,
        TimestampType,
    )
    from pyspark.sql.types import StructField, StructType

    assert isinstance(DataType.boolean.to_spark(), BooleanType)
    assert isinstance(DataType.integer.to_spark(), IntegerType)
    assert isinstance(DataType.long.to_spark(), LongType)
    assert isinstance(DataType.float.to_spark(), FloatType)
    assert isinstance(DataType.double.to_spark(), DoubleType)
    assert isinstance(DataType.string.to_spark(), StringType)
    assert isinstance(DataType.binary.to_spark(), BinaryType)
    assert isinstance(DataType.datetime.to_spark(), TimestampType)
    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"])
    schema = _infer_schema(pandas_df_with_all_types)
    expected_spark_schema = StructType([
        StructField(t.name, t.to_spark(), True) for t in schema.input_types()
    ])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()
    spark_session = pyspark.sql.SparkSession(
        pyspark.SparkContext.getOrCreate())
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types,
                                            schema=actual_spark_schema)
    schema2 = _infer_schema(sparkdf)
    assert schema == schema2

    # test unnamed columns
    schema = Schema([ColSpec(col.type) for col in schema.inputs])
    expected_spark_schema = StructType([
        StructField(str(i), t.to_spark(), True)
        for i, t in enumerate(schema.input_types())
    ])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()

    # test single unnamed column is mapped to just a single spark type
    schema = Schema([ColSpec(DataType.integer)])
    spark_type = schema.as_spark_schema()
    assert isinstance(spark_type, IntegerType)
예제 #3
0
def coerce_to_schema(normalized_df: DataFrame, to_schema: StructType,
                     aliases: dict) -> DataFrame:
    """
    Common transformation to ensure a DataFrame adheres to the correct schema for consistency across all
    Spark jobs. Handles column ordering, renaming, and type coercion.

    Currently using a `collections.OrderedDict` to build the mapping, but this can be changed to a native
    Python dictionary as soon as we update our EMR cluster and AWS Data Pipeline to use >= 3.6. Python 3.4 dicts
    do not preserve ordering, which is important in this case because the mapping gets translated to a select statement.

    :param normalized_df: DataFrame that represents the final output of a transformation but doesn't yet match the
    required schema exactly.
    :param to_schema: The defined `spark.sql.types.StructType` schema into which the input DataFrame should be
    converted.
    :param aliases: Dictionary mapping columns in the input DataFrame to the respective schema column names.
    :return: A new DataFrame that matches the required schema.
    """
    types = {f['name']: f['type'] for f in to_schema.jsonValue()['fields']}
    mapping = OrderedDict()
    for name in to_schema.names:
        if name not in aliases:
            selected = name
        else:
            selected = snek(aliases[name])
        mapping[name] = F.col(selected).cast(types[name])
    expr = build_col_expr(mapping)
    coerced = normalized_df.select(expr)
    return coerced
예제 #4
0
    def should_successfully_merge_struct_types_with(
        schema_a: StructType, schema_b: StructType, expected_schema: StructType
    ):
        # given ^

        # when
        merged_schema = merge_schemas(schema_a, schema_b)

        # then
        assert merged_schema.jsonValue() == expected_schema.jsonValue()

        # ...expect distinct objects
        assert merged_schema is not schema_a
        assert merged_schema is not schema_b