def __init__( self, column: AutoMapperColumnOrColumnLikeType, check: Union[AutoMapperAnyDataType, List[AutoMapperAnyDataType]], value: _TAutoMapperDataType, else_: Optional[_TAutoMapperDataType] = None, ): super().__init__() self.column: AutoMapperColumnOrColumnLikeType = column if isinstance(check, list): self.check: Union[AutoMapperDataTypeBase, List[AutoMapperDataTypeBase]] = [ a if isinstance(a, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value( value=a) for a in check ] else: self.check = (check if isinstance(check, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value=check)) self.value: AutoMapperDataTypeBase = ( value if isinstance(value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value=value)) if else_: self.else_: AutoMapperDataTypeBase = ( cast(AutoMapperDataTypeBase, else_) if isinstance( value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value=value)) else: self.else_ = AutoMapperDataTypeLiteral(None)
class AutoMapperIfRegExDataType(AutoMapperDataTypeBase, Generic[_TAutoMapperDataType]): """ If check returns value if the checks passes else when_not """ def __init__( self, column: AutoMapperColumnOrColumnLikeType, check: Union[str, List[str]], value: _TAutoMapperDataType, else_: Optional[_TAutoMapperDataType] = None, ): super().__init__() self.column: AutoMapperColumnOrColumnLikeType = column self.check: Union[str, List[str]] = check self.value: AutoMapperDataTypeBase = ( value if isinstance(value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value)) if else_: self.else_: AutoMapperDataTypeBase = ( cast(AutoMapperDataTypeBase, else_) if isinstance( value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value)) else: self.else_ = AutoMapperDataTypeLiteral(None) def include_null_properties(self, include_null_properties: bool) -> None: self.value.include_null_properties( include_null_properties=include_null_properties) def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: # rlike takes a string and not a column if isinstance(self.check, list): value: str = self.check[0] column_spec = when( self.column.get_column_spec( source_df=source_df, current_column=current_column).rlike(value), self.value.get_column_spec(source_df=source_df, current_column=current_column), ).otherwise( self.else_.get_column_spec(source_df=source_df, current_column=current_column)) else: value = self.check column_spec = when( self.column.get_column_spec( source_df=source_df, current_column=current_column).rlike(value), self.value.get_column_spec(source_df=source_df, current_column=current_column), ).otherwise( self.else_.get_column_spec(source_df=source_df, current_column=current_column)) return column_spec
def test_auto_mapper_amount(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54.45"), (2, "Vidal", "Michael", "67.67"), (3, "Alex", "Hearn", "1286782.17"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns( age=A.amount(A.column("my_age")), null_col=A.amount(AutoMapperDataTypeLiteral(None)), ) debug_text: str = mapper.to_debug_string() print(debug_text) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], col("b.my_age").cast("double").alias("age") ) assert_compare_expressions( sql_expressions["null_col"], lit(None).cast("double").alias("null_col") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert approx( result_df.where("member_id == 1").select("age", "null_col").collect()[0][:] ) == (approx(54.45), None) assert approx( result_df.where("member_id == 2").select("age", "null_col").collect()[0][:] ) == (approx(67.67), None) # Ensuring exact match in situations in which float arithmetic errors might occur assert ( str(result_df.where("member_id == 3").select("age").collect()[0][0]) == "1286782.17" ) assert dict(result_df.dtypes)["age"] == "double" assert dict(result_df.dtypes)["null_col"] == "double"
def __init__(self, check: AutoMapperColumnOrColumnLikeType, value: _TAutoMapperDataType, when_null_or_empty: Optional[_TAutoMapperDataType] = None): super().__init__() self.check: AutoMapperColumnOrColumnLikeType = check self.value: AutoMapperDataTypeBase = value \ if isinstance(value, AutoMapperDataTypeBase) \ else AutoMapperValueParser.parse_value(value) if when_null_or_empty: self.when_null: AutoMapperDataTypeBase = cast(AutoMapperDataTypeBase, when_null_or_empty) \ if isinstance(value, AutoMapperDataTypeBase) \ else AutoMapperValueParser.parse_value(value) else: self.when_null = AutoMapperDataTypeLiteral(None)
class AutoMapperIfNotNullDataType(AutoMapperDataTypeBase, Generic[_TAutoMapperDataType]): """ If check returns null then return null else return value """ def __init__( self, check: AutoMapperColumnOrColumnLikeType, value: _TAutoMapperDataType, when_null: Optional[Union[AutoMapperTextLikeBase, _TAutoMapperDataType]] = None, ): super().__init__() self.check: AutoMapperColumnOrColumnLikeType = check self.value: AutoMapperDataTypeBase = ( value if isinstance(value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value=value)) if when_null: self.when_null: AutoMapperDataTypeBase = ( cast(AutoMapperDataTypeBase, when_null) if isinstance( value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value=value)) else: self.when_null = AutoMapperDataTypeLiteral(None) def include_null_properties(self, include_null_properties: bool) -> None: self.value.include_null_properties( include_null_properties=include_null_properties) def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column], parent_columns: Optional[List[Column]], ) -> Column: column_spec = when( self.check.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ).isNull(), self.when_null.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ), ).otherwise( self.value.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, )) return column_spec @property def children( self, ) -> Union[AutoMapperDataTypeBase, List[AutoMapperDataTypeBase]]: return [c for c in [self.value, self.when_null] if c is not None]
def test_auto_mapper_number(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "54"), (2, "Vidal", "Michael", "67"), (3, "Old", "Methusela", "131026061001"), ], ["member_id", "last_name", "first_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( age=A.number(A.column("my_age")), null_field=A.number(AutoMapperDataTypeLiteral(None)), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["age"]) in ( str(col("b.my_age").cast("int").alias("age")), str(col("b.my_age").cast("long").alias("age")), ) assert str(sql_expressions["null_field"]) == str( lit(None).cast("long").alias("null_field")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select( "age").collect()[0][0] == 54 assert result_df.where("member_id == 2").select( "age").collect()[0][0] == 67 assert (result_df.where("member_id == 3").select("age").collect()[0][0] == 131026061001) assert ( result_df.where("member_id == 1").select("null_field").collect()[0][0] is None) assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def text( value: Union[AutoMapperNativeSimpleType, AutoMapperTextInputType] ) -> AutoMapperTextLikeBase: """ Specifies that the value parameter should be used as a literal text :param value: text value :return: a text automapper type """ return AutoMapperDataTypeLiteral(value, StringType())
def __init__(self, column: AutoMapperColumnOrColumnLikeType, check: Union[str, List[str]], value: _TAutoMapperDataType, else_: Optional[_TAutoMapperDataType] = None): super().__init__() self.column: AutoMapperColumnOrColumnLikeType = column self.check: Union[str, List[str]] = check self.value: AutoMapperDataTypeBase = value \ if isinstance(value, AutoMapperDataTypeBase) \ else AutoMapperValueParser.parse_value(value) if else_: self.else_: AutoMapperDataTypeBase = cast(AutoMapperDataTypeBase, else_) \ if isinstance(value, AutoMapperDataTypeBase) \ else AutoMapperValueParser.parse_value(value) else: self.else_ = AutoMapperDataTypeLiteral(None)
def to_text(self: _TAutoMapperDataType) -> "AutoMapperTextLikeBase": """ Specifies that the value parameter should be used as a literal text :param self: Set by Python. No need to pass. :return: a text automapper type :example: A.column("paid").to_text() """ return AutoMapperDataTypeLiteral(self, StringType())
class AutoMapperIfNotNullOrEmptyDataType( AutoMapperDataTypeBase, Generic[_TAutoMapperDataType] ): """ If check returns null then return null else return value """ def __init__( self, check: AutoMapperColumnOrColumnLikeType, value: _TAutoMapperDataType, when_null_or_empty: Optional[_TAutoMapperDataType] = None, ): super().__init__() self.check: AutoMapperColumnOrColumnLikeType = check self.value: AutoMapperDataTypeBase = ( value if isinstance(value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value) ) if when_null_or_empty: self.when_null: AutoMapperDataTypeBase = ( cast(AutoMapperDataTypeBase, when_null_or_empty) if isinstance(value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value) ) else: self.when_null = AutoMapperDataTypeLiteral(None) def include_null_properties(self, include_null_properties: bool) -> None: self.value.include_null_properties( include_null_properties=include_null_properties ) def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column] ) -> Column: column_spec = when( self.check.get_column_spec( source_df=source_df, current_column=current_column ).isNull() | self.check.get_column_spec( source_df=source_df, current_column=current_column ).eqNullSafe(""), self.when_null.get_column_spec( source_df=source_df, current_column=current_column ), ).otherwise( self.value.get_column_spec( source_df=source_df, current_column=current_column ) ) return column_spec
def __init__( self, check: AutoMapperColumnOrColumnLikeType, value: _TAutoMapperDataType, when_null: Optional[Union[AutoMapperTextLikeBase, _TAutoMapperDataType]] = None, ): super().__init__() self.check: AutoMapperColumnOrColumnLikeType = check self.value: AutoMapperDataTypeBase = ( value if isinstance(value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value=value)) if when_null: self.when_null: AutoMapperDataTypeBase = ( cast(AutoMapperDataTypeBase, when_null) if isinstance( value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value=value)) else: self.when_null = AutoMapperDataTypeLiteral(None)
def _parse_value( value: Union[Dict[str, Any], List[Any], AutoMapperAnyDataType] ) -> AutoMapperDataTypeBase: # convert any short syntax to long syntax if isinstance(value, str): if len(value) > 0 and value[0] == "[": from spark_auto_mapper.data_types.column import AutoMapperDataTypeColumn return AutoMapperDataTypeColumn( value=value[1:-1]) # skip the first and last characters else: from spark_auto_mapper.data_types.literal import ( AutoMapperDataTypeLiteral, ) return AutoMapperDataTypeLiteral(value=value) if isinstance(value, int): from spark_auto_mapper.data_types.literal import AutoMapperDataTypeLiteral return AutoMapperDataTypeLiteral(value=value, type_=IntegerType()) if isinstance(value, float): from spark_auto_mapper.data_types.literal import AutoMapperDataTypeLiteral return AutoMapperDataTypeLiteral(value=value, type_=FloatType()) if isinstance(value, date): from spark_auto_mapper.data_types.literal import AutoMapperDataTypeLiteral return AutoMapperDataTypeLiteral(value=value, type_=DateType()) if isinstance(value, datetime): from spark_auto_mapper.data_types.literal import AutoMapperDataTypeLiteral return AutoMapperDataTypeLiteral(value=value, type_=DateType()) # if value is a dict then wrap with struct if isinstance(value, dict): from spark_auto_mapper.data_types.complex.struct_type import ( AutoMapperDataTypeStruct, ) return AutoMapperDataTypeStruct(value=value) if isinstance(value, List): from spark_auto_mapper.data_types.list import AutoMapperList # ignore the type because we're using a list and it cannot ensure the type of the list return AutoMapperList(value=value) # type: ignore if isinstance(value, AutoMapperDataTypeBase): return value if value is None: from spark_auto_mapper.data_types.literal import AutoMapperDataTypeLiteral return AutoMapperDataTypeLiteral(None) if isinstance(value, Column): return AutoMapperDataTypeColumnWrapper(value) raise ValueError(f"{type(value)} is not supported for {value}")
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column], parent_columns: Optional[List[Column]], ) -> Column: column_spec = self.column.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ) # noinspection Mypy,PyProtectedMember col_name: str = ( column_spec._jc.toString() # type: ignore ) # Get spark representation of the column try: # Force spark analyzer to confirm that column/expression is possible. This does not actually compute # anything, just triggers the analyzer to check validity, which is what we want. # If SparkSQL AnalysisException is thrown, fall-back to the default, otherwise we can proceed. if source_df: source_df.selectExpr(col_name.replace("b.", "")) # col exists so we use the if_exists if self.if_exists_column: column_spec = self.if_exists_column.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ) except AnalysisException: if self.if_not_exists: column_spec = self.if_not_exists.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ) else: column_spec = AutoMapperDataTypeLiteral(None).get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ) return column_spec
def to_text(self: _TAutoMapperDataType) -> 'AutoMapperTextLikeBase': """ Specifies that the value parameter should be used as a literal text :return: a text automapper type """ return AutoMapperDataTypeLiteral(self, StringType())
class AutoMapperIfDataType(AutoMapperDataTypeBase, Generic[_TAutoMapperDataType]): """ If check returns value if the checks passes else when_not """ @property def children( self ) -> Union[AutoMapperDataTypeBase, List[AutoMapperDataTypeBase]]: return self.value def __init__( self, column: AutoMapperColumnOrColumnLikeType, check: Union[AutoMapperAnyDataType, List[AutoMapperAnyDataType]], value: _TAutoMapperDataType, else_: Optional[_TAutoMapperDataType] = None, ): super().__init__() self.column: AutoMapperColumnOrColumnLikeType = column if isinstance(check, list): self.check: Union[AutoMapperDataTypeBase, List[AutoMapperDataTypeBase]] = [ a if isinstance(a, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value( value=a) for a in check ] else: self.check = (check if isinstance(check, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value=check)) self.value: AutoMapperDataTypeBase = ( value if isinstance(value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value=value)) if else_: self.else_: AutoMapperDataTypeBase = ( cast(AutoMapperDataTypeBase, else_) if isinstance( value, AutoMapperDataTypeBase) else AutoMapperValueParser.parse_value(value=value)) else: self.else_ = AutoMapperDataTypeLiteral(None) def include_null_properties(self, include_null_properties: bool) -> None: self.value.include_null_properties( include_null_properties=include_null_properties) def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column], parent_columns: Optional[List[Column]], ) -> Column: if isinstance(self.check, list): column_spec = when( self.column.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ).isin(*[ c.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ) for c in self.check ]), self.value.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ), ).otherwise( self.else_.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, )) else: column_spec = when( self.column.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ).eqNullSafe( self.check.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, )), self.value.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ), ).otherwise( self.else_.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, )) return column_spec
def test_auto_mapper_datatype_literal(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( dst1="src1", dst2=AutoMapperDataTypeLiteral(None), dst3=AutoMapperDataTypeLiteral(""), dst4=AutoMapperDataTypeLiteral("literal"), dst5=AutoMapperDataTypeLiteral(1234), dst6=AutoMapperDataTypeLiteral(0), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["dst1"]) == str(lit("src1").alias("dst1")) assert str(sql_expressions["dst2"]) == str(lit(None).alias("dst2")) assert str(sql_expressions["dst3"]) == str(lit("").alias("dst3")) assert str(sql_expressions["dst4"]) == str(lit("literal").alias("dst4")) assert str(sql_expressions["dst5"]) == str(lit(1234).alias("dst5")) assert str(sql_expressions["dst6"]) == str(lit(0).alias("dst6")) result_df: DataFrame = mapper.transform(df=source_df) # Assert result_df.printSchema() result_df.show() result = result_df.collect() assert result == [ Row( member_id=1, dst1="src1", dst2=None, dst3="", dst4="literal", dst5=1234, dst6=0, ), Row( member_id=2, dst1="src1", dst2=None, dst3="", dst4="literal", dst5=1234, dst6=0, ), ]