Exemplo n.º 1
0
    def __init__(
        self,
        column: AutoMapperColumnOrColumnLikeType,
        check: Union[AutoMapperAnyDataType, List[AutoMapperAnyDataType]],
        value: _TAutoMapperDataType,
        else_: Optional[_TAutoMapperDataType] = None,
    ):
        super().__init__()

        self.column: AutoMapperColumnOrColumnLikeType = column
        if isinstance(check, list):
            self.check: Union[AutoMapperDataTypeBase,
                              List[AutoMapperDataTypeBase]] = [
                                  a if isinstance(a, AutoMapperDataTypeBase)
                                  else AutoMapperValueParser.parse_value(
                                      value=a) for a in check
                              ]
        else:
            self.check = (check if isinstance(check, AutoMapperDataTypeBase)
                          else AutoMapperValueParser.parse_value(value=check))
        self.value: AutoMapperDataTypeBase = (
            value if isinstance(value, AutoMapperDataTypeBase) else
            AutoMapperValueParser.parse_value(value=value))
        if else_:
            self.else_: AutoMapperDataTypeBase = (
                cast(AutoMapperDataTypeBase, else_) if isinstance(
                    value, AutoMapperDataTypeBase) else
                AutoMapperValueParser.parse_value(value=value))
        else:
            self.else_ = AutoMapperDataTypeLiteral(None)
Exemplo n.º 2
0
class AutoMapperIfRegExDataType(AutoMapperDataTypeBase,
                                Generic[_TAutoMapperDataType]):
    """
    If check returns value if the checks passes else when_not
    """
    def __init__(
        self,
        column: AutoMapperColumnOrColumnLikeType,
        check: Union[str, List[str]],
        value: _TAutoMapperDataType,
        else_: Optional[_TAutoMapperDataType] = None,
    ):
        super().__init__()

        self.column: AutoMapperColumnOrColumnLikeType = column
        self.check: Union[str, List[str]] = check
        self.value: AutoMapperDataTypeBase = (
            value if isinstance(value, AutoMapperDataTypeBase) else
            AutoMapperValueParser.parse_value(value))
        if else_:
            self.else_: AutoMapperDataTypeBase = (
                cast(AutoMapperDataTypeBase, else_) if isinstance(
                    value, AutoMapperDataTypeBase) else
                AutoMapperValueParser.parse_value(value))
        else:
            self.else_ = AutoMapperDataTypeLiteral(None)

    def include_null_properties(self, include_null_properties: bool) -> None:
        self.value.include_null_properties(
            include_null_properties=include_null_properties)

    def get_column_spec(self, source_df: Optional[DataFrame],
                        current_column: Optional[Column]) -> Column:
        # rlike takes a string and not a column
        if isinstance(self.check, list):
            value: str = self.check[0]
            column_spec = when(
                self.column.get_column_spec(
                    source_df=source_df,
                    current_column=current_column).rlike(value),
                self.value.get_column_spec(source_df=source_df,
                                           current_column=current_column),
            ).otherwise(
                self.else_.get_column_spec(source_df=source_df,
                                           current_column=current_column))
        else:
            value = self.check
            column_spec = when(
                self.column.get_column_spec(
                    source_df=source_df,
                    current_column=current_column).rlike(value),
                self.value.get_column_spec(source_df=source_df,
                                           current_column=current_column),
            ).otherwise(
                self.else_.get_column_spec(source_df=source_df,
                                           current_column=current_column))

        return column_spec
def test_auto_mapper_amount(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54.45"),
            (2, "Vidal", "Michael", "67.67"),
            (3, "Alex", "Hearn", "1286782.17"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).columns(
        age=A.amount(A.column("my_age")),
        null_col=A.amount(AutoMapperDataTypeLiteral(None)),
    )

    debug_text: str = mapper.to_debug_string()
    print(debug_text)

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"], col("b.my_age").cast("double").alias("age")
    )

    assert_compare_expressions(
        sql_expressions["null_col"], lit(None).cast("double").alias("null_col")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert approx(
        result_df.where("member_id == 1").select("age", "null_col").collect()[0][:]
    ) == (approx(54.45), None)
    assert approx(
        result_df.where("member_id == 2").select("age", "null_col").collect()[0][:]
    ) == (approx(67.67), None)
    # Ensuring exact match in situations in which float arithmetic errors might occur
    assert (
        str(result_df.where("member_id == 3").select("age").collect()[0][0])
        == "1286782.17"
    )

    assert dict(result_df.dtypes)["age"] == "double"
    assert dict(result_df.dtypes)["null_col"] == "double"
    def __init__(self,
                 check: AutoMapperColumnOrColumnLikeType,
                 value: _TAutoMapperDataType,
                 when_null_or_empty: Optional[_TAutoMapperDataType] = None):
        super().__init__()

        self.check: AutoMapperColumnOrColumnLikeType = check
        self.value: AutoMapperDataTypeBase = value \
            if isinstance(value, AutoMapperDataTypeBase) \
            else AutoMapperValueParser.parse_value(value)
        if when_null_or_empty:
            self.when_null: AutoMapperDataTypeBase = cast(AutoMapperDataTypeBase, when_null_or_empty) \
                if isinstance(value, AutoMapperDataTypeBase) \
                else AutoMapperValueParser.parse_value(value)
        else:
            self.when_null = AutoMapperDataTypeLiteral(None)
Exemplo n.º 5
0
class AutoMapperIfNotNullDataType(AutoMapperDataTypeBase,
                                  Generic[_TAutoMapperDataType]):
    """
    If check returns null then return null else return value
    """
    def __init__(
        self,
        check: AutoMapperColumnOrColumnLikeType,
        value: _TAutoMapperDataType,
        when_null: Optional[Union[AutoMapperTextLikeBase,
                                  _TAutoMapperDataType]] = None,
    ):
        super().__init__()

        self.check: AutoMapperColumnOrColumnLikeType = check
        self.value: AutoMapperDataTypeBase = (
            value if isinstance(value, AutoMapperDataTypeBase) else
            AutoMapperValueParser.parse_value(value=value))
        if when_null:
            self.when_null: AutoMapperDataTypeBase = (
                cast(AutoMapperDataTypeBase, when_null) if isinstance(
                    value, AutoMapperDataTypeBase) else
                AutoMapperValueParser.parse_value(value=value))
        else:
            self.when_null = AutoMapperDataTypeLiteral(None)

    def include_null_properties(self, include_null_properties: bool) -> None:
        self.value.include_null_properties(
            include_null_properties=include_null_properties)

    def get_column_spec(
        self,
        source_df: Optional[DataFrame],
        current_column: Optional[Column],
        parent_columns: Optional[List[Column]],
    ) -> Column:
        column_spec = when(
            self.check.get_column_spec(
                source_df=source_df,
                current_column=current_column,
                parent_columns=parent_columns,
            ).isNull(),
            self.when_null.get_column_spec(
                source_df=source_df,
                current_column=current_column,
                parent_columns=parent_columns,
            ),
        ).otherwise(
            self.value.get_column_spec(
                source_df=source_df,
                current_column=current_column,
                parent_columns=parent_columns,
            ))

        return column_spec

    @property
    def children(
        self, ) -> Union[AutoMapperDataTypeBase, List[AutoMapperDataTypeBase]]:
        return [c for c in [self.value, self.when_null] if c is not None]
def test_auto_mapper_number(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", "67"),
            (3, "Old", "Methusela", "131026061001"),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        age=A.number(A.column("my_age")),
        null_field=A.number(AutoMapperDataTypeLiteral(None)),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) in (
        str(col("b.my_age").cast("int").alias("age")),
        str(col("b.my_age").cast("long").alias("age")),
    )

    assert str(sql_expressions["null_field"]) == str(
        lit(None).cast("long").alias("null_field"))

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == 54
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] == 67
    assert (result_df.where("member_id == 3").select("age").collect()[0][0] ==
            131026061001)
    assert (
        result_df.where("member_id == 1").select("null_field").collect()[0][0]
        is None)

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
 def text(
     value: Union[AutoMapperNativeSimpleType, AutoMapperTextInputType]
 ) -> AutoMapperTextLikeBase:
     """
     Specifies that the value parameter should be used as a literal text
     :param value: text value
     :return: a text automapper type
     """
     return AutoMapperDataTypeLiteral(value, StringType())
Exemplo n.º 8
0
    def __init__(self,
                 column: AutoMapperColumnOrColumnLikeType,
                 check: Union[str, List[str]],
                 value: _TAutoMapperDataType,
                 else_: Optional[_TAutoMapperDataType] = None):
        super().__init__()

        self.column: AutoMapperColumnOrColumnLikeType = column
        self.check: Union[str, List[str]] = check
        self.value: AutoMapperDataTypeBase = value \
            if isinstance(value, AutoMapperDataTypeBase) \
            else AutoMapperValueParser.parse_value(value)
        if else_:
            self.else_: AutoMapperDataTypeBase = cast(AutoMapperDataTypeBase, else_) \
                if isinstance(value, AutoMapperDataTypeBase) \
                else AutoMapperValueParser.parse_value(value)
        else:
            self.else_ = AutoMapperDataTypeLiteral(None)
Exemplo n.º 9
0
    def to_text(self: _TAutoMapperDataType) -> "AutoMapperTextLikeBase":
        """
        Specifies that the value parameter should be used as a literal text


        :param self: Set by Python.  No need to pass.
        :return: a text automapper type
        :example: A.column("paid").to_text()
        """
        return AutoMapperDataTypeLiteral(self, StringType())
class AutoMapperIfNotNullOrEmptyDataType(
    AutoMapperDataTypeBase, Generic[_TAutoMapperDataType]
):
    """
    If check returns null then return null else return value
    """

    def __init__(
        self,
        check: AutoMapperColumnOrColumnLikeType,
        value: _TAutoMapperDataType,
        when_null_or_empty: Optional[_TAutoMapperDataType] = None,
    ):
        super().__init__()

        self.check: AutoMapperColumnOrColumnLikeType = check
        self.value: AutoMapperDataTypeBase = (
            value
            if isinstance(value, AutoMapperDataTypeBase)
            else AutoMapperValueParser.parse_value(value)
        )
        if when_null_or_empty:
            self.when_null: AutoMapperDataTypeBase = (
                cast(AutoMapperDataTypeBase, when_null_or_empty)
                if isinstance(value, AutoMapperDataTypeBase)
                else AutoMapperValueParser.parse_value(value)
            )
        else:
            self.when_null = AutoMapperDataTypeLiteral(None)

    def include_null_properties(self, include_null_properties: bool) -> None:
        self.value.include_null_properties(
            include_null_properties=include_null_properties
        )

    def get_column_spec(
        self, source_df: Optional[DataFrame], current_column: Optional[Column]
    ) -> Column:
        column_spec = when(
            self.check.get_column_spec(
                source_df=source_df, current_column=current_column
            ).isNull()
            | self.check.get_column_spec(
                source_df=source_df, current_column=current_column
            ).eqNullSafe(""),
            self.when_null.get_column_spec(
                source_df=source_df, current_column=current_column
            ),
        ).otherwise(
            self.value.get_column_spec(
                source_df=source_df, current_column=current_column
            )
        )

        return column_spec
Exemplo n.º 11
0
    def __init__(
        self,
        check: AutoMapperColumnOrColumnLikeType,
        value: _TAutoMapperDataType,
        when_null: Optional[Union[AutoMapperTextLikeBase,
                                  _TAutoMapperDataType]] = None,
    ):
        super().__init__()

        self.check: AutoMapperColumnOrColumnLikeType = check
        self.value: AutoMapperDataTypeBase = (
            value if isinstance(value, AutoMapperDataTypeBase) else
            AutoMapperValueParser.parse_value(value=value))
        if when_null:
            self.when_null: AutoMapperDataTypeBase = (
                cast(AutoMapperDataTypeBase, when_null) if isinstance(
                    value, AutoMapperDataTypeBase) else
                AutoMapperValueParser.parse_value(value=value))
        else:
            self.when_null = AutoMapperDataTypeLiteral(None)
Exemplo n.º 12
0
    def _parse_value(
        value: Union[Dict[str, Any], List[Any], AutoMapperAnyDataType]
    ) -> AutoMapperDataTypeBase:
        # convert any short syntax to long syntax
        if isinstance(value, str):
            if len(value) > 0 and value[0] == "[":
                from spark_auto_mapper.data_types.column import AutoMapperDataTypeColumn

                return AutoMapperDataTypeColumn(
                    value=value[1:-1])  # skip the first and last characters
            else:
                from spark_auto_mapper.data_types.literal import (
                    AutoMapperDataTypeLiteral, )

                return AutoMapperDataTypeLiteral(value=value)

        if isinstance(value, int):
            from spark_auto_mapper.data_types.literal import AutoMapperDataTypeLiteral

            return AutoMapperDataTypeLiteral(value=value, type_=IntegerType())

        if isinstance(value, float):
            from spark_auto_mapper.data_types.literal import AutoMapperDataTypeLiteral

            return AutoMapperDataTypeLiteral(value=value, type_=FloatType())

        if isinstance(value, date):
            from spark_auto_mapper.data_types.literal import AutoMapperDataTypeLiteral

            return AutoMapperDataTypeLiteral(value=value, type_=DateType())

        if isinstance(value, datetime):
            from spark_auto_mapper.data_types.literal import AutoMapperDataTypeLiteral

            return AutoMapperDataTypeLiteral(value=value, type_=DateType())

        # if value is a dict then wrap with struct
        if isinstance(value, dict):
            from spark_auto_mapper.data_types.complex.struct_type import (
                AutoMapperDataTypeStruct, )

            return AutoMapperDataTypeStruct(value=value)
        if isinstance(value, List):
            from spark_auto_mapper.data_types.list import AutoMapperList

            # ignore the type because we're using a list and it cannot ensure the type of the list
            return AutoMapperList(value=value)  # type: ignore

        if isinstance(value, AutoMapperDataTypeBase):
            return value

        if value is None:
            from spark_auto_mapper.data_types.literal import AutoMapperDataTypeLiteral

            return AutoMapperDataTypeLiteral(None)

        if isinstance(value, Column):
            return AutoMapperDataTypeColumnWrapper(value)

        raise ValueError(f"{type(value)} is not supported for {value}")
Exemplo n.º 13
0
 def get_column_spec(
     self,
     source_df: Optional[DataFrame],
     current_column: Optional[Column],
     parent_columns: Optional[List[Column]],
 ) -> Column:
     column_spec = self.column.get_column_spec(
         source_df=source_df,
         current_column=current_column,
         parent_columns=parent_columns,
     )
     # noinspection Mypy,PyProtectedMember
     col_name: str = (
         column_spec._jc.toString()  # type: ignore
     )  # Get spark representation of the column
     try:
         # Force spark analyzer to confirm that column/expression is possible. This does not actually compute
         # anything, just triggers the analyzer to check validity, which is what we want.
         # If SparkSQL AnalysisException is thrown, fall-back to the default, otherwise we can proceed.
         if source_df:
             source_df.selectExpr(col_name.replace("b.", ""))
             # col exists so we use the if_exists
             if self.if_exists_column:
                 column_spec = self.if_exists_column.get_column_spec(
                     source_df=source_df,
                     current_column=current_column,
                     parent_columns=parent_columns,
                 )
     except AnalysisException:
         if self.if_not_exists:
             column_spec = self.if_not_exists.get_column_spec(
                 source_df=source_df,
                 current_column=current_column,
                 parent_columns=parent_columns,
             )
         else:
             column_spec = AutoMapperDataTypeLiteral(None).get_column_spec(
                 source_df=source_df,
                 current_column=current_column,
                 parent_columns=parent_columns,
             )
     return column_spec
Exemplo n.º 14
0
 def to_text(self: _TAutoMapperDataType) -> 'AutoMapperTextLikeBase':
     """
     Specifies that the value parameter should be used as a literal text
     :return: a text automapper type
     """
     return AutoMapperDataTypeLiteral(self, StringType())
Exemplo n.º 15
0
class AutoMapperIfDataType(AutoMapperDataTypeBase,
                           Generic[_TAutoMapperDataType]):
    """
    If check returns value if the checks passes else when_not
    """
    @property
    def children(
            self
    ) -> Union[AutoMapperDataTypeBase, List[AutoMapperDataTypeBase]]:
        return self.value

    def __init__(
        self,
        column: AutoMapperColumnOrColumnLikeType,
        check: Union[AutoMapperAnyDataType, List[AutoMapperAnyDataType]],
        value: _TAutoMapperDataType,
        else_: Optional[_TAutoMapperDataType] = None,
    ):
        super().__init__()

        self.column: AutoMapperColumnOrColumnLikeType = column
        if isinstance(check, list):
            self.check: Union[AutoMapperDataTypeBase,
                              List[AutoMapperDataTypeBase]] = [
                                  a if isinstance(a, AutoMapperDataTypeBase)
                                  else AutoMapperValueParser.parse_value(
                                      value=a) for a in check
                              ]
        else:
            self.check = (check if isinstance(check, AutoMapperDataTypeBase)
                          else AutoMapperValueParser.parse_value(value=check))
        self.value: AutoMapperDataTypeBase = (
            value if isinstance(value, AutoMapperDataTypeBase) else
            AutoMapperValueParser.parse_value(value=value))
        if else_:
            self.else_: AutoMapperDataTypeBase = (
                cast(AutoMapperDataTypeBase, else_) if isinstance(
                    value, AutoMapperDataTypeBase) else
                AutoMapperValueParser.parse_value(value=value))
        else:
            self.else_ = AutoMapperDataTypeLiteral(None)

    def include_null_properties(self, include_null_properties: bool) -> None:
        self.value.include_null_properties(
            include_null_properties=include_null_properties)

    def get_column_spec(
        self,
        source_df: Optional[DataFrame],
        current_column: Optional[Column],
        parent_columns: Optional[List[Column]],
    ) -> Column:
        if isinstance(self.check, list):
            column_spec = when(
                self.column.get_column_spec(
                    source_df=source_df,
                    current_column=current_column,
                    parent_columns=parent_columns,
                ).isin(*[
                    c.get_column_spec(
                        source_df=source_df,
                        current_column=current_column,
                        parent_columns=parent_columns,
                    ) for c in self.check
                ]),
                self.value.get_column_spec(
                    source_df=source_df,
                    current_column=current_column,
                    parent_columns=parent_columns,
                ),
            ).otherwise(
                self.else_.get_column_spec(
                    source_df=source_df,
                    current_column=current_column,
                    parent_columns=parent_columns,
                ))
        else:
            column_spec = when(
                self.column.get_column_spec(
                    source_df=source_df,
                    current_column=current_column,
                    parent_columns=parent_columns,
                ).eqNullSafe(
                    self.check.get_column_spec(
                        source_df=source_df,
                        current_column=current_column,
                        parent_columns=parent_columns,
                    )),
                self.value.get_column_spec(
                    source_df=source_df,
                    current_column=current_column,
                    parent_columns=parent_columns,
                ),
            ).otherwise(
                self.else_.get_column_spec(
                    source_df=source_df,
                    current_column=current_column,
                    parent_columns=parent_columns,
                ))

        return column_spec
Exemplo n.º 16
0
def test_auto_mapper_datatype_literal(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        dst1="src1",
        dst2=AutoMapperDataTypeLiteral(None),
        dst3=AutoMapperDataTypeLiteral(""),
        dst4=AutoMapperDataTypeLiteral("literal"),
        dst5=AutoMapperDataTypeLiteral(1234),
        dst6=AutoMapperDataTypeLiteral(0),
    )

    assert isinstance(mapper, AutoMapper)

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)

    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["dst1"]) == str(lit("src1").alias("dst1"))
    assert str(sql_expressions["dst2"]) == str(lit(None).alias("dst2"))
    assert str(sql_expressions["dst3"]) == str(lit("").alias("dst3"))
    assert str(sql_expressions["dst4"]) == str(lit("literal").alias("dst4"))
    assert str(sql_expressions["dst5"]) == str(lit(1234).alias("dst5"))
    assert str(sql_expressions["dst6"]) == str(lit(0).alias("dst6"))

    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    result_df.printSchema()
    result_df.show()

    result = result_df.collect()

    assert result == [
        Row(
            member_id=1,
            dst1="src1",
            dst2=None,
            dst3="",
            dst4="literal",
            dst5=1234,
            dst6=0,
        ),
        Row(
            member_id=2,
            dst1="src1",
            dst2=None,
            dst3="",
            dst4="literal",
            dst5=1234,
            dst6=0,
        ),
    ]