示例#1
0
def test_automapper_field(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").columns(
        age=A.column("identifier").select_one(A.field("type.coding[0].code"))
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["age"]
    #            ) == str(col("b.identifier[0]").alias("age"))
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.select("age").collect()[0][0] == "PRN"
示例#2
0
def test_automapper_flatten(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)

    source_view_name = "cascaded_list_view"
    result_view_name = "flatten_list_view"
    source_df = spark_session.createDataFrame([([[1], [2, 3, 4], [3, 5]], )],
                                              ["column"])
    source_df.createOrReplaceTempView(source_view_name)

    # Act
    mapper = AutoMapper(view=result_view_name,
                        source_view=source_view_name).columns(
                            column_flat=A.flatten(A.column("column")))

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # assert
    assert result_df.select("column_flat").collect()[0][0] == [
        1, 2, 3, 4, 3, 5
    ]
def test_automapper_flatten_with_null(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)

    source_view_name = "cascaded_list_view"
    result_view_name = "flatten_list_view"
    schema = StructType([
        StructField(
            "column",
            ArrayType(elementType=ArrayType(elementType=IntegerType())))
    ])
    source_df = spark_session.createDataFrame(
        [([[1], [2, 3, 4], [3, 5], None], )], schema=schema)
    source_df.printSchema()
    source_df.createOrReplaceTempView(source_view_name)

    # Act
    mapper = AutoMapper(view=result_view_name,
                        source_view=source_view_name).columns(
                            column_flat=A.flatten(A.column("column")))

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # assert
    assert result_df.select("column_flat").collect()[0][0] == [
        1, 2, 3, 4, 3, 5
    ]
示例#4
0
def test_can_load_xml_file_with_schema(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)

    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('test.xml')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    xml_shema = StructType([
        StructField("_id", StringType(), True),
        StructField("author", StringType(), True),
        StructField("description", StringType(), True),
        StructField("genre", StringType(), True),
        StructField("price", DoubleType(), True),
        StructField("publish_date", StringType(), True),
        StructField("title", StringType(), True),
    ])
    # Act
    FrameworkXmlLoader(view="my_view",
                       filepath=test_file_path,
                       row_tag="book",
                       schema=xml_shema).transform(df)

    result: DataFrame = spark_session.sql("SELECT * FROM my_view")
    result.show()
    assert result.count() == 12
    assert len(result.columns) == 7
示例#5
0
def test_can_load_non_standard_delimited_csv(
        spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)

    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('test.psv')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    # Act
    loader = FrameworkCsvLoader(view="my_view",
                                filepath=test_file_path,
                                delimiter="|")
    loader.transform(df)

    # noinspection SqlDialectInspection
    result: DataFrame = spark_session.sql("SELECT * FROM my_view")

    result.show()

    # Assert
    assert loader.getDelimiter() == "|"
    assert_results(result)
示例#6
0
def test_correctly_loads_csv_with_clean_flag_on(
        spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)

    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('column_name_test.csv')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    # Act
    FrameworkCsvLoader(
        view="my_view",
        filepath=test_file_path,
        delimiter=",",
        clean_column_names=True,
    ).transform(df)

    # noinspection SqlDialectInspection
    result: DataFrame = spark_session.sql("SELECT * FROM my_view")

    # Assert
    assert_results(result)
    assert result.collect()[1][0] == "2"
    assert (result.columns[2] ==
            "Ugly_column_with_chars_that_parquet_does_not_like_much_-")
def test_can_convert_json_folder_to_jsonl(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)

    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('test_files')}"

    temp_folder = data_dir.joinpath("temp")
    if path.isdir(temp_folder):
        rmtree(temp_folder)
    makedirs(temp_folder)

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    # Act
    FrameworkJsonToJsonlConverter(file_path=test_file_path,
                                  output_folder=temp_folder).transform(df)

    # Assert
    with open(temp_folder.joinpath("test.json"), "r+") as file:
        lines: List[str] = file.readlines()
        assert len(lines) == 2
        assert (
            lines[0] ==
            '{"title":"A Philosophy of Software Design","authors":[{"given":["John"],"surname":"Ousterhout"}],"edition":null}\n'
        )
        assert (
            lines[1] ==
            '{"title":"Essentials of Programming Languages","authors":[{"given":["Dan","P."],"surname":"Friedman"},{"given":["Mitchell"],"surname":"Wand"}],"edition":3}\n'
        )
示例#8
0
def test_can_keep_columns(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)

    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('test.csv')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    # Act
    FrameworkCsvLoader(view="my_view", filepath=test_file_path,
                       delimiter=",").transform(df)

    FrameworkSelectColumnsTransformer(view="my_view",
                                      keep_columns=["Column2"]).transform(df)

    # noinspection SqlDialectInspection
    result: DataFrame = spark_session.sql("SELECT * FROM my_view")

    result.show()

    # Assert
    assert len(result.columns) == 1

    assert result.count() == 3

    assert result.collect()[1][0] == "bar"
def test_can_run_unvalidated_framework_pipeline(
        spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")
    flights_path: str = f"file://{data_dir.joinpath('flights.csv')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    spark_session.sql("DROP TABLE IF EXISTS default.flights")

    # Act
    parameters = {"flights_path": flights_path}

    with ProgressLogger() as progress_logger:
        pipeline: MyUnValidatedPipeline = MyUnValidatedPipeline(
            parameters=parameters, progress_logger=progress_logger)
        transformer = pipeline.fit(df)
        transformer.transform(df)

    # Assert
    result_df: DataFrame = spark_session.sql("SELECT * FROM flights2")
    result_df.show()

    assert result_df.count() > 0
def test_fail_fast_validated_framework_pipeline_writes_results(
    spark_session: SparkSession, ) -> None:
    # Arrange
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")
    flights_path: str = f"file://{data_dir.joinpath('flights.csv')}"
    output_path: str = f"file://{data_dir.joinpath('temp').joinpath('validation.csv')}"

    if path.isdir(data_dir.joinpath("temp")):
        shutil.rmtree(data_dir.joinpath("temp"))

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    spark_session.sql("DROP TABLE IF EXISTS default.flights")

    # Act
    parameters = {
        "flights_path": flights_path,
        "validation_source_path": str(data_dir),
        "validation_output_path": output_path,
    }

    try:
        with ProgressLogger() as progress_logger:
            pipeline: MyFailFastValidatedPipeline = MyFailFastValidatedPipeline(
                parameters=parameters, progress_logger=progress_logger)
            transformer = pipeline.fit(df)
            transformer.transform(df)
    except AssertionError:
        validation_df = df.sql_ctx.read.csv(output_path, header=True)
        validation_df.show(truncate=False)
        assert validation_df.count() == 1
def test_automapper_filter_and_transform(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").complex(
        MyObject(age=A.transform(
            A.filter(column=A.column("identifier"),
                     func=lambda x: x["use"] == lit("usual")),
            A.complex(bar=A.field("value"), bar2=A.field("system")))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        transform(
            filter("b.identifier", lambda x: x["use"] == lit("usual")),
            lambda x: struct(x["value"].alias("bar"), x["system"].alias("bar2")
                             )).alias("age"))
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)
def test_can_run_validated_framework_pipeline(
        spark_session: SparkSession) -> None:
    with pytest.raises(AssertionError):
        # Arrange
        clean_spark_session(spark_session)
        data_dir: Path = Path(__file__).parent.joinpath("./")
        flights_path: str = f"file://{data_dir.joinpath('flights.csv')}"
        output_path: str = f"file://{data_dir.joinpath('temp').joinpath('validation.csv')}"

        if path.isdir(data_dir.joinpath("temp")):
            shutil.rmtree(data_dir.joinpath("temp"))

        schema = StructType([])

        df: DataFrame = spark_session.createDataFrame(
            spark_session.sparkContext.emptyRDD(), schema)

        spark_session.sql("DROP TABLE IF EXISTS default.flights")

        # Act
        parameters = {
            "flights_path": flights_path,
            "validation_source_path": str(data_dir),
            "validation_output_path": output_path,
        }

        with ProgressLogger() as progress_logger:
            pipeline: MyValidatedPipeline = MyValidatedPipeline(
                parameters=parameters, progress_logger=progress_logger)
            transformer = pipeline.fit(df)
            transformer.transform(df)
def test_auto_mapper_handles_duplicates(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame([
        (1, 'Qureshi', 'Imran'),
        (2, 'Qureshi', 'Imran'),
        (3, 'Qureshi', 'Imran2'),
        (4, 'Vidal', 'Michael'),
    ], ['member_id', 'last_name', 'first_name'
        ]).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"
                              ]).columns(dst1="src1",
                                         dst2=A.column("last_name"),
                                         dst3=A.column("first_name"))

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    mapper.transform(df=source_df)
    result_df: DataFrame = spark_session.table("members")

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.count() == 3
示例#14
0
def test_validation_recurses_query_dir(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    query_dir: Path = Path(__file__).parent.joinpath("./queries")
    more_queries_dir: str = "more_queries"
    data_dir: Path = Path(__file__).parent.joinpath("./data")
    test_data_file: str = f"{data_dir.joinpath('test.csv')}"
    validation_query_file: str = "validate.sql"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    FrameworkCsvLoader(view="my_view", filepath=test_data_file).transform(df)

    FrameworkValidationTransformer(
        validation_source_path=str(query_dir),
        validation_queries=[validation_query_file, more_queries_dir],
    ).transform(df)

    df_validation = df.sql_ctx.table("pipeline_validation")
    df_validation.show(truncate=False)
    assert 3 == df_validation.count(
    ), "Expected 3 total rows in pipeline_validation"
    assert (1 == df_validation.filter("is_failed == 1").count()
            ), "Expected one failing row in the validation table"
示例#15
0
def test_correctly_loads_csv_with_clean_flag_off(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)

    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('column_name_test.json')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema
    )

    # Act
    FrameworkJsonLoader(
        view="books", filepath=test_file_path, clean_column_names=False
    ).transform(df)

    # noinspection SqlDialectInspection
    result: DataFrame = spark_session.sql("SELECT * FROM books")

    # Assert
    assert result.count() == 2

    assert result.collect()[1]["title"] == "Essentials of Programming Languages"
    assert len(result.collect()[1]["authors"]) == 2
    assert result.collect()[1]["authors"][0]["surname"] == "Friedman"
    assert (
        result.collect()[1]["Ugly column,with;chars{that}parquet(does)not	like=much_-"]
        == 3
    )
示例#16
0
def test_can_load_simple_json(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)

    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('test.json')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    # Act
    FrameworkJsonLoader(view="books", filepath=test_file_path).transform(df)

    # noinspection SqlDialectInspection
    result: DataFrame = spark_session.sql("SELECT * FROM books")

    result.show()

    # Assert
    assert result.count() == 2

    assert result.collect(
    )[1]["title"] == "Essentials of Programming Languages"
    assert len(result.collect()[1]["authors"]) == 2
    assert result.collect()[1]["authors"][0]["surname"] == "Friedman"
    assert result.collect()[1]["edition"] == 3
示例#17
0
def test_auto_mapper_full_no_keys(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame([
        ('Qureshi', 'Imran'),
        ('Vidal', 'Michael'),
    ], ['last_name', 'first_name']).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # example of a variable
    client_address_variable: str = "address1"

    # Act
    mapper = AutoMapper(view="members", source_view="patients").columns(
        dst1="src1",
        dst2=AutoMapperList([client_address_variable]),
        dst3=AutoMapperList([client_address_variable, "address2"]))

    company_name: str = "Microsoft"

    if company_name == "Microsoft":
        mapper = mapper.columns(
            dst4=AutoMapperList(
                [A.complex(use="usual", family=A.column("last_name"))]),
            dst5=AutoMapperList(
                [A.complex(use="usual", first=A.column("first_name"))]),
        )

    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    mapper.transform(df=source_df)
    result_df: DataFrame = spark_session.table("members")

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5, list(result_df.columns)
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst1").collect()[0][0] == "src1"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst2").collect()[0][0][0] == "address1"

    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst3").collect()[0][0][0] == "address1"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst3").collect()[0][0][1] == "address2"

    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst4").collect()[0][0][0][0] == "usual"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst4").collect()[0][0][0][1] == "Qureshi"
    assert result_df.where("dst4[0].family == 'Qureshi'").select(
        "dst5").collect()[0][0][0][1] == "Imran"
def test_automapper_if_not_null_or_empty(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame(
        [
            (1, 'Qureshi', 'Imran', "54"),
            (2, 'Vidal', 'Michael', ""),
            (3, 'Vidal3', 'Michael', None),
        ], ['member_id', 'last_name', 'first_name', "my_age"]
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")
    source_df.show()

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False
    ).columns(
        age=A.if_not_null_or_empty(
            A.column("my_age"), A.column("my_age"), A.text("100")
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df
    )
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert str(sql_expressions["age"]) == str(
        when(
            col("b.my_age").isNull() | col("b.my_age").eqNullSafe(""),
            lit("100").cast(StringType())
        ).otherwise(col("b.my_age")).alias("age")
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select("age"
                                                    ).collect()[0][0] == "54"
    assert result_df.where("member_id == 2").select("age"
                                                    ).collect()[0][0] == "100"
    assert result_df.where("member_id == 3").select("age"
                                                    ).collect()[0][0] == "100"

    assert dict(result_df.dtypes)["age"] == "string"
示例#19
0
def test_can_load_fixed_width(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)

    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('test.txt')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    # Act
    FrameworkFixedWidthLoader(
        view="my_view",
        filepath=test_file_path,
        columns=[
            ColumnSpec(column_name="id",
                       start_pos=1,
                       length=3,
                       data_type=StringType()),
            ColumnSpec(column_name="some_date",
                       start_pos=4,
                       length=8,
                       data_type=StringType()),
            ColumnSpec(
                column_name="some_string",
                start_pos=12,
                length=3,
                data_type=StringType(),
            ),
            ColumnSpec(
                column_name="some_integer",
                start_pos=15,
                length=4,
                data_type=IntegerType(),
            ),
        ],
    ).transform(df)

    # noinspection SqlDialectInspection
    result: DataFrame = spark_session.sql("SELECT * FROM my_view")

    result.show()

    # Assert
    assert result.count() == 2
    assert result.collect()[0][0] == "001"
    assert result.collect()[1][0] == "002"
    assert result.collect()[0][1] == "01292017"
    assert result.collect()[1][1] == "01302017"
    assert result.collect()[0][2] == "you"
    assert result.collect()[1][2] == "me"
    assert result.collect()[0][3] == 1234
    assert result.collect()[1][3] == 5678
示例#20
0
def test_automapper_null_if_empty(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(session=spark_session)
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "54"),
            (2, "Vidal", "Michael", ""),
            (3, "Vidal3", "Michael", None),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")
    source_df.show()

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(age=A.column("my_age").to_null_if_empty())

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        when(col("b.my_age").eqNullSafe(""),
             lit(None)).otherwise(col("b.my_age")).alias("age"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "age").collect()[0][0] == "54"
    assert result_df.where("member_id == 2").select(
        "age").collect()[0][0] is None
    assert result_df.where("member_id == 3").select(
        "age").collect()[0][0] is None

    assert dict(result_df.dtypes)["age"] == "string"
def test_automapper_nested_array_filter_simple_with_array(
    spark_session: SparkSession, ) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    environ["LOGLEVEL"] = "DEBUG"

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients").columns(age=A.nested_array_filter(
            array_field=A.column("array1"),
            inner_array_field=A.field("array2"),
            match_property="reference",
            match_value=A.text("bar"),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        filter(
            col("b.array1"),
            lambda y: exists(
                y["array2"], lambda x: x["reference"] == lit("bar").cast(
                    "string")),
        ).alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.printSchema()
    result_df.show(truncate=False)

    assert result_df.count() == 2
    assert result_df.select("age").collect()[0][0] == []
    assert result_df.select(
        "age").collect()[1][0][0]["array2"][0]["reference"] == "bar"
示例#22
0
def test_automapper_concat_array(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        drop_key_columns=False).columns(
            age=A.column("identifier").concat(A.text("foo").to_array()))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        concat(col("b.identifier"),
               array(lit("foo").cast("string"))).alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.where("id == 1730325416").select(
        "age").collect()[0][0] == [
            "bar",
            "foo",
        ]

    assert result_df.where("id == 1467734301").select(
        "age").collect()[0][0] == [
            "John",
            "foo",
        ]
def test_web_crawler(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)
    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    response: Dict[str, Any] = {}

    # Act
    response = FrameworkWebCrawler(spider_class=SpiderTestClass,
                                   name="test_crawler").transform(
                                       df, response)

    # Assert
    print(response)
    assert response
示例#24
0
def test_auto_mapper_schema_pruning_with_defined_class(
    spark_session: SparkSession, ) -> None:
    # Arrange
    clean_spark_session(spark_session)
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", 45),
            (2, "Vidal", "Michael", 35),
        ],
        ["member_id", "last_name", "first_name", "my_age"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
    ).complex(
        MyClass(name=A.column("last_name"), age=A.number(A.column("my_age"))))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    assert_compare_expressions(sql_expressions["name"],
                               col("b.last_name").cast("string").alias("name"))
    assert_compare_expressions(sql_expressions["age"],
                               col("b.my_age").cast("long").alias("age"))

    result_df.printSchema()
    result_df.show()

    assert result_df.where("member_id == 1").select(
        "name").collect()[0][0] == "Qureshi"

    assert dict(result_df.dtypes)["age"] in ("int", "long", "bigint")
def test_automapper_select_one(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients").columns(age=A.column("identifier").filter(
            lambda x: x["system"] == "http://hl7.org/fhir/sid/us-npi").
                                        select_one(A.field("_.value")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        transform(
            filter(
                "b.identifier",
                lambda x: x["system"] == lit("http://hl7.org/fhir/sid/us-npi"),
            ),
            lambda x: x["value"],
        )[0].alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.select("age").collect()[0][0] == "1730325416"
    assert result_df.select("age").collect()[1][0] == "1467734301"
def test_automapper_null_remover(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients").columns(address=A.if_not_null(
            A.column("address"),
            value=A.column("address").select(
                A.if_not_null(
                    A.field("line"),
                    A.field("line").select(
                        A.current().sanitize()).remove_null_or_empty(),
                )),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # assert str(sql_expressions["age"]) == str(
    #     filter("b.identifier", lambda x: x["use"] == lit("usual")).alias("age")
    # )
    result_df: DataFrame = mapper.transform(df=source_df)

    print(result_df.select("address").collect()[0][0])
    assert result_df.select("address").collect()[0][0][0] == [
        "1111 STREET LN",
        "SUITE 256",
    ]
    result_df.show(truncate=False)
示例#27
0
def test_can_load_xml_file(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)

    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('test.xml')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    # Act
    FrameworkXmlLoader(view="my_view", filepath=test_file_path,
                       row_tag="book").transform(df)

    result: DataFrame = spark_session.sql("SELECT * FROM my_view")
    result.show()
    assert result.count() == 12
    assert len(result.columns) == 7
示例#28
0
def test_can_load_multiline_csv(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)

    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('multiline_row.csv')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    # Act
    FrameworkCsvLoader(view="my_view",
                       filepath=test_file_path,
                       delimiter=",",
                       multiline=True).transform(df)

    # noinspection SqlDialectInspection
    result: DataFrame = spark_session.sql("SELECT * FROM my_view")
    assert 1 == result.count()
示例#29
0
def test_can_load_simple_json_with_schema(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")
    test_file_path: str = f"{data_dir.joinpath('schema_test.json')}"
    test_file_path_2: str = f"{data_dir.joinpath('schema_test_2.json')}"

    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    # Act
    FrameworkJsonLoader(view="books", filepath=test_file_path).transform(df)
    FrameworkJsonLoader(view="books_schema",
                        filepath=test_file_path_2,
                        use_schema_from_view="books").transform(df)
    result: DataFrame = spark_session.sql("SELECT * FROM books")
    result_2: DataFrame = spark_session.sql("SELECT * FROM books_schema")

    # Act
    assert result.schema == result_2.schema
示例#30
0
def test_file_downloader(spark_session: SparkSession) -> None:
    # Arrange
    clean_spark_session(spark_session)
    schema = StructType([])

    df: DataFrame = spark_session.createDataFrame(
        spark_session.sparkContext.emptyRDD(), schema)

    response: Dict[str, Any] = {}

    download_url: str = "https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip"
    download_to_path: str = f"file://{os.path.join(Path(__file__).parent, 'data')}"

    # Act
    response = FrameworkFileDownloader(
        download_urls=[download_url],
        download_to_path=download_to_path,
        extract_zips=True,
    ).transform(df, response)

    # Assert
    assert response