Пример #1
0
    def test_filter_df_based_on_column_value(self):
        source_data = [
            ("2016-02-01T00:00:00", -3.9, "Highland & Eilean Siar"),
            ("2016-02-01T00:00:00", 1.3, "Highland & Eilean Siar"),
            ("2016-02-01T00:00:00", 2.8, "Grampian"),
            ("2016-02-01T00:00:00", 2.5, "Grampian"),
            ("2016-02-01T00:00:00", 0.8, "Grampian"),
            ("2016-03-01T00:00:00", 3.2, "Grampian"),
            ("2016-03-01T00:00:00", 10.4, "Strathclyde"),
            ("2016-03-01T00:00:00", 2.7, "Central Tayside & Fife"),
            ("2016-03-01T00:00:00", 2.3, "Central Tayside & Fife"),
        ]
        source_df = get_spark_session().createDataFrame(
            source_data,
            ["ObservationDate", "ScreenTemperature", "Region"]
        )

        actual_df = filter_df_based_on_column_value(source_df, "ScreenTemperature", 10.4)

        expected_data = [
            ("2016-03-01T00:00:00", 10.4, "Strathclyde")
        ]
        expected_df = get_spark_session().createDataFrame(
            expected_data,
            ["ObservationDate", "ScreenTemperature", "Region"]
        )

        assert(expected_df.collect() == actual_df.collect())
Пример #2
0
    def test_select_distinct_Region_from_df(self):
        source_data = [
            ("2016-03-01T00:00:00", 10.4, "Strathclyde"),
        ]
        source_df = get_spark_session().createDataFrame(
            source_data,
            ["ObservationDate", "ScreenTemperature", "Region"]
        )

        actual_df = select_distinct_column_from_df(source_df, "Region")

        expected_data = [
            ("Strathclyde",)
        ]
        expected_df = get_spark_session().createDataFrame(
            expected_data,
            ["Region"]
        )

        assert(expected_df.collect() == actual_df.collect())
Пример #3
0
    def test_convert_timestamp_column_to_date_column(self):
        source_data = [
            ("20160301", 10.4, "Strathclyde"),
        ]
        source_df = get_spark_session().createDataFrame(
            source_data,
            ["ObservationDate", "ScreenTemperature", "Region"]
        )
        source_df = source_df.withColumn("ObservationDate", F.to_date(source_df.ObservationDate, 'yyyyMMdd').cast('timestamp'))

        actual_df = convert_timestamp_column_to_date_column(source_df, "ObservationDate")

        expected_data = [
            ("20160301",)
        ]
        expected_df = get_spark_session().createDataFrame(
            expected_data,
            ["ObservationDate"]
        )
        expected_df = expected_df.withColumn("ObservationDate", F.to_date(expected_df.ObservationDate, 'yyyyMMdd').cast('timestamp').cast('date'))

        assert(expected_df.collect() == actual_df.collect())
Пример #4
0
    def test_select_distinct_Region_from_df_if_max_is_in_multiple_records(self):
        source_data = [
            ("2016-03-01T00:00:00", 10.4, "Strathclyde"),
            ("2016-02-01T00:00:00", 10.4, "Central Tayside & Fife"),
        ]
        source_df = get_spark_session().createDataFrame(
            source_data,
            ["ObservationDate", "ScreenTemperature", "Region"]
        )

        actual_df = select_distinct_column_from_df(source_df, "Region")

        expected_data = [
            ("Strathclyde",),
            ("Central Tayside & Fife",)
        ]
        expected_df = get_spark_session().createDataFrame(
            expected_data,
            ["Region"]
        )

        assert(expected_df.collect() == actual_df.collect())
Пример #5
0
    def test_find_max_in_df(self):
        source_data = [
            ("2016-02-01T00:00:00", -3.9, "Highland & Eilean Siar"),
            ("2016-02-01T00:00:00", 1.3, "Highland & Eilean Siar"),
            ("2016-02-01T00:00:00", 2.8, "Grampian"),
            ("2016-02-01T00:00:00", 2.5, "Grampian"),
            ("2016-02-01T00:00:00", 0.8, "Grampian"),
            ("2016-03-01T00:00:00", 3.2, "Grampian"),
            ("2016-03-01T00:00:00", 10.4, "Strathclyde"),
            ("2016-03-01T00:00:00", 2.7, "Central Tayside & Fife"),
            ("2016-03-01T00:00:00", 2.3, "Central Tayside & Fife"),
        ]
        source_df = get_spark_session().createDataFrame(
            source_data,
            ["ObservationDate", "ScreenTemperature", "Region"]
        )

        actual_data = find_max_in_df(source_df, "ScreenTemperature")

        expected_data = 10.4

        assert(expected_data == actual_data)
def main():

    print('Mission Started')
    result = {}

    # The results formatted based on parquet will be stored 3 different subfolders
    RESULTS_LOCATION_for_Hottest_ObservationDate = conf[
        'RESULTS_LOCATION'] + "/" + "Hottest_" + "ObservationDate"
    RESULTS_LOCATION_for_Hottest_ScreenTemperature = conf[
        'RESULTS_LOCATION'] + "/" + "Hottest_" + "ScreenTemperature"
    RESULTS_LOCATION_for_Hottest_Region = conf[
        'RESULTS_LOCATION'] + "/" + "Hottest_" + "Region"

    # Read 3 columns "ObservationDate", "ScreenTemperature" and "Region" from the input date
    session = get_spark_session()
    df = read_csv_into_df(session, conf['CSV_LOCATION'])
    output_validation(conf['PARQUET_LOCATION'], conf['OVERWRITE'])
    store_df_as_parquet(df, conf['PARQUET_LOCATION'])
    df = read_parquet_into_df(
        session, conf['PARQUET_LOCATION'],
        ["ObservationDate", "ScreenTemperature", "Region"])

    # Fetch the maximum value of column named "ScreenTemperature" as the hottest temperature
    max_value = find_max_in_df(df, "ScreenTemperature")

    # Fetch record(s) including the hottest temprature
    df = filter_df_based_on_column_value(df, "ScreenTemperature", max_value)

    # Find and store the hottest ObservationDate(s)
    hottest_timestamp_df = select_distinct_column_from_df(
        df, "ObservationDate")
    hottest_date_df = convert_timestamp_column_to_date_column(
        hottest_timestamp_df, "ObservationDate")
    print("\n==== Hottest Day(s) =====")
    hottest_date_df.show()
    output_validation(RESULTS_LOCATION_for_Hottest_ObservationDate,
                      conf['OVERWRITE'])
    store_df_as_parquet(hottest_date_df,
                        RESULTS_LOCATION_for_Hottest_ObservationDate)
    result['hottest_date_df'] = hottest_date_df

    # Find and store the hottest ScreenTemperature
    hottest_temperature_df = select_distinct_column_from_df(
        df, "ScreenTemperature")
    print("\n==== Hottest Temperature =====")
    hottest_temperature_df.show()
    output_validation(RESULTS_LOCATION_for_Hottest_ScreenTemperature,
                      conf['OVERWRITE'])
    store_df_as_parquet(hottest_temperature_df,
                        RESULTS_LOCATION_for_Hottest_ScreenTemperature)
    result['hottest_temperature_df'] = hottest_temperature_df

    # Find and store the hottest Region(s)
    hottest_region_df = select_distinct_column_from_df(df, "Region")
    print("\n==== Hottest Region(s) =====")
    hottest_region_df.show()
    output_validation(RESULTS_LOCATION_for_Hottest_Region, conf['OVERWRITE'])
    store_df_as_parquet(hottest_region_df, RESULTS_LOCATION_for_Hottest_Region)
    result['hottest_region_df'] = hottest_region_df

    print('Mission Ended Successfully - Result is: {}'.format(str(result)))
    return result
    def test_main_if_max_is_in_multiple_regions_and_multiple_dates(self):

        # Change configurations to a temporary folder
        conf['CSV_LOCATION'] = "tmp/input/"
        conf['PARQUET_LOCATION'] = "tmp/output/parquet"
        conf['RESULTS_LOCATION'] = "tmp/output/results"

        # Write some real-world data input
        source_cvs = """ForecastSiteCode,ObservationTime,ObservationDate,WindDirection,WindSpeed,WindGust,Visibility,ScreenTemperature,Pressure,SignificantWeatherCode,SiteName,Latitude,Longitude,Region,Country
3002,0,2016-03-01T00:00:00,12,8,,30000,9.80,997,8,BALTASOUND (3002),60.7490,-0.8540,Orkney & Shetland,SCOTLAND
3005,0,2016-02-01T00:00:00,10,2,,35000,0.10,997,7,LERWICK (S. SCREEN) (3005),60.1390,-1.1830,Orkney & Shetland,SCOTLAND
3008,0,2016-02-01T00:00:00,8,6,,50000,2.80,997,-99,FAIR ISLE (3008),59.5300,-1.6300,Orkney & Shetland,
3017,0,2016-02-01T00:00:00,6,8,,40000,1.60,996,8,KIRKWALL (3017),58.9540,-2.9000,Orkney & Shetland,SCOTLAND
3023,0,2016-02-01T00:00:00,10,30,37,2600,9.80,991,11,SOUTH UIST RANGE (3023),57.3580,-7.3970,Highland & Eilean Siar,SCOTLAND
3026,0,2016-02-01T00:00:00,5,15,,3900,4.30,991,12,STORNOWAY (3026),58.2140,-6.3250,Highland & Eilean Siar,SCOTLAND
3031,0,2016-02-01T00:00:00,4,9,,5000,1.10,995,11,LOCH GLACARNOCH SAWS (3031),57.7250,-4.8960,Highland & Eilean Siar,SCOTLAND
3034,0,2016-02-01T00:00:00,5,7,,5000,3.10,992,11,AULTBEA (3034),57.8590,-5.6360,Highland & Eilean Siar,SCOTLAND
3037,0,2016-02-01T00:00:00,8,15,,3400,8.60,993,15,SKYE/LUSA (SAMOS) (3037),57.2570,-5.8090,Highland & Eilean Siar,SCOTLAND
3039,0,2016-02-01T00:00:00,9,32,43,,3.90,,-99,BEALACH NA BA (3039),57.4200,-5.6900,Highland & Eilean Siar,
3041,0,2016-02-01T00:00:00,10,44,82,,3.90,,-99,AONACH MOR (3041),56.8200,-4.9700,Highland & Eilean Siar,
3044,0,2016-02-01T00:00:00,12,6,,40000,1.30,994,8,ALTNAHARRA SAWS (3044),58.2880,-4.4420,Highland & Eilean Siar,SCOTLAND
3047,0,2016-02-01T00:00:00,9,15,,10000,4.80,997,15,TULLOCH BRIDGE (3047),56.8670,-4.7080,Highland & Eilean Siar,SCOTLAND
3062,0,2016-02-01T00:00:00,6,6,,5000,2.40,995,9,TAIN RANGE (3062),57.8200,-3.9700,Highland & Eilean Siar,
3063,0,2016-02-01T00:00:00,9,9,22,22000,2.70,996,9,AVIEMORE (3063),57.2060,-3.8270,Highland & Eilean Siar,SCOTLAND
3075,0,2016-02-01T00:00:00,7,10,,27000,4.20,996,8,WICK AIRPORT (3075),58.4540,-3.0890,Highland & Eilean Siar,SCOTLAND
3066,0,2016-02-01T00:00:00,4,5,,12000,2.80,995,12,KINLOSS (3066),57.6494,-3.5606,Grampian,SCOTLAND
3068,0,2016-02-01T00:00:00,6,8,,23000,2.50,996,8,LOSSIEMOUTH (3068),57.7120,-3.3220,Grampian,SCOTLAND
3080,0,2016-02-01T00:00:00,12,2,,16000,0.80,998,7,ABOYNE (3080),57.0770,-2.8360,Grampian,SCOTLAND
3088,0,2016-02-01T00:00:00,8,17,,400,4.30,998,6,INVERBERVIE (3088),56.8500,-2.2700,Grampian,
3091,0,2016-02-01T00:00:00,8,8,,4700,3.00,998,11,ABERDEEN DYCE (3091),57.2060,-2.2020,Grampian,SCOTLAND
"""
        # Write the real-world data to a csv file
        csv_file = open(conf['CSV_LOCATION'] + "sample.csv",
                        "w",
                        encoding='utf8')
        csv_file.write(source_cvs)
        csv_file.close()

        result = main()

        # Test Hottest Date
        actual_df = result['hottest_date_df']
        expected_data = [("2016-03-01", ), ("2016-02-01", )]
        expected_df = get_spark_session().createDataFrame(
            expected_data, ["ObservationDate"])
        expected_df = expected_df.withColumn(
            "ObservationDate",
            F.to_date(expected_df.ObservationDate,
                      'yyyy-MM-dd').cast('timestamp').cast('date'))
        assert (expected_df.collect() == actual_df.collect())

        # Test Hottest Temperature
        actual_df = result['hottest_temperature_df']
        expected_data = [(9.8, )]
        expected_df = get_spark_session().createDataFrame(
            expected_data, ["ScreenTemperature"])
        expected_df = expected_df.withColumn(
            "ScreenTemperature", expected_df.ScreenTemperature.cast('double'))
        assert (expected_df.collect() == actual_df.collect())

        # Test Hottest Region
        actual_df = result['hottest_region_df']
        expected_data = [
            ("Orkney & Shetland", ),
            ("Highland & Eilean Siar", ),
        ]
        expected_df = get_spark_session().createDataFrame(
            expected_data, ["Region"])
        expected_df = expected_df.withColumn("Region",
                                             expected_df.Region.cast('string'))
        assert (expected_df.collect() == actual_df.collect())