def test_extended_string_to_timestamp(self, spark_session, input_value, expected_value): # test uses timezone set to GMT / UTC (pytest.ini)! input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_to_timestamp")]).transform(input_df) # workaround via pandas necessary due to bug with direct conversion # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123) output_pd_df = output_df.toPandas() output_value = output_pd_df.iloc[0]["output_key"] if isinstance(output_value, type(pd.NaT)): actual_value = None else: actual_value = output_value.to_pydatetime() assert actual_value == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.TimestampType)
def test_extended_string_to_timestamp_spark2(self, spark_session, input_value, expected_value): # test uses timezone set to GMT / UTC (pytest.ini)! input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_to_timestamp")]).transform(input_df) # workaround via pandas necessary due to bug with direct conversion # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123) try: output_pd_df = output_df.toPandas() actual_value = output_pd_df.iloc[0]["output_key"].to_pydatetime() except ValueError: # If input is in milliseconds it will still be stored in the DF but cannot be collected in Python actual_value = "out_of_range_for_python" except AttributeError: # `.to_pydatetime()` can only be used on datetimes and throws AttributeErrors on other objects / None actual_value = None assert actual_value == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.TimestampType)
def test_extended_string_unix_timestamp_ms_to_timestamp_spark2( self, spark_session, input_value, expected_value): # test uses timezone set to GMT / UTC (pytest.ini)! input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_unix_timestamp_ms_to_timestamp" )]).transform(input_df) # workaround via pandas necessary due to bug with direct conversion # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123) try: output_pd_df = output_df.toPandas() actual_value = output_pd_df.iloc[0]["output_key"].to_pydatetime() assert ( actual_value.toordinal() == expected_value.toordinal(), "actual_value: {act_val}, expected value: {expected_val}". format(act_val=actual_value, expected_val=expected_value), ) except AttributeError: # `.to_pydatetime()` can only be used on datetimes and throws AttributeErrors on None assert expected_value is None assert isinstance(output_df.schema["output_key"].dataType, T.TimestampType)