def test_udf_as_join_condition(self): left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)]) right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, b1=3, b2=1)]) f = udf(lambda a: a, IntegerType()) df = left.join(right, [f("a") == f("b"), left.a1 == right.b1]) self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)])
def test_udf_in_generate(self): from pyspark.sql.functions import udf, explode df = self.spark.range(5) f = udf(lambda x: list(range(x)), ArrayType(LongType())) row = df.select(explode(f(*df))).groupBy().sum().first() self.assertEqual(row[0], 10) df = self.spark.range(3) res = df.select("id", explode(f(df.id))).collect() self.assertEqual(res[0][0], 1) self.assertEqual(res[0][1], 0) self.assertEqual(res[1][0], 2) self.assertEqual(res[1][1], 0) self.assertEqual(res[2][0], 2) self.assertEqual(res[2][1], 1) range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType())) res = df.select("id", explode(range_udf(df.id))).collect() self.assertEqual(res[0][0], 0) self.assertEqual(res[0][1], -1) self.assertEqual(res[1][0], 0) self.assertEqual(res[1][1], 0) self.assertEqual(res[2][0], 1) self.assertEqual(res[2][1], 0) self.assertEqual(res[3][0], 1) self.assertEqual(res[3][1], 1)
def test_nondeterministic_udf3(self): # regression test for SPARK-23233 f = udf(lambda x: x) # Here we cache the JVM UDF instance. self.spark.range(1).select(f("id")) # This should reset the cache to set the deterministic status correctly. f = f.asNondeterministic() # Check the deterministic status of udf. df = self.spark.range(1).select(f("id")) deterministic = df._jdf.logicalPlan().projectList().head().deterministic() self.assertFalse(deterministic)
def test_udf_in_filter_on_top_of_join(self): # regression test for SPARK-18589 left = self.spark.createDataFrame([Row(a=1)]) right = self.spark.createDataFrame([Row(b=1)]) f = udf(lambda a, b: a == b, BooleanType()) df = left.crossJoin(right).filter(f("a", "b")) self.assertEqual(df.collect(), [Row(a=1, b=1)])
def test_udf_in_subquery(self): f = udf(lambda x: x, "long") with self.tempView("v"): self.spark.range(1).filter(f("id") >= 0).createTempView("v") sql = self.spark.sql result = sql("select i from values(0L) as data(i) where i in (select id from v)") self.assertEqual(result.collect(), [Row(i=0)])
def runWithJoinType(join_type, type_string): with self.assertRaisesRegex( AnalysisException, """Python UDF in the ON clause of a %s JOIN.""" % type_string, ): left.join(right, [f("a", "b"), left.a1 == right.b1], join_type).collect()
def runWithJoinType(join_type, type_string): with self.assertRaisesRegex( AnalysisException, """Using PythonUDF in join condition of join type "%s" is not supported""" % type_string, ): left.join(right, [f("a", "b"), left.a1 == right.b1], join_type).collect()
def test_udf_and_common_filter_in_join_condition(self): # regression test for SPARK-25314 # test the complex scenario with both udf and common filter left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, a1=2, a2=2)]) right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, b1=3, b2=1)]) f = udf(lambda a, b: a == b, BooleanType()) df = left.join(right, [f("a", "b"), left.a1 == right.b1]) # do not need spark.sql.crossJoin.enabled=true for udf is not the only join condition. self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=1, b1=1, b2=1)])
def test_udf_in_join_condition(self): # regression test for SPARK-25314 left = self.spark.createDataFrame([Row(a=1)]) right = self.spark.createDataFrame([Row(b=1)]) f = udf(lambda a, b: a == b, BooleanType()) df = left.join(right, f("a", "b")) with self.assertRaisesRegexp(AnalysisException, 'Detected implicit cartesian product'): df.collect() with self.sql_conf({"spark.sql.crossJoin.enabled": True}): self.assertEqual(df.collect(), [Row(a=1, b=1)])
def test_udf_in_left_outer_join_condition(self): # regression test for SPARK-26147 from pyspark.sql.functions import col left = self.spark.createDataFrame([Row(a=1)]) right = self.spark.createDataFrame([Row(b=1)]) f = udf(lambda a: str(a), StringType()) # The join condition can't be pushed down, as it refers to attributes from both sides. # The Python UDF only refer to attributes from one side, so it's evaluable. df = left.join(right, f("a") == col("b").cast("string"), how="left_outer") with self.sql_conf({"spark.sql.crossJoin.enabled": True}): self.assertEqual(df.collect(), [Row(a=1, b=1)])
def detect_barometric_anamoly(barometric_reading, TimeStamp): ''' Driver function to detect barometric anamolies ''' barometric_reading = np.asarray(barometric_reading) TimeStamp = np.asarray(TimeStamp) try: if np.amin(barometric_reading) > 370: return False elif np.amin(barometric_reading) < 0: return False else: # Time at which the drone height is lowest sorted_TimeStamp = TimeStamp.argsort() barometric_reading = barometric_reading[sorted_TimeStamp] TimeStamp = TimeStamp[sorted_TimeStamp] Minimum_time = TimeStamp[np.where( barometric_reading == np.amin(barometric_reading))] Mid_time = TimeStamp[int(TimeStamp.size / 2)] # Define window size that you wanna pick out the data # i.e. window = [Minimum_time-Window_Size_Secs:Minimum_time] Window_Size_Secs = 10 sliced_barometric = barometric_reading[np.where((TimeStamp > (Minimum_time - Window_Size_Secs)) & \ (TimeStamp < (Minimum_time + Window_Size_Secs)))] sliced_TimeStamp = TimeStamp[np.where((TimeStamp > (Minimum_time - Window_Size_Secs)) & \ (TimeStamp < (Minimum_time + Window_Size_Secs)))] # generate expected malfunctioning device data length_array = TimeStamp[np.where((TimeStamp > (Mid_time - Window_Size_Secs)) & \ (TimeStamp < (Mid_time + Window_Size_Secs)))].size anomalous_event, ts = get_anomalous_event(length_array) sliced_anomalous = anomalous_event[np.where((ts > (np.amin(sliced_TimeStamp) - Minimum_time)[0]) & \ (ts <= (np.amax(sliced_TimeStamp) - Minimum_time)[0]))] sliced_ts = ts[np.where((ts > (np.amin(sliced_TimeStamp) - Minimum_time)[0]) & \ (ts <= (np.amax(sliced_TimeStamp) - Minimum_time)[0]))] f = interpolate.interp1d(np.linspace(0, 1, len(sliced_anomalous)), sliced_anomalous) x = np.linspace(0, 1, sliced_barometric.size) compare_anomalous = f(x) Error = RMSE((sliced_barometric), compare_anomalous) if Error < 11: return True else: return False except ValueError: return False
def test_udf_in_join_condition(self): # regression test for SPARK-25314 left = self.spark.createDataFrame([Row(a=1)]) right = self.spark.createDataFrame([Row(b=1)]) f = udf(lambda a, b: a == b, BooleanType()) # The udf uses attributes from both sides of join, so it is pulled out as Filter + # Cross join. df = left.join(right, f("a", "b")) with self.sql_conf({"spark.sql.crossJoin.enabled": False}): with self.assertRaisesRegex(AnalysisException, 'Detected implicit cartesian product'): df.collect() with self.sql_conf({"spark.sql.crossJoin.enabled": True}): self.assertEqual(df.collect(), [Row(a=1, b=1)])
def test_udf_defers_judf_initialization(self): # This is separate of UDFInitializationTests # to avoid context initialization # when udf is called f = UserDefinedFunction(lambda x: x, StringType()) self.assertIsNone( f._judf_placeholder, "judf should not be initialized before the first call." ) self.assertIsInstance(f("foo"), Column, "UDF call should return a Column.") self.assertIsNotNone( f._judf_placeholder, "judf should be initialized after UDF has been called." )
def runWithJoinType(join_type, type_string): with self.assertRaisesRegexp( AnalysisException, 'Using PythonUDF.*%s is not supported.' % type_string): left.join(right, [f("a", "b"), left.a1 == right.b1], join_type).collect()
def test_udf_globals_not_overwritten(self): @udf('string') def f(): assert "itertools" not in str(map) self.spark.range(1).select(f()).collect()
def transform(self, f): return f(self)
byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, date_gen, # we are limiting TimestampGen to avoid overflowing the INT96 value # see https://github.com/rapidsai/cudf/issues/8070 limited_timestamp() ] parquet_basic_map_gens = [ MapGen(f(nullable=False), f()) for f in [ BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, DateGen, limited_timestamp ] ] + [ simple_string_to_string_map_gen, MapGen(DecimalGen(20, 2, nullable=False), decimal_gen_128bit) ] parquet_struct_gen_no_maps = [ StructGen([['child' + str(ind), sub_gen] for ind, sub_gen in enumerate(parquet_basic_gen)]), StructGen([['child0', StructGen([['child1', byte_gen]])]]) ] parquet_struct_of_map_gen = StructGen(
array_gens_sample = single_level_array_gens + nested_array_gens_sample # all of the basic types in a single struct all_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(all_basic_gens)]) # Some struct gens, but not all because of nesting nonempty_struct_gens_sample = [all_basic_struct_gen, StructGen([['child0', byte_gen], ['child1', all_basic_struct_gen]]), StructGen([['child0', ArrayGen(short_gen)], ['child1', double_gen]])] struct_gens_sample = nonempty_struct_gens_sample + [StructGen([])] simple_string_to_string_map_gen = MapGen(StringGen(pattern='key_[0-9]', nullable=False), StringGen(), max_length=10) all_basic_map_gens = [MapGen(f(nullable=False), f()) for f in [BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, DateGen, TimestampGen]] + [simple_string_to_string_map_gen] # Some map gens, but not all because of nesting map_gens_sample = all_basic_map_gens + [MapGen(StringGen(pattern='key_[0-9]', nullable=False), ArrayGen(string_gen), max_length=10), MapGen(RepeatSeqGen(IntegerGen(nullable=False), 10), long_gen, max_length=10), MapGen(StringGen(pattern='key_[0-9]', nullable=False), simple_string_to_string_map_gen)] allow_negative_scale_of_decimal_conf = {'spark.sql.legacy.allowNegativeScaleOfDecimal': 'true'} no_nans_conf = {'spark.rapids.sql.hasNans': 'false'} def copy_and_update(conf, *more_confs): local_conf = conf.copy() for more in more_confs: local_conf.update(more) return local_conf