def test_array_repeat(self): from pyspark.sql.functions import array_repeat, lit df = self.spark.range(1) self.assertEquals( df.select(array_repeat("id", 3)).toDF("val").collect(), df.select(array_repeat("id", lit(3))).toDF("val").collect(), )
def test_array_repeat(self): df = self.spark.range(1) df = df.withColumn("repeat_n", lit(3)) expected = [Row(val=[0, 0, 0])] self.assertTrue( all([ df.select(array_repeat("id", 3).alias("val")).collect() == expected, df.select(array_repeat( "id", lit(3)).alias("val")).collect() == expected, df.select(array_repeat( "id", "repeat_n").alias("val")).collect() == expected, ]))
def billing_events(df): import datetime MAX_MONTH = 72 def get_last_month(col): h = F.abs(F.xxhash64(col)) h1 = (h.bitwiseAND(0xff)) % (MAX_MONTH // 2) h2 = (F.shiftRight(h, 8).bitwiseAND(0xff)) % (MAX_MONTH // 3) h3 = (F.shiftRight(h, 16).bitwiseAND(0xff)) % (MAX_MONTH // 5) h4 = (F.shiftRight(h, 24).bitwiseAND(0xff)) % (MAX_MONTH // 7) h5 = (F.shiftRight(h, 32).bitwiseAND(0xff)) % (MAX_MONTH // 11) return -(h1 + h2 + h3 + h4 + h5) w = pyspark.sql.Window.orderBy(F.lit("")).partitionBy(df.customerID) charges = (df.select( df.customerID, F.lit("Charge").alias("kind"), F.explode( F.array_repeat( (df.TotalCharges / df.tenure).cast(get_currency_type()), df.tenure.cast("int"))).alias("value"), F.when(df.Churn == "Yes", get_last_month( df.customerID)).otherwise(0).alias("last_month")).withColumn( "now", F.lit(now).cast("date")).withColumn( "month_number", -(F.row_number().over(w) + F.col("last_month"))).withColumn( "date", F.expr("add_months(now, month_number)")).drop( "now", "month_number", "last_month")) serviceStarts = (df.withColumn( "last_month", F.when(df.Churn == "Yes", get_last_month( df.customerID)).otherwise(0)).select( df.customerID, F.lit("AccountCreation").alias("kind"), F.lit(0.0).cast(get_currency_type()).alias("value"), F.lit(now).alias("now"), (-df.tenure - 1 + F.col("last_month")).alias("month_number"), ).withColumn("date", F.expr("add_months(now, month_number)")).drop( "now", "month_number")) serviceTerminations = df.withColumn( "last_month", F.when(df.Churn == "Yes", get_last_month(df.customerID)).otherwise(0)).where( df.Churn == "Yes").withColumn("now", F.lit(now)).select( df.customerID, F.lit("AccountTermination").alias("kind"), F.lit(0.0).cast(get_currency_type()).alias("value"), F.expr("add_months(now, last_month)").alias("date")) billingEvents = charges.union(serviceStarts).union( serviceTerminations).orderBy("date").withColumn( "month", F.substring("date", 0, 7)) return billingEvents
def generate( batch_id, n_data, public_key_hex_internal, public_key_hex_external, output, n_rows, scale, partition_size_mb, ): shares = ( spark_session().range(n_rows * n_data).select( (F.col("id") % n_rows).alias("row_id"), F.when(F.rand() > 0.5, 1).otherwise(0).alias("payload"), ).groupBy("row_id").agg( F.collect_list("payload").alias("payload")).select( F.pandas_udf( partial( udf.encode, batch_id, n_data, public_key_hex_internal, public_key_hex_external, ), returnType="a: binary, b: binary", )("payload").alias("shares")) # repeat this data `scale` times .withColumn("_repeat", F.explode(F.array_repeat( F.lit(0), scale))).drop("_repeat").withColumn( "id", F.udf(lambda: str(uuid4()), returnType="string")())) # we can make an estimate with just a single row, since the configuration # is the same here. row = shares.first() dataset_estimate_mb = ((len(b64encode(row.shares.a)) + len(str(uuid4()))) * n_rows * scale * 1.0 / 10**6) num_partitions = math.ceil(dataset_estimate_mb / partition_size_mb) click.echo(f"writing {num_partitions} partitions") # try to be efficient without caching by repartitioning repartitioned = (shares.withColumn( "shares", F.map_from_arrays(F.array(F.lit("a"), F.lit("b")), F.array("shares.a", "shares.b")), ).repartitionByRange(num_partitions, "id").select( "id", F.explode("shares").alias("server_id", "payload"))) repartitioned.write.partitionBy("server_id").json(output, mode="overwrite")
def compile_array_repeat(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) times = op.times.op().value return F.flatten(F.array_repeat(src_column, times))
array_subset = shows.select("name", "genres") array_subset = array_subset.select( "name", array_subset.genres[0].alias("dot_and_index"), F.col("genres")[0].alias("col_and_index"), array_subset.genres.getItem(0).alias("dot_and_method"), F.col("genres").getItem(0).alias("col_and_method"), ) # array_subset.show() array_subset_repeated = array_subset.select( "name", F.lit("Comedy").alias("one"), F.lit("Horror").alias("two"), F.lit("Drama").alias("three"), F.col("dot_and_index"), ).select( "name", F.array("one", "two", "three").alias("Some_Genres"), F.array_repeat("dot_and_index", 5).alias("Repeated_Genres"), array_subset_repeated.show(1, False) array_subset_repeated.select( "name", F.size("Some_Genres"), F.size("Repeated_Genres") ).show() array_subset_repeated.select( "name", F.array_distinct("Some_Genres"), F.array_distinct("Repeated_Genres") ).show(1, False)