Exemplo n.º 1
0
    def test_array_repeat(self):
        from pyspark.sql.functions import array_repeat, lit

        df = self.spark.range(1)

        self.assertEquals(
            df.select(array_repeat("id", 3)).toDF("val").collect(),
            df.select(array_repeat("id", lit(3))).toDF("val").collect(),
        )
Exemplo n.º 2
0
    def test_array_repeat(self):
        df = self.spark.range(1)
        df = df.withColumn("repeat_n", lit(3))

        expected = [Row(val=[0, 0, 0])]
        self.assertTrue(
            all([
                df.select(array_repeat("id",
                                       3).alias("val")).collect() == expected,
                df.select(array_repeat(
                    "id", lit(3)).alias("val")).collect() == expected,
                df.select(array_repeat(
                    "id", "repeat_n").alias("val")).collect() == expected,
            ]))
Exemplo n.º 3
0
def billing_events(df):
    import datetime

    MAX_MONTH = 72

    def get_last_month(col):
        h = F.abs(F.xxhash64(col))
        h1 = (h.bitwiseAND(0xff)) % (MAX_MONTH // 2)
        h2 = (F.shiftRight(h, 8).bitwiseAND(0xff)) % (MAX_MONTH // 3)
        h3 = (F.shiftRight(h, 16).bitwiseAND(0xff)) % (MAX_MONTH // 5)
        h4 = (F.shiftRight(h, 24).bitwiseAND(0xff)) % (MAX_MONTH // 7)
        h5 = (F.shiftRight(h, 32).bitwiseAND(0xff)) % (MAX_MONTH // 11)
        return -(h1 + h2 + h3 + h4 + h5)

    w = pyspark.sql.Window.orderBy(F.lit("")).partitionBy(df.customerID)

    charges = (df.select(
        df.customerID,
        F.lit("Charge").alias("kind"),
        F.explode(
            F.array_repeat(
                (df.TotalCharges / df.tenure).cast(get_currency_type()),
                df.tenure.cast("int"))).alias("value"),
        F.when(df.Churn == "Yes", get_last_month(
            df.customerID)).otherwise(0).alias("last_month")).withColumn(
                "now",
                F.lit(now).cast("date")).withColumn(
                    "month_number",
                    -(F.row_number().over(w) +
                      F.col("last_month"))).withColumn(
                          "date",
                          F.expr("add_months(now, month_number)")).drop(
                              "now", "month_number", "last_month"))

    serviceStarts = (df.withColumn(
        "last_month",
        F.when(df.Churn == "Yes", get_last_month(
            df.customerID)).otherwise(0)).select(
                df.customerID,
                F.lit("AccountCreation").alias("kind"),
                F.lit(0.0).cast(get_currency_type()).alias("value"),
                F.lit(now).alias("now"),
                (-df.tenure - 1 + F.col("last_month")).alias("month_number"),
            ).withColumn("date", F.expr("add_months(now, month_number)")).drop(
                "now", "month_number"))

    serviceTerminations = df.withColumn(
        "last_month",
        F.when(df.Churn == "Yes",
               get_last_month(df.customerID)).otherwise(0)).where(
                   df.Churn == "Yes").withColumn("now", F.lit(now)).select(
                       df.customerID,
                       F.lit("AccountTermination").alias("kind"),
                       F.lit(0.0).cast(get_currency_type()).alias("value"),
                       F.expr("add_months(now, last_month)").alias("date"))

    billingEvents = charges.union(serviceStarts).union(
        serviceTerminations).orderBy("date").withColumn(
            "month", F.substring("date", 0, 7))
    return billingEvents
Exemplo n.º 4
0
def generate(
    batch_id,
    n_data,
    public_key_hex_internal,
    public_key_hex_external,
    output,
    n_rows,
    scale,
    partition_size_mb,
):
    shares = (
        spark_session().range(n_rows * n_data).select(
            (F.col("id") % n_rows).alias("row_id"),
            F.when(F.rand() > 0.5, 1).otherwise(0).alias("payload"),
        ).groupBy("row_id").agg(
            F.collect_list("payload").alias("payload")).select(
                F.pandas_udf(
                    partial(
                        udf.encode,
                        batch_id,
                        n_data,
                        public_key_hex_internal,
                        public_key_hex_external,
                    ),
                    returnType="a: binary, b: binary",
                )("payload").alias("shares"))
        # repeat this data `scale` times
        .withColumn("_repeat", F.explode(F.array_repeat(
            F.lit(0), scale))).drop("_repeat").withColumn(
                "id",
                F.udf(lambda: str(uuid4()), returnType="string")()))
    # we can make an estimate with just a single row, since the configuration
    # is the same here.
    row = shares.first()
    dataset_estimate_mb = ((len(b64encode(row.shares.a)) + len(str(uuid4()))) *
                           n_rows * scale * 1.0 / 10**6)
    num_partitions = math.ceil(dataset_estimate_mb / partition_size_mb)
    click.echo(f"writing {num_partitions} partitions")

    # try to be efficient without caching by repartitioning
    repartitioned = (shares.withColumn(
        "shares",
        F.map_from_arrays(F.array(F.lit("a"), F.lit("b")),
                          F.array("shares.a", "shares.b")),
    ).repartitionByRange(num_partitions, "id").select(
        "id",
        F.explode("shares").alias("server_id", "payload")))
    repartitioned.write.partitionBy("server_id").json(output, mode="overwrite")
Exemplo n.º 5
0
def compile_array_repeat(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    times = op.times.op().value
    return F.flatten(F.array_repeat(src_column, times))
Exemplo n.º 6
0
array_subset = shows.select("name", "genres")

array_subset = array_subset.select(
    "name",
    array_subset.genres[0].alias("dot_and_index"), 
    F.col("genres")[0].alias("col_and_index"),
    array_subset.genres.getItem(0).alias("dot_and_method"), 
    F.col("genres").getItem(0).alias("col_and_method"),
)

# array_subset.show()
array_subset_repeated = array_subset.select(
    "name",
    F.lit("Comedy").alias("one"),
    F.lit("Horror").alias("two"),
    F.lit("Drama").alias("three"),
    F.col("dot_and_index"),
).select(
    "name",
    F.array("one", "two", "three").alias("Some_Genres"),
    F.array_repeat("dot_and_index", 5).alias("Repeated_Genres"),

array_subset_repeated.show(1, False)

array_subset_repeated.select(
    "name", F.size("Some_Genres"), F.size("Repeated_Genres")
).show()

array_subset_repeated.select(
    "name", F.array_distinct("Some_Genres"), F.array_distinct("Repeated_Genres")
).show(1, False)