Exemplo n.º 1
0
def topn(df, c, by=None, n=3, others=False, index='_idx', result='_res'):
    cnt = f'{c}##cnt'
    cnt_tot = f'{c}##tot'
    cnt_top = f'{c}##top'

    _gcols = [by] if isinstance(by, str) and by else by or []
    r = _topn(df, c, _gcols, n)

    if others:
        # colculate total topn and total
        o = a.groupby(_gcols).agg(F.sum(cnt).alias(cnt_top))
        t = s.groupby(*_gcols).agg(F.sum(cnt).alias(cnt_tot))

        # corner case single row
        o = o.join(t, on=_gcols) if _gcols else o.withColumn(
            cnt_tot, F.lit(t.take(1)[0][0]))

        # collect to list
        r = r.union(
            o.select(*_gcols,
                     F.lit(others).alias(c),
                     (F.col(cnt_tot) - F.col(cnt_top)).alias(cnt)))

    r = r.groupby(*_gcols).agg(
        F.map_from_arrays(F.collect_list(F.col(c)),
                          F.collect_list(F.col(cnt))).alias(result))

    # the following forces colname to be nullable
    r = r.withColumn(index, F.udf(lambda x: x, T.StringType())(F.lit(c)))

    return r
Exemplo n.º 2
0
def percentiles(df,
                c,
                by=None,
                p=[10, 25, 50, 75, 90],
                index='_idx',
                result='_res'):
    _gcols = [by] if isinstance(by, str) and by else by or []
    ptile = f'{c}##p'

    # percentiles per row
    w = Window.partitionBy(*_gcols).orderBy(c)
    d = df.select(c, *_gcols,
                  F.floor(100 * (F.percent_rank().over(w))).alias(ptile))

    # aggregate
    agg_keys = F.array(*[F.lit(x) for x in p])
    agg_values = F.array(
        *[F.max(F.when(F.col(ptile) < x, F.col(c))) for x in p])
    r = d.groupby(*_gcols).agg(
        F.map_from_arrays(agg_keys, agg_values).alias(result))

    # add colname
    r = r.withColumn(index, F.lit(c))

    return r
Exemplo n.º 3
0
def scd_analyze(df, merge_on=None, state_col='_state', updated_col='_updated'):
    add_ids = '##add_ids'
    del_ids = '##del_ids'
    upd_ids = '##upd_ids'

    c = set(df.columns).difference({state_col, updated_col})
    colnames = [x for x in df.columns if x in c]

    on = merge_on or colnames
    on = on if isinstance(on, (list, tuple)) else [on]
    on = [c for c in on if c in colnames]

    s = on + [state_col, updated_col]
    cols = [x for x in df.columns if x not in s]

    a = df.filter(f'{state_col} = 0') \
        .groupby(updated_col) \
        .agg(F.collect_set(F.concat(*on)).alias(add_ids)) \
        .select(updated_col, add_ids)

    d = df.filter(f'{state_col} = 1') \
        .groupby(updated_col) \
        .agg(F.collect_set(F.concat(*on)).alias(del_ids)) \
        .select(updated_col, del_ids)

    res = a.join(d, on=updated_col, how='outer')
    res = res.select(updated_col,
                     F.coalesce(add_ids, F.array([])).alias(add_ids),
                     F.coalesce(del_ids, F.array([])).alias(del_ids))

    if cols:
        agg_funcs = [(F.countDistinct(x) - F.lit(1)).alias(x) for x in cols]
        cnt = df.groupby(*on, updated_col).agg(*agg_funcs)

        agg_names = [F.lit(x) for x in cols]
        agg_sums = [F.sum(x) for x in cols]
        cnt = cnt.groupby(updated_col).agg(
            F.map_from_arrays(F.array(*agg_names),
                              F.array(*agg_sums)).alias('changes'))

        res = res.join(cnt, on=updated_col)
    else:
        res = res.withColumn('changes', F.lit(None))

    res = res.select('*', F.array_intersect(add_ids, del_ids).alias(upd_ids))
    res = res.select(
        F.col(updated_col).alias('updated'),
        F.size(upd_ids).alias('upd'),
        F.size(F.array_except(add_ids, upd_ids)).alias('add'),
        F.size(F.array_except(del_ids, upd_ids)).alias('del'), 'changes')

    return res.orderBy('updated')
Exemplo n.º 4
0
def sampleVCF(hdfs, spark):
    vcf = spark.sparkContext.textFile(hdfs)
    col_name = vcf.filter(lambda x: x.startswith("#CHROM")).first().split("\t")
    vcf_data = vcf.filter(lambda x : re.match("[^#][^#]", x))\
                       .map(lambda x : x.split("\t"))\
                       .toDF(col_name)\
                       .withColumn("POS", F.col("POS").cast(IntegerType()))

    sample_name = vcf_data.columns[-1]
    vcf_data = vcf_data.select(F.col("#CHROM"), F.col("POS"), F.col("FORMAT"), F.col(sample_name))\
                       .withColumn("FORMAT", F.array_remove(F.split(F.col("FORMAT"), ":"), "GT"))\
                       .withColumn(sample_name, firstremove(F.split(F.col(sample_name), ":")))\
                       .withColumn(sample_name, F.map_from_arrays(F.col("FORMAT"), F.col(sample_name)))
    return vcf_data.select("#CHROM", "POS", sample_name)
Exemplo n.º 5
0
def generate(
    batch_id,
    n_data,
    public_key_hex_internal,
    public_key_hex_external,
    output,
    n_rows,
    scale,
    partition_size_mb,
):
    shares = (
        spark_session().range(n_rows * n_data).select(
            (F.col("id") % n_rows).alias("row_id"),
            F.when(F.rand() > 0.5, 1).otherwise(0).alias("payload"),
        ).groupBy("row_id").agg(
            F.collect_list("payload").alias("payload")).select(
                F.pandas_udf(
                    partial(
                        udf.encode,
                        batch_id,
                        n_data,
                        public_key_hex_internal,
                        public_key_hex_external,
                    ),
                    returnType="a: binary, b: binary",
                )("payload").alias("shares"))
        # repeat this data `scale` times
        .withColumn("_repeat", F.explode(F.array_repeat(
            F.lit(0), scale))).drop("_repeat").withColumn(
                "id",
                F.udf(lambda: str(uuid4()), returnType="string")()))
    # we can make an estimate with just a single row, since the configuration
    # is the same here.
    row = shares.first()
    dataset_estimate_mb = ((len(b64encode(row.shares.a)) + len(str(uuid4()))) *
                           n_rows * scale * 1.0 / 10**6)
    num_partitions = math.ceil(dataset_estimate_mb / partition_size_mb)
    click.echo(f"writing {num_partitions} partitions")

    # try to be efficient without caching by repartitioning
    repartitioned = (shares.withColumn(
        "shares",
        F.map_from_arrays(F.array(F.lit("a"), F.lit("b")),
                          F.array("shares.a", "shares.b")),
    ).repartitionByRange(num_partitions, "id").select(
        "id",
        F.explode("shares").alias("server_id", "payload")))
    repartitioned.write.partitionBy("server_id").json(output, mode="overwrite")
Exemplo n.º 6
0
def topn(df, c, by=None, n=3, others=False, index='_idx', result='_res'):
    """

    :param df: input dataframe
    :param c: column to aggregate
    :param by: group by
    :param n: top n results
    :param others: If true, it must be of the same type of the column being aggregated e.g 'others' for strings and -1 for positive numbers
    :param index:
    :param result:
    :return:
    """
    cnt = f'{c}##cnt'
    cnt_tot = f'{c}##tot'
    cnt_top = f'{c}##top'

    _gcols = [by] if isinstance(by, str) and by else by or []
    s, a = _topn(df, c, _gcols, n)
    r = a.select(*_gcols, c, cnt)

    if others:
        # colculate total topn and total
        o = a.groupby(_gcols).agg(F.sum(cnt).alias(cnt_top))
        t = s.groupby(*_gcols).agg(F.sum(cnt).alias(cnt_tot))

        # corner case single row
        o = o.join(t, on=_gcols) if _gcols else o.withColumn(
            cnt_tot, F.lit(t.take(1)[0][0]))

        o = o.select(*_gcols,
                     F.lit(others).alias(c),
                     (F.col(cnt_tot) - F.col(cnt_top)).alias(cnt))

        # collect to list
        r = r.union(o)

    r = r.groupby(*_gcols).agg(
        F.map_from_arrays(F.collect_list(F.col(c)),
                          F.collect_list(F.col(cnt))).alias(result))

    # the following forces colname to be nullable
    r = r.withColumn(index, F.udf(lambda x: x, T.StringType())(F.lit(c)))

    return r
Exemplo n.º 7
0
    def applyLabels(self, df, labels, date):
        
        map_keys = array([lit(k) for k in labels.keys()])
        map_values = array([lit(v) for v in labels.values()])
        map_func = map_from_arrays(map_keys, map_values) 

        verdict = udf(self.spark_functions.finalVerdict, StringType())

        labelled_df = df.withColumn('src_verdict', map_func.getItem(df.src_ip)).withColumn('dest_verdict', map_func.getItem(df.dest_ip))
        
        labelled_df = labelled_df.withColumn('label', verdict('src_verdict', 'dest_verdict', 'src_ip', 'dest_ip'))

        to_drop = ['src_verdict', 'dest_verdict', 'asn', 'country', 'timestamp']

        labelled_df = labelled_df.drop(*to_drop)

        #Drop ips that are not labelled
        labelled_df.show()

        return labelled_df
Exemplo n.º 8
0
def map_from_array(theArray, theDelim):
    def pull_key_val(x, d, kind):
        retval = []
        index = -1
        if (kind == "key"):
            index = 0
        if (kind == "val"):
            index = 1
        if index == -1:
            raise "Bad input"
        for i in x:
            retval.append(i.split(d)[index])
        return (retval)

    pull_key_udf = f.pandas_udf(
        lambda x: x.apply(pull_key_val, args=(theDelim, "key")),
        ArrayType(StringType()))
    pull_val_udf = f.pandas_udf(
        lambda x: x.apply(pull_key_val, args=(theDelim, "val")),
        ArrayType(StringType()))

    return (f.map_from_arrays(pull_key_udf(theArray), pull_val_udf(theArray)))
Exemplo n.º 9
0
def generate_integration(
    data_config,
    public_key_hex_internal,
    public_key_hex_external,
    output,
    n_rows,
    n_partitions,
    n_drop_batch,
):
    """Generate test data from a configuration file.

    The data is generated in a deterministic way and fits into memory."""
    spark = spark_session()

    assert n_rows > 0
    config_data = spark.read.json(data_config, multiLine=True)

    def generate_data(batch_id, n_data):
        return dict(
            batch_id=batch_id,
            n_data=n_data,
            payload=[int(x % 3 == 0 or x % 5 == 0) for x in range(n_data)],
        )

    test_data = []
    # by dropping batch_ids, we can test whether the main processing script is
    # robust enough when the data doesn't exist.
    collected_rows = sorted(config_data.collect(),
                            key=lambda x: x["batch_id"],
                            reverse=True)
    for conf in collected_rows[n_drop_batch:]:
        batch_id = conf["batch_id"]
        n_data = conf["n_data"]
        test_data += [
            generate_data(batch_id, n_data) for _ in range(n_rows - 1)
        ]
        # include invalid data for a batch
        test_data += [generate_data(batch_id, n_data + 1)]
    # include unknown batch id
    test_data += [generate_data("bad-id", 10) for _ in range(n_rows)]

    shares = (spark.createDataFrame(test_data).select(
        "batch_id",
        F.udf(udf.encode_single, returnType="a: binary, b: binary")(
            "batch_id",
            "n_data",
            F.lit(public_key_hex_internal),
            F.lit(public_key_hex_external),
            "payload",
        ).alias("shares"),
    ).withColumn("id",
                 F.udf(lambda: str(uuid4()), returnType="string")()))

    repartitioned = (shares.withColumn(
        "shares",
        F.map_from_arrays(F.array(F.lit("a"), F.lit("b")),
                          F.array("shares.a", "shares.b")),
    ).repartitionByRange(n_partitions, "batch_id", "id").select(
        "batch_id", "id",
        F.explode("shares").alias("server_id", "payload")))
    repartitioned.write.partitionBy("server_id",
                                    "batch_id").json(output, mode="overwrite")
Exemplo n.º 10
0
df2 = spark.sql(sql)
#df2.cache()
#df2.show()

columns = df2.columns
columns.remove("uid")
columns.remove("time_interval")

# 所有指标列合并
df3 = df2.withColumn('data', f.array(columns))

#df3.show(10, False)

# 时间周期合并,时间为key,value为所有指标数组
df4 = df3.groupBy("uid").agg(
    f.map_from_arrays(f.collect_list("time_interval"),
                      f.collect_list("data")).alias("map"))

#df4.show(10, False)

# udf,时间周期自动往前累加
# def agg(data):
#   f = {}
#   m = data
#   for ci,cn in enumerate(columns):
#     d = {}
#     for i in range(0,144):
#       if i in m:
#          num = m[i][ci]
#       else:
#          num = 0
#       for j in range(0,i):