예제 #1
0
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     column_spec = array_max(*[
         col.get_column_spec(source_df=source_df,
                             current_column=current_column)
         for col in self.value
     ])
     return column_spec
예제 #2
0
def column_revalue(vcf):
    # info 값 수정 필요
    name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"]
    for name in name_list:
        if name == "FORMAT":
            vcf = vcf.withColumn(
                name, F.array_sort(F.array_distinct(F.flatten(F.col(name)))))
            vcf = vcf.withColumn(
                name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":")))
        else:
            vcf = vcf.withColumn(name, F.array_max(F.col(name)))
    return vcf
예제 #3
0
def max_element_column(parquetFiles):
    parquetFiles = parquetFiles.withColumn('bars_confidence_max', F.array_max(col('bars_confidence')))
    parquetFiles = parquetFiles.withColumn('bars_start_max', F.array_max(col('bars_start')))
    parquetFiles = parquetFiles.withColumn('beats_confidence_max', F.array_max(col('beats_confidence')))
    parquetFiles = parquetFiles.withColumn('segments_confidence_max', F.array_max(col('segments_confidence')))
    parquetFiles = parquetFiles.withColumn('segments_loudness_max_time_max',
                                           F.array_max(col('segments_loudness_max_time')))
    parquetFiles = parquetFiles.withColumn('tatums_confidence_max', F.array_max(col('tatums_confidence')))

    return parquetFiles
def main(args):
    sparkConf = (SparkConf().set("spark.driver.memory", "10g").set(
        "spark.executor.memory",
        "10g").set("spark.driver.maxResultSize",
                   "0").set("spark.debug.maxToStringFields", "2000").set(
                       "spark.sql.execution.arrow.maxRecordsPerBatch",
                       "500000"))

    if args.local:
        spark = (SparkSession.builder.config(
            conf=sparkConf).master('local[*]').getOrCreate())
    else:
        spark = (SparkSession.builder.config(conf=sparkConf).getOrCreate())

    print('args: ', args)
    print('Spark version: ', spark.version)
    start_time = time()

    # load co-occurrences from parquet dataset coming from path
    coocs = (spark.read.parquet(args.in_cooccurrences))

    # we need some filtering; not all data is ready to be used
    # 1. at least 2 data points per month
    # 2. there must be data for the year 2020
    w2 = Window.partitionBy(*predictions_grouped_keys)

    # curry function to pass to transform with the keys to group by
    tfn = partial(assoc_fn, group_by_cols=grouped_keys)
    aggregated = (coocs.withColumn("year", year(
        coocs.pubDate)).withColumn("month", month(coocs.pubDate)).withColumn(
            "day",
            lit(1)).filter((coocs.isMapped == True) & (coocs.type == "GP-DS")
                           & col("year").isNotNull()
                           & col("month").isNotNull()).selectExpr(
                               *coocs_columns).transform(tfn).withColumn(
                                   "ds",
                                   to_date(
                                       concat_ws("-",
                                                 col("year"), col("month"),
                                                 col("day")))).withColumn(
                                                     "y", col(harmonic_col)).
                  dropna(subset=predictions_selection_keys).withColumn(
                      "years",
                      collect_set(col("year")).over(w2)).withColumn(
                          "nYears", array_size(col("years"))).withColumn(
                              "minYear", array_min(col("years"))).withColumn(
                                  "maxYear",
                                  array_max(col("years"))).withColumn(
                                      "dtCount",
                                      count(col("y")).over(w2)).withColumn(
                                          "dtMaxYear",
                                          max(col("year")).over(w2)).
                  filter((col("maxYear") >= 2019) & (col("nYears") >= 3)
                         & (col("dtCount") >= 12)).select(
                             *predictions_selection_keys).repartition(
                                 *predictions_grouped_keys).persist())

    aggregated.write.parquet(f"{args.out_prefix}/associationsFromCoocsTS")
    print('Completed aggregated data in {:.1f} secs'.format(time() -
                                                            start_time))

    # generate the models
    start_time = time()

    fbp = (aggregated.groupBy(*predictions_grouped_keys).applyInPandas(
        make_predictions, prediction_schema).persist())

    # fbp.show(20, False)

    fbp.write.parquet(f"{args.out_prefix}/associationsFromCoocsTSPredictions")
    print('Completed TS analysis (FB Prophet) data in {:.1f} secs'.format(
        time() - start_time))

    # clean all up just in case
    spark.stop()
    return 0
예제 #5
0
#Okay now we have a list of outages, restore_times, locations, core_ids
#First let's calculate some high level metrics

#size of outages
pw_finalized_outages = pw_finalized_outages.withColumn("cluster_size", F.size(F.array_distinct("core_id")))

#standard deviation outage times
pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_stddev", F.explode("outage_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"),*exprs)

#range of outage times
pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_range", F.array_max("outage_times") - F.array_min("outage_times"))

#standard deviation and range of restore times
pw_finalized_outages = pw_finalized_outages.withColumn("restore_times", col("restore_time"))
pw_finalized_outages = pw_finalized_outages.withColumn("restore_time", F.explode("restore_time"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_time' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.avg("restore_time").alias("restore_times_mean"),*exprs)

pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_stddev", F.explode("restore_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_times_stddev' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("restore_times_stddev").alias("restore_times_stddev"),*exprs)
pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_range", F.array_max("restore_times") - F.array_min("restore_times"))
예제 #6
0
    return int(v.argmax()) + 1


argmax_udf = udf(argmax, IntegerType())

fit.printSchema()
fit = fit.withColumn("topTopic", argmax_udf("topicDistribution"))


def arraymaker(v):
    return list([float(x) for x in v])


arraymaker_udf = udf(arraymaker, ArrayType(FloatType()))
fit = fit.withColumn("arrayTopics", arraymaker_udf("topicDistribution"))
fit = fit.withColumn("topTopicScore", array_max("arrayTopics"))
fit.write.mode("overwrite").json("s3://covid-tweets/fit-tweets" +
                                 str(num_topics))

# use window function instead a la
# https://stackoverflow.com/questions/38397796/retrieve-top-n-in-each-group-of-a-dataframe-in-pyspark
dfs = []
w = Window.partitionBy(fit.topTopic).orderBy(fit.topTopicScore.desc())
fit = fit.select("*", rank().over(w).alias("rank"))
fit = fit.where(fit.rank < 20)
# for i in range(num_topics):
#    top_topic_df = fit.where(fit.topTopic == i + 1)
#    top_topic_df = top_topic_df.sort("topTopicScore", ascending=False)
#    dfs.append(top_topic_df.limit(20))
#
# df_complete = reduce(DataFrame.unionAll, dfs)
    model_path = "s3://" + bucket_name + "/models/w2v_model"
    loaded_model = Word2VecModel.load(model_path)
    # We add the output columns : it is the average of the words' vectors for each tweet
    tweets_df = loaded_model.transform(tweets_df)

    # We load the classifier
    clf_path = "s3://" + bucket_name + "/models/mpc_model"
    loaded_clf = MultilayerPerceptronClassificationModel.load(clf_path)
    predictions = loaded_clf.transform(tweets_df)

    # We keep the probability only for the predicted sentiment
    to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))
    predictions = predictions.withColumn("probability",
                                         to_array("probability"))
    predictions = predictions.withColumn("probability",
                                         array_max("probability"))

    # We assign a weight of 0.5 to negative tweets
    compute_weights = udf(lambda x: x if x == 1.0 else 0.5, FloatType())

    # The sentiment score is in [0, 0.5] if the value is negative and [0.5, 1] if positive
    predictions = predictions.withColumn("weights", compute_weights("prediction")) \
    .withColumn("sentiment_score", col("probability")*col("weights")) \
    .groupBy("symbol") \
    .agg({"sentiment_score" : "avg"})\
    .withColumnRenamed("avg(sentiment_score)", "sentiment_score") \
    .drop("features", "rawPrediction", "processed_tweets")

    predictions.repartition(1).write.mode("overwrite").csv("hdfs:///sentiment",
                                                           header=True)
def foreach_jdbc_writer(df, epoch_id):
    df.write.\
    jdbc(url="jdbc:mysql://localhost/world",table="amazon_products",mode='append',properties={"driver":"com.mysql.cj.jdbc.Driver","user":"******"})


spark = SparkSession.builder.master('local[2]').appName(
    'StreamingDemo').getOrCreate()

df = spark.readStream.format('kafka')\
    .option('kafka.bootstrap.servers','localhost:9092')\
    .option('subscribe','amazon')\
    .load()

deser = udf(lambda x: pickle.loads(x), MapType(StringType(), StringType()))

deserlizedDF = df.withColumn('map', deser(df['value']))
parsedDF = deserlizedDF.withColumn('title',element_at('map','productTitle'))\
    .withColumn('Categories',element_at('map','productCategories'))\
    .withColumn('Rating',element_at('map','productRating'))\
    .withColumn('Description',element_at('map','productDescription'))\
    .withColumn('Prices',element_at('map','productPrices'))\
    .withColumn('Min_Price',array_min(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType()))))\
    .withColumn('Max_Price',array_max(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType()))))

projectedDF = parsedDF.select('title', 'Categories', 'Rating', 'Prices',
                              'Min_Price', 'Max_Price')

result = projectedDF.writeStream.foreachBatch(foreach_jdbc_writer).start()

result.awaitTermination()
예제 #9
0
#standard deviation outage times
pw_finalized_outages = pw_finalized_outages.withColumn(
    "outage_times_stddev", F.explode("outage_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [
    F.first(x).alias(x) for x in pw_finalized_outages.columns
    if x != 'outage_times_stddev' and x != 'outage_time'
]
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(
    F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"), *exprs)

#range of outage times
pw_finalized_outages = pw_finalized_outages.withColumn(
    "outage_times_range",
    F.array_max("outage_times") - F.array_min("outage_times"))

#Okay now to effectively calculate SAIDI/SAIFI we need to know the sensor population
#join the number of sensors reporting metric above with our outage groupings
#then we can calculate the relative SAIDI/SAIFI contribution of each outage
pw_finalized_outages = pw_finalized_outages.join(
    pw_distinct_user_id,
    F.date_trunc("day", F.from_unixtime(
        pw_finalized_outages["outage_time"])) == F.date_trunc(
            "day", pw_distinct_user_id["window_mid_point"]))

pw_finalized_outages = pw_finalized_outages.select(
    "outage_time", "cluster_size", "phones_reporting", "user_id",
    "outage_times", "outage_times_range", "outage_times_stddev")
pw_finalized_outages = pw_finalized_outages.withColumn(
    "relative_cluster_size",