def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: column_spec = array_max(*[ col.get_column_spec(source_df=source_df, current_column=current_column) for col in self.value ]) return column_spec
def column_revalue(vcf): # info 값 수정 필요 name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"] for name in name_list: if name == "FORMAT": vcf = vcf.withColumn( name, F.array_sort(F.array_distinct(F.flatten(F.col(name))))) vcf = vcf.withColumn( name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":"))) else: vcf = vcf.withColumn(name, F.array_max(F.col(name))) return vcf
def max_element_column(parquetFiles): parquetFiles = parquetFiles.withColumn('bars_confidence_max', F.array_max(col('bars_confidence'))) parquetFiles = parquetFiles.withColumn('bars_start_max', F.array_max(col('bars_start'))) parquetFiles = parquetFiles.withColumn('beats_confidence_max', F.array_max(col('beats_confidence'))) parquetFiles = parquetFiles.withColumn('segments_confidence_max', F.array_max(col('segments_confidence'))) parquetFiles = parquetFiles.withColumn('segments_loudness_max_time_max', F.array_max(col('segments_loudness_max_time'))) parquetFiles = parquetFiles.withColumn('tatums_confidence_max', F.array_max(col('tatums_confidence'))) return parquetFiles
def main(args): sparkConf = (SparkConf().set("spark.driver.memory", "10g").set( "spark.executor.memory", "10g").set("spark.driver.maxResultSize", "0").set("spark.debug.maxToStringFields", "2000").set( "spark.sql.execution.arrow.maxRecordsPerBatch", "500000")) if args.local: spark = (SparkSession.builder.config( conf=sparkConf).master('local[*]').getOrCreate()) else: spark = (SparkSession.builder.config(conf=sparkConf).getOrCreate()) print('args: ', args) print('Spark version: ', spark.version) start_time = time() # load co-occurrences from parquet dataset coming from path coocs = (spark.read.parquet(args.in_cooccurrences)) # we need some filtering; not all data is ready to be used # 1. at least 2 data points per month # 2. there must be data for the year 2020 w2 = Window.partitionBy(*predictions_grouped_keys) # curry function to pass to transform with the keys to group by tfn = partial(assoc_fn, group_by_cols=grouped_keys) aggregated = (coocs.withColumn("year", year( coocs.pubDate)).withColumn("month", month(coocs.pubDate)).withColumn( "day", lit(1)).filter((coocs.isMapped == True) & (coocs.type == "GP-DS") & col("year").isNotNull() & col("month").isNotNull()).selectExpr( *coocs_columns).transform(tfn).withColumn( "ds", to_date( concat_ws("-", col("year"), col("month"), col("day")))).withColumn( "y", col(harmonic_col)). dropna(subset=predictions_selection_keys).withColumn( "years", collect_set(col("year")).over(w2)).withColumn( "nYears", array_size(col("years"))).withColumn( "minYear", array_min(col("years"))).withColumn( "maxYear", array_max(col("years"))).withColumn( "dtCount", count(col("y")).over(w2)).withColumn( "dtMaxYear", max(col("year")).over(w2)). filter((col("maxYear") >= 2019) & (col("nYears") >= 3) & (col("dtCount") >= 12)).select( *predictions_selection_keys).repartition( *predictions_grouped_keys).persist()) aggregated.write.parquet(f"{args.out_prefix}/associationsFromCoocsTS") print('Completed aggregated data in {:.1f} secs'.format(time() - start_time)) # generate the models start_time = time() fbp = (aggregated.groupBy(*predictions_grouped_keys).applyInPandas( make_predictions, prediction_schema).persist()) # fbp.show(20, False) fbp.write.parquet(f"{args.out_prefix}/associationsFromCoocsTSPredictions") print('Completed TS analysis (FB Prophet) data in {:.1f} secs'.format( time() - start_time)) # clean all up just in case spark.stop() return 0
#Okay now we have a list of outages, restore_times, locations, core_ids #First let's calculate some high level metrics #size of outages pw_finalized_outages = pw_finalized_outages.withColumn("cluster_size", F.size(F.array_distinct("core_id"))) #standard deviation outage times pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_stddev", F.explode("outage_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"),*exprs) #range of outage times pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_range", F.array_max("outage_times") - F.array_min("outage_times")) #standard deviation and range of restore times pw_finalized_outages = pw_finalized_outages.withColumn("restore_times", col("restore_time")) pw_finalized_outages = pw_finalized_outages.withColumn("restore_time", F.explode("restore_time")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_time' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.avg("restore_time").alias("restore_times_mean"),*exprs) pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_stddev", F.explode("restore_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_times_stddev' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("restore_times_stddev").alias("restore_times_stddev"),*exprs) pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_range", F.array_max("restore_times") - F.array_min("restore_times"))
return int(v.argmax()) + 1 argmax_udf = udf(argmax, IntegerType()) fit.printSchema() fit = fit.withColumn("topTopic", argmax_udf("topicDistribution")) def arraymaker(v): return list([float(x) for x in v]) arraymaker_udf = udf(arraymaker, ArrayType(FloatType())) fit = fit.withColumn("arrayTopics", arraymaker_udf("topicDistribution")) fit = fit.withColumn("topTopicScore", array_max("arrayTopics")) fit.write.mode("overwrite").json("s3://covid-tweets/fit-tweets" + str(num_topics)) # use window function instead a la # https://stackoverflow.com/questions/38397796/retrieve-top-n-in-each-group-of-a-dataframe-in-pyspark dfs = [] w = Window.partitionBy(fit.topTopic).orderBy(fit.topTopicScore.desc()) fit = fit.select("*", rank().over(w).alias("rank")) fit = fit.where(fit.rank < 20) # for i in range(num_topics): # top_topic_df = fit.where(fit.topTopic == i + 1) # top_topic_df = top_topic_df.sort("topTopicScore", ascending=False) # dfs.append(top_topic_df.limit(20)) # # df_complete = reduce(DataFrame.unionAll, dfs)
model_path = "s3://" + bucket_name + "/models/w2v_model" loaded_model = Word2VecModel.load(model_path) # We add the output columns : it is the average of the words' vectors for each tweet tweets_df = loaded_model.transform(tweets_df) # We load the classifier clf_path = "s3://" + bucket_name + "/models/mpc_model" loaded_clf = MultilayerPerceptronClassificationModel.load(clf_path) predictions = loaded_clf.transform(tweets_df) # We keep the probability only for the predicted sentiment to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType())) predictions = predictions.withColumn("probability", to_array("probability")) predictions = predictions.withColumn("probability", array_max("probability")) # We assign a weight of 0.5 to negative tweets compute_weights = udf(lambda x: x if x == 1.0 else 0.5, FloatType()) # The sentiment score is in [0, 0.5] if the value is negative and [0.5, 1] if positive predictions = predictions.withColumn("weights", compute_weights("prediction")) \ .withColumn("sentiment_score", col("probability")*col("weights")) \ .groupBy("symbol") \ .agg({"sentiment_score" : "avg"})\ .withColumnRenamed("avg(sentiment_score)", "sentiment_score") \ .drop("features", "rawPrediction", "processed_tweets") predictions.repartition(1).write.mode("overwrite").csv("hdfs:///sentiment", header=True)
def foreach_jdbc_writer(df, epoch_id): df.write.\ jdbc(url="jdbc:mysql://localhost/world",table="amazon_products",mode='append',properties={"driver":"com.mysql.cj.jdbc.Driver","user":"******"}) spark = SparkSession.builder.master('local[2]').appName( 'StreamingDemo').getOrCreate() df = spark.readStream.format('kafka')\ .option('kafka.bootstrap.servers','localhost:9092')\ .option('subscribe','amazon')\ .load() deser = udf(lambda x: pickle.loads(x), MapType(StringType(), StringType())) deserlizedDF = df.withColumn('map', deser(df['value'])) parsedDF = deserlizedDF.withColumn('title',element_at('map','productTitle'))\ .withColumn('Categories',element_at('map','productCategories'))\ .withColumn('Rating',element_at('map','productRating'))\ .withColumn('Description',element_at('map','productDescription'))\ .withColumn('Prices',element_at('map','productPrices'))\ .withColumn('Min_Price',array_min(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType()))))\ .withColumn('Max_Price',array_max(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType())))) projectedDF = parsedDF.select('title', 'Categories', 'Rating', 'Prices', 'Min_Price', 'Max_Price') result = projectedDF.writeStream.foreachBatch(foreach_jdbc_writer).start() result.awaitTermination()
#standard deviation outage times pw_finalized_outages = pw_finalized_outages.withColumn( "outage_times_stddev", F.explode("outage_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [ F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time' ] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg( F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"), *exprs) #range of outage times pw_finalized_outages = pw_finalized_outages.withColumn( "outage_times_range", F.array_max("outage_times") - F.array_min("outage_times")) #Okay now to effectively calculate SAIDI/SAIFI we need to know the sensor population #join the number of sensors reporting metric above with our outage groupings #then we can calculate the relative SAIDI/SAIFI contribution of each outage pw_finalized_outages = pw_finalized_outages.join( pw_distinct_user_id, F.date_trunc("day", F.from_unixtime( pw_finalized_outages["outage_time"])) == F.date_trunc( "day", pw_distinct_user_id["window_mid_point"])) pw_finalized_outages = pw_finalized_outages.select( "outage_time", "cluster_size", "phones_reporting", "user_id", "outage_times", "outage_times_range", "outage_times_stddev") pw_finalized_outages = pw_finalized_outages.withColumn( "relative_cluster_size",