def test_nested_higher_order_function(self): # SPARK-35382: lambda vars must be resolved properly in nested higher order functions from pyspark.sql.functions import flatten, struct, transform df = self.spark.sql( "SELECT array(1, 2, 3) as numbers, array('a', 'b', 'c') as letters" ) actual = df.select( flatten( transform( "numbers", lambda number: transform( "letters", lambda letter: struct( number.alias("n"), letter.alias("l"))), ))).first()[0] expected = [ (1, "a"), (1, "b"), (1, "c"), (2, "a"), (2, "b"), (2, "c"), (3, "a"), (3, "b"), (3, "c"), ] self.assertEquals(actual, expected)
def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: return flatten( filter( self.column.get_column_spec(source_df=source_df, current_column=current_column), lambda x: x.isNotNull(), ))
def query3(df, beg, end): most_common_topic = udf(lambda x: max(set(x), key=x.count)) count_entrances = udf(lambda x: x.count(max(set(x), key=x.count))) return df.filter(col('time').between(beg, end)).groupBy(col('group_country').alias('country'))\ .agg(flatten(collect_list('topic_name')).alias('list')).withColumn('topic', most_common_topic('list'))\ .withColumn('count', count_entrances('list')).select(col('country'), col('topic'), col('count'))
def column_revalue(vcf): # info 값 수정 필요 name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"] for name in name_list: if name == "FORMAT": vcf = vcf.withColumn( name, F.array_sort(F.array_distinct(F.flatten(F.col(name))))) vcf = vcf.withColumn( name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":"))) else: vcf = vcf.withColumn(name, F.array_max(F.col(name))) return vcf
def create_normalization_spec_spark(df, column, num_samples: int, seed: int): """Returns approximately num_samples random rows from column of df.""" df = df.select( explode(col(column).alias("features")).alias("feature_name", "feature_value")) # calculate fractions counts_df = df.groupBy("feature_name").count() frac = {} for row in counts_df.collect(): assert num_samples <= row["count"] frac[row["feature_name"]] = num_samples / row["count"] # TODO(T64843081): change to reservoir sampling, currently it approximates # perform sampling and collect them df = df.sampleBy("feature_name", fractions=frac, seed=seed) df = df.groupBy("feature_name").agg( collect_list("feature_value").alias("feature_value_list")) df = df.select("feature_name", flatten("feature_value_list").alias("feature_values")) return df
| # these last two were specifically requested (col("resp_pricings_osm_ann_ele_name") == F.lit("rfisc"))).withColumn( "resp_pricings_osm_ann_ele_type", F.when((F.substring(col("resp_pricings_osm_ann_ele_name"), 15, 3) == F.lit("row")), F.substring(col("resp_pricings_osm_ann_ele_name"), 19, 3)).otherwise( col("resp_pricings_osm_ann_ele_name")) ) # __goog_ptable_row_??? this will be "row" .groupby( *[key[2:] for key in row_key], "os_id" ) # represents transaction x passenger x segment x leg rows x optionalServices .agg( F.collect_list( col("resp_pricings_osm_ann_ele_type")).alias("google_tables"), F.flatten(F.collect_list(col("resp_pricings_osm_ann_ele_val"))).alias( "google_table_values"))) # fcols_annotations.select("f_google_tables", "f_google_table_values").show(20, False, vertical=True) # COMMAND ---------- annotations.count() # COMMAND ---------- flattened = os4.select(*psl_cols, *os_cols, "os_id").join( annotations, [*[key[2:] for key in row_key], "os_id"]).drop("os_id") # COMMAND ---------- flattened.select("optionalServices_taxes").show(vertical=True)
(col("diff_lead1") < CD) & ((col("diff_lead1") <= col("diff_lead2")) | col("diff_lead2").isNull()) & ((col("diff_lead1") <= col("diff_lag1")) | col("diff_lag1").isNull()), col("lead1")).when((col("diff_lag1") < CD) & ( (col("diff_lag1") <= col("diff_lag2")) | col("diff_lag2").isNull()) & ((col("diff_lag1") <= col("diff_lead1")) | col("diff_lead1").isNull()), col("outage_time")).otherwise(None) pw_df = pw_df.withColumn("merge_time", merge_time) pw_null_merge_time = pw_df.filter(col("merge_time").isNull()) pw_df = pw_df.filter(col("merge_time").isNotNull()) pw_df = pw_df.groupBy("merge_time").agg( F.flatten(F.collect_list("core_id")).alias("core_id"), F.flatten(F.collect_list("tx")).alias("tx"), F.flatten(F.collect_list("feeder_id")).alias("feeder_id"), F.flatten(F.collect_list("outage_times")).alias("outage_times"), F.flatten(F.collect_list("restore_time")).alias("restore_time"), F.flatten(F.collect_list("location")).alias("location")) pw_df = pw_df.select("core_id", "outage_times", "restore_time", "location", "tx", "feeder_id") pw_null_merge_time = pw_null_merge_time.select("core_id", "outage_times", "restore_time", "location", "tx", "feeder_id") pw_df = pw_df.union(pw_null_merge_time) udfTimestampAverage = udf(timestamp_average, LongType()) pw_df = pw_df.withColumn("outage_time",
#func.max(sort_pub_year).alias('lastyr'), # number of cited papers. func.sum( func.expr('IF(' + sort_pub_year + ' BETWEEN ' + minyear + ' AND ' + maxyear + ',IF(CitationCountNonSelf>0,1,0),0)') ).alias('ns_npcY1Y3'), func.sum( func.expr('IF(' + sort_pub_year + ' BETWEEN ' + minyear + ' AND ' + maxyear + ',IF(CitationCount>0,1,0),0)')).alias('ws_npcY1Y3'), func.sum('CitationCountNonSelf').alias('ns_ncY2Y3'), func.size( func.array_distinct( func.flatten(func.collect_list( 'CitingEidsNonSelf')))).alias('ns_ncY2Y3_cp'), func.max(func.expr('IF(ns_r<=CitationCountNonSelf,ns_r,0)')).alias( 'ns_hY3'), func.max(func.expr('IF(ns_r_eff<=CitationCountNonSelf,ns_r_eff,0)') ).alias('ns_hmY3'), func.sum(func.expr('IF(n_authors=1,1,0)')).alias('ns_nps'), func.sum(func.expr( 'IF(n_authors=1,CitationCountNonSelf,0)')).alias('ns_ncs'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,1,0)')).alias( 'ns_npsf'), func.sum( func.expr( 'IF(n_authors=1 OR Authorseq=1,CitationCountNonSelf,0)')). alias('ns_ncsf'), func.sum( func.expr(
pw_df = pw_df.withColumn("diff_lead2", col("lead2") - col("lead1")) pw_df = pw_df.withColumn("diff_lag1", col("outage_time") - col("lag1")) pw_df = pw_df.withColumn("diff_lag2", col("lag1") - col("lag2")) merge_time = when((col("diff_lead1") < CD) & ((col("diff_lead1") <= col("diff_lead2")) | col("diff_lead2").isNull()) & ((col("diff_lead1") <= col("diff_lag1")) | col("diff_lag1").isNull()), col("lead1")).when( (col("diff_lag1") < CD) & ((col("diff_lag1") <= col("diff_lag2")) | col("diff_lag2").isNull()) & ((col("diff_lag1") <= col("diff_lead1")) | col("diff_lead1").isNull()), col("outage_time")).otherwise(None) pw_df = pw_df.withColumn("merge_time", merge_time) pw_null_merge_time = pw_df.filter(col("merge_time").isNull()) pw_df = pw_df.filter(col("merge_time").isNotNull()) pw_df = pw_df.groupBy("merge_time").agg(F.flatten(F.collect_list("core_id")).alias("core_id"), F.flatten(F.collect_list("outage_times")).alias("outage_times"), F.flatten(F.collect_list("restore_time")).alias("restore_time"), F.flatten(F.collect_list("location")).alias("location")) pw_df = pw_df.select("core_id","outage_times","restore_time","location") pw_null_merge_time = pw_null_merge_time.select("core_id","outage_times","restore_time","location") pw_df = pw_df.union(pw_null_merge_time) udfTimestampAverage = udf(timestamp_average, LongType()) pw_df = pw_df.withColumn("outage_time", udfTimestampAverage("outage_times")) pw_df = pw_df.localCheckpoint(eager = True) print("Merged to:", pw_df.count()) print() #Okay now we have a list of outages, restore_times, locations, core_ids
sum("DislikeCount") #calculate LikeCount , sum("LikeCount") #calculate Rating , sum("Rating") #calculate Duration , sum("Duration") #calculate ViewCount , sum("ViewCount") #calculate textwords , flatten(f.collect_list("TextWords")) #count videos , f.count("*")) #rename .withColumnRenamed("sum(DislikeCount)", "DislikeCount").withColumnRenamed( 'sum(LikeCount)', "LikeCount").withColumnRenamed( "sum(Rating)", "Rating").withColumnRenamed( "sum(Duration)", "Duration").withColumnRenamed( "sum(ViewCount)", "ViewCount").withColumnRenamed( "flatten(collect_list(TextWords))", "TextWords").withColumnRenamed("count(1)", "VideoCount")) #cache results df2.cache()
# Schema for the user-defined function schema = T.ArrayType( T.StructType([ T.StructField("word", T.StringType(), False), T.StructField("count", T.IntegerType(), False) ])) # Sorting dictionary in the ascending order SorterUDF = f.udf(sort_dict_f, schema) udf_take_n_words = f.udf(lambda x: [i for i in x[:5]]) # Stopwords filter is applied testdf = billdf.withColumn('lst', f.split(f.col('Lyrics'), ' ')) testdf2 = testdf.select(['Year','lst']).groupby('Year').agg(f.collect_list('lst'))\ .withColumn("collect_list(lst)",f.flatten("collect_list(lst)"))\ .withColumnRenamed("collect_list(lst)", "All words") testdf3 = testdf2.withColumn( "cnt", SorterUDF(udf_flatten_counter(udf_filter_words("All words")))) lsc = testdf3.select("All words").collect() lsc = [i for i in lsc[0][0] if i not in stpwr] ds = dict(Counter(lsc)) ds = sorted(ds.items(), key=operator.itemgetter(1), reverse=True) popular_words = pd.Series(lsc).str.cat(sep=' ') wordcloud = WordCloud(width=1600, height=800, max_font_size=200, background_color='white').generate(popular_words) plt.figure(figsize=(12, 10))
(col("diff_lead1") < CD) & ((col("diff_lead1") <= col("diff_lead2")) | col("diff_lead2").isNull()) & ((col("diff_lead1") <= col("diff_lag1")) | col("diff_lag1").isNull()), col("lead1")).when((col("diff_lag1") < CD) & ( (col("diff_lag1") <= col("diff_lag2")) | col("diff_lag2").isNull()) & ((col("diff_lag1") <= col("diff_lead1")) | col("diff_lead1").isNull()), col("outage_time")).otherwise(None) pw_df = pw_df.withColumn("merge_time", merge_time) pw_null_merge_time = pw_df.filter(col("merge_time").isNull()) pw_df = pw_df.filter(col("merge_time").isNotNull()) pw_df = pw_df.groupBy("merge_time").agg( F.flatten(F.collect_list("user_id")).alias("user_id"), F.flatten(F.collect_list("outage_times")).alias("outage_times"), ) pw_df = pw_df.select("user_id", "outage_times") pw_null_merge_time = pw_null_merge_time.select("user_id", "outage_times") pw_df = pw_df.union(pw_null_merge_time) udfTimestampAverage = udf(timestamp_average, LongType()) pw_df = pw_df.withColumn("outage_time", udfTimestampAverage("outage_times")) pw_df = pw_df.localCheckpoint(eager=True) print("Merged to:", pw_df.count()) print() #Okay now we have a list of outages, restore_times, locations, user_ids
def calc_features(spark_session: SparkSession) -> DataFrame: df: DataFrame = spark_session \ .read \ .option("header", True) \ .option("inferSchema", False) \ .csv("daily-raw") \ .withColumn("closingPrice", F.col("closingPrice").cast(types.DoubleType())) def days(x: int) -> int: return x * 24 * 3600 window = Window.partitionBy("ISIN").orderBy("Date") window2 = Window.partitionBy("ISIN", "localMin", "localMax").orderBy("Date") window3 = Window \ .partitionBy("ISIN") \ .orderBy(F.col("Date").cast("timestamp").cast("long")) \ .rangeBetween(days(-31), days(-1)) label_window = Window \ .partitionBy("ISIN") \ .orderBy(F.col("Date").cast("timestamp").cast("long")) \ .rangeBetween(days(1), days(30)) # TODO: show case a pandas UDF # https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html # https://docs.databricks.com/spark/latest/spark-sql/udf-python-pandas.html # https://spark.apache.org/docs/latest/api/python/pyspark.sql.html # https://stackoverflow.com/questions/40006395/applying-udfs-on-groupeddata-in-pyspark-with-functioning-python-example # https://intellipaat.com/community/11611/applying-udfs-on-groupeddata-in-pyspark-with-functioning-python-example @F.udf( types.StructType([ types.StructField("Date", types.StringType(), True), types.StructField("closingPrice", types.DoubleType(), True) ])) def my_udf(entries: Sequence[Tuple[str, float]]): return min([e for e in entries if e[1] >= 65] + [entries[-1]], key=lambda x: x[0]) df2: DataFrame = df \ .withColumn("availableDays", F.datediff("Date", F.min("Date").over(window))) \ .withColumn("label", F.collect_list(F.struct(F.col("Date"), F.col("closingPrice"))).over(label_window)) \ .filter(F.size("label") > 0) \ .withColumn("label", my_udf(F.col("label"))) \ .withColumn("sellAt", F.col("label.Date")) \ .withColumn("sellPrice", F.col("label.closingPrice")) \ .drop("label") \ .withColumn("diffToPrev", F.col("closingPrice") / F.lag("closingPrice", 1).over(window) - 1) \ .withColumn("up", F.col("diffToPrev") >= 0) \ .withColumn("change", F.col("up") != F.lag("up", 1).over(window)) \ .withColumn("nextChange", F.lead("change", 1, False).over(window)) \ .withColumn("localMax", F.col("up") & F.col("nextChange")) \ .withColumn("localMin", ~F.col("up") & F.col("nextChange")) \ .drop("up", "change", "nextChange") \ .withColumn("index", F.row_number().over(window)) \ .filter(F.col("localMin") | F.col("localMax")) \ .withColumn("higher", F.col("closingPrice") >= F.lag("closingPrice", 1).over(window2)) \ .withColumn("daysBetween", F.col("index") - F.lag("index", 1).over(window)) \ .drop("index") \ .withColumn("hallo", F.concat_ws(",", "localMax", "localMin", "higher")) indexer = StringIndexer(inputCol="hallo", outputCol="categoryIndex") df3: DataFrame = indexer.fit(df2).transform(df2) \ .withColumn("hallo", F.format_string("%.0fx", "categoryIndex")) \ .filter(F.col("higher").isNotNull()) \ .withColumn("events", F.flatten(F.collect_list(F.array("daysBetween", "hallo")).over(window3))) \ .drop("hallo", "categoryIndex") \ .filter(F.col("availableDays") >= 30) \ .orderBy("Date") return df3
def main(self, sc: SparkContext, *args: Any): """ Solr Core loader :param list argv: the list elements should be: [1]: source IMPC parquet file [2]: Output Path """ observations_parquet_path = args[0] pipeline_core_parquet_path = args[1] omero_ids_csv_path = args[2] output_path = args[3] spark = SparkSession.builder.getOrCreate() observations_df = spark.read.parquet(observations_parquet_path) pipeline_core_df = spark.read.parquet(pipeline_core_parquet_path) pipeline_core_df = pipeline_core_df.select( "fully_qualified_name", "mouse_anatomy_id", "mouse_anatomy_term", "embryo_anatomy_id", "embryo_anatomy_term", col("mp_id").alias("impress_mp_id"), col("mp_term").alias("impress_mp_term"), "top_level_mouse_anatomy_id", "top_level_mouse_anatomy_term", "top_level_embryo_anatomy_id", "top_level_embryo_anatomy_term", col("top_level_mp_id").alias("impress_top_level_mp_id"), col("top_level_mp_term").alias("impress_top_level_mp_term"), col("intermediate_mp_id").alias("impress_intermediate_mp_id"), col("intermediate_mp_term").alias("impress_intermediate_mp_term"), ).distinct() omero_ids_df = spark.read.csv(omero_ids_csv_path, header=True).dropDuplicates() omero_ids_df = omero_ids_df.alias("omero") image_observations_df = observations_df.where( col("observation_type") == "image_record") image_observations_df = image_observations_df.alias("obs") image_observations_df = image_observations_df.join( omero_ids_df, [ "observation_id", "download_file_path", "phenotyping_center", "pipeline_stable_id", "procedure_stable_id", "parameter_stable_id", "datasource_name", ], ) image_observations_df = image_observations_df.select( "obs.*", "omero.omero_id") parameter_association_fields = [ "parameter_association_stable_id", "parameter_association_sequence_id", "parameter_association_name", "parameter_association_value", ] image_observations_exp_df = image_observations_df for parameter_association_field in parameter_association_fields: image_observations_exp_df = image_observations_exp_df.withColumn( f"{parameter_association_field}_exp", explode_outer(parameter_association_field), ) image_observations_x_impress_df = image_observations_exp_df.withColumn( "fully_qualified_name", concat_ws( "_", "pipeline_stable_id", "procedure_stable_id", "parameter_association_stable_id_exp", ), ) image_observations_x_impress_df = image_observations_x_impress_df.join( pipeline_core_df, (image_observations_x_impress_df["fully_qualified_name"] == pipeline_core_df["fully_qualified_name"]), "left_outer", ) group_by_expressions = [ collect_set( when( col("mouse_anatomy_id").isNotNull(), col("mouse_anatomy_id")).otherwise(col( "embryo_anatomy_id"))).alias("embryo_anatomy_id_set"), collect_set( when( col("mouse_anatomy_term").isNotNull(), col("mouse_anatomy_term")).otherwise( col("embryo_anatomy_term"))).alias( "embryo_anatomy_term_set"), collect_set( when( col("mouse_anatomy_id").isNotNull(), col("mouse_anatomy_id")).otherwise( col("embryo_anatomy_id"))).alias("anatomy_id"), collect_set( when( col("mouse_anatomy_term").isNotNull(), col("mouse_anatomy_term")).otherwise( col("embryo_anatomy_term"))).alias("anatomy_term"), flatten( collect_set( when( col("mouse_anatomy_id").isNotNull(), col("top_level_mouse_anatomy_id"), ).otherwise(col("top_level_embryo_anatomy_id")))).alias( "selected_top_level_anatomy_id"), flatten( collect_set( when( col("mouse_anatomy_id").isNotNull(), col("top_level_mouse_anatomy_term"), ).otherwise(col("top_level_embryo_anatomy_term")))).alias( "selected_top_level_anatomy_term"), collect_set("impress_mp_id").alias("mp_id"), collect_set("impress_mp_term").alias("mp_term"), flatten(collect_set("impress_top_level_mp_id")).alias( "top_level_mp_id_set"), flatten(collect_set("impress_top_level_mp_term")).alias( "top_level_mp_term_set"), flatten(collect_set("impress_intermediate_mp_id")).alias( "intermediate_mp_id_set"), flatten(collect_set("impress_intermediate_mp_term")).alias( "intermediate_mp_term_set"), ] image_observations_x_impress_df = image_observations_x_impress_df.select( [ "observation_id", "mouse_anatomy_id", "embryo_anatomy_id", "mouse_anatomy_term", "embryo_anatomy_term", "top_level_mouse_anatomy_id", "top_level_embryo_anatomy_id", "top_level_mouse_anatomy_term", "top_level_embryo_anatomy_term", "impress_mp_id", "impress_mp_term", "impress_top_level_mp_id", "impress_top_level_mp_term", "impress_intermediate_mp_id", "impress_intermediate_mp_term", ]) image_observations_x_impress_df = image_observations_x_impress_df.groupBy( "observation_id").agg(*group_by_expressions) image_observations_df = image_observations_df.join( image_observations_x_impress_df, "observation_id") image_observations_df = image_observations_df.withColumn( "download_url", concat( lit("//www.ebi.ac.uk/mi/media/omero/webgateway/archived_files/download/" ), col("omero_id"), ), ) image_observations_df = image_observations_df.withColumn( "jpeg_url", concat( lit("//www.ebi.ac.uk/mi/media/omero/webgateway/render_image/"), col("omero_id"), ), ) image_observations_df = image_observations_df.withColumn( "thumbnail_url", concat( lit("//www.ebi.ac.uk/mi/media/omero/webgateway/render_birds_eye_view/" ), col("omero_id"), ), ) image_observations_df.write.parquet(output_path)
def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: return flatten( self.column.get_column_spec(source_df=source_df, current_column=current_column))
import pyspark from pyspark.sql import SparkSession from pyspark.sql.functions import explode, flatten spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate() arrayArrayData = [ ("James", [["Java", "Scala", "C++"], ["Spark", "Java"]]), ("Michael", [["Spark", "Java", "C++"], ["Spark", "Java"]]), ("Robert", [["CSharp", "VB"], ["Spark", "Python"]]) ] df = spark.createDataFrame(data=arrayArrayData, schema=['name', 'subjects']) # df.printSchema() #df.show(truncate=False) # explode array columns to array rows df.select(df.name, explode(df.subjects)).show() df.select(df.name, flatten(df.subjects)).show(truncate=False)
def main(self, sc: SparkContext, *args: Any): """ Pipeline Solr Core loader """ pipeline_parquet_path = args[0] observations_parquet_path = args[1] ontology_parquet_path = args[2] emap_emapa_tsv_path = args[3] emapa_metadata_csv_path = args[4] ma_metadata_csv_path = args[5] output_path = args[6] spark = SparkSession(sc) pipeline_df = spark.read.parquet(pipeline_parquet_path) observations_df = spark.read.parquet(observations_parquet_path) ontology_df = spark.read.parquet(ontology_parquet_path) emap_emapa_df = spark.read.csv(emap_emapa_tsv_path, header=True, sep="\t") for col_name in emap_emapa_df.columns: emap_emapa_df = emap_emapa_df.withColumnRenamed( col_name, col_name.lower().replace(" ", "_")) emapa_metadata_df = spark.read.csv(emapa_metadata_csv_path, header=True) ma_metadata_df = spark.read.csv(ma_metadata_csv_path, header=True) pipeline_df = pipeline_df.withColumnRenamed("increment", "incrementStruct") for column, source in COLUMN_MAPPER.items(): pipeline_df = pipeline_df.withColumn(column, col(source)) pipeline_df = pipeline_df.withColumn( "unit_y", when(col("incrementStruct").isNotNull(), col("unitName")).otherwise(lit(None)), ) pipeline_df = pipeline_df.withColumn( "unit_x", when( col("incrementStruct").isNotNull(), col("incrementStruct.incrementUnit")).otherwise( col("unitName")), ) pipeline_df = pipeline_df.withColumn( "metadata", col("parameter.type") == "procedureMetadata") pipeline_df = pipeline_df.withColumn( "fully_qualified_name", concat_ws("_", "pipeline_stable_id", "procedure_stable_id", "parameter_stable_id"), ) observations_df = observations_df.withColumn( "fully_qualified_name", concat_ws("_", "pipeline_stable_id", "procedure_stable_id", "parameter_stable_id"), ) observations_df = observations_df.groupBy("fully_qualified_name").agg( first(col("observation_type")).alias("observation_type")) pipeline_df = pipeline_df.join(observations_df, "fully_qualified_name", "left_outer") pipeline_categories_df = pipeline_df.select( "fully_qualified_name", when( col("option.name").rlike("^\d+$") & col("option.description").isNotNull(), col("option.description"), ).otherwise(col("option.name")).alias("name"), ) pipeline_categories_df = pipeline_categories_df.groupBy( "fully_qualified_name").agg( collect_set("name").alias("categories")) pipeline_df = pipeline_df.join(pipeline_categories_df, "fully_qualified_name", "left_outer") pipeline_mp_terms_df = pipeline_df.select( "fully_qualified_name", "parammpterm.selectionOutcome", "termAcc").where(col("termAcc").startswith("MP")) pipeline_mp_terms_df = pipeline_mp_terms_df.join( ontology_df, col("id") == col("termAcc")) uniquify = udf(self._uniquify, ArrayType(StringType())) pipeline_mp_terms_df = pipeline_mp_terms_df.groupBy( "fully_qualified_name" ).agg( collect_set("id").alias("mp_id"), collect_set("term").alias("mp_term"), uniquify(flatten( collect_list("top_level_ids"))).alias("top_level_mp_id"), uniquify(flatten( collect_list("top_level_terms"))).alias("top_level_mp_term"), uniquify(flatten(collect_list("top_level_synonyms"))).alias( "top_level_mp_term_synonym"), uniquify(flatten( collect_list("intermediate_ids"))).alias("intermediate_mp_id"), uniquify(flatten(collect_list("intermediate_terms"))).alias( "intermediate_mp_term"), collect_set( when(col("selectionOutcome") == "ABNORMAL", col("termAcc")).otherwise( lit(None))).alias("abnormal_mp_id"), collect_set( when(col("selectionOutcome") == "ABNORMAL", col("term")).otherwise( lit(None))).alias("abnormal_mp_term"), collect_set( when(col("selectionOutcome") == "INCREASED", col("termAcc")).otherwise( lit(None))).alias("increased_mp_id"), collect_set( when(col("selectionOutcome") == "INCREASED", col("term")).otherwise( lit(None))).alias("increased_mp_term"), collect_set( when(col("selectionOutcome") == "DECREASED", col("termAcc")).otherwise( lit(None))).alias("decreased_mp_id"), collect_set( when(col("selectionOutcome") == "DECREASED", col("term")).otherwise( lit(None))).alias("decreased_mp_term"), ) pipeline_df = pipeline_df.join(pipeline_mp_terms_df, "fully_qualified_name", "left_outer") pipeline_df = pipeline_df.withColumn( "embryo_anatomy_id", when(col("termAcc").contains("EMAPA:"), col("termAcc")).otherwise(lit(None)), ) emapa_metadata_df = emapa_metadata_df.select( "acc", col("name").alias("emapaName")) pipeline_df = pipeline_df.join(emapa_metadata_df, col("embryo_anatomy_id") == col("acc"), "left_outer") pipeline_df = pipeline_df.withColumn("embryo_anatomy_term", col("emapaName")) pipeline_df = pipeline_df.drop(*emapa_metadata_df.columns) pipeline_df = pipeline_df.join(ontology_df, col("embryo_anatomy_id") == col("id"), "left_outer") pipeline_df = pipeline_df.withColumn("top_level_embryo_anatomy_id", col("top_level_ids")) pipeline_df = pipeline_df.withColumn("top_level_embryo_anatomy_term", col("top_level_terms")) pipeline_df = pipeline_df.drop(*ontology_df.columns) pipeline_df = pipeline_df.withColumn( "mouse_anatomy_id", when(col("termAcc").startswith("MA:"), col("termAcc")).otherwise(lit(None)), ) ma_metadata_df = ma_metadata_df.withColumnRenamed("name", "maName") pipeline_df = pipeline_df.join(ma_metadata_df, col("mouse_anatomy_id") == col("curie"), "left_outer") pipeline_df = pipeline_df.withColumn("mouse_anatomy_term", col("maName")) pipeline_df = pipeline_df.drop(*ma_metadata_df.columns) pipeline_df = pipeline_df.join(ontology_df, col("mouse_anatomy_id") == col("id"), "left_outer") pipeline_df = pipeline_df.withColumn("top_level_mouse_anatomy_id", col("top_level_ids")) pipeline_df = pipeline_df.withColumn("top_level_mouse_anatomy_term", col("top_level_terms")) missing_parameter_information_df = pipeline_df.where( col("parameter_stable_id").isNull()) missing_parameter_rows = missing_parameter_information_df.collect() if len(missing_parameter_rows) > 0: print("MISSING PARAMETERS") for missing in missing_parameter_rows: print(missing.asDict()) pipeline_df = pipeline_df.where(col("parameter_stable_id").isNotNull()) pipeline_df = pipeline_df.drop(*ontology_df.columns) pipeline_df.write.parquet(output_path)
arrayArrayData = [("James", [["Java", "Scala", "C++"], ["Spark", "Java"]]), ("Michael", [["Spark", "Java", "C++"], ["Spark", "Java"]]), ("Robert", [["CSharp", "VB"], ["Spark", "Python"]])] df = spark.createDataFrame(data=arrayArrayData, schema=['name', 'subjects']) df.printSchema() df.show(truncate=False) df_explode = df.select(df.name, explode(df.subjects).alias("Exploded_Subjects")) df_explode.printSchema() df_explode.show(truncate=False) df_flatten = df.select(df.name, flatten(df.subjects).alias("Flattened_Subjects")) df_flatten.printSchema() df_flatten.show(truncate=False) df_flatten_zip=df_flatten \ .withColumn("tmp", arrays_zip("Flattened_Subjects")) \ .withColumn("tmp", explode("tmp")) \ .select("name", col("tmp.Flattened_Subjects")) \ df_flatten_zip.printSchema() df_flatten_zip.show(truncate=False) '''Above is not performant hence below solution if array size is known ''' # Length of array n = 5
.withColumn('ns_r_eff',func.sum(1/func.col('n_authors')).over(wns.rangeBetween(Window.unboundedPreceding, 0))) .withColumn('ns_r',func.rank().over(wns)) .withColumn('ws_r_eff',func.sum(1/func.col('n_authors')).over(wws.rangeBetween(Window.unboundedPreceding, 0))) .withColumn('ws_r',func.rank().over(wws)) .groupBy('auid') .agg( func.sort_array(func.collect_set("subfield_tuple"),True).alias("subFields"), func.sort_array(func.collect_set("field_tuple"),True).alias("Fields"), func.sum(func.expr('IF('+sort_pub_year+' BETWEEN '+minyear+' AND '+maxyear+',1,0)')).alias('npY1Y3'), # no longer capture first/last here; we want to get those values from the full database and therefore collect them with the author names dataframe (where we also get the last known full prefereed name) #func.min(sort_pub_year).alias('firstyr'), #func.max(sort_pub_year).alias('lastyr'), func.sum('CitationCountNonSelf').alias('ns_ncY2Y3'), func.size(func.array_distinct(func.flatten(func.collect_list('CitingEidsNonSelf')))).alias('ns_ncY2Y3_cp'), func.max(func.expr('IF(ns_r<=CitationCountNonSelf,ns_r,0)')).alias('ns_hY3'), func.max(func.expr('IF(ns_r_eff<=CitationCountNonSelf,ns_r_eff,0)')).alias('ns_hmY3'), func.sum(func.expr('IF(n_authors=1,1,0)')).alias('ns_nps'), func.sum(func.expr('IF(n_authors=1,CitationCountNonSelf,0)')).alias('ns_ncs'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,1,0)')).alias('ns_npsf'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,CitationCountNonSelf,0)')).alias('ns_ncsf'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1 OR Authorseq=n_authors,1,0)')).alias('ns_npsfl'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1 OR Authorseq=n_authors,CitationCountNonSelf,0)')).alias('ns_ncsfl'), func.sum('CitationCount').alias('ws_ncY2Y3'), func.size(func.array_distinct(func.flatten(func.collect_list('CitingEids')))).alias('ws_ncY2Y3_cp'), func.max(func.expr('IF(ws_r<=CitationCount,ws_r,0)')).alias('ws_hY3'), func.max(func.expr('IF(ws_r_eff<=CitationCount,ws_r_eff,0)')).alias('ws_hmY3'), func.sum(func.expr('IF(n_authors=1,1,0)')).alias('ws_nps'), func.sum(func.expr('IF(n_authors=1,CitationCount,0)')).alias('ws_ncs'),
def compile_array_repeat(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) times = op.times.op().value return F.flatten(F.array_repeat(src_column, times))
my_list = [('a', 2, 3), ('b', 5, 6), ('c', 8, 9), ('a', 2, 3), ('b', 5, 6), ('c', 8, 9)] col_name = ['col1', 'col2', 'col3'] ds = spark.createDataFrame(my_list, schema=col_name) ds.withColumn('concat', F.concat('col1', 'col2')).show() # 分组统计按#拼接字符串 df.groupBy("col1").agg( F.concat_ws("#", F.collect_list(F.col('col2'))).alias("col2_set")) # 分组统计,合并列表 ds2 = spark.createDataFrame([(1, [1, 2, 3]), (1, [4, 5, 6]), (2, [2]), (2, [3])], ["store", "values"]) # 方法1(思路:先合并,再flatten) ds2 = ds2.groupBy("store").agg(F.collect_list("values").alias('values_list')) ds2 = ds2.withColumn("flatten_array", F.flatten(F.col("values_list"))) ds2.show() # 方法2, rdd, map ds2.rdd.map(lambda r: (r.store, r.values)).reduceByKey( lambda x, y: x + y).toDF(['store', 'values']).show() # 方法3, udf import functools def concat_list(val): return functools.reduce(lambda x, y: x + y, val) concat_list_udf = F.udf(concat_list, ArrayType(IntegerType())) ds2 = ds2.groupBy("store").agg( concat_list_udf(F.collect_list("values")).alias("values_list"))