def test_mixed_sql_and_udf(self): df = self.data w = self.unbounded_window ow = self.ordered_window max_udf = self.pandas_agg_max_udf min_udf = self.pandas_agg_min_udf result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w)) expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w)) # Test mixing sql window function and window udf in the same expression result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w)) expected2 = expected1 # Test chaining sql aggregate function and udf result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \ .withColumn('min_v', min(df['v']).over(w)) \ .withColumn('v_diff', col('max_v') - col('min_v')) \ .drop('max_v', 'min_v') expected3 = expected1 # Test mixing sql window function and udf result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \ .withColumn('rank', rank().over(ow)) expected4 = df.withColumn('max_v', max(df['v']).over(w)) \ .withColumn('rank', rank().over(ow)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[: len(r)])
def process_file(date_update): """Process downloaded MEDLINE folder to parquet file""" print("Process MEDLINE file to parquet") # remove if folder still exist if glob(os.path.join(save_dir, 'medline_*.parquet')): subprocess.call(['rm', '-rf', 'medline_*.parquet']) date_update_str = date_update.strftime("%Y_%m_%d") path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000) parse_results_rdd = path_rdd.\ flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict) for publication_dict in pp.parse_medline_xml(x)]) medline_df = parse_results_rdd.toDF() medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str), mode='overwrite') window = Window.partitionBy(['pmid']).orderBy(desc('file_name')) windowed_df = medline_df.select( max('delete').over(window).alias('is_deleted'), rank().over(window).alias('pos'), '*') windowed_df.\ where('is_deleted = False and pos = 1').\ write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str), mode='overwrite') # parse grant database parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\ .filter(lambda x: x is not None)\ .map(lambda x: Row(**x)) grant_df = parse_grant_rdd.toDF() grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str), mode='overwrite')
def extractor (df, min_count, output_path): n_gram_df = make_ngrams (df) n_gram_score = chi_square_procedur(n_gram_df, min_count) window = Window.partitionBy(n_gram_score['category'])\ .orderBy(n_gram_score['aprx_chi_scr'].desc()) n_gram_score = n_gram_score.dropDuplicates(['n_gram', 'category']) top_word_df = n_gram_score.select('*', F.rank().over(window).alias('rank'))\ .filter(F.col('rank')<=1000) top_word_df = top_word_df.join(categories, on = ['category'], how='left') top_words = top_word_df.orderBy(F.col('category'), F.col('count').desc()).select('n_gram','category',\ 'count','distinct_user_count','aprx_chi_scr').toPandas() top_words.to_csv(output_path) return top_words
def test_window_functions_without_partitionBy(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.orderBy("key", df.value) from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w)) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 4, 1, 1, 1, 1), ("2", 1, 1, 1, 4, 2, 2, 2, 1), ("2", 1, 2, 1, 4, 3, 2, 2, 2), ("2", 2, 2, 2, 4, 4, 4, 3, 2) ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])
def retrieve_next_layer(self, f, e, topx, direction='out'): if direction == 'out': orig_node = 'src' dest_node = 'dst' else: orig_node = 'dst' dest_node = 'src' df = f.select("id").join(e.drop('in_scope'), f.id == e[orig_node], 'inner').drop(orig_node) window = Window.partitionBy(df['id']).orderBy(df['amount'].desc()) df = df.select( '*', F.rank().over(window).alias('rank')).filter(F.col('rank') <= topx) dummy_tmp = self.create_dummy_edges(df, f, topx, direction) df = dummy_tmp.union(df.select(dummy_tmp.columns)) df = df.withColumn("direction", F.lit(direction)) df = df.withColumnRenamed(dest_node, "adj") return df
def spark_mysql(title_basics, title_ratings): titles = title_basics.select('tconst', 'startYear', 'originalTitle', 'titleType') titles = titles.withColumn('startYear', titles['startYear'].cast(IntegerType()))\ .where(titles['startYear'].isNotNull()) ratings = title_ratings.select('tconst', 'averageRating') result = ratings.join(titles, on=['tconst']) window = Window.partitionBy(['titleType', 'startYear']).orderBy(desc('averageRating')) result = result.select( '*', rank().over(window).alias('rank')).filter(col('rank') <= 10) result = result.orderBy('titleType', 'startYear', desc('averageRating')) return result
def find_most_trees_address(self, df): """Find the adrress with most number of trees planted :param df: Input DataFrame containing all details of trees :return: DataFrame of the address with most number of trees """ # dataframe of address and corresponding tree_id(s) and find the total count of trees in each address max_trees = df.select('address', 'tree_id').filter( df.address.isNotNull()).groupBy(col('address')).count().sort( desc("count")) # rank the addresses based on decreasing order of number of trees planted and find top address, which is rank 1 max_trees_place = max_trees.withColumn( "rank", rank().over(Window.orderBy(col("count").desc()))).filter( col("rank") == 1).select('address') self.log.warn('Found the address with most number of trees planted') return max_trees_place
def main(spark, model_file, data_file, K): '''Main routine for Collaborative Filtering Model testing Parameters ---------- spark: SparkSession object model_file: string, path to store the model data_file: string, path to the parquet file to load K: int, evaluations are based on predictions of the top K items for each user ''' testIdx = spark.read.parquet(data_file) model = ALSModel.load(model_file) users_val = testIdx.select("user_idx").distinct() perUserPredictedItemsDF = model.recommendForUserSubset(users_val, K) perUserPredictedItemsDF = perUserPredictedItemsDF.select( "user_idx", "recommendations.track_idx").withColumnRenamed( 'user_idx', 'user').withColumnRenamed('recommendations.track_idx', 'items') w2 = Window.partitionBy('user_idx').orderBy(col('count').desc()) perUserActualItemsDF = testIdx.select( 'user_idx', 'track_idx', 'count', F.rank().over(w2).alias('rank')).where( 'rank <= {0}'.format(K)).groupBy('user_idx').agg( expr('collect_list(track_idx) as items')).withColumnRenamed( 'user_idx', 'user') perUserItemsRDD = perUserPredictedItemsDF.join( perUserActualItemsDF, 'user').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(perUserItemsRDD) print("============================================") print("meanAveragePrecision = %.8f" % rankingMetrics.meanAveragePrecision) print("precisionAt(K) = %.8f" % rankingMetrics.precisionAt(K)) print("ndcgAt(K) = %.8f" % rankingMetrics.ndcgAt(K))
def runOtherFunctions(spark, personDf): df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]) # array df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False) # desc, asc personDf.show() personDf.sort(functions.desc("age"), functions.asc("name")).show() # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음 # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능) df2 = spark.createDataFrame([("Splits str around pattern", )], ['value']) df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False) # rownum, rank f1 = StructField("date", StringType(), True) f2 = StructField("product", StringType(), True) f3 = StructField("amount", IntegerType(), True) schema = StructType([f1, f2, f3]) p1 = ("2017-12-25 12:01:00", "note", 1000) p2 = ("2017-12-25 12:01:10", "pencil", 3500) p3 = ("2017-12-25 12:03:20", "pencil", 23000) p4 = ("2017-12-25 12:05:00", "note", 1500) p5 = ("2017-12-25 12:05:07", "note", 2000) p6 = ("2017-12-25 12:06:25", "note", 1000) p7 = ("2017-12-25 12:08:00", "pencil", 500) p8 = ("2017-12-25 12:09:45", "note", 30000) dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema) w1 = Window.partitionBy("product").orderBy("amount") w2 = Window.orderBy("amount") dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"), functions.rank().over(w2).alias("rank")).show()
def get_users_dataframe(complete_listens_df, metadata): """ Prepare users dataframe Args: complete_listens_df (dataframe): Dataframe with all the columns/fields that a typical listen has. Returns: users_df (dataframe): Columns can be depicted as: [ 'user_name', 'user_id' ] """ # We use window function to give rank to distinct user_names # Note that if user_names are not distinct rank would repeat and give unexpected results. user_window = Window.orderBy('user_name') users_df = complete_listens_df.select('user_name').distinct().withColumn( 'user_id', rank().over(user_window)) metadata['users_count'] = users_df.count() save_dataframe(users_df, path.USERS_DATAFRAME_PATH) return users_df
def main(sc, out_file_name): """ Read GDELT data from S3, clean themes from taxonomy words, and perform frequency count of cleaned themes. Pick top 1000 most popular themes and write to out_file_name """ #Obtain list of taxonomy words for theme cleaning tax_file = os.environ['TAX_LIST_FILE'] tax_list = f.read_tax_file(tax_file) rdd_tax_list = sc.broadcast(tax_list) #Read 'GKG" table from GDELT S3 bucket. Transform into RDD and clean taxonomy words gkgRDD = sc.textFile('s3a://gdelt-open-data/v2/gkg/201[5-9]*000000.gkg.csv') gkgRDD = gkgRDD.map(lambda x: x.encode("utf", "ignore")) gkgRDD.cache() gkgRDD = gkgRDD.map(lambda x: x.split('\t')) gkgRDD = gkgRDD.filter(lambda x: len(x)==27) gkgRDD = gkgRDD.filter(lambda x: f.is_not_empty(x[7]])) gkgRowRDD = gkgRDD.map(lambda x : Row(themes = f.clean_taxonomy(x[7].split(';')[:-1], rdd_tax_list))) sqlContext = SQLContext(sc) #Transform RDDs to dataframes gkgDF = sqlContext.createDataFrame(gkgRowRDD) #Each document could contain multiple themes. Explode on the themes and make a new column explodedDF = gkgDF.select(explode(gkgDF.themes).alias("theme")) #Count the frequency of each theme testDF = explodedDF.groupBy('theme').agg(count('*').alias('num_mentions')) #Find top 1000 most popular themes, use Pandas to write to output file window = Window.orderBy(testDF['num_mentions'].desc()) rankDF = testDF.select('*', rank().over(window).alias('rank')) .filter(col('rank') <= 1000).where(col('theme') != '') pandasDF = rankDF.toPandas() pandasDF.to_csv(out_file_name, columns = ["theme", "num_mentions", "rank"])
def transform_as4_invoice(self, sources: dict) -> DataFrame: """ Dim Location records and attributes from dataB Invoice Plant data """ spark = self.get_spark() inv_loc_df = spark.read.orc(sources['dataB_invoice_extract']['path']) df = (inv_loc_df.select( col('plant').alias('location_id'), 'invoice_date', 'invoice', col('plant_name').alias('description'))) window = Window.partitionBy('location_id').orderBy( df['invoice_date'].desc(), df['invoice'].desc()) df = df.withColumn("rank", F.rank().over(window)) df_rank = df.filter("rank = 1").distinct() df_final = (df_rank.select('location_id', 'invoice_date', 'invoice', 'description')) return df_final
def top_payments_monthly(data_df): payment_otc = spark.read.jdbc(mysql_url, "source.payment_otc", properties={ "user": mysql_un, "password": mysql_password }) payment_df = data_df.groupBy("payment_type", "month", "year").count() window = Window.partitionBy("month", "year").orderBy(fn.desc("count")) payment_df2 = payment_df.withColumn("rank", fn.rank().over(window)) payment_df3 = payment_df2.where("rank <= 3").select( "month", "year", "payment_type", "count") payment_df4 = payment_df3.join(payment_otc, ["payment_type"]).select( "month", "year", "payment_name", "count") payment_df4.orderBy("year", "month").write.orc(hdfs_output + "top_payments_monthly", mode="overwrite")
def obtener_topN_ciclistas_por_provincia_en_total_de_kilometros( ciclistas_kilometros_df, N): #obtiene el total de kilómetros por ciclista, agrupado por provincia, cedula y nombre_Completo provincia_ciclistas_kilometros_total_df = ciclistas_kilometros_df.groupBy( "provincia", "cedula", "nombre_Completo").sum("TotalKilometros") #provincia_ciclistas_kilometros_total_df.show() provincia_ciclistas_kilometros_total_df = \ provincia_ciclistas_kilometros_total_df.select( col('provincia'), col('cedula'), col('nombre_Completo'), col('sum(TotalKilometros)').alias('TotalKilometros')) #provincia_ciclistas_kilometros_total_df.show() #particiona los datos por provincia, ordenados por TotalKilometros descendente y cedula ascendente, y posteriormente crea columna para asignarles una "posición" window = Window.partitionBy('provincia').orderBy( col('TotalKilometros').desc(), col('cedula').asc()) provincia_ciclistas_kilometros_total_df = provincia_ciclistas_kilometros_total_df.withColumn( "Posicion_Por_Provincia", rank().over(window)) provincia_ciclistas_kilometros_total_df = provincia_ciclistas_kilometros_total_df.withColumn( "Tipo_Top_N_Ciclistas_Por_Provincia", lit("Total de Km")) #obtiene el top N provincia_ciclistas_kilometros_total_df = provincia_ciclistas_kilometros_total_df.filter( provincia_ciclistas_kilometros_total_df.Posicion_Por_Provincia <= N) provincia_ciclistas_kilometros_total_df = provincia_ciclistas_kilometros_total_df.select( col('Tipo_Top_N_Ciclistas_Por_Provincia'), col('provincia'), col('cedula'), col('nombre_Completo'), col('TotalKilometros').alias('Valor'), col('Posicion_Por_Provincia')) return provincia_ciclistas_kilometros_total_df
def transform(retail_df): """ :param retail_df: :return: """ from pyspark.sql.window import Window from pyspark.sql.functions import col, date_format, desc, dense_rank, rank, max # convert date format on retail_df transform_step1 = (retail_df.withColumn('InvoiceDate', date_format(col("InvoiceDate"), "MM/dd/yyyy H:mm"))) # window function window_function = (Window.partitionBy("CustomerId") .orderBy(desc("Quantity")) .rowsBetween(Window.unboundedPreceding, Window.currentRow)) # aggregate functions max_purchase_quantity = max(col("Quantity")).over(window_function) # rank functions purchase_dense_rank = dense_rank().over(window_function) purchase_rank = rank().over(window_function) transformed_df = (retail_df.withColumn('InvoiceDate', date_format(col("InvoiceDate"), "MM/dd/yyyy H:mm")) .where("CustomerId IS NOT NULL") .orderBy("CustomerId") .select(col("CustomerId"), col("InvoiceDate"), col("Quantity"), purchase_rank.alias("quantityRank"), purchase_dense_rank.alias("quantityDenseRank"), max_purchase_quantity.alias("maxPurchaseQuantity"))) return transformed_df
def run_job(self, sc, sqlc): input_data = sc.textFile(self.text_file, minPartitions=4) output = input_data.mapPartitionsWithIndex( self.process_warcs).reduce(add) output_json = sc.parallelize(output) self.create_db_connection() self.reference_to_instagram_df = output_json.toDF() \ .orderBy("reference_link", "warc_date") \ window = Window.partitionBy("instagram_link", "reference_link").orderBy( "warc_date", 'tiebreak') self.reference_to_instagram_df = ( self.reference_to_instagram_df.withColumn( 'tiebreak', monotonically_increasing_id()).withColumn( 'rank', rank().over(window)).filter(col('rank') == 1).drop( 'rank', 'tiebreak')) self.log_aggregators(sc) self.prepare_csv(sc, sqlc) try: self.drop_outdated_references() self.perform_aggregations() self.conn.commit() self.conn.close() finally: pass
def runOtherFunctions(spark, personDf): df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]); # array df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False) # desc, asc personDf.show() personDf.sort(functions.desc("age"), functions.asc("name")).show() # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음 # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능) df2 = spark.createDataFrame([("Splits str around pattern",)], ['value']) df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False) # rownum, rank f1 = StructField("date", StringType(), True) f2 = StructField("product", StringType(), True) f3 = StructField("amount", IntegerType(), True) schema = StructType([f1, f2, f3]) p1 = ("2017-12-25 12:01:00", "note", 1000) p2 = ("2017-12-25 12:01:10", "pencil", 3500) p3 = ("2017-12-25 12:03:20", "pencil", 23000) p4 = ("2017-12-25 12:05:00", "note", 1500) p5 = ("2017-12-25 12:05:07", "note", 2000) p6 = ("2017-12-25 12:06:25", "note", 1000) p7 = ("2017-12-25 12:08:00", "pencil", 500) p8 = ("2017-12-25 12:09:45", "note", 30000) dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema) w1 = Window.partitionBy("product").orderBy("amount") w2 = Window.orderBy("amount") dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"), functions.rank().over(w2).alias("rank")).show()
def process_file(date_update): """Process downloaded MEDLINE folder to parquet file""" print("Process MEDLINE file to parquet") # remove if folder still exist if glob(os.path.join(save_dir, 'medline_*.parquet')): subprocess.call(['rm', '-rf', 'medline_*.parquet']) date_update_str = date_update.strftime("%Y_%m_%d") path_rdd = sc.parallelize(glob( os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000) parse_results_rdd = path_rdd.\ flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict) for publication_dict in pp.parse_medline_xml(x)]) medline_df = parse_results_rdd.toDF() medline_df.write.parquet(os.path.join( save_dir, 'medline_raw_%s.parquet' % date_update_str), compression='gzip') window = Window.partitionBy(['pmid']).orderBy(desc('file_name')) windowed_df = medline_df.select( max('delete').over(window).alias('is_deleted'), rank().over(window).alias('pos'), '*') windowed_df.\ where('is_deleted = False and pos = 1').\ write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str), compression='gzip') # parse grant database parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\ .filter(lambda x: x is not None)\ .map(lambda x: Row(**x)) grant_df = parse_grant_rdd.toDF() grant_df.write.parquet(os.path.join( save_dir, 'medline_grant_%s.parquet' % date_update_str), compression='gzip')
def __get_recent_items(): window = Window.partitionBy(date_format( df.dateAdded, 'yyy-MM-dd')).orderBy(df['dateAdded'].desc()) recent_items = ( df.select('*', rank().over(window).alias('rank')).filter(col('rank') <= 1). dropDuplicates( ['dateAdded'] ) # Sometimes, for a day, multiple products have been added at the recent time. Since the output has to be a single product, dropping the duplicates .orderBy(df.dateAdded, ascending=False)) # removing unnecessary fields recent_items = recent_items.select( date_format(df.dateAdded, 'yyy-MM-dd').alias('dateAdded'), 'id', 'brand', 'colors') # writing the results to redis (uses HASH data structure) recent_items.write \ .format("org.apache.spark.sql.redis") \ .option("table", "recent") \ .option("key.column", "dateAdded") \ .mode("overwrite") \ .save()
def get_recordings_df(mapped_listens_df, metadata): """ Prepare recordings dataframe. Args: mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping. Returns: recordings_df: Dataframe containing distinct recordings and corresponding mbids and names. """ recording_window = Window.orderBy('mb_recording_mbid') recordings_df = mapped_listens_df.select('mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid', 'msb_artist_credit_name_matchable', 'msb_recording_name_matchable') \ .distinct() \ .withColumn('recording_id', rank().over(recording_window)) metadata['recordings_count'] = recordings_df.count() save_dataframe(recordings_df, path.RECORDINGS_DATAFRAME_PATH) return recordings_df
def get_recordings_df(mapped_listens_df, metadata, save_path): """ Prepare recordings dataframe. Args: mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping. save_path (str): path where recordings_df should be saved Returns: recordings_df: Dataframe containing distinct recordings and corresponding mbids and names. """ recording_window = Window.orderBy('recording_mbid') recordings_df = mapped_listens_df \ .select( 'artist_credit_id', 'recording_mbid', ) \ .distinct() \ .withColumn('recording_id', rank().over(recording_window)) metadata['recordings_count'] = recordings_df.count() save_dataframe(recordings_df, save_path) return recordings_df
def main(): parser = argparse.ArgumentParser() parser.add_argument('--posts', type=str, required=True) parser.add_argument('--num_top', type=int, default=1) parser.add_argument('--output_dir', type=str, required=True) args = parser.parse_args() conf = SparkConf().set("spark.driver.maxResultSize", "10G"). \ set("spark.hadoop.validateOutputSpecs", "false"). \ set('spark.default.parallelism', '400') spark = SparkSession.builder.\ appName("SO Tag first usage date").\ config(conf=conf).\ getOrCreate() sc = spark.sparkContext in_rdd = sc.textFile(args.posts).filter(lambda x: get_field(x, 'Id') is not None).\ map(lambda x: (int(get_field(x, 'Id')), x)) in_rdd = in_rdd.filter(lambda x: get_field(x[1], 'Tags') is not None and get_field(x[1], 'CreationDate') is not None).\ map(lambda x: (datetime.strptime(get_field(x[1], 'CreationDate').decode('utf-8'), DT_FORMAT), get_tags(get_field(x[1], 'Tags').decode('utf-8')))).\ flatMap(lambda x: [(x[0], y) for y in x[1]]) tag_date_df = in_rdd.toDF(['CreationDate', 'Tag']) window = Window.partitionBy(tag_date_df['Tag']).orderBy(tag_date_df['CreationDate'].asc()) #tag_first_appearances = tag_date_df.groupBy('Tag').agg({'CreationDate': 'min'}) tag_first_appearances = tag_date_df.select('*', rank().over(window).alias('rank')).\ filter(col('rank') <= args.num_top) tag_first_appearances_pd = tag_first_appearances.toPandas().drop(columns=['rank']) make_sure_path_exists(args.output_dir) with open(os.path.join(args.output_dir, 'tag_'+str(args.num_top)+'_earliest_appearance.csv'), 'w') as f: tag_first_appearances_pd.to_csv(f)
def get_topN(df, group_by_columns, order_by_column, n=10): window = Window.partitionBy(group_by_columns).orderBy( order_by_column.desc()) return df.select('*', f.rank().over(window).alias('rank')).filter( f.col('rank') <= n).drop("rank")
from pyspark.sql import Window spark = SparkSession.builder.enableHiveSupport().getOrCreate() df1 = spark.sql(''' select * from app.app_saas_sfs_model_input where dt='2018-07-31' ''').select(['sku_code', 'sale_date', 'sale']) df1.show() day_len = 90 day_end = '2018-07-31' day_start = (parse(day_end) - datetime.timedelta(day_len)).strftime('%Y-%m-%d') df1_sum = df1.where(''' sale_date >= '{day_start}' and sale_date <= '{day_end}' '''.format(day_start=day_start, day_end=day_end)).groupBy('sku_code').agg(F.sum('sale').alias('sale_sum')) # Temp rank of the sale, just to split into 5 windowspec_r = Window.orderBy(F.col('sale_sum').desc()) df1_rank = df1_sum.withColumn('rank', F.rank().over(windowspec_r)) # 16483 df1_cnt = df1_sum.select(F.countDistinct('sku_code').alias('sku_count')) df1_rcnt = df1_rank.crossJoin(F.broadcast(df1_cnt)) df1_rcnt = df1_rcnt.withColumn('rank_rate', F.col('rank')/ F.col('sku_count')) band_sql = ''' Case When rank_rate < 0.2 Then 1 When rank_rate < 0.4 Then 2 When rank_rate < 0.6 Then 3 When rank_rate < 0.8 Then 4 else 5 end as
fontsize=30) graph2.set_xlabel("Players Age", fontsize=20) graph2.set_ylabel("Player Overall ratings", fontsize=20) graph2.set_xlim(15, 45) graph2.set_ylim(50, 100) graph2.xaxis.set_major_locator(tck.MultipleLocator(5)) graph2.yaxis.set_major_locator(tck.MultipleLocator(5)) plt.show() # Finding top 10 best players with respect to each position considering their overall # Adding a rank column to rank the best players in each position window_for_highest_overall = Window.partitionBy( cleaned_set['Position']).orderBy(cleaned_set['Overall'].desc()) top_worthy_players = cleaned_set.select( '*', rank().over(window_for_highest_overall).alias('rank')).filter( col('rank') <= 10) print( "\nThe top 10 best players who are worthy based on their Overall for each position are\n" ) print("The column rank indicates the rank of players") top_worthy_players.show(1000) # Plotting average market value of top 10 worthy players for each position with required plot properties # Adding avg(Value) column containing average of market values for each position positionvsavgvalue_dataset = top_worthy_players.groupBy('Position').agg( {'Value': 'avg'}) g = positionvsavgvalue_dataset.toPandas() graph3 = g.plot(x='Position', y='avg(Value)', kind='bar',
def createDataFile(start_date, end_date, spark_instance, jackknife_buckets, sample_percent, output_path): feature_data_phase1 = spark_instance.table(_TABLE_SOURCE).select([ _COL_ID.alias("id"), _DATE_PARSED.alias("date"), # TODO: Use MD5 instead of CRC32 (F.floor(F.crc32(_COL_ID) / 100) % jackknife_buckets).alias("bucket"), lit(1).alias("is_active"), F.when(_COL_URI_COUNT >= _NUM_ADAU_THRESHOLD, 1).otherwise(0).alias( "is_active_active"), F.to_date(_COL_PC_DATE).alias("profile_creation_date") ] + list(_MAP_NATURAL_DIMENSIONS.keys())).filter( (_DATE_PARSED.between(start_date, end_date)) & (_COL_SAMPLE < sample_percent)).withColumn( "young_profile", F.when( col("date") < F.date_add(col("profile_creation_date"), 14), "TRUE").otherwise("FALSE")) new_profile_window = Window.partitionBy(col("id")).orderBy(col("date")) new_profile_data = feature_data_phase1.filter( (col("date") >= col("profile_creation_date")) & (col("date") <= F.date_add(col("profile_creation_date"), 6))).select( "*", F.rank().over(new_profile_window).alias('rank')).filter( col('rank') == 1).withColumn("new_profile", lit(1)).drop("date").withColumn( "date", col("profile_creation_date")) feature_data = feature_data_phase1.alias("fd").join( new_profile_data.alias("np"), (col("fd.id") == col("np.id")) & (col("fd.date") == col("np.date")), how='full', ).select( [F.coalesce(col("np.new_profile"), lit(0)).alias("new_profile")] + [F.coalesce(col("fd.is_active"), lit(0)).alias("is_active")] + [ F.coalesce(col("fd.is_active_active"), lit(0)).alias( "is_active_active") ] + [ F.coalesce(col("fd.{}".format(c)), col("np.{}".format(c))).alias(c) for c in feature_data_phase1.columns if c not in ["is_active", "is_active_active"] ]) once_ever_profiles = feature_data.filter( col("is_active") == 1).groupBy("id").count().filter( col("count") == 1).select("id").withColumn("single_day_profile", lit("1")) feature_data = feature_data.alias("fd").join( once_ever_profiles.alias("oep"), "id", "outer").fillna({"single_day_profile": "0"}) ppi_profiles = spark_instance.table("main_summary").select( col("client_id").alias("id"), lit(1).alias("ppi")).filter('''submission_date_s3 >= '20190121' AND scalar_parent_startup_profile_selection_reason IN ( 'firstrun-skipped-default', 'restart-skipped-default' )''') feature_data = feature_data.alias("fd").join(ppi_profiles.alias("ppip"), "id", "left").fillna({"ppi": 0}) feature_data.write.partitionBy("date").mode('overwrite').parquet( output_path)
# 1. human interfere df_cleanning = human_interfere(spark, df_cleanning, df_interfere) df_cleanning.persist() # 2. 以MOLE_NAME为主键JOIN df_result = similarity(spark, df_cleanning, df_standard) df_result.persist() df_result.show() print(df_result.count()) # 3. 对每个需要匹配的值做猜想排序 # windowSpec = Window.partitionBy("id").orderBy(desc("SIMILARITY")) windowSpec = Window.partitionBy("id").orderBy("SIMILARITY") df_match = df_result.withColumn("RANK", rank().over(windowSpec)) df_match = df_match.where(df_match.RANK <= 5) df_match.persist() # df_match.printSchema() df_match = df_match.withColumn("check", check_similarity(df_match.PACK_ID_CHECK, df_match.PACK_ID_STANDARD, df_match.SIMILARITY)) # df_match.show(5) df_match = df_match.orderBy("id").drop("ORIGIN", "STANDARD") df_match.persist() df_match.repartition(1).write.format("parquet").mode("overwrite").save("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/azsanofi/0.0.4/all") # df_match.repartition(1).write.format("parquet").mode("overwrite").save("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/0.0.15/all") df_replace = df_match.filter(df_match.check == 1)
table + '/*.' + fileFormat) #load today's changes logger.info("Load the latest changes") latestChanges = spark.read.format(fileFormat).load(landingDir + '/' + table + '/today/*.' + fileFormat) #Combine the datasets logger.info( "Merge two datasets which will have duplicates for the records modified" ) dataMerge = baseData.union(latestChanges) #Filter the old data and save the files with latest changes logger.info( "Filter the old data and have the latest changes for the records modified" ) dataMerge.select("*", rank().over(Window.partitionBy(partitionByColumn).orderBy(desc(lmtColumn))).alias("latestRecord"))\ .where("latestRecord == 1").drop("latestRecord").repartition(1).write.option('path',targetDir+'/'+table).mode(fileMode)\ .bucketBy(noBuckets,bucketByColumn).saveAsTable(destDB+'.'+table) logger.info( f"Latest changes merged with the base data and it is available in {targetDir}/{table}" ) except Exception as error: logger.exception(f"Failed with error- {error}") else: logger.info("Latest changes are merged successfully") finally: spark.stop()
# 7. Saving the end results # you cannot save df directly as textfile should convet it to rdd # dff=dff.withColumn('year',dff['_c1'].substr(8,4) <<postion,length #Note : dff = sc.textfile(trainingdata+"part-00000").map(lambda x: x.replace('[','').replace(']','').split('|')).toDF() # best combination of python and spark ######################################################################################### # lines=Source.fromFile("E:\scala\spark-data\movie-description") scala from pyspark.sql import Window from pyspark.sql.functions import rank dff = spark.read.csv("E:\scala\spark-data\movie-description", sep="|") movie_year_df = dff.withColumn('year', dff['_c2'].substr(8, 4)).select( "_c0", "_c1", "_c2", "year") movie_number_df = spark.read.csv("E:\scala\spark-data\movie-data.data", inferSchema=True, header=False, sep="\t").select("_c0", "_c3") cond = movie_number_df._c0 == movie_year_df._c0 mv_fi_df = movie_year_df.join( movie_number_df, cond, 'right').withColumnRenamed("_c1", "movie").withColumnRenamed( "_c3", "gross").select("movie", "year", "gross") windowSpec = Window.partitionBy("year").orderBy("gross") mv_fi_df = mv_fi_df.withColumn("rank", rank().over(windowSpec)) mv_fi_df.filter(mv_fi_df.rank == 1).show()
"PROVINCE", regexp_replace(df_cleanning.PROVINCE, "市", "")) df_cleanning = df_cleanning.withColumn( "PROVINCE", regexp_replace(df_cleanning.PROVINCE, "自治区", "")) df_cleanning = df_cleanning.withColumn( "CITY", regexp_replace(df_cleanning.CITY, "市", "")) # 2. Join 一下 df_cleanning = df_cleanning.join(broadcast(df_standard), on=["PROVINCE", "CITY"], how="left") df_cleanning.persist() df_not_match = df_cleanning.where(isnull(df_cleanning.STANDARD_NAME)) df_cleanning = df_cleanning.where(~isnull(df_cleanning.STANDARD_NAME)) df_cleanning = df_cleanning.repartition(800).withColumn( "SIMILARITY", efftiveness_with_jaro_winkler_similarity_in_hc_mapping( df_cleanning.NAME, df_cleanning.STANDARD_NAME)) windowSpec = Window.partitionBy("ID").orderBy(desc("SIMILARITY")) df_cleanning = df_cleanning.withColumn("RANK", rank().over(windowSpec)) df_cleanning = df_cleanning.where(df_cleanning.RANK == 1) df_cleanning.repartition(1).write.mode("overwrite").option( "header", "true" ).csv( "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/chc_hc_cleanning/hc_result_2" ) df_not_match.repartition(1).write.mode("overwrite").option( "header", "true" ).csv( "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/chc_hc_cleanning/hc_result_not_match" )
def compile_rank(t, expr, scope, *, window, **kwargs): return F.rank().over(window).astype('long') - F.lit(1)
train.select(['user_id']).distinct().count() #5298 val_new.select(['user_id']).distinct().count() #1751 test_new.select(['user_id']).distinct().count() #1636 ## there are some users in val_add and test_add but not in val_new and test_new, therefore, 8774> 5298+1751+1636=8685 ## example of writing x% data train_add_test.write.parquet("train01.parquet") val_new.write.parquet("val01.parquet") test_new.write.parquet("test01.parquet") # create the true rank list (example of 1% data) from pyspark.sql.window import Window from pyspark.sql.functions import rank, col window = Window.partitionBy(val['user_id']).orderBy(val['rating'].desc()) val_true_order = val.select('*', rank().over(window).alias('rank')) val_true_list = val_true_order.select( 'user_id', 'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books')) val_true_list.write.parquet("val01_true_list.parquet") window = Window.partitionBy(test['user_id']).orderBy(test['rating'].desc()) test_true_order = test.select('*', rank().over(window).alias('rank')) test_true_list = test_true_order.select( 'user_id', 'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books')) test_true_list.write.parquet("test01_true_list.parquet")
def match_accidents_with_roads(spark, road_df, accident_df, use_cache=True): cache_path = workdir + "data/matches_accident-road.parquet" if isdir(cache_path) and use_cache: print("Reading accident-road matches from cache...") return spark.read.parquet(cache_path) nb_top_road_center_preselected = 5 max_distance_accepted = 10 # in meters # Compute distance between accident and road centers to identify the # top nb_top_road_center_preselected closest roads road_centers = road_df.select( ["street_id", "center_long", "center_lat"] ).drop_duplicates() acc_window = Window.partitionBy("accident_id").orderBy("distance_measure") accidents_top_k_roads = ( accident_df.select("loc_lat", "loc_long", "accident_id") .crossJoin(road_centers) .withColumn( "distance_inter", distance_intermediate_formula( "loc_lat", "loc_long", "center_lat", "center_long" ), ) .withColumn("distance_measure", distance_measure()) .select( "accident_id", "street_id", "distance_measure", "loc_lat", "loc_long", rank().over(acc_window).alias("distance_rank"), ) .filter(col("distance_rank") <= nb_top_road_center_preselected) .drop("distance_measure", "distance_rank") .persist() ) # For each accident identify road point closest accidents_roads_first_match = ( accidents_top_k_roads.join(road_df, "street_id") .withColumn( "distance_inter", distance_intermediate_formula( "loc_lat", "loc_long", "coord_lat", "coord_long" ), ) .withColumn("distance_measure", distance_measure()) .select( "accident_id", "loc_lat", "loc_long", "coord_lat", "coord_long", "street_id", "street_name", row_number().over(acc_window).alias("distance_rank"), "distance_measure", ) .filter(col("distance_rank") == 1) .withColumn("distance", col("distance_measure") * (6371 * 2 * 1000)) .drop("distance_rank", "distance_measure", "coord_lat", "coord_long") .persist() ) # If the distance is lower than max_distance_accepted we keep the # accident/street matches accidents_road_correct_match = accidents_roads_first_match.filter( col("distance") < max_distance_accepted ).select("accident_id", "street_id") # If not, we try to get a better match by adding intermediate points on # the preselected streets # For unsatisfying matches, recompute the k closests roads # Recomputing is probably faster than reading from disk # cache + joining on accident_ids accidents_close_streets_coords = ( accidents_roads_first_match.filter(col("distance") >= max_distance_accepted) .select("accident_id", "loc_lat", "loc_long") .crossJoin(road_centers) .withColumn( "distance_inter", distance_intermediate_formula( "loc_lat", "loc_long", "center_lat", "center_long" ), ) .withColumn("distance_measure", distance_measure()) .select( "accident_id", "street_id", "distance_measure", "loc_lat", "loc_long", rank().over(acc_window).alias("distance_rank"), ) .filter(col("distance_rank") <= nb_top_road_center_preselected) .drop("distance_measure", "distance_rank") .join(road_df.select("street_id", "coord_lat", "coord_long"), "street_id") ) # Add the intermediate points street_rolling_window = ( Window.partitionBy("street_id").orderBy("coord_long").rowsBetween(0, +1) ) accidents_close_streets_with_additional_coords = ( accidents_close_streets_coords.select( "accident_id", "street_id", "loc_lat", "loc_long", avg("coord_long").over(street_rolling_window).alias("coord_long"), avg("coord_lat").over(street_rolling_window).alias("coord_lat"), ) .union(accidents_close_streets_coords) .dropDuplicates() ) accidents_close_streets_coords.unpersist() # Recompute distances between accident and new set of points # and use closest point to identify street accidents_roads_first_match_with_additional_coords = ( accidents_close_streets_with_additional_coords.withColumn( "distance_inter", distance_intermediate_formula( "loc_lat", "loc_long", "coord_lat", "coord_long" ), ) .withColumn("distance_measure", distance_measure()) .select( "accident_id", "street_id", "loc_lat", "loc_long", "coord_lat", "coord_long", row_number().over(acc_window).alias("distance_rank"), ) .filter(col("distance_rank") == 1) .drop("distance_rank", "loc_lat", "loc_long", "coord_lat", "coord_long") ) # Union accidents matched correctly with first method with the accidents # for which we used more street points final_match = accidents_road_correct_match.union( accidents_roads_first_match_with_additional_coords ) # Make sure there is only one road per accident final_match = ( final_match.join(road_centers, "street_id") .join(accident_df.select("loc_lat", "loc_long", "accident_id"), "accident_id") .withColumn( "distance_inter", distance_intermediate_formula( "loc_lat", "loc_long", "center_lat", "center_long" ), ) .withColumn("distance_measure", distance_measure()) .withColumn("dist_rank", row_number().over(acc_window)) .filter(col("dist_rank") == 1) .select("accident_id", "street_id") ) return final_match
.partitionBy("CustomerId", "date")\ .orderBy(desc("Quantity"))\ .rowsBetween(Window.unboundedPreceding, Window.currentRow) # COMMAND ---------- from pyspark.sql.functions import max maxPurchaseQuantity = max(col("Quantity")).over(windowSpec) # COMMAND ---------- from pyspark.sql.functions import dense_rank, rank purchaseDenseRank = dense_rank().over(windowSpec) purchaseRank = rank().over(windowSpec) # COMMAND ---------- from pyspark.sql.functions import col dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\ .select( col("CustomerId"), col("date"), col("Quantity"), purchaseRank.alias("quantityRank"), purchaseDenseRank.alias("quantityDenseRank"), maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()