def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[: len(r)])
def train(self, df, featureCols): ntiles = [] for col in featureCols: w = Window.partitionBy().orderBy(col) aux = df.select(F.ntile(self._n).over(w).alias('ntile'),col) ntiles.append(list(aux.groupby('ntile').max(col).collect())) self.ntiles_ = np.array(ntiles) self.columns_ = map(str,featureCols) self._is_trained = True
def compute(day): # On veut les jours day-30 à day-1 sums = wikipediadata.where( (wikipediadata.day >= day-30) & (wikipediadata.day <= day-1)) # Sous-ensemble de test #sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr'))) # On somme les heures de la journées sums = sums.groupby('projectcode', 'page', 'day').sum('views') # On cache pour plus tard sums.cache() # on définit une windows := jour precedent window_spec = Window.partitionBy(sums.projectcode, sums.page) \ .orderBy(sums.day.asc()).rowsBetween(-1, -1) # on calcule la différence entre views(d) - views(d-1) diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \ .over(window_spec)) # on calcule les coefs à appliquer à chaque jour coefs = pd.DataFrame({'day': range(day-30, day)}) coefs['coef'] = 1. / (day - coefs.day) coefs = hc.createDataFrame(coefs) diffs = diffs.join(coefs, 'day') # on calcul le score de chaque jour diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef) totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score') # on normalise par la racine de la somme des views totals = totals.withColumn('score', totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \ .orderBy(F.desc('score')) \ .withColumnRenamed('SUM(views)', 'total_views') \ .limit(10) views = sums.select('projectcode', 'page', 'day', 'views') \ .join(totals.select('projectcode', 'page', 'total_views', 'score'), (totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer') df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas() df2 = views.toPandas() df2 = df2.iloc[:, 2:] df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0) df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True) df.to_csv(filename(day), index=False) # on vide le cache hc.clearCache()
def runOtherFunctions(spark, personDf): df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]); # array df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False) # desc, asc personDf.show() personDf.sort(functions.desc("age"), functions.asc("name")).show() # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음 # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능) df2 = spark.createDataFrame([("Splits str around pattern",)], ['value']) df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False) # rownum, rank f1 = StructField("date", StringType(), True) f2 = StructField("product", StringType(), True) f3 = StructField("amount", IntegerType(), True) schema = StructType([f1, f2, f3]) p1 = ("2017-12-25 12:01:00", "note", 1000) p2 = ("2017-12-25 12:01:10", "pencil", 3500) p3 = ("2017-12-25 12:03:20", "pencil", 23000) p4 = ("2017-12-25 12:05:00", "note", 1500) p5 = ("2017-12-25 12:05:07", "note", 2000) p6 = ("2017-12-25 12:06:25", "note", 1000) p7 = ("2017-12-25 12:08:00", "pencil", 500) p8 = ("2017-12-25 12:09:45", "note", 30000) dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema) w1 = Window.partitionBy("product").orderBy("amount") w2 = Window.orderBy("amount") dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"), functions.rank().over(w2).alias("rank")).show()
def sliding_row_window(self): return Window.partitionBy('id').orderBy('v').rowsBetween(-2, 1)
def ordered_window(self): return Window.partitionBy('id').orderBy('v')
def shrinking_range_window(self): return Window.partitionBy('id').orderBy('v') \ .rangeBetween(-3, Window.unboundedFollowing)
def growing_range_window(self): return Window.partitionBy('id').orderBy('v') \ .rangeBetween(Window.unboundedPreceding, 4)
tgtfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/Parquet/" dfjson = sparkSession.read.format("json").option("multiline", "true").option( "inferSchema", "true").load(srcfilePath) data = dfjson.withColumn("data", explode("DATA")).select("data.*") # dfPT = data.withColumn("createdDatePT",sf.to_timestamp(udf_TZConversion(sf.regexp_replace(data.createdDate,"T"," ").cast("string"),sf.lit("UTC"),sf.lit("US/Pacific")),"yyyy-MM-dd HH:mm:ss")) dfPT = data.withColumn( "createdDatePT", sf.from_utc_timestamp(sf.regexp_replace(data.createdDate, "T", " "), "US/Pacific")) df = dfPT.withColumn("year",sf.split("createdDate","\-")[0]) \ .withColumn("month",sf.split("createdDate","\-")[1]) \ .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0]) dfbaseData = df.select( [col for col in df.columns if not col.startswith("xmlns")]) #dfbaseData.show(10,False) dfrankedId = dfbaseData.withColumn("row_num", sf.row_number().over(Window.partitionBy("id").orderBy(sf.asc("updatedAt")))) \ .where(sf.col("row_num") == 1) \ .select(dfbaseData["*"]) dfrankedId.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \ .write.format("parquet") \ .partitionBy("year","month","day") \ .mode("overwrite") \ .save(tgtfilePath)
def test_sliding_window(data_gen): row_gen = StructGen([['ts', _restricted_ts_gen],['data', data_gen]], nullable=False) w = Window.partitionBy(f.window('ts', '5 hour', '1 hour')) assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, row_gen).withColumn('rolling_max', f.max("data").over(w)))
def cal_mat_window(sc, sqlContext, dfSC, window): windowSpec = Window.partitionBy("symbol").orderBy("date").rangeBetween(-1 * window+1,1) mat = func.avg("close").over(windowSpec) dfSC = dfSC.select(dfSC.symbol, dfSC.date, dfSC.close, mat ) print dfSC.collect()
def sliding_range_window(self): return Window.partitionBy('id').orderBy('v').rangeBetween(-2, 4)
def shrinking_row_window(self): return Window.partitionBy('id').orderBy('v').rowsBetween(-2, Window.unboundedFollowing)
def growing_row_window(self): return Window.partitionBy('id').orderBy('v').rowsBetween(Window.unboundedPreceding, 3)
sf.col('inputData.trackingId').alias('trackingId'), sf.col('sort'), sf.col('filters'), sf.col('inputData.filterInput').alias('filterInput'), sf.col('algo'), sf.col('pid'), sf.col('outputData.rankList.ranks').alias('ranks')) # display(_searches) # COMMAND ---------- columnsToPartitionOn = ['vcid', 'flavour', 'checkIn', 'checkOut', 'pax', 'userId', 'email', 'trackingId', 'sort', 'filters', 'filterInput', 'algo'] searches = _searches.withColumn('lastSearchTime', sf.lag('currentSearchTime') \ .over(Window.partitionBy(columnsToPartitionOn) \ .orderBy('currentSearchTime'))) \ .withColumn('isNewSession', is_new_session(sf.col('currentSearchTime'), sf.col('lastSearchTime'), sf.col('pid'))) \ .withColumn('sessionId', sf.sum('isNewSession') \ .over(Window.partitionBy(columnsToPartitionOn) \ .orderBy('currentSearchTime'))) \ .groupBy(columnsToPartitionOn + ['sessionId']) \ .agg(sf.min('searchTime').alias('searchTime'), sf.collect_list('ranks').alias('hotels')) \ .withColumn('hotels', flatten(sf.col('hotels'))) display(searches) # COMMAND ---------- # DBTITLE 1,Details hotels_e_detail = spark.table('hotels_e.detail')
AVG(peakBytes) peakBytesDaily, AVG(peakLoad1) peakLoad1Daily, AVG(peakLoad2) peakLoad2Daily, AVG(peakLoad3) peakLoad3Daily, AVG(peakLoad4) peakLoad4Daily, AVG(peakLoad5) peakLoad5Daily, AVG(peakLoadSecure) peakLoadSecureDaily FROM featureeddf group by ServerIP, SessionStartDay """ dailyStatisticdf = spark.sql(sqlStatement); #lag features #previous week average #rolling mean features with 2-days/48-hours lag rollingLags = [2] lagColumns = [x for x in dailyStatisticdf.columns if 'Daily' in x] windowSize=[7] for w in windowSize: for i in rollingLags: wSpec = Window.partitionBy('d_ServerIP').orderBy('d_SessionStartDay').rowsBetween(-i-w, -i-1) for j in lagColumns: dailyStatisticdf = dailyStatisticdf.withColumn(j+'Lag'+str(i)+'Win'+str(w),F.avg(col(j)).over(wSpec) ) selectColumn = ['d_ServerIP', 'd_SessionStartDay'] selectColumn.extend([x for x in dailyStatisticdf.columns if 'Lag' in x]) dailyStatisticdf = dailyStatisticdf.select(selectColumn) dailyStatisticdf = dailyStatisticdf.withColumn("d_key2", concat(dailyStatisticdf.d_ServerIP,lit("_"),dailyStatisticdf.d_SessionStartDay.cast('string'))) featureeddf = featureeddf.withColumn("d_key2", concat(featureeddf.ServerIP,lit("_"),featureeddf.SessionStartDay.cast('string'))) dailyStatisticdf.cache() # Single column join is much faster than two columns join featureeddf = featureeddf.join(dailyStatisticdf, (featureeddf.d_key2 == dailyStatisticdf.d_key2), 'outer' ) featureeddf.show(1) featureeddf.persist() featureeddf = featureeddf.select([x for x in featureeddf.columns if 'd_' not in x ])
priceDF = sqlContext.read.format('csv') \ .options(header='true', inferSchema='true') \ .load(INPUT_CSV) print('Archivo cargado') priceDF = priceDF.select( \ F.col('code').alias('CODE'), \ F.col('Price').alias('PRICE'), \ F.col('Date').alias('DATE')) priceDF = priceDF.withColumn('YEAR', F.year(F.to_date('DATE', 'MM-dd-yyyy'))) print('Dataset creado') #Rentabilidad diaria windowSpec = Window.partitionBy("CODE").orderBy( F.col('DATE').asc()).rowsBetween(-1, 0) priceDF = priceDF.withColumn('AUX', F.sum("price").over(windowSpec)) priceDF = priceDF.withColumn("RETURNS", (F.col("PRICE") - (F.col("AUX") - F.col("PRICE"))) / (F.col("AUX") - F.col("PRICE"))).drop("AUX") print('Rentabilidad diaria') #Rentabilidad acumulada precioIniDF = priceDF.sort(F.desc('DATE')).groupBy('YEAR', 'CODE').agg( F.last('PRICE').alias('PRICE_START')) priceDF = precioIniDF.join(priceDF, on=['YEAR', 'CODE']) priceDF = priceDF.withColumn('CUMULATIVE_RETURNS', ((F.col('PRICE') - F.col('PRICE_START')) / (F.col('PRICE_START'))))
import pandas as pd except ImportError as e: print("It wasn't possible import packages. Error:", e) path = '/home/rafaelrs/Documents/case_stone/data' relative_read_path = f'{path}/transaction_history/' relative_write_path = f'{path}/balance/' balance_schema = T.StructType([ T.StructField('account_id', T.StringType(), True), T.StructField('amount', T.LongType(), True), T.StructField('counterparty_account_id', T.StringType(), True), T.StructField('inserted_at', T.LongType(), True), T.StructField('transaction_id', T.StringType(), True) ]) try: balance = spark.read.parquet(relative_read_path, schema=balance_schema) except Exception as e: print("Something went wrong with the transaction history path", e) by_account_id = Window.partitionBy(['account_id']).orderBy("inserted_at") balance = balance.withColumn('balance', F.sum(F.col('amount')).over(by_account_id)) balance = balance.withColumn( 'inserted_at', F.to_timestamp(F.from_unixtime(F.col('inserted_at') / 1000))) balance.write.parquet(relative_write_path)
dfbaseData.loanApplicationId, \ dfbaseData.clientID, \ dfbaseData.mvpClientID, \ sf.regexp_replace(dfbaseData.createdDate,"T"," ").cast(TimestampType()).alias("idologyTimestampUTC"), \ dfbaseData.createdDatePT.alias("idologyTimestampPT"),\ dfrmi.id.alias("rmiId"), \ dfrmi.date_created.alias("riskTimestampPT"), \ dfrmsNS.score_type.alias("scoreType"),\ dfrmi.risk_model_scorex_plus_input,\ dfrmi.risk_model_vantage_input,\ dfrmi.risk_model_vantage3_input,\ dfbaseData.year,\ dfbaseData.month,\ dfbaseData.day) dfranked = dfjoin.withColumn("row_num", sf.row_number().over(Window.partitionBy("loanApplicationId","idologyTimestampPT").orderBy(sf.desc("riskTimestampPT"),sf.desc("scoreType")))) \ .where(sf.col("row_num") == 1) \ .select(dfjoin["*"]) dfsplitCol.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \ .write.format("parquet") \ .partitionBy("year","month","day") \ .mode("overwrite") \ .save(tgtfilePathResultCodes) dfbaseData.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \ .write.format("parquet") \ .partitionBy("year","month","day") \ .mode("overwrite") \ .save(tgtfilePathBaseData)
def unbounded_window(self): return Window.partitionBy('id') \ .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
def calculate_time_window_indicator(df, column_name, time_window, group_column, filter_criteria, agg_column='sum_column', timestamp_column): """ Function to calculate count aggregation grouped by a column and using provided time window and filter criteria :param df: spark dataframe with data to aggregate :param column_name: new column name :param time_window: time window to aggregate values :param group_column: column used to group by :param filter_criteria: filter criteria to apply before aggregation :param timestamp_column: column with timestamp to use for aggregation :return df: spark dataframe with aggregated column """ # Time window dic dict_time_window = { '60M': 60, '2H': 2, '3H': 3, '12H': 12, '24H': 24, '3D': 3, '7D': 7, '15D': 15, '30D': 30, '60D': 60, '90D': 90, '365D': 365, } # Check if time window is correct if not time_window in dict_time_window: print('ERR PARAM: invalid time_window') return df # Check if column_name is correct if column_name in df.columns: print('ERR PARAM: invalid column_name, already exists on dataframe') return df # Check if group_column is correct if group_column not in df.columns: if not all(item in df.columns for item in group_column): print('ERR PARAM: invalid time_window') return df # Lambdas to calcule time in seconds days = lambda i: i * 86400 hours = lambda i: i * 3600 minutes = lambda i: i * 60 # Create time window with paramenter if 'M' in time_window: window_parameter = minutes(dict_time_window[time_window]) elif 'H' in time_window: window_parameter = hours(dict_time_window[time_window]) elif 'D' in time_window: window_parameter = days(dict_time_window[time_window]) # Define time window with partition windowSpec = Window.partitionBy(group_column).orderBy( F.col(timestamp_column).cast('long')).rangeBetween( -window_parameter, 0) # Add temp column used for aggregation that meet criteria if agg_column == 'sum_column': df = df.withColumn(agg_column, F.when((filter_criteria), 1).otherwise(0)) else: df = df.withColumn( agg_column, F.when((filter_criteria), df[agg_column]).otherwise(0)) # Calculate rolling on provided window df = df.withColumn(column_name, F.sum(agg_column).over(windowSpec)) # Substract one index due to start with 1 if agg_column == 'sum_column': df = df.withColumn( column_name, F.when((df.sum_column == 1) & (df[column_name] > 0), df[column_name] - 1).otherwise(df[column_name])) # Drop temporary columns df = df.drop(*['sum_column']) return df
def unpartitioned_window(self): return Window.partitionBy()
def with_window_column(df): from pyspark.sql.window import Window from pyspark.sql.functions import percent_rank windowSpec = Window.partitionBy(df['id']).orderBy(df['forecast']) return df.withColumn("r", percent_rank().over(windowSpec))
#reading dataframes from MongoDB df = spark.read.format("mongo").load() df.printSchema() #let's change the data type to a timestamp df = df.withColumn("tx_time", df.tx_time.cast("timestamp")) #Here we are calculating a moving average from pyspark.sql.window import Window from pyspark.sql import functions as F movAvg = df.withColumn( "movingAverage", F.avg("price").over( Window.partitionBy("company_symbol").rowsBetween(-1, 1))) movAvg.show() #Saving Dataframes to MongoDB movAvg.write.format("mongo").option("replaceDocument", "true").mode("append").save() #Reading Dataframes from the Aggregation Pipeline in MongoDB pipeline = "[{'$group': {_id:'$company_name', 'maxprice': {$max:'$price'}}},{$sort:{'maxprice':-1}}]" aggPipelineDF = spark.read.format("mongo").option("pipeline", pipeline).load() aggPipelineDF.show() #using SparkSQL with MongoDB movAvg.createOrReplaceTempView("avgs") sqlDF = spark.sql("SELECT * FROM avgs WHERE movingAverage > 43.0")
def unbounded_window(self): return Window.partitionBy('id') \ .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing).orderBy('v')
def chain_delay_feature_engineering(airline_df): '''Takes airline df with created columns CRS_DEP_TIME_UTC, CRS_DEP_MINUS_TWO_FIFTEEN_UTC and returns new airline df with 5 added columns: dep_time_diff_one_flight_before: time between departure of current flight and previous flight (in seconds) dep_time_diff_two_flights_before: time between departure of current flight and flight two previous (in seconds) delay_one_before: was flight before delayed? (binary) delay_two_before: was flight two before delayed? (binary) PREVIOUS_FLIGHT_DELAYED_FOR_MODEL: If previous flight is at least 2 hours 15 minutes prior (8100 seconds), was it delayed? If less than 2:15, was flight 2 before delayed? (binary)''' airline_df.createOrReplaceTempView("airlines_temp_view") #Store new df with limited number of ordered columns that we can use to window airlines_aircraft_tracking = airline_df[[ "tail_num", "fl_date", "origin_city_name", "dest_city_name", "dep_del15", "crs_dep_time_utc", "crs_dep_minus_two_fifteen_utc", "crs_arr_time_utc" ]].orderBy("tail_num", "fl_date", "crs_dep_time_utc") #This section is related to windowing so that we can pull information from previous flight and flight 2 before current flight. Windowing will only pull for the same tail number w = Window.partitionBy("tail_num").orderBy("crs_dep_time_utc") diff = col("crs_dep_time_utc").cast("long") - lag("crs_dep_time_utc", 1).over(w).cast("long") diff2 = col("crs_dep_time_utc").cast("long") - lag("crs_dep_time_utc", 2).over(w).cast("long") delay_one_before = lag("dep_del15", 1).over(w) delay_two_before = lag("dep_del15", 2).over(w) arr_time_one_before = col("crs_dep_time_utc").cast("long") - lag( "crs_arr_time_utc", 1).over(w).cast("long") arr_time_two_before = col("crs_dep_time_utc").cast("long") - lag( "crs_arr_time_utc", 2).over(w).cast("long") airlines_aircraft_tracking_diff = airlines_aircraft_tracking.withColumn("dep_time_diff_one_flight_before", diff)\ .withColumn("dep_time_diff_two_flights_before", diff2)\ .withColumn("delay_one_before", delay_one_before)\ .withColumn("delay_two_before", delay_two_before)\ .withColumn("arr_time_one_before", arr_time_one_before)\ .withColumn("arr_time_two_before", arr_time_two_before) def chain_delay_analysis(crs_dep_time_utc, dep_time_diff_one_flight_before, dep_time_diff_two_flights_before, delay_one_before, delay_two_before, arr_time_one_before, arr_time_two_before): '''Takes info on flight before: departure time difference, whether flight was delayed and returns 1 if flight before was delayed AND outside of 2:15 from current flight. If outside of 2:15 looks at flight 2 before and returns 1 if that one was delayed, 0 if not. If scheduled arrival of previous flight is greater than 5 hours or flight 2 before great than 7 hours before current flight we mark as 0''' try: if dep_time_diff_one_flight_before >= 8100: if arr_time_one_before <= 18000: return delay_one_before else: return int(0) else: if arr_time_two_before <= 25200: return delay_two_before else: return int(0) except: return int(0) chain_delay_analysis_udf = f.udf(chain_delay_analysis) airlines_aircraft_tracking_diff_for_join = airlines_aircraft_tracking_diff.withColumn( "PREVIOUS_FLIGHT_DELAYED_FOR_MODELS", chain_delay_analysis_udf('crs_dep_time_utc', 'dep_time_diff_one_flight_before', 'dep_time_diff_two_flights_before', 'delay_one_before', 'delay_two_before', 'arr_time_one_before', 'arr_time_two_before')) airline_df_with_id = airline_df.withColumn("id", monotonically_increasing_id()) #Join chain delay analysis back to full airline data join_columns = [ "tail_num", "fl_date", "origin_city_name", "dest_city_name", "crs_dep_time_utc" ] airlines_chain_delays = airline_df_with_id.alias("a").join(airlines_aircraft_tracking_diff_for_join.alias("j"), join_columns, 'left_outer') \ .select('a.year', 'a.quarter', 'a.month', 'a.day_of_week', 'a.fl_date', 'a.op_unique_carrier', 'a.tail_num', 'a.origin_airport_id', 'a.origin', 'a.origin_city_name', 'a.dest_airport_id', 'a.dest', 'a.dest_city_name', 'a.crs_dep_time', 'a.dep_time', 'a.dep_delay', 'a.dep_del15', 'a.cancelled', 'a.diverted', 'a.distance', 'a.distance_group', 'a.short_dest_city_name', 'a.short_orig_city_name', 'a.carrier_delay', 'a.weather_delay', 'a.nas_delay', 'a.security_delay', 'a.late_aircraft_delay', 'a.taxi_out', 'a.dest_timezone', 'a.origin_timezone', 'a.truncated_crs_dep_time_utc', 'a.truncated_crs_dep_minus_three_utc', 'a.crs_dep_time_utc', 'a.crs_dep_minus_two_fifteen_utc', 'a.Holiday', 'a.id', 'j.dep_time_diff_one_flight_before', 'j.dep_time_diff_two_flights_before', 'j.delay_one_before', 'j.delay_two_before', 'j.PREVIOUS_FLIGHT_DELAYED_FOR_MODELS') #Drop duplicates created during join. airlines_chain_delays_no_dups = airlines_chain_delays.dropDuplicates( ['id']) return airlines_chain_delays_no_dups
# Join de Rentabilidad y Volatilidad rentaVolatilDF = rentabilidadDF.join(volatilidadDF, on=['YEAR', 'CODE']) # Tabla Final TotalDF = TotalDF.join(rentaVolatilDF, on=['YEAR', 'CODE']) # RANKINGs auxColumns_cat = [ 'PSDA__EARNINGS_PER_SHARE_DILUTED', 'PSDA__EBITDA_PER_SHARE', 'PSDA__TOTAL_DEBT_PER_SHARE', 'CSR__NET_MARGIN', 'IS__REVENUE', 'IS__NET_INCOME', 'IS__COST_OF_GOODS_SOLD', 'BS__TOTAL_EQUITY', 'IS__EBITDA', 'BS__TOTAL_ASSETS', 'BS__LONG_TERM_DEBT', 'BS__TOTAL_CURRENT_LIABILITIES' ] wY = Window.partitionBy('YEAR') for col in auxColumns_cat: TotalDF = TotalDF.withColumn(col + '_RANKING', F.row_number().over(wY.orderBy(F.col(col)))) # Agregamos label (rentabilidad periodo siguiente) TotalDF = TotalDF.withColumn( "TARGET", F.lead(F.col("RETURNS"), 1).over(Window.partitionBy(F.col("CODE")).orderBy(F.col("YEAR")))) # Escribimos fichero final (CSV) TotalDF.repartition('YEAR', 'CODE').write.csv(BUCKET + OUTPUT_FILE, header=True)
print("OMO's Zip & Borough list,") boro_zip_df = nyc_omo_df \ .select("Boro", nyc_omo_df["Zip"].cast(IntegerType())) \ .groupBy("Boro") \ .agg({"Zip": "collect_set"}) \ .withColumnRenamed("collect_set(Zip)", "ZipList") \ .withColumn("ZipCount", F.size("ZipList")) boro_zip_df \ .select("Boro", "ZipCount", "ZipList") \ .show(5) # Window functions window_spec = Window.partitionBy("OMOCreateDate") omo_daily_freq = nyc_omo_df \ .withColumn("OMODailyFreq", F.count("OMOID").over(window_spec)) print("# of partitions in window'ed OM dataframe = " + str(omo_daily_freq.count())) omo_daily_freq.show(5) omo_daily_freq.select("OMOCreateDate", "OMODailyFreq") \ .distinct() \ .show(5) omo_daily_freq \ .repartition(5) \ .write \ .mode("overwrite") \ .parquet("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/nyc_omo_data")
# COMMAND ---------- identity_columns = [index_column_name, delta_column_name] nested_columns = eval(str(dbutils.widgets.get("nested_columns"))) print("Nested Columns:", nested_columns) # COMMAND ---------- for nested_column in nested_columns: max_records_on_array = (newDf.select( size(nested_column).alias(nested_column + "len")).groupby().max( nested_column + "len").collect()[0]["max({})".format(nested_column + "len")]) nestedwindowSpec = Window.partitionBy("ROW_ID").orderBy( monotonically_increasing_id()) nested_parquet_filepath = (mount + basepath + zone + "/" + contry_name + "/" + source_name + "/" + object_name + "_" + nested_column + "/" + year + "/" + month + "/" + day + "/" + object_name + "_" + nested_column + "_" + filename_timestamp + ".parquet") if max_records_on_array > 0: result = denormalizer(newDf, nested_column, identity_columns) result = result.withColumn("ITEM_ID", sqlfn.row_number().over(nestedwindowSpec)) # added to remove any structypes result = flattenDataframe(result) result.write.format("parquet").save(nested_parquet_filepath) newDf = newDf.withColumn( nested_column, sha2(newDf[nested_column].cast(StringType()), 512)) else:
def unbounded_window(self): return (Window.partitionBy("id").rowsBetween( Window.unboundedPreceding, Window.unboundedFollowing).orderBy("v"))
def get_similar_artists(top_artist_df, artist_relation_df, similar_artist_limit): """ Get artists similar to top artists. Args: top_artist_df: Dataframe containing top artists listened to by users artist_relation_df: Dataframe containing artists and similar artists. For columns refer to artist_relation_schema in listenbrainz_spark/schema.py. similar_artist_limit (int): number of similar artist to calculate Returns: similar_artist_df (dataframe): Top Z artists similar to top artists where Z = SIMILAR_ARTISTS_LIMIT. """ condition = [top_artist_df.top_artist_credit_id == artist_relation_df.id_0] df1 = top_artist_df.join(artist_relation_df, condition, 'inner') \ .select(col('id_0').alias('top_artist_credit_id'), col('name_0').alias('top_artist_name'), col('id_1').alias('similar_artist_credit_id'), col('name_1').alias('similar_artist_name'), 'score', 'user_name') condition = [top_artist_df.top_artist_credit_id == artist_relation_df.id_1] df2 = top_artist_df.join(artist_relation_df, condition, 'inner') \ .select(col('id_1').alias('top_artist_credit_id'), col('name_1').alias('top_artist_name'), col('id_0').alias('similar_artist_credit_id'), col('name_0').alias('similar_artist_name'), 'score', 'user_name') df = df1.union(df2) window = Window.partitionBy('top_artist_credit_id', 'user_name')\ .orderBy(col('score').desc()) similar_artist_df_html = df.withColumn('rank', row_number().over(window)) \ .where(col('rank') <= similar_artist_limit)\ .select('top_artist_credit_id', 'top_artist_name', 'similar_artist_credit_id', 'similar_artist_name', 'user_name') similar_artist_df_html = filter_top_artists_from_similar_artists( similar_artist_df_html, top_artist_df) # Two or more artists can have same similar artist(s) leading to non-unique recordings # therefore we have filtered the distinct similar artists. similar_artist_df = similar_artist_df_html.select('similar_artist_credit_id', 'similar_artist_name', 'user_name') \ .distinct() if _is_empty_dataframe(similar_artist_df): logger.error('Similar artists not generated.', exc_info=True) raise SimilarArtistNotFetchedException( 'Artists missing from artist relation') return similar_artist_df, similar_artist_df_html
def ordered_window(self): return Window.partitionBy("id").orderBy("v")
train.select(['user_id']).distinct().count() #5298 val_new.select(['user_id']).distinct().count() #1751 test_new.select(['user_id']).distinct().count() #1636 ## there are some users in val_add and test_add but not in val_new and test_new, therefore, 8774> 5298+1751+1636=8685 ## example of writing x% data train_add_test.write.parquet("train01.parquet") val_new.write.parquet("val01.parquet") test_new.write.parquet("test01.parquet") # create the true rank list (example of 1% data) from pyspark.sql.window import Window from pyspark.sql.functions import rank, col window = Window.partitionBy(val['user_id']).orderBy(val['rating'].desc()) val_true_order = val.select('*', rank().over(window).alias('rank')) val_true_list = val_true_order.select( 'user_id', 'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books')) val_true_list.write.parquet("val01_true_list.parquet") window = Window.partitionBy(test['user_id']).orderBy(test['rating'].desc()) test_true_order = test.select('*', rank().over(window).alias('rank')) test_true_list = test_true_order.select( 'user_id', 'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books')) test_true_list.write.parquet("test01_true_list.parquet")
def sliding_row_window(self): return Window.partitionBy("id").orderBy("v").rowsBetween(-2, 1)
def process_log_data(spark, input_data, output_data): """ Load Log & Song dataset from AWS S3 into spark and build two dimension tables (users, time) and a fact table (songplays) And writes tables back to S3 in parquet format Arguments: spark: Spark Session input_data: Input data path output_data: Output data path Return: None """ # build SQLContext sc = spark.sparkContext sqlContext = SQLContext(sc) # get filepath to log data file log_data = os.path.join(input_data, "log_data/*/*/*.json") # get filepath to song data file song_data = os.path.join(input_data, "song_data/*/*/*/*.json") # read log data file df = spark.read.format("json").load(log_data) # filter by actions for song plays and keep only 'NextSong' event df = df.filter(df.page == 'NextSong') # extract columns for users table # users table need a special treat for keeping the last level value # in the log data # For capturing the last level value for each user, Window will be used win1 = Window.partitionBy('userId').orderBy('ts') \ .rangeBetween(Window.unboundedPreceding, 0) df_users = df.filter(col('userId').isNotNull()). \ select('userId', 'firstName', 'lastName', 'gender', 'level', 'ts') \ .withColumn('tsSeq', count('ts').over(win1)) win2 = Window.partitionBy("userId") df_users = df_users.withColumn("tsSeqMax", F.max("tsSeq").over(win2)) users_table = df_users.filter((df_users.tsSeq == df_users.tsSeqMax)) \ .groupBy('userId') \ .agg(F.max("firstName"), F.max("lastName"), F.max("gender"), F.max("level")) \ .withColumnRenamed('userId', 'user_id') \ .withColumnRenamed('max(firstName)', 'first_name') \ .withColumnRenamed('max(lastName)', 'last_name') \ .withColumnRenamed('max(gender)', 'gender') \ .withColumnRenamed('max(level)', 'level') # make sure users table has desired schema users_schema = StructType([ StructField('user_id', StringType(), False), StructField('first_name', StringType(), False), StructField('last_name', StringType(), False), StructField('gender', StringType(), True), StructField('level', StringType(), False) ]) users_table = sqlContext.createDataFrame(users_table.rdd, users_schema) # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users'), mode='overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1e3), TimestampType()) df = df.withColumn("timestamp", get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(x / 1e3), DateType()) df = df.withColumn("ts_date", get_datetime(df.ts)) # extract columns to create time table time_table = df.groupBy('timestamp') \ .agg({'timestamp':'max'}) \ .withColumn('hour', F.hour('max(timestamp)')) \ .withColumn('day', F.dayofmonth('max(timestamp)')) \ .withColumn('week', F.weekofyear('max(timestamp)')) \ .withColumn('month', F.month('max(timestamp)')) \ .withColumn('year', F.year('max(timestamp)')) \ .withColumn('weekday', F.date_format('max(timestamp)', 'E')) \ .withColumnRenamed('max(timestamp)', 'start_time') \ .drop('timestamp') \ .orderBy(F.asc('start_time')) # make sure time table has desired schema time_schema = StructType([ StructField('start_time', TimestampType(), False), StructField('hour', IntegerType(), False), StructField('day', IntegerType(), False), StructField('week', IntegerType(), False), StructField('month', IntegerType(), False), StructField('year', IntegerType(), False), StructField('weekday', StringType(), False) ]) time_table = sqlContext.createDataFrame(time_table.rdd, time_schema) # write time table to parquet files partitioned by year and month time_table.repartition(col('year'), col('month')) \ .write.partitionBy('year', 'month') \ .parquet(os.path.join(output_data, 'time'), mode='overwrite') # read in song data to use for songplays table song_df = spark.read.format("json").load(song_data) t_log = df.alias('t_log') t_song = song_df.alias('t_song') # extract columns from joined song and log datasets to create songplays table songplays_table = t_log.join(t_song, \ (col('t_log.song') == col('t_song.title')) & \ (col('t_log.artist') == col('t_song.artist_name') \ ),how='inner') \ .select([col('t_log.timestamp'),\ col('t_log.userId'), \ col('t_log.level'), \ col('t_song.song_id'), \ col('t_song.artist_id'), \ col('t_log.sessionId'), \ col('t_log.location'), \ col('t_log.userAgent')]) \ .withColumnRenamed('timestamp', 'start_time') \ .withColumnRenamed('userId', 'user_id') \ .withColumnRenamed('sessionId', 'session_id') \ .withColumnRenamed('userAgent', 'user_agent') \ .orderBy(F.asc('start_time')) # make sure songplays has desired schema songplays_schema = StructType([ StructField('start_time', TimestampType(), False), StructField('user_id', StringType(), False), StructField('level', StringType(), True), StructField('song_id', StringType(), False), StructField('artist_id', StringType(), False), StructField('session_id', LongType(), False), StructField('location', StringType(), True), StructField('user_agent', StringType(), False) ]) songplays_table = sqlContext.createDataFrame(songplays_table.rdd, songplays_schema) # write songplays table to parquet files partitioned by year and month songplays_table.withColumn('year', F.year('start_time')) \ .withColumn('month', F.month('start_time')) \ .repartition(col('year'), col('month')) \ .write.partitionBy('year', 'month') \ .parquet(os.path.join(output_data, 'songplays'),\ mode='overwrite')
def growing_range_window(self): return Window.partitionBy("id").orderBy("v").rangeBetween( Window.unboundedPreceding, 4)
inner_join.createOrReplaceTempView('inner_join') (training, tune, test) = check.randomSplit([0.6, 0.2, 0.2], seed=0) training.createOrReplaceTempView('training') tune.createOrReplaceTempView('tune') test.createOrReplaceTempView('test') train_set = spark.sql( 'Select training.user_id, inner_join.rating, inner_join.book_id from inner_join, training where inner_join.user_id = training.user_id' ) val_set = spark.sql( 'Select tune.user_id, inner_join.rating, inner_join.book_id from inner_join, tune where inner_join.user_id = tune.user_id' ) test_set = spark.sql( 'Select test.user_id, inner_join.rating, inner_join.book_id from inner_join, test where inner_join.user_id = test.user_id' ) from pyspark.sql.window import Window window = Window.partitionBy('user_id').orderBy('book_id') import pyspark.sql.functions as F val = (val_set.select("user_id", "book_id", "rating", F.row_number().over(window).alias("row_number"))) test = (test_set.select("user_id", "book_id", "rating", F.row_number().over(window).alias("row_number"))) val.createOrReplaceTempView('val') test.createOrReplaceTempView('test') val_data = spark.sql('Select * from val where row_number%2 = 0') test_final = spark.sql('Select * from test where row_number%2 = 0') val_train = spark.sql('Select * from val where row_number%2 = 1') test_train = spark.sql('Select * from test where row_number%2 = 1') val_train = val_train.drop('row_number') val_data = val_data.drop('row_number') test_final = test_final.drop('row_number') train_set = train_set.unionByName(val_train)
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() # setup spark/sql context to be used for communication with HDFS sc = SparkContext(appName="phedex_br") if not opts.yarn: sc.setLogLevel("ERROR") sqlContext = HiveContext(sc) schema_def = schema() # read given file(s) into RDD if opts.fname: pdf = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(opts.fname, schema = schema_def) elif opts.basedir: fromdate, todate = defDates(opts.fromdate, opts.todate) files = getFileList(opts.basedir, fromdate, todate) msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files)) print msg if not files: return pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv') .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(file_path, schema = schema_def) \ for file_path in files]) else: raise ValueError("File or directory not specified. Specify fname or basedir parameters.") # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date) groupdic, nodedic = getJoinDic() acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$" data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$" groupf = udf(lambda x: groupdic[x], StringType()) nodef = udf(lambda x: nodedic[x], StringType()) ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \ .withColumn("node_kind", nodef(pdf.node_id)) \ .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \ .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \ .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1))) # print dataframe schema if opts.verbose: ndf.show() print("pdf data type", type(ndf)) ndf.printSchema() # process aggregation parameters keys = [key.lower().strip() for key in opts.keys.split(',')] results = [result.lower().strip() for result in opts.results.split(',')] aggregations = [agg.strip() for agg in opts.aggregations.split(',')] order = [orde.strip() for orde in opts.order.split(',')] if opts.order else [] asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else [] filtc, filtv = opts.filt.split(":") if opts.filt else (None,None) validateAggregationParams(keys, results, aggregations, order, filtc) if filtc and filtv: ndf = ndf.filter(getattr(ndf, filtc) == filtv) # if delta aggregation is used if DELTA in aggregations: validateDeltaParam(opts.interval, results) result = results[0] #1 for all dates generate interval group dictionary datedic = generateDateDict(fromdate, todate, opts.interval) boundic = generateBoundDict(datedic) max_interval = max(datedic.values()) interval_group = udf(lambda x: datedic[x], IntegerType()) interval_start = udf(lambda x: boundic[x][0], StringType()) interval_end = udf(lambda x: boundic[x][1], StringType()) #2 group data by block, node, interval and last result in the interval ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result)) idf = ndf.withColumn("interval_group", interval_group(ndf.now)) win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc()) idf = idf.withColumn("row_number", rowNumber().over(win)) rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\ .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0))) rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result)) rdf.cache() #3 create intervals that not exist but has minus delta win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win)) hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\ .withColumn("interval_group", adf.interval_group + 1)\ .withColumn(result, lit(0))\ .drop(adf.interval_group_aft) #4 join data frames idf = rdf.unionAll(hdf) #3 join every interval with previous interval win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win)) #5 calculate delta_plus and delta_minus columns and aggregate by date and node ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \ .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0)) aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\ sum(ddf.delta_minus).alias("delta_minus")) aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus) else: resAgg_dic = zipResultAgg(results, aggregations) order, asc = formOrdAsc(order, asc, resAgg_dic) # perform aggregation if order: aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc) else: aggres = ndf.groupBy(keys).agg(resAgg_dic) # output results if opts.fout: fout_header = formFileHeader(opts.fout) if opts.header: aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header) else: aggres.write.format('com.databricks.spark.csv').save(fout_header) else: aggres.show(50)