def pivot(trades, prices): """ Pivot and fill the columns on the event id so that each row contains a column for each id + column combination where the value is the most recent non-null value for that id. For example, given the above input tables the expected output is: +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+ | id| timestamp| bid| ask|price|quantity|10_bid|10_ask|10_price|10_quantity|20_bid|20_ask|20_price|20_quantity| +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+ | 10|1546300799000| 37.5|37.51| null| null| 37.5| 37.51| null| null| null| null| null| null| | 10|1546300800000| null| null| 37.5| 100.0| 37.5| 37.51| 37.5| 100.0| null| null| null| null| | 10|1546300801000| null| null|37.51| 100.0| 37.5| 37.51| 37.51| 100.0| null| null| null| null| | 10|1546300802000|37.51|37.52| null| null| 37.51| 37.52| 37.51| 100.0| null| null| null| null| | 20|1546300804000| null| null|12.67| 300.0| 37.51| 37.52| 37.51| 100.0| null| null| 12.67| 300.0| | 10|1546300806000| 37.5|37.51| null| null| 37.5| 37.51| 37.51| 100.0| null| null| 12.67| 300.0| | 10|1546300807000| null| null| 37.5| 200.0| 37.5| 37.51| 37.5| 200.0| null| null| 12.67| 300.0| +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+ :param trades: DataFrame of trade events :param prices: DataFrame of price events :return: A DataFrame of the combined events and pivoted columns. """ df = fill(trades, prices).groupBy('id', 'timestamp', 'bid', 'ask', 'price', 'quantity').pivot('id').agg( func.last('bid').alias('bid'), func.last('ask').alias('ask'), func.last('price').alias('price'), func.last('quantity').alias('quantity')) \ .orderBy("timestamp") return df
def set_d1_state_vector(self): """ returns 1D representation for the distributed state :return: """ d1_state_vector_udf = F.udf(d1_state_vector, ArrayType(ArrayType(FloatType()))) window = Window.partitionBy(['id', F.to_date('avg_ts') ]).orderBy('ts').rangeBetween( -OD_time_frame, OD_time_frame) self.df = self.df \ .withColumn('ts', F.col('avg_ts').cast('long')) \ .withColumn('first_ts', F.first('avg_ts').over(window)) \ .withColumn('last_ts', F.last('avg_ts').over(window)) \ .withColumn('first_lat_idx', F.first('lat_idx').over(window)) \ .withColumn('last_lat_idx', F.last('lat_idx').over(window)) \ .withColumn('first_lon_idx', F.first('lon_idx').over(window)) \ .withColumn('last_lon_idx', F.last('lon_idx').over(window)) \ .withColumn('d1_states1', d1_state_vector_udf(F.col('first_lon_idx'), F.col('first_lat_idx'), F.lit(width), F.lit(lon_cells), F.lit(lat_cells)) ) \ .withColumn('d1_states2', d1_state_vector_udf(F.col('last_lon_idx'), F.col('last_lat_idx'), F.lit(width), F.lit(lon_cells), F.lit(lat_cells)) )\
def __generate_target_fill(self, df: DataFrame, partition_cols: List[str], ts_col: str, target_col: str) -> DataFrame: """ Create columns for previous and next value for a specific target column :param df: input DataFrame :param partition_cols: partition column names :param ts_col: timestamp column name :param target_col: target column name """ return (df.withColumn( f"previous_{target_col}", last(df[target_col], ignorenulls=True).over( Window.partitionBy( *partition_cols).orderBy(ts_col).rowsBetween( Window.unboundedPreceding, 0)), ) # Handle if subsequent value is null .withColumn( f"next_null_{target_col}", last(df[target_col], ignorenulls=True).over( Window.partitionBy(*partition_cols).orderBy( col(ts_col).desc()).rowsBetween( Window.unboundedPreceding, 0)), ).withColumn( f"next_{target_col}", lead(df[target_col]).over( Window.partitionBy(*partition_cols).orderBy(ts_col)), ))
def bi_fluent_join(pyData): df = spark.createDataFrame(pyData) level1 = df \ .groupBy(df.grp) \ .agg( func.mean(df.C).alias("mean_of_C"), func.max(df.D).alias("max_of_D")) level2 = df \ .groupBy(df.grp, df.subgrp) \ .agg( func.variance(df.E).alias("var_of_E"), ((func.sum(df.E * df.E)- func.sum(df.E) * func.avg(df.E)) /(func.count(df.E)-1)).alias("var_of_E2") ) level3 = level2 \ .join(level1, "grp") \ .groupBy(level1.grp) \ .agg( func.last(level1.mean_of_C).alias("mean_of_C"), func.last(level1.max_of_D).alias("max_of_D"), func.avg(level2.var_of_E).alias("avg_var_of_E"), func.avg(level2.var_of_E2).alias("avg_var_of_E2") ) \ .orderBy(level1.grp) # .collect() return level3, None
def test_first_last_ignorenulls(self): from pyspark.sql import functions df = self.spark.range(0, 100) df2 = df.select(functions.when(df.id % 3 == 0, None).otherwise(df.id).alias("id")) df3 = df2.select(functions.first(df2.id, False).alias('a'), functions.first(df2.id, True).alias('b'), functions.last(df2.id, False).alias('c'), functions.last(df2.id, True).alias('d')) self.assertEqual([Row(a=None, b=1, c=None, d=98)], df3.collect())
def transform_data_with_udf(clickstream_data, purchase_data): window1 = Window.partitionBy('userId').orderBy('eventTime') window2 = Window.orderBy('sessionId') clickstream_data = (clickstream_data.withColumn( 'appOpenFlag', app_open_flag_udf(clickstream_data['eventType'])).withColumn( 'sessionId', sum(col('appOpenFlag')).over(window1)).withColumn( 'attr', attributes_udf( clickstream_data['eventType'], clickstream_data['attributes'])).withColumn( 'campaign_id', when( get_json_object('attr', '$.campaign_id').isNotNull(), get_json_object('attr', '$.campaign_id')).otherwise(None) ).withColumn( 'channel_id', when( get_json_object('attr', '$.channel_id').isNotNull(), get_json_object( 'attr', '$.channel_id')).otherwise(None)).withColumn( 'purchase_id', when( get_json_object( 'attr', '$.purchase_id').isNotNull(), get_json_object( 'attr', '$.purchase_id')).otherwise(None)). withColumn( 'campaignId', last(col('campaign_id'), ignorenulls=True).over( window2.rowsBetween( Window.unboundedPreceding, 0))).withColumn( 'channelId', last(col('channel_id'), ignorenulls=True).over( window2.rowsBetween( Window.unboundedPreceding, 0)))) target_df = clickstream_data.join( purchase_data, clickstream_data['purchase_id'] == purchase_data['purchaseId'], JOIN_TYPE.LEFT) return target_df.select(col('purchaseId'), col('purchaseTime'), col('billingCost'), col('isConfirmed'), col('sessionId'), col('campaignId'), col('channelId'))
def _get_distances(self, prob_df: DataFrame, df_cdf_0: DataFrame, df_cdf_1: DataFrame) -> DataFrame: window_fill = Window.orderBy(self.probability_col).rowsBetween( Window.unboundedPreceding, Window.currentRow) df_ks = prob_df.select(self.probability_col) \ .join(df_cdf_0, on=self.probability_col, how='left') \ .join(df_cdf_1, on=self.probability_col, how='left') \ .withColumn(self.EMPIRICAL_CDF_NEG, F.last(self.EMPIRICAL_CDF_NEG, ignorenulls=True).over(window_fill)) \ .withColumn(self.EMPIRICAL_CDF_POS, F.last(self.EMPIRICAL_CDF_POS, ignorenulls=True).over(window_fill)) \ .fillna(0) \ .withColumn(self.DISTANCE, F.abs(F.col(self.EMPIRICAL_CDF_NEG) - F.col(self.EMPIRICAL_CDF_POS))) return df_ks
def __getLastRightRow(self, left_ts_col, right_cols, sequence_col, tsPartitionVal): from functools import reduce """Get last right value of each right column (inc. right timestamp) for each self.ts_col value self.ts_col, which is the combined time-stamp column of both left and right dataframe, is dropped at the end since it is no longer used in subsequent methods. """ ptntl_sort_keys = [self.ts_col, sequence_col] sort_keys = [ f.col(col_name) for col_name in ptntl_sort_keys if col_name != '' ] sort_keys.append('rec_ind') window_spec = Window.partitionBy( self.partitionCols).orderBy(sort_keys).rowsBetween( Window.unboundedPreceding, Window.currentRow) # splitting off the condition as we want different columns in the reduce if we are implementing the skew AS OF join if tsPartitionVal is None: df = reduce( lambda df, idx: df.withColumn( right_cols[idx], f.last(right_cols[idx], True).over(window_spec)), range(len(right_cols)), self.df) else: df = reduce( lambda df, idx: df.withColumn( right_cols[idx], f.last(right_cols[idx], True).over(window_spec)). withColumn('non_null_ct' + right_cols[idx], f.count(right_cols[idx]).over(window_spec)), range(len(right_cols)), self.df) df = (df.filter(f.col(left_ts_col).isNotNull()).drop( self.ts_col)).drop('rec_ind') # remove the null_ct stats used to record missing values in partitioned as of join if tsPartitionVal is not None: for column in df.columns: if (column.startswith("non_null")): any_blank_vals = (df.agg({ column: 'min' }).collect()[0][0] == 0) newCol = column.replace("non_null_ct", "") if any_blank_vals: print( "Column " + newCol + " had no values within the lookback window. Consider using a larger window to avoid missing values. If this is the first record in the data frame, this warning can be ignored." ) df = df.drop(column) return TSDF(df, left_ts_col, self.partitionCols)
def reduce_to_ohlc(time, rdd): row_rdd = rdd.map(lambda row: row.split(',')) \ .filter(lambda row: len(row) == 3) \ .map(lambda row: Row( symbol=row[0], tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'), price=float(row[1]) )) sql_context = get_sql_context_instance(rdd.context) data = sql_context.createDataFrame(row_rdd) data.cache() data.write.format('org.apache.spark.sql.cassandra') \ .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \ .mode('append') \ .save() ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \ .orderBy('tx_time') \ .groupBy('symbol', 'batch_time') \ .agg( F.first(data.price).alias('open'), F.max(data.price).alias('high'), F.min(data.price).alias('low'), F.last(data.price).alias('close'), F.first(data.tx_time).alias('open_time'), F.last(data.tx_time).alias('close_time') ) existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \ .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \ .load() \ .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time') merged_ohlc = ohlc.join(existing_ohlc, (ohlc.symbol == existing_ohlc.symbol) & (ohlc.batch_time == existing_ohlc.batch_time), 'left' ) merged_ohlc = merged_ohlc.select( ohlc.symbol.alias('symbol'), ohlc.batch_time.alias('batch_time'), F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'), F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'), F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'), F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'), F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'), F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high') ) merged_ohlc.write.format('org.apache.spark.sql.cassandra') \ .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \ .mode('append') \ .save()
def pivot(trades, prices): """ Pivot and fill the columns on the event id so that each row contains a column for each id + column combination where the value is the most recent non-null value for that id. For example, given the above input tables the expected output is: +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+ | id| timestamp| bid| ask|price|quantity|10_bid|10_ask|10_price|10_quantity|20_bid|20_ask|20_price|20_quantity| +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+ | 10|1546300799000| 37.5|37.51| null| null| 37.5| 37.51| null| null| null| null| null| null| | 10|1546300800000| null| null| 37.5| 100.0| 37.5| 37.51| 37.5| 100.0| null| null| null| null| | 10|1546300801000| null| null|37.51| 100.0| 37.5| 37.51| 37.51| 100.0| null| null| null| null| | 10|1546300802000|37.51|37.52| null| null| 37.51| 37.52| 37.51| 100.0| null| null| null| null| | 20|1546300804000| null| null|12.67| 300.0| 37.51| 37.52| 37.51| 100.0| null| null| 12.67| 300.0| | 10|1546300806000| 37.5|37.51| null| null| 37.5| 37.51| 37.51| 100.0| null| null| 12.67| 300.0| | 10|1546300807000| null| null| 37.5| 200.0| 37.5| 37.51| 37.5| 200.0| null| null| 12.67| 300.0| +---+-------------+-----+-----+-----+--------+------+------+--------+-----------+------+------+--------+-----------+ :param trades: DataFrame of trade events :param prices: DataFrame of price events :return: A DataFrame of the combined events and pivoted columns. """ trades_prices = trades. \ join(prices, ['id', 'timestamp'], 'outer'). \ select('id', 'timestamp', 'bid', 'ask', 'price', 'quantity'). \ orderBy(asc("timestamp")) unique_ids = trades_prices.select('id').distinct().collect() result = None for row in unique_ids: id = str(row.id) dyn_columns = trades_prices. \ withColumn("bid", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('bid')).cast(T.DoubleType()))).\ withColumn("ask", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('ask')).cast(T.DoubleType()))).\ withColumn("price", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('price')).cast(T.DoubleType()))).\ withColumn("quantity", when(col("id") != row.id, lit(None).cast(T.DoubleType())).otherwise(lit(col('quantity')).cast(T.DoubleType()))).\ withColumn(id+"_id", when(col("id") == row.id, lit(id).cast(T.IntegerType())).otherwise(lit(id).cast(T.IntegerType()))).\ withColumn(id + "_bid", func.last('bid', True).over( Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \ withColumn(id + "_ask", func.last('ask', True).over( Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \ withColumn(id + "_price", func.last('price', True).over( Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))). \ withColumn(id + "_quantity", func.last('quantity', True).over( Window.partitionBy(id+"_id").orderBy('timestamp').rowsBetween(-sys.maxsize, 0))).\ drop('bid', 'ask', 'price', 'quantity', id + "_id") if result is None: result = trades_prices.join(dyn_columns, ['id', 'timestamp'], how='outer') else: result = result.join(dyn_columns, ['id', 'timestamp'], how='outer') return result.orderBy('timestamp')
def get_editor_features(tag_user_histories): tag_features = tag_user_histories.groupby('event_user_id') \ .agg(f.last('num_groups').alias('num_groups'), f.countDistinct('page_id').alias('num_articles'), f.count('revision_id').alias('num_edits'), f.last('num_blocks_historical').alias('num_past_blocks'), f.last('num_curr_blocks').alias('num_curr_blocks'), f.sum(col("is_revert_bool")).alias('num_reverts_by_others'), f.sum(col('is_reverted_bool')).alias('num_reverts_of_others'), f.last('days_since_registration').alias('time_since_registration'), udf_page_talk_ratio(f.collect_list('page_namespace')).alias('talk_article_ratio'), udf_contribution_frac(f.collect_list('page_id')).alias('contribution_frac_entropy') ) return tag_features
def bi_fluent_window(pyData): df = spark.createDataFrame(pyData) window = Window \ .partitionBy(df.grp, df.subgrp) \ .orderBy(df.id) df = df \ .orderBy(df.grp, df.subgrp, df.id)\ .withColumn("sub_var_of_E", func.variance(df.E)\ .over(window)) df = df \ .groupBy(df.grp, df.subgrp)\ .agg(func.sum(df.C).alias("sub_sum_of_C"), func.count(df.C).alias("sub_count"), func.max(df.D).alias("sub_max_of_D"), func.last(df.sub_var_of_E).alias("sub_var_of_E1"), func.variance(df.E).alias("sub_var_of_E2")) df \ .groupBy(df.grp)\ .agg( (func.sum(df.sub_sum_of_C)/ func.sum(df.sub_count)).alias("mean_of_C"), func.max(df.sub_max_of_D).alias("max_of_D"), func.avg(df.sub_var_of_E1).alias("avg_var_of_E1"), func.avg(df.sub_var_of_E2).alias("avg_var_of_E2"))\ .orderBy(df.grp)\ .collect()
def _denoise_marker_column(self, window, start=True) -> Column: """Return marker column with noises removed and forward/backwards filled. Parameters ---------- window: pyspark.sql.Window Resembles a window specification according to groupby/order. start: bool, optional Indicate fill order. If True, forward fill for start markers. If False, backwards fill for end markers. Returns ------- denoised: pyspark.sql.column.Column Return spark column expression with denoised values. """ marker_column = F.col(self.marker_column) # remove noise values valid_values = [self.marker_start, self.marker_end] mask_no_noise = marker_column.isin(valid_values) denoised = F.when(mask_no_noise, marker_column) # forward fill with remaining start/end markers if start: ffill_window = window.rowsBetween(Window.unboundedPreceding, 0) fill = F.last(denoised, ignorenulls=True).over(ffill_window) else: bfill_window = window.rowsBetween(0, Window.unboundedFollowing) fill = F.first(denoised, ignorenulls=True).over(bfill_window) return fill
def task_4(df): window_duplicate_remove = Window.partitionBy('video_id').orderBy( col('views').desc()) task_4_res_df = df.select('channel_title', 'video_id', 'views', 'trending_date', F.row_number().over(window_duplicate_remove).alias('rn')). \ filter(col('rn') == 1).groupBy("channel_title").agg( F.first('trending_date').alias("start_date"), F.last('trending_date').alias("end_date"), F.sum('views').alias("total_views"), F.collect_list('video_id').alias("video_id_list"), F.collect_list('views').alias("views_list") ).orderBy(col('total_views'), ascending=False).limit(20) query_result = task_4_res_df.collect() json_result = { "channels": [{ "channel_name": row.channel_title, "start_date": row.start_date, "end_date": row.end_date, "total_views": row.total_views, "videos_views": [{ "video_id": row.video_id_list[i], "viewes": row.views_list[i] } for i in range(len(row.video_id_list))] } for row in query_result] } return json_result
def metricDaysPerWeekPerProfileDay(data, needed_dimension_variables, feature_col, sampling_multiplier, days=7, include_day_of_week=False): all_user_days = data.select("id").distinct().crossJoin( data.select("date").distinct()) data = data.filter( col(feature_col) > 0).select(["id", "date", "bucket", feature_col] + needed_dimension_variables).distinct() data = data.alias("intermediate_table") all_user_days = all_user_days.alias("all_user_days") # Augment activity table to include non-active days intermediate_table2 = data.join( all_user_days, ['id', 'date'], 'outer').withColumn( "n_", F.coalesce("intermediate_table." + feature_col, lit(0))).drop(feature_col).withColumnRenamed( "n_", feature_col) if include_day_of_week: intermediate_table2 = intermediate_table2.withColumn( feature_col + "_weekend", F.when( F.date_format('date', 'u').cast(IntegerType()) >= 6, col(feature_col)).otherwise(0)).withColumn( feature_col + "_weekday", F.when( F.date_format('date', 'u').cast(IntegerType()) <= 5, col(feature_col)).otherwise(0)) # Calculate active days per week for each profile-day windowSpec = Window.partitionBy([intermediate_table2.id]).orderBy( intermediate_table2.date).rowsBetween(1 - days, 0) intermediate_table3 = intermediate_table2.withColumn( "n_", F.sum(intermediate_table2[feature_col]).over(windowSpec)).drop( feature_col).withColumnRenamed("n_", feature_col) if include_day_of_week: intermediate_table3 = intermediate_table3.withColumn( "n_", F.sum(intermediate_table2[feature_col + "_weekend"]).over( windowSpec)).drop(feature_col + "_weekend").withColumnRenamed( "n_", feature_col + "_weekend") intermediate_table3 = intermediate_table3.withColumn( "n_", F.sum(intermediate_table2[feature_col + "_weekday"]).over( windowSpec)).drop(feature_col + "_weekday").withColumnRenamed( "n_", feature_col + "_weekday") for v in needed_dimension_variables: intermediate_table3 = intermediate_table3.withColumn( v, F.last(v, True).over(windowSpec)) return intermediate_table3
def task_2(df_proper_date, categories_map): window_group_WoY = Window.partitionBy("WoY").orderBy( col('total_views').desc()) task_2_res_df = df_proper_date.withColumn('WoY', ( F.weekofyear(df_proper_date.dateframe) + F.year(df_proper_date.dateframe) * 53)). \ groupBy('WoY', "category_id", "video_id").agg( F.count('video_id').alias("count"), F.first('views').alias("start_views"), F.last('views').alias("end_views"), ).filter(col('count') > 1). \ withColumn("diff", (col("end_views") - col("start_views"))). \ groupBy("WoY", "category_id").agg( F.sum('diff').alias("total_views"), F.collect_list('video_id').alias("video_id_list"), ).withColumn("rank", F.row_number().over(window_group_WoY)).filter(col("rank") == 1) query_result = task_2_res_df.collect() json_result = { "weeks": [{ "start_date": date_by_week_n_from(int(row.WoY / 53), row.WoY % 53), "end_date": date_by_week_n_to(int(row.WoY / 53), row.WoY % 53), "category_id": row.category_id, "category_name": categories_map[row.category_id], "number_of_videos": len(row.video_id_list), "total_views": row.total_views, "video_ids": row.video_id_list } for row in query_result] } return json_result
def cond_fluent_window(pyData): dfData = spark.createDataFrame(pyData) dfData = dfData \ .withColumn("cond", func.when(dfData.E < 0, -1).otherwise( +1)) dfData = dfData \ .orderBy(dfData.grp, dfData.subgrp, dfData.cond, dfData.id) window = Window \ .partitionBy(dfData.grp, dfData.subgrp, dfData.cond) \ .orderBy(dfData.id)\ .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) dfData = dfData \ .withColumn("cond_var_of_E_2_pre1", func.when(dfData.cond < 0, func.variance(dfData.E)\ .over(window))) dfData = dfData \ .groupBy(dfData.grp, dfData.subgrp, dfData.cond)\ .agg(func.sum(dfData.C).alias("sum_of_C_pre"), func.count(dfData.C).alias("count_of_C_pre"), func.max(dfData.D).alias("max_of_D_pre"), func.variance(func.when(dfData.E < 0, dfData.E)).alias("cond_var_of_E_1_pre"), func.last(dfData.cond_var_of_E_2_pre1).alias("cond_var_of_E_2_pre2")) dfData = dfData \ .groupBy(dfData.grp, dfData.subgrp)\ .agg((func.sum(dfData.sum_of_C_pre) \ / func.sum(dfData.count_of_C_pre)\ ).alias("mean_of_C"), func.max(dfData.max_of_D_pre).alias("max_of_D"), func.max(dfData.cond_var_of_E_1_pre).alias("cond_var_of_E_1"), func.max(dfData.cond_var_of_E_2_pre2).alias("cond_var_of_E_2"))\ .orderBy(dfData.grp, dfData.subgrp)\ .collect()
def forward_fill_dataframe(df, partition_cols, filling_cols): forward_fill_window = Window.partitionBy(partition_cols).rowsBetween(-sys.maxsize, 0) for column in filling_cols: filled_column_values = F.last(df[column], ignorenulls=True).over(forward_fill_window) df = df.withColumn(column, filled_column_values) return df
def join_stress_streams(self, dataStream, propagation='forward'): """ filter data Args: columnName (str): name of the column operator (str): basic operators (e.g., >, <, ==, !=) value (Any): if the columnName is timestamp, please provide python datatime object Returns: DataStream: this will return a new datastream object with blank metadata """ combined_df = self._data.join( dataStream.data, on=['user', 'timestamp', 'localtime', 'version'], how='full').orderBy('timestamp') combined_filled = combined_df.withColumn( "data_quality", F.last('data_quality', True).over( Window.partitionBy('user').orderBy('timestamp').rowsBetween( -sys.maxsize, 0))) combined_filled_filtered = combined_filled.filter( combined_filled.ecg.isNotNull()) return DataStream(data=combined_filled_filtered, metadata=Metadata())
def ffill_windows(cls, df, time_col, columns_to_fill): """ Forward filling strategy. This strategy fills empty spots using the last know value of a column """ import sys from pyspark.sql import Window from pyspark.sql.functions import last # define the window (and order it by time) window = Window.orderBy(time_col)\ .rowsBetween(-sys.maxsize, 0) # fill every column and replace columns for col_entry in columns_to_fill: col_name_to_fill = col_entry[0] col_name_new = col_entry[1] if (col_name_new is None): col_name_new = col_name_to_fill df = df.withColumn( col_name_new, last(df[col_name_to_fill], ignorenulls=True).over(window)) return df
def extract_itineraries(job): find_spark() from util import hdfs_fn from pyspark.sql.functions import collect_list, first, last def make_line(row): return '{} {} {} {} {}'.format( row.ItinID, row.FirstAirportID, ' '.join(map(str, row.OriginAirportIDs)), row.LastAirportID, row.LastAirportID) with build_spark() as spark: df = spark.read.csv(hdfs_fn(job, 'Coupon.csv'), header=True, inferSchema=True) col_names = [ 'ItinID', 'SeqNum', 'OriginAirportID', 'Origin', 'DestAirportID', 'Dest' ] df_network = df[col_names].repartition('ItinID').sort( ['ItinID', 'SeqNum']) itins = df_network.groupby(['ItinID']).agg( first('OriginAirportID').alias('FirstAirportID'), collect_list('OriginAirportID').alias('OriginAirportIDs'), last('DestAirportID').alias('LastAirportID')) itins.rdd.map(make_line).saveAsTextFile( hdfs_fn(job, 'hon_itineraries.txt'))
def _summary(self, name=None): """ Return a summarized representation. Parameters ---------- name : str name to use in the summary representation Returns ------- String with a summarized representation of the index """ head, tail, total_count = self._kdf._sdf.select( F.first(self._scol), F.last(self._scol), F.count(F.expr("*"))).first() if total_count > 0: index_summary = ", %s to %s" % (pprint_thing(head), pprint_thing(tail)) else: index_summary = "" if name is None: name = type(self).__name__ return "%s: %s entries%s" % (name, total_count, index_summary)
def countTest(): countDf = spark.read.format('csv').option("header", "true").option( "inferSchema", "true").load("../../data/airlines.csv") print(countDf.count()) countDf.select(count('Code')).show() countDf.select(countDistinct('Code')).show() countDf.select(first('Code'), last('Code')).show()
def _generate_raw_iids_special(self, start_first: bool, add_negate_shift_col: bool, reverse=False) -> Column: """Create sequence of interval ids in increasing order regardless of their validity. Parameters ---------- start_first: bool Defines if the first start is used for intervals. add_negate_shift_col: bool True if the shift col have to be negated. reverse: bool, optional Define order by. Returns ------- raw_iids: pyspark.sql.column.Column """ marker_col = F.col(self.marker_column) window = self._window_groupby(reverse) # generate forward fill depending on interval if start_first: default = 0 forward_fill = F.when(marker_col == self.marker_start, 1) \ .when(marker_col == self.marker_end, 0) \ .otherwise(None) else: default = 1 forward_fill = F.when(marker_col == self.marker_end, 1) \ .when(marker_col == self.marker_start, 0) \ .otherwise(None) ff_window = window.rowsBetween(Window.unboundedPreceding, 0) forward_fill_col = F.last(forward_fill, ignorenulls=True).over( ff_window) # shifting marker_col forward shift_col = F.lag(forward_fill_col, default=default, count=1) \ .over(window) \ .cast("integer") # compare forward fill col and shifted forward fill col end_marker_null_col = F.when(shift_col == forward_fill_col, 0) \ .otherwise(forward_fill_col) if add_negate_shift_col: shift_col_negated = F.when(shift_col == 0, 1).otherwise(0) add_col = end_marker_null_col + shift_col_negated else: add_col = end_marker_null_col # build cum sum over window raw_iids = F.sum(add_col).over(window) return raw_iids
def get_last_user_event_value( target_column: str, user_column: str = "user_id" ) -> Column: return F.last(F.col(target_column)).over( Window() .partitionBy(user_column) .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) )
def __fill_nans(self, df): # part of the fix related to JIRA ARESPY-20 window = Window.partitionBy('timestamp').orderBy( 'timestamp').rowsBetween(-100000, 0) for column in df.columns: df = df.withColumn('filled_%s' % column, F.last(F.col(column)).over(window).isNotNull()) return df
def create_prediction_df(training_df, prediction_period, other_column=None): """ :param training_df: -- dataframe: for training :param prediction_period: -- integer: number of period to predict on :param other_column: -- col column for prediction other than the overall rank :return: -- dataframe and corresponding dates """ if other_column: last_date = training_df.select('Date').distinct().orderBy( 'Date').select(last('Date')).collect()[0][0] last_id = training_df.select('id').distinct().orderBy('id').select( max('id')).collect()[0][0] date_rows = list( Row( float(last_id + i), date(last_date.year, last_date.month, 1) + timedelta(days=i * 31)) for i in range(1, prediction_period + 1)) date_df = spark.createDataFrame(date_rows, ['id', 'Date']) specialized_rows = training_df.select(other_column, other_column + '_idx').distinct() prediction_df = specialized_rows.join(date_df) assembler = VectorAssembler(inputCols=['id', other_column + '_idx'], outputCol='features') return assembler.transform(prediction_df) else: last_date = training_df.orderBy('Date').select( last('Date')).collect()[0][0] last_id = training_df.orderBy('id').select( max('id')).collect()[0][0][0] prediction_rows = list( Row( float(last_id + i), date(last_date.year, last_date.month, 1) + timedelta(days=i * 31)) for i in range(1, prediction_period + 1)) prediction_df = spark.createDataFrame(prediction_rows, ['id', 'Date']) return prediction_df
def metricRetention(data, needed_dimension_variables, feature_col, sampling_multiplier, activated=False): activity_data = data.filter(col(feature_col) > 0).select( ["id", "date", feature_col]).distinct() pcd_table = data.select(["date", "id", "bucket"] + needed_dimension_variables) windowSpec = Window.partitionBy([pcd_table.id] + needed_dimension_variables).orderBy( pcd_table.date).rowsBetween(0, 13) for v in needed_dimension_variables: pcd_table = pcd_table.withColumn(v, F.last(v, True).over(windowSpec)) pcd_table = pcd_table.filter(col("new_profile") == 1) if activated: pcd_table = pcd_table.alias("pcd_t").join( activity_data.alias("i_t"), (col('pcd_t.id') == col('i_t.id')) & (col('i_t.date') >= F.date_add(col('pcd_t.date'), 1)) & (col('i_t.date') <= F.date_add(col('pcd_t.date'), 6)), "inner").filter(col("i_t." + feature_col) > 0).dropDuplicates([ 'id' ]).select([ col('pcd_t.{}'.format(c)) for c in ['id', 'bucket', "date"] + needed_dimension_variables ]) intermediate_table3 = pcd_table.alias("pcd_t").join( activity_data.alias("i_t"), (col('pcd_t.id') == col('i_t.id')) & (col('i_t.date') >= F.date_add(col('pcd_t.date'), 7)) & (col('i_t.date') <= F.date_add(col('pcd_t.date'), 13)), "outer").select([ 'pcd_t.{}'.format(c) for c in ['id', 'date', 'bucket'] + needed_dimension_variables ] + [feature_col]).fillna(0, [feature_col]).groupBy([ 'pcd_t.{}'.format(c) for c in ['id', 'date', 'bucket'] + needed_dimension_variables ], ).agg(F.max(col(feature_col))).drop(feature_col).withColumnRenamed( "MAX({})".format(feature_col), feature_col).select([ col("pcd_t.{}".format(c)).alias(c) for c in ['id', 'bucket', 'date'] + needed_dimension_variables ] + [feature_col]) intermediate_table4 = intermediate_table3.groupBy( ["date", "bucket"] + needed_dimension_variables).mean(feature_col).withColumnRenamed( 'avg({})'.format(feature_col), feature_col) intermediate_table4_allbucket = intermediate_table3.groupBy( ["date"] + needed_dimension_variables).mean(feature_col).withColumnRenamed( 'avg({})'.format(feature_col), feature_col).withColumn('bucket', lit("ALL")) joined_intermediate = intermediate_table4.unionByName( intermediate_table4_allbucket) return joined_intermediate
def fill_activity_na(df): # define the window window = Window.orderBy('timestamp').rowsBetween(-20, 0) # define the forward-filled column filled_column = last(df['heart_rate'], ignorenulls=True).over(window) df = df.withColumn('heart_rate', filled_column) return df
def test_aggregator(self): df = self.df g = df.groupBy() self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0])) self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect()) from pyspark.sql import functions self.assertEqual((0, u'99'), tuple(g.agg(functions.first(df.key), functions.last(df.value)).first())) self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0]) self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
def last(self): """ Compute last of group values. See Also -------- databricks.koalas.Series.groupby databricks.koalas.DataFrame.groupby """ return self._reduce_for_stat_function( lambda col: F.last(col, ignorenulls=True), only_numeric=False)
def last(self): """ Compute last of group values. See Also -------- koalas.DataFrame.groupby """ return self._reduce_for_stat_function(lambda col: F.last(col, ignorenulls=True), only_numeric=False)
# COMMAND ---------- from pyspark.sql.functions import countDistinct df.select(countDistinct("StockCode")).show() # 4070 # COMMAND ---------- from pyspark.sql.functions import approx_count_distinct df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364 # COMMAND ---------- from pyspark.sql.functions import first, last df.select(first("StockCode"), last("StockCode")).show() # COMMAND ---------- from pyspark.sql.functions import min, max df.select(min("Quantity"), max("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import sum df.select(sum("Quantity")).show() # 5176450 # COMMAND ----------