def spark_timestamp_split( data, ratio=0.75, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Spark timestamp based splitter The splitter splits the data into sets by timestamps without stratification on either user or item. The ratios are applied on the timestamp column which is divided accordingly into several partitions. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two sets and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. Earlier indexed splits will have earlier times (e.g the latest time in split[0] <= the earliest time in split[1]) col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Float number represented in seconds since Epoch. Returns: list: Splits of the input data as spark.DataFrame. """ multi_split, ratio = process_split_ratio(ratio) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.orderBy(col(col_timestamp)) rating = data.withColumn("rank", row_number().over(window_spec)) data_count = rating.count() rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop( "rank" ) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) ).drop("rank") splits.append(rating_split) return splits
def topPlayersOVA(self, data: DataFrame): """ This function finds the top 10 players for each position based on the OVA indicator :param data: A dataframe :return: A dataframe with the top 10 players for each position """ try: df = data.select(F.col(Name), F.col(OVA), F.col(Single_position).alias(Position), F.row_number().over(Window.partitionBy(Single_position) .orderBy(F.col(OVA).desc())).alias(row_number))\ .filter(F.col(row_number).isin(top10)).drop(row_number) return df except Exception as ex: print(ex)
def to_friday(df, dt): '''Convert all days to Fridays Note that this is not forward mapping as Saturday and Sunday are mapped backward. This is fine since all the data are supposed to be in "business days" To be consistent over the weekends, use the timestamps''' cols = df.columns # Convert all days to Friday of the week df = df.withColumn('friday', F.date_add(F.date_trunc('week', dt), 4)) # Keep only the last record in each week w = Window.partitionBy('friday').orderBy(F.col(dt).desc()) df = df.withColumn('rn', F.row_number().over(w)).where(F.col('rn') == 1) df = df.drop(dt).withColumnRenamed('friday', dt) return df.select(*cols)
def create_race_dimention(cities_fact_df): try: # Select distinct race types race_dim_df = cities_fact_df.select("Race").distinct() # Assign unique id for each race type as race_type_id race_window = Window.orderBy(f.col("Race")) race_dim_df = race_dim_df.withColumn("race_type_id", f.row_number().over(race_window)) return race_dim_df except Exception as e: print(str(e))
def find_otp_bus_legs_actual_start_time(otp_legs_df, clean_bus_trips_df): w = Window.partitionBy( ['date', 'user_trip_id', 'itinerary_id', 'route', 'from_stop_id']).orderBy(['timediff']) return otp_legs_df \ .withColumn('stopPointId', F.col('from_stop_id')) \ .join(clean_bus_trips_df, ['date','route','stopPointId'], how='inner') \ .na.drop(subset=['timestamp']) \ .withColumn('timediff',F.abs(F.unix_timestamp(F.col('timestamp')) - F.unix_timestamp(F.col('otp_start_time')))) \ .drop('otp_duration') \ .withColumn('rn', F.row_number().over(w)) \ .where(F.col('rn') == 1) \ .select(['date','user_trip_id','itinerary_id','leg_id','route','busCode','tripNum','from_stop_id','otp_start_time','timestamp','to_stop_id','otp_end_time']) \ .withColumnRenamed('timestamp','from_timestamp')
def get_source_upvotes(memes_source, upvotes_column, source): """Get upvotes_centile and save upvotes to hdfs (for hour predictions)""" df = memes_source df.select(upvotes_column).write.format("com.databricks.spark.csv").mode( "overwrite").save('hdfs:///upvotes/{0}'.format(source)) n = df.count() w = Window.orderBy(upvotes_column) df = df.withColumn('upvotes_centile', functions.round(row_number().over(w) / n, 5)) df = df.select('id', functions.col(upvotes_column).alias('upvotes'), 'upvotes_centile', 'timestamp') return df
def top_10_addons_on_date(data, date, topN, period=7, country_list=None): """ Gets the number of users in the past week who have used the top N addons, broken down by country. Parameters: data - The main ping server. date - The day you which you want to get the top N addons. topN - the number of addons to get. period - number of days to use to calculate metric country_list - a list of country names in string Returns: Dataframe containing the number of users using each of the addons. submission_date_s3, country, addon_id, name, percent_of_active_users """ addon_filter = (~col('addon.is_system')) & (~col('addon.foreign_install')) & \ (~col('addon.addon_id').isin(NON_MOZ_TP)) & (~col('addon.addon_id').like('%@mozilla%')) &\ (~col('addon.addon_id').like('%@shield.mozilla%')) &\ (~col('addon.addon_id').like('%' + UNIFIED_SEARCH_STR + '%')) data_all = keep_countries_and_all(data, country_list) begin = date_plus_x_days(date, -period) wau = data_all.filter((col('submission_date_s3') > begin) & (col('submission_date_s3') <= date))\ .groupBy('country')\ .agg(lit(date).alias('submission_date_s3'), F.countDistinct('client_id').alias('wau')) counts = data_all.select('submission_date_s3', 'country', 'client_id', F.explode('active_addons').alias('addon'))\ .filter((col('submission_date_s3') > begin) & (col('submission_date_s3') <= date))\ .filter(addon_filter)\ .select('country', 'client_id', 'addon.addon_id', 'addon.name')\ .distinct()\ .groupBy('country', 'addon_id')\ .agg(F.count('*').alias('number_of_users'), F.last('name').alias('name'))\ .select('*', lit(date).alias('submission_date_s3'), lit(begin).alias('start_date'), F.row_number().over(Window.partitionBy('country') .orderBy(desc('number_of_users')) .rowsBetween(Window.unboundedPreceding, Window.currentRow)) .alias('rank'))\ .filter(col('rank') <= topN) return counts.join(F.broadcast(wau), on=['country'], how='left')\ .select(lit(date).alias('submission_date_s3'), 'country', 'addon_id', col('name').alias('addon_name'), (100.0 * col('number_of_users') / col('wau')).alias('pct_with_addon'))
def mergeCdcData(mergedDfTarget, primaryKeyArray, orderByField): """ Combine data and get the latest row out for a primary key based on the change_order_by key :param mergeddfTarget: dataframe with old and new records :param primary_key: database primary :param change_order_by: by time - last updated timestamp :return: dataframe - with the merged records """ win = Window().partitionBy(primaryKeyArray).orderBy(F.col(orderByField).desc()) mergedTargetRanked = mergedDfTarget.withColumn("order_inc_rank", F.row_number().over(win)) mergeTargetRankedWithoutDuplicates = mergedTargetRanked.dropDuplicates() rowsWithLatestUpdatesDf = mergeTargetRankedWithoutDuplicates.filter( mergeTargetRankedWithoutDuplicates.order_inc_rank == 1) rowsWithLatestUpdatesDf = rowsWithLatestUpdatesDf.drop("order_inc_rank") return rowsWithLatestUpdatesDf
def get_data_missing_from_musicbrainz(partial_listens_df, msid_mbid_mapping_df): """ Get data that has been submitted to ListenBrainz but is missing from MusicBrainz. Args: partial_listens_df (dataframe): dataframe of listens. msid_mbid_mapping_df (dataframe): msid->mbid mapping. For columns refer to msid_mbid_mapping_schema in listenbrainz_spark/schema.py Returns: missing_musicbrainz_data_itr (iterator): Data missing from the MusicBrainz. """ condition = [ partial_listens_df.track_name_matchable == msid_mbid_mapping_df.msb_recording_name_matchable, partial_listens_df.artist_name_matchable == msid_mbid_mapping_df.msb_artist_credit_name_matchable ] df = partial_listens_df.join(msid_mbid_mapping_df, condition, 'left') \ .select('artist_msid', 'artist_name', 'listened_at', 'recording_msid', 'release_msid', 'release_name', 'track_name', 'user_name') \ .where(col('msb_recording_name_matchable').isNull() & col('msb_artist_credit_name_matchable').isNull()) current_app.logger.info('Number of (artist, recording) pairs missing from mapping: {}'.format(df.count())) window = Window.partitionBy('user_name').orderBy(col('listened_at').desc()) # limiting listens to 200 for each user so that messages don't drop # Also, we don't want to overwhelm users with the data that they # have submitted to LB and should consider submitting to MB. # The data will be sorted on "listened_at" missing_musicbrainz_data_itr = df.groupBy('artist_msid', 'artist_name', 'recording_msid', 'release_msid', 'release_name', 'track_name', 'user_name') \ .agg(func.max('listened_at').alias('listened_at')) \ .withColumn('rank', row_number().over(window)) \ .where(col('rank') <= 200) \ .toLocalIterator() return missing_musicbrainz_data_itr
def run_ALS(path_prefix, path_results, path_train, nr_predictions=300): behaviors_presel_df, behaviors_df, preselection_df = read_data(path_prefix) _, behaviors_train_df, preselection_train_df = read_data(path_train) max_index_behaviors_df = behaviors_df.groupby().max( 'index').first().asDict()['max(index)'] behaviors_train_df = behaviors_train_df.withColumnRenamed('index', 'i2')\ .withColumn('index', F.col('i2') + max_index_behaviors_df)\ .drop('i2') behaviors_train_df = behaviors_train_df.cache() behaviors_df_train = behaviors_train_df.select( 'index', 'item_id', 'rating', 'user').union(behaviors_df.select('index', 'item_id', 'rating', 'user')) model, als = train_ALS_model(behaviors_df_train, rank=100, alpha=10, reg_par=0.1, max_iter=10) index_subset = behaviors_df_train.filter( F.col('index') <= max_index_behaviors_df) index_recs = model.recommendForUserSubset(index_subset, nr_predictions) indexitem_recs_df = index_recs.select( 'index', F.explode(index_recs.recommendations)) indexitem_df = indexitem_recs_df.select( 'index', F.col('col.item_id').alias('item_id'), F.col('col.rating').alias('prediction')) indexitem_predictions_df = indexitem_df.join(preselection_df.select( 'item_id', 'index', 'item').distinct(), ['item_id', 'index'], how='inner') indexitem_predictions_df = indexitem_predictions_df.withColumn( 'rank', F.row_number().over( Window.partitionBy('index').orderBy(F.desc('prediction')))) indexitem_predictions_df = indexitem_predictions_df.withColumn('dic', F.create_map(['item', 'rank']))\ .select('index', 'dic') indexitem_predictions_df = indexitem_predictions_df.groupby('index').agg( F.collect_list('dic').alias('dic_list')) result = get_pred_rank(indexitem_predictions_df, behaviors_presel_df) result.repartition(1).write.json(path_results, mode='overwrite') return result, model
def prepare_client_rows(main_summary): """Coalesce client pings into a DataFrame that contains one row for each client.""" in_columns = { "client_id", "timestamp", "scalar_parent_browser_engagement_total_uri_count", "scalar_parent_browser_engagement_unique_domains_count", "subsession_length" } out_columns = (set(main_summary.columns) | { "usage_seconds", "total_uri_count", "unique_domains_count_per_profile" }) assert (in_columns <= set(main_summary.columns)) # Get the newest ping per client and append to original dataframe window_spec = (Window.partitionBy(F.col('client_id')).orderBy( F.col('timestamp').desc())) newest_per_client = (main_summary.withColumn( 'client_rank', F.row_number().over(window_spec)).where(F.col("client_rank") == 1)) # Compute per client aggregates lost during newest client computation select_expr = utils.build_col_expr({ "client_id": None, "total_uri_count": (F.coalesce("scalar_parent_browser_engagement_total_uri_count", F.lit(0))), "unique_domains_count": (F.coalesce("scalar_parent_browser_engagement_unique_domains_count", F.lit(0))), # Clamp broken subsession values to [0, MAX_SUBSESSION_LENGTH]. "subsession_length": (F.when( F.col('subsession_length') > MAX_SUBSESSION_LENGTH, MAX_SUBSESSION_LENGTH).otherwise( F.when(F.col('subsession_length') < 0, 0).otherwise(F.col('subsession_length')))) }) per_client_aggregates = (main_summary.select( *select_expr).groupby('client_id').agg( F.sum('subsession_length').alias('usage_seconds'), F.sum('total_uri_count').alias('total_uri_count'), F.avg('unique_domains_count').alias( 'unique_domains_count_per_profile'))) # Join the two intermediate datasets return (newest_per_client.join(per_client_aggregates, 'client_id', 'inner').select(*out_columns))
def evaluate_batch(self, test_ratings, weighted_ratings, users): total_users = test_ratings.select("user_id").distinct().count() weighted_ratings.cache() ratingsPredicted = test_ratings.select( "user_id", "business_id", col("stars").cast("float")).join(weighted_ratings, ["user_id", "business_id"]) # predictedVsActual.orderBy("user_id").show() # print(ratingsPredicted.groupBy("user_id", "business_id")\ # .agg(F.count("*").alias("count")).where(col("count")>1).count()) total = test_ratings.count() predicted = ratingsPredicted.count() print("Total {}".format(total), " Predicted {}".format(predicted)) coverage = predicted * 100 / total evaluator = RegressionEvaluator(metricName="mae", labelCol="stars", predictionCol="pred_stars") mae = evaluator.evaluate(ratingsPredicted) precision = {} precision_liked = {} for i in [5, 10]: wi = Window.partitionBy("user_id").orderBy( col("pred_stars").desc()) topi = weighted_ratings.withColumn( "rn", F.row_number().over(wi)).where(col("rn") <= i).drop("rn") # topi.show(100) topratings = topi.join(test_ratings.select("user_id", "business_id", "stars"), ["user_id", "business_id"]) \ .join(users.select("user_id", "average_stars"), "user_id") topratings = topratings.withColumn("liked", col("stars") >= col("average_stars")) \ .withColumn("pred_liked", col( "pred_stars") >= col( "average_stars")) topratings.cache() hits = topratings.count() hits_liked = topratings.where( col("liked") == col("pred_liked")).count() precision[i] = hits / (i * total_users) precision_liked[i] = hits_liked / (i * total_users) print("Hits : {} ".format(hits), " Hits Liked: {} ".format(hits_liked)) topratings.unpersist() weighted_ratings.unpersist() return mae, coverage, precision, precision_liked
def analysis(folder_name): tweets = spark.read.load("hdfs:///user/maria_dev/project/data/" + folder_name + "/clean_data.csv", format="csv", sep=",", inferSchema="true", header="true", encoding="utf-8") # parse date type tweets = tweets.withColumn("date", to_date("date")) # date별 언급량 tweets_num = tweets.groupBy("date").count().orderBy("date", ascending=0) tweets_num = tweets_num.na.drop() # flatten word tweets = tweets.withColumn("word", explode(split("word", '\\|'))) # date별 word count tweets = tweets.groupBy(["word", "date"]).count().orderBy(["date", "count"], ascending=[0, 0]) # word와 pos 나누기 split_col = split(tweets.word, "\\,") tweets = tweets.withColumn("split_word", split_col.getItem(0)) tweets = tweets.withColumn("pos", split_col.getItem(1)) # 긍정 부정 구하기 udf_calc = udf(get_sentiment, StringType()) tweets = tweets.withColumn("sentiment", udf_calc(col("split_word"), col("pos"))) # 긍부정 퍼센트 계산 pos_percentage = tweets.groupBy(["date", "sentiment"]).sum().withColumnRenamed("sum(count)", "count")\ .withColumn("total", sum("count").over(Window.partitionBy("date")))\ .withColumn("percent", (col("count") / col("total")) * 100)\ .filter(col("sentiment").isin(["POS", "NEG"]))\ .select(["date", "sentiment", "percent"])\ .orderBy(["date", "percent"], ascending=[0,0]) # # date별 많은 word 15개 추출 w = Window().partitionBy("date").orderBy(col("count").desc()) tweets = tweets.withColumn("rn", row_number().over(w)).where(col("rn") <= 15)\ .select("split_word", "date", "count", "pos", "sentiment")\ .orderBy(["date", "rn"], ascending=[0,1]) # 용언에 '-다' 추가 tweets = tweets.withColumn("split_word",\ when((tweets.pos == 'PV') | (tweets.pos == 'PA'), concat(col("split_word"), lit("다"))).otherwise(tweets.split_word)) # nlp 안된 것 처리('김정'은) tweets = tweets.withColumn("split_word",\ when(tweets.split_word == '김정', concat(col("split_word"), lit("은"))).otherwise(tweets.split_word)) import_data(folder_name, tweets, pos_percentage, tweets_num)
def process_song_data(spark, input_data, output_data): """Extracts, transforms song data available in s3 and loads it into two tables: songs_table, artists_table. The tables are stored as parquet files in s3. Args: SparkSession: The entry point to programming Spark with the Dataset and DataFrame API. input_data: A path to the songs_data output_data: the location where songs and artists tables will be stored """ # get filepath to song data file song_data = input_data + "song_data/*/*/*/*.json" # read song data file df = spark.read.json(song_data, schema) # extract columns to create songs table songs_table_cols = ["song_id", "title", "artist_id", "year", "duration"] songs_table = df[songs_table_cols].na.\ drop(subset=["song_id"]).dropDuplicates(subset=["song_id"]) # write songs table to parquet files partitioned by year and artist output_songs = output_data+"songs" songs_table.write.mode("overwrite").partitionBy("year", "artist_id").\ parquet(output_songs) artists_cols = ["artist_id", "artist_name", "artist_location", "artist_latitude", "artist_longitude"] # Rank the artists in order to be able to choose one in case of duplicates window = Window.partitionBy("artist_id").orderBy("year") ranked_artists = df.withColumn("row_num", row_number().over(window)) # extract columns to create artists table artists_table = ranked_artists.filter(ranked_artists.row_num == 1).\ select(artists_cols) # write artists table to parquet files output_artists = output_data+"artists" artists_table.write.mode("overwrite").parquet(output_artists)
def transform(self): self.df_order_table = self.df_order_table.withColumn( 'order_date', to_date(col('order_datetime'), 'yyyy-MM-dd')) self.df_order_table = self.df_order_table.withColumn( 'order_month', func.month(col('order_datetime'))) df_filter_cust = self.df_customer_table.where(col('age') > 18) ###inner join df_order_customer = self.df_order_table.join( df_filter_cust, on=(self.df_order_table['customer_id'] == df_filter_cust['customer_id']), how='inner').select(df_filter_cust['customer_id'], self.df_order_table['order_id'], self.df_order_table['order_month'], self.df_order_table['amount']) # total sales amount for each month of each customer who are greater than age 18 wind = Window.partitionBy('customer_id', 'order_month') df_order_customer = df_order_customer.withColumn( 'total_sale', func.sum(col('amount')).over(wind)) df_order_customer.distinct() df_order_customer.show() ###list the cutomer_id and their second order_id of customers who places more than 2 order in last 20 dayssss ######################## wind = Window.partitionBy('customer_id', 'order_date').orderBy( func.col('order_id').asc()) df_temp = self.df_order_table.withColumn('row', func.row_number().over(wind))\ df_temp = df_temp.withColumn( 'current_date', to_date(func.current_timestamp(), 'yyyy-MM-dd')) df_temp = df_temp.withColumn( 'diff_days', func.datediff('current_date', 'order_date')) df_temp = df_temp.withColumn( "diff", when((col('diff_days') <= lit(20)), lit(1)).otherwise(0)) df_temp = df_temp.where(col('diff') == 1) wind = Window.partitionBy('customer_id') df_temp = df_temp.withColumn('count', func.count('order_id').over(wind)) df_temp = df_temp.where((col('count') > 2) & (col('row') == 2)) df_temp.show()
def windowing(df, batch_size): """ Args: df: dataframe to perform windowing on batch_size: number of rows per batch """ if "timestamp" not in df.columns: raise ValueError("timestamp column not found!") df = df.withColumn("timestamp_1", F.unix_timestamp(F.col("timestamp"))) window_spec = Window.orderBy("timestamp_1") return df.withColumn( "batch_id", F.floor( (F.row_number().over(window_spec) - F.lit(1)) / int(batch_size)), )
def main_category_data(df, output_path, s3_partition): ''' Creates dataframe of product main category to be a dimension table (normalized) Selects distinct main categories and drop duplicates Creates main_cat_id with window function Writes .parquet file to S3 ''' window = Window.orderBy(col('main_cat')) main_cat_table = df.select(['main_cat']) \ .where(col('main_cat').isNotNull()) \ .dropDuplicates() \ .withColumn('main_cat_id', row_number().over(window)) main_cat_table.repartition(5).write.parquet( output_path + 'main_category/' + s3_partition, 'overwrite') return main_cat_table
def _remove_duplicate_indexes(self): """ Removes duplicate recipes by randomly selecting one if duplicated. :return: """ window = Window \ .partitionBy([self.index_column]) \ .orderBy(f.rand()) self.df_labels = self.df_labels\ .withColumn('rn', f.row_number().over(window))\ .filter(f.col('rn') == 1)\ .drop('rn')
def overall_prediction_grouping(csv): """ Grouping dataset by date :param csv: -- dataframe: containing all the data :return: -- dataframe: grouped """ grouped = csv.groupby('Date').agg({'Date': 'count'}) grouped_with_date = grouped.withColumn('Date', change_to_date_func(col('Date'))) window_row = Window().orderBy('Date') grouped_indexed = grouped_with_date.withColumn( 'id', row_number().over(window_row)) return grouped_indexed.withColumn('id', to_vector(col('id')))
def generate_variables(dataframe, partition_array, order_array): """ Generates new variables to identify recurrent payments. It calculates the difference of days (difference_of_days) between transactions of the same amount, made in the same commerce; the number of such transactions (nu_payment); the amount payed (impoper_f); and the order of the transactions of the same kind made more than once. It returns a data frame with this information :param dataframe: :param partition_array:['customer_id', 'commerce_name'] :param order_array: ['operation_date'] :return: Dataframe identifying recurrent payments according to the order of the trasnsactions. """ utils = AnalyticUtils filtered_window = Window.partitionBy(partition_array).orderBy( order_array) filtered_descending = Window.partitionBy(fields.customer_id, fields.commerce_affiliation_id) \ .orderBy(desc(fields.operation_date)) filtered_by_range = Window.partitionBy(partition_array).orderBy(order_array) \ .rowsBetween(-sys.maxsize, sys.maxsize) return dataframe.select( fields.customer_id, fields.commerce_affiliation_id, fields.transaction_amount, fields.operation_date, (utils.subtract_days( date_format(c(fields.operation_date), 'YYYY-MM-dd'), date_format( lag(fields.operation_date).over(filtered_window), 'YYYY-MM-dd'))).alias('difference_of_days'), lag(fields.transaction_amount).over(filtered_window).alias( 'impoper_f'), utils.retrieve_day( date_format(c(fields.operation_date), 'YYYY-MM-dd')).alias('day_month'), count('*').over(filtered_by_range).alias('nu_payments'), row_number().over(filtered_window).alias('order'), row_number().over(filtered_descending).alias('order2'))
def fill_auto_increment( df_existing:pyspark.sql.DataFrame, df_new:pyspark.sql.DataFrame, autoincrement_column:str): # add primary key from existing to new by shared business key # returns: df_new with a new primary key column # for new entries, returns an autoincrement value w = Window().orderBy(F.lit('dummy')) max_id = get_max_value(df_existing,autoincrement_column) df_combined = df_new.withColumn(autoincrement_column, F.when( F.col(autoincrement_column).isNull(), F.row_number().over(w)+max_id)\ .otherwise(F.col(autoincrement_column))) return df_combined
def generate_dataset_m(df_assembled, CONFIG_PREPROCESS): m_window_spec = Window.partitionBy('CELL_NUM').orderBy('dt') m_feat_cols = ['day0_features0'] m_days_cols = ['day0_features'] shuffled_with_cell = df_assembled.withColumn( 'seq', row_number().over(m_window_spec)).cache() input_m = shuffled_with_cell.withColumnRenamed('features', m_feat_cols[0]) vector_udt_metadata = input_m.schema[input_m.schema.fields.index( input_m.schema[m_feat_cols[0]])].metadata # Generate 1 day data (5min * 10 data) for i in range(1, CONFIG_PREPROCESS.INPUT_M_SIZE): n_features = lead(col(m_feat_cols[0]), i).over(m_window_spec) col_name = 'day{}_features{}'.format(0, i) input_m = input_m.withColumn( col_name, n_features.alias(col_name, metadata=vector_udt_metadata)) m_feat_cols.append('day{}_features{}'.format(0, i)) input_m = input_m.dropna() input_m = VectorAssembler().setInputCols(m_feat_cols).setOutputCol( m_days_cols[0]).transform(input_m) vector_udt_metadata = input_m.schema[-1].metadata # for DAYS_TO_MEMORY(7) days memory in same time zone for i in range(1, CONFIG_PREPROCESS.DAYS_TO_MEMORY): n_features = lead(col('day0_features'), int(CONFIG_PREPROCESS.ITEMS_PER_DAY * i)).over(m_window_spec) col_name = 'day{}_features'.format(i) input_m = input_m.withColumn( col_name, n_features.alias(col_name, metadata=vector_udt_metadata)) m_days_cols.append('day{}_features'.format(i)) m_tail_skip_size = CONFIG_PREPROCESS.ITEMS_PER_DAY # rows to skip, for 1 Day (X & Y) inbound = when( input_m['seq'] <= (max(input_m['seq']).over( m_window_spec.rangeBetween(-sys.maxsize, sys.maxsize)) - m_tail_skip_size), 1).otherwise(0) input_m = input_m.dropna().withColumn('inbound', inbound).filter('inbound == 1') input_m = VectorAssembler().setInputCols( m_days_cols).setOutputCol('features').transform(input_m).select([ 'dt', 'CELL_NUM', 'features' ]) # assemble DAYS_TO_MEMORY days columns into one ('features') return input_m
def add_incrementral_id(df: DataFrame, id_column: str, partitionby: str) -> DataFrame: """Will create a surrogate key that starts from 1 and is ordered since monotonically_increasing_id will ve unique but not ordered OVER (PARTITION BY . . . ORDER BY . . .) :param df: pyspark Dataframe :param id_column: name of column :param partitionby: column to partition by :return: new column with a unique and consistent surrogate key """ return df.withColumn( id_column, F.row_number().over( Window.partitionBy(partitionby).orderBy( F.monotonically_increasing_id())))
def addIdCol1(dataDF, idFieldName="continuousID"): ''' :param dataDF: 数据表; DataFrame :param idColName: 生成的ID列的名称:String :return: 末列添加了从1开始的连续递增ID的列表;DataFrame ''' numParitions = dataDF.rdd.getNumPartitions() data_withindex = dataDF.withColumn("increasing_id_temp", fn.monotonically_increasing_id()) data_withindex = data_withindex.withColumn( idFieldName, fn.row_number().over(Window.orderBy("increasing_id_temp"))) data_withindex = data_withindex.repartition(numParitions) data_withindex = data_withindex.sort("increasing_id_temp") data_withindex = data_withindex.drop("increasing_id_temp") return data_withindex
def _topn(df, c, by=None, n=3): cnt = f'{c}##cnt' rnk = f'{c}##rnk' _gcols = [by] if isinstance(by, str) and by else by or [] s = df.select(*_gcols, c).groupby(*_gcols, c).agg(F.count(F.col(c)).alias(cnt)) # calculate topn a = s.select( *_gcols, c, cnt, F.row_number().over( Window.partitionBy(*_gcols).orderBy( F.col(cnt).desc())).alias(rnk)).filter(F.col(rnk) <= n) return a.select(*_gcols, c, cnt)
def getTargetStatsByFactors(df, factors, target): stats_names = ['mean', 'median', 'variance'] output_cols = [*factors, *stats_names] w = Window.partitionBy(*factors).orderBy(target) rank_df = df.withColumn('rank', F.row_number().over(w)) result_df = df.groupBy(*factors).agg(F.avg(target).alias('mean'), F.var_pop(target).alias('variance'), F.count(F.lit(1)).alias('count')) \ .join(rank_df, [*factors]) \ .filter(F.col('rank') == F.expr('count+1/2').cast(IntegerType())) \ .withColumn('median', F.col(target)) \ .orderBy(*factors) \ .select(*output_cols) return result_df
def transform_data(df: DataFrame) -> DataFrame: """Apply data deduplication. params: df -- Spark DataFrame outputs: Spark DataFrame """ w = Window.partitionBy('id').orderBy(col('update_date').desc()) df = df.withColumn('rownum', row_number().over(w)) \ .where(col('rownum') == 1) \ .drop('rownum') return df
def run_pyspark_pipeline(dir_path, spark, cores, out_path): """ Reads parquet files from 'dir_path' and parses trec_car_tools.Page object to create protobuf with entity linking. """ print('start preprocessin') start_preprocess = time.time() # Reads parquet files from 'dir_path' - each row is a TREC CAR pages. df_in = spark.read.parquet(dir_path) df_in.printSchema() num_partitions = df_in.rdd.getNumPartitions() print("Number of default partitions: {}".format(num_partitions)) print('end preprocess') end_preprocess = time.time() print("*** preprocess time: {:.2f}s ***".format(end_preprocess - start_preprocess)) print('start pyspark_processing job') start_pyspark_job = time.time() if num_partitions < cores * 4: print('repartitioning df') df_in = df_in.repartition(cores * 4) print("Number of partitions should equal 4*cores --> {}".format( df_in.rdd.getNumPartitions())) @udf(returnType=BinaryType()) def parse_udf(page_bytearray): # Parses trec_car_tools.Page object to create protobuf with entity linking. page = pickle.loads(page_bytearray) tp = TrecCarParser() doc = tp.parse_page_to_protobuf(page=page) doc_bytearray = pickle.dumps(doc.SerializeToString()) return doc_bytearray # Add index to DF. df_parse = df_in.withColumn("doc_bytearray", parse_udf("page_bytearray")) df_parse = df_parse.withColumn( "index", row_number().over(Window.orderBy(monotonically_increasing_id())) - 1) df_parse.write.parquet(out_path) print('end pyspark_processing job') end_pyspark_job = time.time() print("*** pyspark_processing job time: {:.2f}s ***".format( end_pyspark_job - start_pyspark_job))
def spark_app(): spark = SparkSession \ .builder \ .appName("Python Spark SQL") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() spark.sparkContext.setLogLevel("ERROR") sql_context = SQLContext(spark) # loading *.CSV files as DataFrames team_info = sql_context.read.csv("src/team_info.csv", header=True, sep=",") game_skater_stats = sql_context.read.csv("src/game_skater_stats.csv", header=True, sep=",") player_info = sql_context.read.csv("src/player_info.csv", header=True, sep=",") # joining tables data_frame = team_info. \ join(game_skater_stats, game_skater_stats.team_id == team_info.team_id). \ join(player_info, game_skater_stats.player_id == player_info.player_id) # first task: data_frame.groupby('shortName', 'teamName'). \ agg((f.sum(f.when(data_frame.primaryPosition == 'D', data_frame.goals).otherwise(0)) / f.sum('goals')).alias('METRIC')). \ sort('METRIC', ascending=False).limit(5).show() # additional task game = sql_context.read.csv("src/game.csv", header=True, sep=",") data_frame = data_frame.join(game, game_skater_stats.game_id == game.game_id) data_frame_2 = data_frame. \ groupby('shortName', 'teamName', f.year(data_frame.date_time).alias('years')). \ agg((f.sum(f.when(data_frame.primaryPosition == 'D', data_frame.goals).otherwise(0)) / f.sum('goals')).alias('METRIC')) window = Window.partitionBy(f.col('years')).orderBy( (f.col('METRIC')).desc()) data_frame_2.select(f.col('shortName'), f.col('teamName'), f.col('years'), f.col('METRIC'), f.row_number().over(window).alias('row_number')).where( f.col('row_number') <= 5).show() spark.stop()
def attach_default_index(sdf, default_index_type=None): """ This method attaches a default index to Spark DataFrame. Spark does not have the index notion so corresponding column should be generated. There are several types of default index can be configured by `compute.default_index_type`. >>> spark_frame = ks.range(10).to_spark() >>> spark_frame DataFrame[id: bigint] It adds the default index column '__index_level_0__'. >>> spark_frame = InternalFrame.attach_default_index(spark_frame) >>> spark_frame DataFrame[__index_level_0__: int, id: bigint] It throws an exception if the given column name already exists. >>> InternalFrame.attach_default_index(spark_frame) ... # doctest: +ELLIPSIS Traceback (most recent call last): ... AssertionError: '__index_level_0__' already exists... """ index_column = SPARK_DEFAULT_INDEX_NAME assert (index_column not in sdf.columns ), "'%s' already exists in the Spark column names '%s'" % ( index_column, sdf.columns) if default_index_type is None: default_index_type = get_option("compute.default_index_type") scols = [scol_for(sdf, column) for column in sdf.columns] if default_index_type == "sequence": sequential_index = (F.row_number().over( Window.orderBy(F.monotonically_increasing_id())) - 1) return sdf.select(sequential_index.alias(index_column), *scols) elif default_index_type == "distributed-sequence": return InternalFrame.attach_distributed_sequence_column( sdf, column_name=index_column) elif default_index_type == "distributed": return InternalFrame.attach_distributed_column( sdf, column_name=index_column) else: raise ValueError( "'compute.default_index_type' should be one of 'sequence'," " 'distributed-sequence' and 'distributed'")
def process_song_data(spark: SparkSession, input_data: str, output_data: str) -> None: """ Extracts raw song data from data lake, transforms it to songs and artists table and loads these tables back to the data lake. :param spark: instance of SparkSession :param input_data: data lake path to input data :param output_data: data lake path to output data """ # get filepath to song data file song_data = os.path.join(input_data, 'song_data/*/*/*/*.json') # read song data file df = spark.read.json(song_data) # extract columns to create songs table songs_table = df.select('song_id', 'title', 'artist_id', 'year', 'duration') \ .where(col('song_id').isNotNull()) \ .drop_duplicates(['song_id']) # write songs table to parquet files partitioned by year and artist songs_table.write.mode('overwrite').partitionBy( 'year', 'artist_id').parquet(os.path.join(output_data, 'songs')) # extract columns to create artists table # we need some extra logic to clean up duplicated artist_id records with distinct artist_name; we want to: # 1. Remove "featured" names (e.g. "Elton John feat. Sting" or "Elton John featuring Sting"); # 2. From the remaining records, choose the one with the shortest artist name and try to get a record with # non-null location, latitude and longitude. artist_cleanup_ordering = row_number().over( Window.partitionBy('artist_id').orderBy( length('name'), col('location').asc_nulls_last(), col('latitude').asc_nulls_last(), col('longitude').asc_nulls_last())) artists_table = df.select(col('artist_id'), col('artist_name').alias('name'), col('artist_location').alias('location'), col('artist_latitude').alias('latitude'), col('artist_longitude').alias('longitude')) \ .where(~col('name').like('%feat.%') | ~col('name').like('%featuring')) \ .withColumn('rn', artist_cleanup_ordering) \ .where(col('rn') == 1) \ .drop('rn') # write artists table to parquet files artists_table.write.mode('overwrite').parquet( os.path.join(output_data, 'artists'))
def _get_top_k_items( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_prediction=DEFAULT_PREDICTION_COL, k=DEFAULT_K ): """Get the input customer-item-rating tuple in the format of Spark DataFrame, output a Spark DataFrame in the dense format of top k items for each user. NOTE: if it is implicit rating, just append a column of constants to be ratings. Args: dataframe (spark.DataFrame): DataFrame of rating data (in the format of customerID-itemID-rating tuple). col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_prediction (str): column name for prediction. k (int): number of items for each user. Return: spark.DataFrame: DataFrame of top k items for each user. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc()) # this does not work for rating of the same value. items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .groupby(col_user) .agg(F.collect_list(col_item).alias(col_prediction)) ) return items_for_user
def test_window_functions_without_partitionBy(self): df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.orderBy("key", df.value) from pyspark.sql import functions as F sel = df.select(df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))), F.row_number().over(w), F.rank().over(w), F.dense_rank().over(w), F.ntile(2).over(w)) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 4, 1, 1, 1, 1), ("2", 1, 1, 1, 4, 2, 2, 2, 1), ("2", 1, 2, 1, 4, 3, 2, 2, 2), ("2", 2, 2, 2, 4, 4, 4, 3, 2) ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])
def _get_relevant_items_by_timestamp( dataframe, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, col_prediction=DEFAULT_PREDICTION_COL, k=DEFAULT_K ): """Get relevant items for each customer defined by timestamp. Relevant items are defined as k items that appear mostly recently according to timestamps. Args: dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp tuples. col_user (str): column name for user. col_item (str): column name for item. col_rating (str): column name for rating. col_timestamp (str): column name for timestamp. col_prediction (str): column name for prediction. k: number of relevent items to be filtered by the function. Return: spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items. """ window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc()) items_for_user = ( dataframe.select( col_user, col_item, col_rating, row_number().over(window_spec).alias("rank") ) .where(col("rank") <= k) .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))) .select(col_user, col_prediction) .dropDuplicates([col_user, col_prediction]) ) return items_for_user
def runOtherFunctions(spark, personDf): df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]); # array df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False) # desc, asc personDf.show() personDf.sort(functions.desc("age"), functions.asc("name")).show() # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음 # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능) df2 = spark.createDataFrame([("Splits str around pattern",)], ['value']) df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False) # rownum, rank f1 = StructField("date", StringType(), True) f2 = StructField("product", StringType(), True) f3 = StructField("amount", IntegerType(), True) schema = StructType([f1, f2, f3]) p1 = ("2017-12-25 12:01:00", "note", 1000) p2 = ("2017-12-25 12:01:10", "pencil", 3500) p3 = ("2017-12-25 12:03:20", "pencil", 23000) p4 = ("2017-12-25 12:05:00", "note", 1500) p5 = ("2017-12-25 12:05:07", "note", 2000) p6 = ("2017-12-25 12:06:25", "note", 1000) p7 = ("2017-12-25 12:08:00", "pencil", 500) p8 = ("2017-12-25 12:09:45", "note", 30000) dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema) w1 = Window.partitionBy("product").orderBy("amount") w2 = Window.orderBy("amount") dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"), functions.rank().over(w2).alias("rank")).show()
def spark_chrono_split( data, ratio=0.75, min_rating=1, filter_by="user", col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Spark chronological splitter This function splits data in a chronological manner. That is, for each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two sets and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Returns: list: Splits of the input data as spark.DataFrame. """ if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError("min_rating should be integer and larger than or equal to 1.") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item if min_rating > 1: data = min_rating_filter_spark( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.partitionBy(split_by_column).orderBy(col(col_timestamp)) rating_grouped = ( data.groupBy(split_by_column) .agg({col_timestamp: "count"}) .withColumnRenamed("count(" + col_timestamp + ")", "count") ) rating_all = data.join(broadcast(rating_grouped), on=split_by_column) rating_rank = rating_all.withColumn( "rank", row_number().over(window_spec) / col("count") ) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) ) splits.append(rating_split) return splits
def collect_numeric_metric(metric, df, population): cdf = df.select(df[metric['src']]) cdf = cdf.dropna(subset=metric['src']) cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket')) total_count = cdf.count() num_partitions = total_count / 500 ws = Window.orderBy('bucket') cdf = cdf.select( cdf['bucket'], cume_dist().over(ws).alias('c'), row_number().over(ws).alias('i')) cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions) cdf = cdf.collect() # Collapse rows with duplicate buckets. collapsed_data = [] prev = None for d in cdf: if not collapsed_data: collapsed_data.append(d) # Always keep first record. continue if prev and prev['bucket'] == d['bucket']: collapsed_data.pop() collapsed_data.append(d) prev = d # Calculate `p` from `c`. data = [] prev = None for i, d in enumerate(collapsed_data): p = d['c'] - prev['c'] if prev else d['c'] data.append({ 'bucket': d['bucket'], 'c': d['c'], 'p': p, }) prev = d """ Example of what `data` looks like now:: [{'bucket': 0.0, 'c': 0.00126056, 'p': 0.00126056}, {'bucket': 3.0, 'c': 0.00372313, 'p': 0.00246256}, {'bucket': 4.0, 'c': 0.00430616, 'p': 0.0005830290622683026}, {'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184}, {'bucket': 8.0, 'c': 0.08114486, 'p': 0.07514685}, {'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795}, ...] """ # Push data to database. sql = ("INSERT INTO api_numericcollection " "(num_observations, population, metric_id, dataset_id) " "VALUES (%s, %s, %s, %s) " "RETURNING id") params = [total_count, population, metric['id'], dataset_id] if DEBUG_SQL: collection_id = 0 print sql, params else: cursor.execute(sql, params) conn.commit() collection_id = cursor.fetchone()[0] for d in data: sql = ("INSERT INTO api_numericpoint " "(bucket, proportion, collection_id) " "VALUES (%s, %s, %s)") params = [d['bucket'], d['p'], collection_id] if DEBUG_SQL: print sql, params else: cursor.execute(sql, params) if not DEBUG_SQL: conn.commit()