def prepare_firmware_cve_counts(firmware_cves_df: DataFrame, firmware_hashes_df: DataFrame) -> DataFrame: # yapf: disable # Ensure that the windows for each of low, med, hi, and crit are over the entire firmware space instead of just # those for which a CVE is known to exist firmware_cves_full_df = firmware_hashes_df.join( firmware_cves_df, 'firmware_hash', 'left' ).na.fill(0) low_window = Window.orderBy(firmware_cves_full_df['low']) med_window = Window.orderBy(firmware_cves_full_df['medium']) high_window = Window.orderBy(firmware_cves_full_df['high']) crit_window = Window.orderBy(firmware_cves_full_df['critical']) # sum(wi*xi)/sum(wi) cve_composite_score = (percent_rank().over(low_window) + (2 * percent_rank().over(med_window)) + (3 * percent_rank().over(high_window)) + (4 * percent_rank().over(crit_window))) / 10 fwc_with_score_df = firmware_cves_full_df.withColumn( 'firmware_cve_component_score', cve_composite_score ).select( 'firmware_hash', 'firmware_cve_component_score' ) # yapf: enable return fwc_with_score_df
def split_user_interactions(interactions): interactions_rand = interactions.withColumn( 'rand_num', F.rand() ) interactions_with_ranks = interactions_rand.select( 'user_id', 'book_id', 'rating', F.percent_rank().over( Window.partitionBy( interactions_rand['user_id'] ).orderBy( interactions_rand['rand_num'] ) ).alias('book_id_rank') ) interactions_train = interactions_with_ranks.filter( interactions_with_ranks['book_id_rank'] >= 0.5 ).select( 'user_id', 'book_id', 'rating' ) interactions_val = interactions_with_ranks.filter( interactions_with_ranks['book_id_rank'] < 0.5 ).select( 'user_id', 'book_id', 'rating' ) return interactions_train, interactions_val
def modify_group_for_dim(model_dict, df, d, colname): '''given a DF with a groups assigned (variable colname), apply a dictionary to post-process the groups according to that one dimension. returns original DF with modified colname column. e.g. move specific seasons to the holdout or throwaway sets. ''' dim_props = model_dict['dimensional_dataset_proportions'].iteritems() for grp, grp_dict_list in dim_props: for grp_dict in grp_dict_list: window = Window.orderBy('dim_rnd')\ .partitionBy(grp_dict['dim'], colname) df = df.withColumn('dim_rk', F.percent_rank().over(window)) ## if (1) the column is within the set values, ## (2) the pre-existing group falls within those set values, and ## (3) the random value is below the set threshold, ## then override and modify the group membership if grp_dict['prop_to_move'] > 0: df = df.withColumn( colname, F.when((col(grp_dict['dim']).isin(grp_dict['vals'])) & (col(colname).isin(grp_dict['from_groups'])) & (col('dim_rk') >= 1 - grp_dict['prop_to_move']), grp).otherwise(col(colname))) return df
def percentiles(df, c, by=None, p=[10, 25, 50, 75, 90], index='_idx', result='_res'): _gcols = [by] if isinstance(by, str) and by else by or [] ptile = f'{c}##p' # percentiles per row w = Window.partitionBy(*_gcols).orderBy(c) d = df.select(c, *_gcols, F.floor(100 * (F.percent_rank().over(w))).alias(ptile)) # aggregate agg_keys = F.array(*[F.lit(x) for x in p]) agg_values = F.array( *[F.max(F.when(F.col(ptile) < x, F.col(c))) for x in p]) r = d.groupby(*_gcols).agg( F.map_from_arrays(agg_keys, agg_values).alias(result)) # add colname r = r.withColumn(index, F.lit(c)) return r
def sort_volume_stocks(df, n=10): # this function should be revised when using more than one trading day data. # return: a list of pyspark dataframe with stocks sorted window_spec1 = Window.partitionBy().orderBy('STOCKD_SUM_YENVOL') grouped = df.groupBy(df['Ticker']).agg( expr("sum(YenVolume) as STOCKD_SUM_YENVOL")) prc_rank = grouped.select( 'Ticker', percent_rank().over(window_spec1).alias('percent_rank')).toPandas() def sort_con(_n=n): # construct percentiles unit = 1 / _n first = unit last = unit * (_n - 1) conditions = [] for i in np.arange(first, 1 + unit, unit): if i == first: conditions.append(prc_rank['percent_rank'] <= i) elif i == 1: conditions.append(prc_rank['percent_rank'] > i - unit) else: conditions.append((prc_rank['percent_rank'] <= i) & (prc_rank['percent_rank'] > i - unit)) return conditions conds = sort_con(n) stock_lists = [prc_rank[cond]['Ticker'].tolist() for cond in conds] assert len(stock_lists) == n, "number of stock lists is not equal to n" return [df.filter(df.Ticker.isin(l)) for l in stock_lists]
def send_df_to_model(df): df = df.withColumn( "rank", percent_rank().over(Window.partitionBy().orderBy("timestamp"))) train_df = df.where("rank <= .5").drop("rank") print("Train Set") train_df.show() test_df = df.where("rank > .5").drop("rank") print("Test Set") test_df.show() vectorAssembler = VectorAssembler(inputCols=['num_mentions'], outputCol='features') train_vdf = vectorAssembler.transform(train_df) train_vdf = train_vdf.select(['features', 'avg_price']) test_vdf = vectorAssembler.transform(test_df) test_vdf = test_vdf.select(['features', 'avg_price']) lr = LinearRegression(featuresCol='features', labelCol='avg_price', maxIter=10, regParam=0.3, elasticNetParam=0.8) lr_model = lr.fit(train_vdf) lr_predictions = lr_model.transform(test_vdf) lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="avg_price", metricName="r2") print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions)) test_result = lr_model.evaluate(test_vdf) print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError) return lr_predictions.select("prediction", "avg_price", "features")
def train_val_test_split(data): data = data.withColumn( "rank", percent_rank().over(Window.partitionBy().orderBy("Year"))) train_df = data.where("rank <= .6").drop("rank") val_df = data.where("rank > 0.6 AND rank<=0.8 ").drop("rank") test_df = data.where("rank>0.8 ").drop("rank") return train_df, val_df, test_df
def add_median_salary(df, *groups, sort_field="-avg_salary"): window = Window.partitionBy(*groups) rank_spec = window.orderBy(sort(sort_field)) df = df.withColumn("percent_rank", F.percent_rank().over(rank_spec)) # 按中位数排序 median_spec = window.orderBy(F.pow(df.percent_rank - 0.5, 2)) df = df.withColumn("avg_salary", F.first("avg_salary").over(median_spec)) return df
def _cal_percent_rank(self, dfBase, col_name, order): print "sort column[{0}],order by[{1}]".format(col_name, order) if order == "asc": window = Window.orderBy(col_name) else: window = Window.orderBy(fn.desc(col_name)) df = dfBase.where("%s is not null" % col_name) \ .withColumn(col_name + "_rank", fn.percent_rank().over(window)) \ .persist(storageLevel=StorageLevel.DISK_ONLY) \ .union(dfBase.where("%s is null" % col_name) .withColumn(col_name + "_rank", fn.lit(None))) \ .repartition(20).persist(storageLevel=StorageLevel.DISK_ONLY) df.count() return df
def random_split_users_int(interactions, users, splitProportion): """ Returns a split of the interactions of the users received. Used to include interactions from validation and test users in the training set. """ w = Window.partitionBy('user_id').orderBy('book_id') ranked_interactions = interactions.join(users, 'user_id', 'leftsemi').\ select("user_id", "book_id", "rating", F.percent_rank().over(w).alias("percent_rank")) split1 = ranked_interactions.filter( ranked_interactions.percent_rank <= splitProportion).drop( 'percent_rank') split2 = ranked_interactions.filter( ranked_interactions.percent_rank > splitProportion).drop( 'percent_rank') return (split1, split2)
def partitioned_split_by_cols(df, partit_col, split_col, parts_fractions): probs = list(np.cumsum(parts_fractions)) window = Window.orderBy(split_col).partitionBy(partit_col) df_tmp = df\ .withColumn('rank', F.percent_rank().over(window)) parts = [] prev_p = 0. sum_p = 0. for p in parts_fractions: sum_p += p part = df_tmp \ .filter((F.col('rank') >= prev_p) & (F.col('rank') < sum_p)) \ .drop('rank') parts.append(part) prev_p += p return parts
def get_percentile_no_recurrence(no_recurrents_work): """ Get the percentile to assign the amount that corresponds to non recurrent payments :param no_recurrents_work: Dataframe :return: Dataframe """ filtered_window = Window.partitionBy( fields.customer_id).orderBy("sum_importe") ranged_window = Window.partitionBy(fields.customer_id).rowsBetween( -sys.maxsize, sys.maxsize) no_recurrents = no_recurrents_work.select( '*', percent_rank().over(filtered_window).alias("percentile"), count('*').over(ranged_window).alias('num_rows')) return no_recurrents.select( '*', when( c("percentile") >= (1 - (1 / c("num_rows"))), c("sum_importe")).otherwise(None).alias("percentile_im"))
def assign_group(model_dict, df, d, strata_cols, colname): '''given (1) a dictionary of ranges, (2) a DF with random values ranked by random block, and (3) a name for the grouped columns, return DF with a new column that assigns group membership''' window = Window.orderBy('dataset_rnd')\ .partitionBy(*model_dict['strata_cols']) df = df.withColumn('dataset_rk', F.percent_rank().over(window)) for i, (k, v) in enumerate(d.iteritems()): ## if the bottom is 0, make it -1 to include 0 min_val = -1 if v[0] == 0 else min_val if i == 0: group_assign_cond = F.when((col('dataset_rk') > min_val) & (col('dataset_rk') <= v[1]), F.lit(k)) else: group_assign_cond = group_assign_cond.when( (col('dataset_rk') > min_val) & (col('dataset_rk') <= v[1]), F.lit(k)) return df.withColumn(colname, group_assign_cond)
def benjamini_hochberg_filter(sdf, alpha=0.001, filter=True): """ Input Spark dataframe must contain columns item1_count, both_count, item2_prevalence as produced by `get_item_pair_stats` """ import pandas as pd from scipy.stats import norm m = sdf.count() z_to_pval_pudf = fn.pandas_udf(lambda z: pd.Series(norm.sf(abs(z)) * 2), DoubleType()) # two-sided p-value sdf2 = sdf\ .withColumn("Z", fn.expr("(both_count - item1_count*item2_prevalence)/sqrt(item1_count*item2_prevalence*(1-item2_prevalence))")) \ .withColumn("absZ", fn.expr("abs(Z)")) \ .withColumn("Z_rank_fraction", fn.percent_rank().over(Window.orderBy(fn.desc("absZ")))) \ .withColumn("p_value", z_to_pval_pudf(fn.col("Z"))) \ .withColumn('benjamini_hochberg_criterion', fn.col('p_value') <= fn.col('Z_rank_fraction') * alpha) if filter: sdf2 = sdf2.filter('benjamini_hochberg_criterion') drop_cols = set(sdf.columns) - set(sdf2.columns) for col in drop_cols: sdf2 = sdf2.drop(col) return sdf2
This function leaves gaps in rank when there are ties. ''' df.withColumn("rank",rank().over(window_spec)) \ .show() #dense_rank Window Function ''' dense_rank() window function is used to get the result with rank of rows within a window partition without any gaps. This is similar to rank() function difference being rank function leaves gaps in rank when there are ties. ''' df.withColumn("dense_rank",dense_rank().over(window_spec)) \ .show() #percent_rank Window Function df.withColumn("percent_rank",percent_rank().over(window_spec)) \ .show() #ntile Window Function ''' ntile() window function returns the relative rank of result rows within a window partition. In below example we have used 2 as an argument to ntile hence it returns ranking between 2 values (1 and 2) ''' df.withColumn("ntile",ntile(2).over(window_spec)) \ .show() #PySpark Window Analytic functions #cume_dist Window Function '''
sql_ctx = pyspark.SQLContext(sc) # Explicitly setting the schema type - may be unnecessary # Notes: https://stackoverflow.com/questions/44706398/spark-csv-reader-quoted-numerics schema = StructType([ StructField("RecId", IntegerType(), True), StructField("Name", StringType(), True), StructField("PassengerClass", StringType(), True), StructField("Age", IntegerType(), True), StructField("Sex", StringType(), True), StructField("Survived", IntegerType(), True), StructField("SexCode", IntegerType(), True), ]) # Print the average age titanic = sql_ctx.read.option("delimiter", ",").option( "header", True).schema(schema).csv('titanic.csv') titanic.groupBy().avg('Age').show() # Print the top 75th percentile in age # Notes: https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html # https://stackoverflow.com/questions/40885266/how-to-take-the-top-percentage-from-a-relatively-large-spark-dataframe-and-save window = Window.partitionBy().orderBy(titanic['Age'].asc()) titanic.select('*', percent_rank().over(window).alias('age_rank')).filter( col('age_rank') >= 0.75).show(1000) # Saving our DF to Parquet file titanic.write.parquet('titanic_p', mode='overwrite')
def with_window_column(df): from pyspark.sql.window import Window from pyspark.sql.functions import percent_rank windowSpec = Window.partitionBy(df['id']).orderBy(df['forecast']) return df.withColumn("r", percent_rank().over(windowSpec))
def num_iqr_filter(self, data_frame, inputCol="VALUE", labelCol="ITEMID", REPARTITION_CONST=None, sc=None): from pyspark.sql.window import Window from pyspark.sql.functions import abs, percent_rank, row_number, collect_list, udf, struct, count, avg, stddev_pop from pyspark.sql.types import MapType, StringType, DoubleType self.logger.debug("[NUM_IQR_FILTER] IN") self.logger.debug("[NUM_IQR_FILTER] BEFORE QUANTILE TAGGED") if REPARTITION_CONST is None: value_order = Window.partitionBy(labelCol).orderBy( col(inputCol).cast("float")) Q1_percentile = Window.partitionBy(labelCol).orderBy( abs(0.25 - col("percentile"))) Q2_percentile = Window.partitionBy(labelCol).orderBy( abs(0.5 - col("percentile"))) Q3_percentile = Window.partitionBy(labelCol).orderBy( abs(0.75 - col("percentile"))) percent_data_frame = data_frame.select( labelCol, inputCol, percent_rank().over(value_order).alias("percentile")) Q1_data_frame = percent_data_frame.withColumn("Q1_rn",row_number().over(Q1_percentile)).where("Q1_rn == 1")\ .select(labelCol,inputCol,lit("Q1").alias("quantile")) Q2_data_frame = percent_data_frame.withColumn("Q2_rn",row_number().over(Q2_percentile)).where("Q2_rn == 1")\ .select(labelCol,inputCol,lit("Q2").alias("quantile")) Q3_data_frame = percent_data_frame.withColumn("Q3_rn",row_number().over(Q3_percentile)).where("Q3_rn == 1")\ .select(labelCol,inputCol,lit("Q3").alias("quantile")) self.logger.debug( "[NUM_IQR_FILTER] REPARTITION_CONST Not Asserted") merge_all = Q1_data_frame.unionAll(Q2_data_frame).unionAll( Q3_data_frame).persist() self.logger.debug("[NUM_IQR_FILTER] Qs Merged") udf_parse_list_to_map = udf( lambda maps: dict(list(tuple(x) for x in maps)), MapType(StringType(), StringType())) self.logger.debug("[NUM_IQR_FILTER] Before merge quantiles") aggregate_quantiles = merge_all.groupBy(labelCol).agg( collect_list(struct("quantile", inputCol)).alias("quant_info")) self.logger.debug("AGG ONLY") aggregate_quantiles = aggregate_quantiles.select( labelCol, udf_parse_list_to_map("quant_info").alias("quant_info")) self.logger.debug("TRANSFORM") iqr_data_frame = aggregate_quantiles.withColumn("Q1",col("quant_info").getItem("Q1").cast("float"))\ .withColumn("Q2",col("quant_info").getItem("Q2").cast("float"))\ .withColumn("Q3",col("quant_info").getItem("Q3").cast("float")) self.logger.debug("QUANTILE_EXTRACTION") else: cur_label_list = data_frame.select( labelCol).distinct().rdd.flatMap(list).collect() cur_iqr_list = list() cnt = -1 for cur_item in cur_label_list: cnt = cnt + 1 data_frame.where( col(labelCol) == cur_item).registerTempTable("cur_table") self.logger.debug("{0}/{1},::{2}".format( cnt, len(cur_label_list), sc.sql( "select {0} from cur_table".format(labelCol)).count())) cur_iqr = sc.sql( "select {0}, percentile_approx({1},0.25) as Q1, percentile_approx({2},0.5) as Q2, percentile_approx({3},0.75) as Q3 from cur_table group by {4}" .format(labelCol, inputCol, inputCol, inputCol, labelCol)).first().asDict() cur_iqr_list.append(cur_iqr) sc.catalog.dropTempView("cur_table") #percent_data_frame = data_frame.select(labelCol, inputCol, percent_rank().over(value_order).alias("percentile")).repartition(REPARTITION_CONST).cache().checkpoint() iqr_data_frame = sc.createDataFrame(cur_iqr_list).repartition( REPARTITION_CONST) if REPARTITION_CONST is None: iqr_data_frame = iqr_data_frame.withColumn("IQR",col("Q3")-col("Q1"))\ .withColumn("LB",col("Q1")-1.5*col("IQR"))\ .withColumn("UB",col("Q3")+1.5*col("IQR"))\ .select(labelCol,"LB","UB") else: iqr_data_frame = iqr_data_frame.withColumn("IQR",col("Q3")-col("Q1"))\ .withColumn("LB",col("Q1")-1.5*col("IQR"))\ .withColumn("UB",col("Q3")+1.5*col("IQR"))\ .select(labelCol,"LB","UB").repartition(REPARTITION_CONST).persist() self.logger.debug("CUR_ITEMID_ALL_COUNT:{0}".format( iqr_data_frame.count())) self.logger.debug("[NUM_IQR_FILTER] iqr_data_frame merged") if REPARTITION_CONST is None: self.logger.debug( "[NUM_IQR_FILTER] RETURN_PREP, REPARTITION_CONST NOT ASSERTED") ret_data_frame = data_frame.repartition(labelCol).join(iqr_data_frame,labelCol).where((col("LB").cast("float") <= col(inputCol).cast("float")) & (col("UB").cast("float")>=col(inputCol).cast("float")))\ .drop("LB").drop("UB").persist() ref_df = ret_data_frame.repartition(labelCol).groupBy(labelCol)\ .agg(count(inputCol).alias("ref_count"),avg(inputCol).alias("ref_avg"),stddev_pop(inputCol).alias("ref_std")).persist() self.logger.debug("CHECK DF") self.logger.debug(ref_df.count()) self.logger.debug(ret_data_frame.count()) return (ret_data_frame, ref_df) else: self.logger.debug( "[NUM_IQR_FILTER] RETURN_PREP, REPARTITION_CONST ASSERTED: {0}" .format(REPARTITION_CONST)) ret_data_frame = data_frame.join(iqr_data_frame,labelCol).where((col("LB").cast("float") <= col(inputCol).cast("float")) & (col("UB").cast("float")>=col(inputCol).cast("float")))\ .drop("LB").drop("UB").repartition(REPARTITION_CONST) ref_df = ret_data_frame.groupBy(labelCol)\ .agg(count(inputCol).alias("ref_count"),avg(inputCol).alias("ref_avg"),stddev_pop(inputCol).alias("ref_std")).repartition(REPARTITION_CONST) return (ret_data_frame, ref_df)
'wnd_direction', 'dest_wnd_direction', 'OP_UNIQUE_CARRIER', 'OP_CARRIER', 'ORIGIN', 'ORIGIN_STATE_ABR', 'DEST', 'cig_code', 'cig_cavok_code', 'dest_cig_code', 'dest_cig_cavok_code', 'dest_vis_var_code', 'unique_tail', 'DAY_OF_WEEK', 'DEP_DEL15_PREV', 'MONTH', 'QUARTER', 'DAY_OF_MONTH', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'DISTANCE_GROUP', 'OD_GROUP') # drop some problematic NAs df = df.na.drop(subset=["DEP_DEL15", 'unique_tail']) # limit for small batch testing #df = df.limit(130459) # create a variable that will be used to split the data into train/valid/test later df = df.withColumn( "rank", percent_rank().over(Window.partitionBy().orderBy("departure_time"))) # COMMAND ---------- # DBTITLE 1,Convert String and Int. Categorical to Zero-min Categorical # list of str features str_features = [ 'DEST_STATE_ABR', 'wnd_direction', 'dest_wnd_direction', 'OP_UNIQUE_CARRIER', 'OP_CARRIER', 'ORIGIN', 'ORIGIN_STATE_ABR', 'DEST', 'cig_code', 'cig_cavok_code', 'dest_cig_code', 'dest_cig_cavok_code', 'dest_vis_var_code', 'unique_tail' ] # list of int categorical features int_categorical = [ 'DAY_OF_WEEK', 'DEP_DEL15_PREV', 'MONTH', 'QUARTER', 'DAY_OF_MONTH',
case_status_history.show() escalation_starts = (case_status_history.filter( F.col('is_escalate') == 'Y').groupby('reference_id').agg( F.min('seconds_since_case_start').alias('escalation_start'), F.max('seconds_since_case_start').alias('case_end'))) escalation_starts.count() # 646 escalation_starts.filter(F.col('reference_id') == 100087).show() escalation_points_distribution = (escalation_starts.withColumn( 'escalation_point_relative_to_case_duration', F.col('escalation_start') / F.col('case_end')).withColumn( 'percent_rank_escalation_point_relative_to_case_duration', F.percent_rank().over(Window.partitionBy().orderBy( F.asc("escalation_point_relative_to_case_duration"))) ).withColumn( 'percentile', F.floor( F.col('percent_rank_escalation_point_relative_to_case_duration') * 100)).groupBy('percentile').agg( F.avg('escalation_point_relative_to_case_duration').alias( 'average_percentile_escalation_point'))) escalation_points_distribution.withColumn( 'r', F.round(F.col("average_percentile_escalation_point"), 2)).show(n=100) escalation_training_ids = (escalation_starts.select('reference_id').distinct()) escalation_training_ids.count() escalation_training_targets = (case_status_history.join( escalation_starts,
# Create a simple window specification: from pyspark.sql.window import Window from pyspark.sql.functions import desc ws = Window.orderBy(desc("score")) from pyspark.sql.functions import row_number, cume_dist, ntile df.select("score", row_number().over(ws).alias("row_number")).show() df.select("score", cume_dist().over(ws).alias("cume_dist")).show() df.select("score", ntile(2).over(ws).alias("ntile(2)")).show() from pyspark.sql.functions import rank, dense_rank, percent_rank df.select("score", rank().over(ws).alias("rank"), dense_rank().over(ws).alias("dense_rank")).show() df.select("score", percent_rank().over(ws).alias("percent_rank")).show() from pyspark.sql.functions import lag, lead df.select("score", lag("score", count=1).over(ws).alias("lag"), lead("score", count=2).over(ws).alias("lead")).show() # ## Compute mean star rating over last five rides ws = Window.partitionBy("driver_id").orderBy("date_time").rowsBetween(-4, 0) from pyspark.sql.functions import count, mean, last rides \ .select("driver_id", "date_time", "star_rating", mean("star_rating").over(ws).alias("moving_average")) \ .groupby("driver_id") \ .agg(count("driver_id"), mean("star_rating"), last("moving_average")) \
def data_split(spark, sample_size = 0.1, seed = 1): ''' Function to Data Splitting and Sub sampling Parameters ---------- spark : spark session object sample_size: type-float: Data subsampling 1 for complete data seed: Seed for sampling Return ---------- train_final_data: type-Dataframe:Training data val_data: type-Dataframe: Validation data test_data: type-Dataframe: Testing data ''' #Load Dataset interactions = spark.read.csv('hdfs:/user/bm106/pub/goodreads/goodreads_interactions.csv',\ header=True, schema="user_id INT, book_id INT,is_read INT,rating INT,is_reviewed INT") #Create temp view interactions.createOrReplaceTempView('interactions') #Remove Rows with is_read = 0, is_reviewed = 0 and ratings = 0 filtered_interactions = spark.sql( "SELECT * FROM interactions WHERE is_read=1 AND is_reviewed=1 AND rating > 0") filtered_interactions.createOrReplaceTempView('filtered_interactions') # keep data from 10 or more interactions per user more_than_ten_interactions = spark.sql("SELECT D.book_id, D.is_read, D.rating, D.user_id, D.is_reviewed from filtered_interactions D INNER JOIN \ (SELECT user_id, count(user_id) FROM filtered_interactions GROUP BY user_id having count(user_id) > 10) R ON D.user_id = R.user_id") more_than_ten_interactions.createOrReplaceTempView('more_than_ten_interactions') #Get all the unique user ids from the generated dataframe user_id_sampled = more_than_ten_interactions.select(more_than_ten_interactions.user_id).distinct() #Sub sampling if sample_size < 1: #Get-sub sampled unique user id list user_id_sampled = user_id_sampled.sample(withReplacement = False, fraction = sample_size, seed = seed) #Get interactions corresponding to sub-sampled unique user id list more_than_ten_interactions = more_than_ten_interactions.join(user_id_sampled, more_than_ten_interactions.user_id == user_id_sampled.user_id,'inner')\ .select(more_than_ten_interactions.user_id, more_than_ten_interactions.book_id,more_than_ten_interactions.rating ) #Split unique user id data in the ratio Train:0.6, Val:0.2, Test:0.2 train_users, val_users, test_users = user_id_sampled.randomSplit([0.6, 0.2, 0.2]) #Get corresponding Train/ Val/Test Interactions train_interactions = more_than_ten_interactions.join(train_users, more_than_ten_interactions.user_id == train_users.user_id,'inner')\ .select(more_than_ten_interactions.user_id, more_than_ten_interactions.book_id,more_than_ten_interactions.rating ) val_interactions = more_than_ten_interactions.join(val_users, more_than_ten_interactions.user_id == val_users.user_id,'inner')\ .select(more_than_ten_interactions.user_id, more_than_ten_interactions.book_id,more_than_ten_interactions.rating ) test_interactions = more_than_ten_interactions.join(test_users, more_than_ten_interactions.user_id == test_users.user_id,'inner')\ .select(more_than_ten_interactions.user_id, more_than_ten_interactions.book_id,more_than_ten_interactions.rating ) #Grouping val_interactions = (val_interactions.select('user_id','book_id','rating',percent_rank().over(Window.partitionBy(val_interactions['user_id']).orderBy(val_interactions['book_id'])).alias('percent_50'))) #reserve 50 percent into training val_to_train = val_interactions.filter(col('percent_50') < 0.5).select('user_id','book_id','rating') #Take remaining 50 percent as validation val_data = val_interactions.filter(col('percent_50') >= 0.5).select('user_id','book_id','rating') #Grouping test_interactions = (test_interactions.select('user_id','book_id','rating',percent_rank().over(Window.partitionBy(test_interactions['user_id']).orderBy(val_interactions['book_id'])).alias('percent_50'))) #reserve 50 percent into training test_to_train = test_interactions.filter(col('percent_50') < 0.5).select('user_id','book_id','rating') #Take remaining 50 percent as Testing test_data = test_interactions.filter(col('percent_50') >= 0.5).select('user_id','book_id','rating') #Append validation extracted rows into training train_val_data = train_interactions.unionByName(val_to_train) #Append Testing extracted rows into training train_final_data = train_val_data.unionByName(test_to_train) return train_final_data, val_data, test_data
# In[5]: sales = order.join(customer, order.o_custkey == customer.c_custkey, how = 'inner') sales = sales.join(lineitem, lineitem.l_orderkey == sales.o_orderkey, how = 'full') sales = sales.where('c_mktsegment == "BUILDING"').select('l_quantity','o_orderdate') sales = sales.groupBy('o_orderdate').agg({'l_quantity': 'sum'}) .withColumnRenamed("sum(l_quantity)", "TOTAL_SALES") .withColumnRenamed("o_orderdate", "ORDERDATE") sales = sales.withColumn('DATE', F.unix_timestamp(sales.ORDERDATE) ) .withColumn('DAY', F.dayofmonth(sales.ORDERDATE) ) .withColumn('WDAY', F.dayofweek(sales.ORDERDATE) ) .withColumn('YDAY', F.dayofyear(sales.ORDERDATE) ) .withColumn('WEEK', F.weekofyear(sales.ORDERDATE) ) sales = sales.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("DATE"))) training = sales.where("rank <= .8").drop("rank").drop("ORDERDATE") testing = sales.where("rank > .8").drop("rank").drop("ORDERDATE") # In[7]: featuresCols = training.columns featuresCols.remove('TOTAL_SALES') va = VectorAssembler(inputCols = featuresCols, outputCol = "features") # In[8]: with mlflow.start_run() as run:
-1, Window.currentRow)) df = (data.withColumn( "list_salary", F.collect_list(F.col("salary")).over(windowSpec)).withColumn( "average_salary", F.avg(F.col("salary")).over(windowSpec)).withColumn( "total_salary", F.sum(F.col("salary")).over(windowSpec))) df.show() # In[14]: windowSpec = Window.partitionBy("dept").orderBy(F.asc("salary")) df = (data.withColumn("average_salary", F.avg(F.col("salary")).over(windowSpec)).withColumn( "total_salary", F.sum(F.col("salary")).over(windowSpec)).withColumn( "rank", F.rank().over(windowSpec)).withColumn( "dense_rank", F.dense_rank().over(windowSpec)).withColumn( "perc_rank", F.percent_rank().over(windowSpec))) df.show() # In[ ]:
udf_median = func.udf(median, spark.sql.types.FloatType()) group_df = df.groupby(['a', 'd']) df_grouped = group_df.agg(udf_median(func.collect_list(col('c'))).alias('median')) df_grouped.show() from pyspark.sql.window import Window from pyspark.sql.functions import percent_rank,col,count w = Window.partitionBy('Reviewer_Nationality').orderBy(df_nationality.Reviewer_Score ) df_median_countries = df_nationality.select('Reviewer_Nationality', 'Reviewer_Score', percent_rank().over(w).alias("percentile"), count(col('Reviewer_Nationality')).over(w).alias('N'))\ .orderBy('N', ascending = False)\ .where('percentile>0.48 and percentile<0.52') spark.catalog.dropTempView("medians") df_median_countries.createTempView('medians') spark.sql("SELECT * FROM medians").show() spark.sql("SELECT Reviewer_Nationality, \ AVG(percentile) AS PERC \ FROM medians\ GROUP BY Reviewer_Nationality\
'Itau_open', 'BVSP_open', 'USDBRL_open', 'lag_1', 'lag_2', 'lag_3' ] vectorAssembler = VectorAssembler(inputCols=FEATURES_COL1, outputCol="features") vdf = vectorAssembler.transform(df.na.drop()) vdf = vdf.select( ['Date', 'Itau_Close', 'features', 'class_N', 'class_NN', 'class_P']) scale_features = MinMaxScaler(inputCol='features', outputCol='scaled_features') model_scale = scale_features.fit(vdf) df_scaled = model_scale.transform(vdf) FEATURES_COL1 = ['scaled_features', 'class_N', 'class_NN', 'class_P'] vectorAssembler = VectorAssembler(inputCols=FEATURES_COL1, outputCol="Col_features") df_completed = vectorAssembler.transform(df_scaled) df_completed = df_completed.select(['Date', 'Itau_Close', 'Col_features']) df_completed = df_completed.withColumn( "rank", percent_rank().over(Window.partitionBy().orderBy("Date"))) train_df = df_completed.where("rank <= .95").drop("rank") test_df = df_completed.where("rank > .95").drop("rank") lr = LinearRegression(featuresCol='Col_features', labelCol='Itau_Close') lr_model = lr.fit(train_df) lr_predictions = lr_model.transform(test_df) lr_evaluator = RegressionEvaluator(predictionCol="prediction", \ labelCol="Itau_Close",metricName="r2") test_result = lr_model.evaluate(test_df) predictions = lr_model.transform(test_df) predictions.select("prediction", "Itau_close").write.option( "header", "true").mode("overwrite").parquet('pred.parquet')
def score_hive(hc, conf): with Timer('[score] command'): rdd, all_colnames = read_hive_scoring_data(hc, conf['hive_scoring_data']['input_db_name'], conf['hive_scoring_data']['input_table_name'], os.path.join(conf['path_to_output'], 'scoring', 'colnames_for_scoring.txt'), conf['colname_for_id'], log_start_time=False, indent=2, level='debug') rdd = rdd.cache() label_encoders = import_object(os.path.join(conf['path_to_output'], 'scoring', 'label_encoders', 'label_encoders.p'), log_start_time=False, indent=2, level='debug') label_decoders = {cat_colname: {value: key for key, value in label_encoder.items()} for cat_colname, label_encoder in label_encoders.items()} best_model_names = pd.read_csv(os.path.join(conf['path_to_output'], 'scoring', 'best_models.csv')).set_index('target')['best_model'] id_index = all_colnames.index(conf['colname_for_id'].upper()) targets = conf['targets'] estimators, feature_info, bias_adjusters = prepare_scoring(all_colnames, label_decoders, targets, best_model_names, conf['path_to_output'], conf['train_event_rate'], conf['fill_values_categorical_missing'], log_start_time=False, indent=2, level='debug') estimators_bc = sc.broadcast(estimators) feature_info_bc = sc.broadcast(feature_info) bias_adjusters_bc = sc.broadcast(bias_adjusters) def extract_feature(row, index, label, fill_value): variable_raw = row[index] if variable_raw == None or variable_raw == u'': variable_raw = fill_value if label is None: variable = float(variable_raw) else: variable = 1.0 if variable_raw == label else 0.0 return variable def extract_records(row): records = [[extract_feature(row, index, label, fill_value) for index, label, fill_value in feature_info_bc.value[i]] for i in range(len(targets))] return (row[id_index], records) def score_chunck(chunck): ids, records = zip(*chunck) records = zip(*records) results = [ids] for i in range(len(targets)): results.append(estimators_bc.value[i].predict_proba(list(records[i]))[:, 1]) return zip(*results) def adjust_bias(row): row_adjusted = list(row) for i in range(len(row) - 1): bias_adjuster = bias_adjusters_bc.value[i] row_adjusted[i + 1] = bias_adjuster[0] * row[i + 1] / (bias_adjuster[1] - bias_adjuster[2] * row[i + 1]) return row_adjusted def make_initial_schema(targets): schema = StructType() schema_str_list = list() for header in targets: if header.upper() == conf['colname_for_id'].upper(): schema.add(header, data_type=StringType()) schema_str_list.append(header + " STRING") else: schema.add(header + "_SCORE", data_type=FloatType()) schema_str_list.append(header + "_SCORE FLOAT") return schema, schema_str_list headers = [conf['colname_for_id'].upper()] + conf['targets'] initial_schema, initial_schema_str_list = make_initial_schema(headers) score_cols = [col for col in initial_schema.names if col.endswith("_SCORE")] def break_ties(target): return (funcs.col(target) + (funcs.randn(conf['random_state']) / funcs.lit(10000000000))).alias(target) scores_df = (rdd.map(extract_records) .mapPartitions(score_chunck) .map(adjust_bias) .map(lambda x: [z.item() if type(z) is pd.np.float64 else z for z in x]) .toDF(initial_schema) .repartition(conf['colname_for_id']) .select(conf['colname_for_id'], *(break_ties(target) for target in score_cols)) .cache() ) if conf['score_analysis_flag'] > 0: final_df = None for target in score_cols: w = Window.orderBy(scores_df[target].desc()) perc_rank_func = (funcs.percent_rank().over(w)) rank_func = (funcs.dense_rank().over(w)) temp = (scores_df.select(conf['colname_for_id'], target, rank_func.alias(target + "_rank"), perc_rank_func.alias(target + "_percent_rank")) .repartition(conf['colname_for_id']) .cache() ) if final_df is None: final_df = temp else: final_df = final_df.join(temp, on=conf['colname_for_id'], how='outer') else: final_df = scores_df full_table_name = conf['hive_scoring_data']['output_db_name'] + '.' + conf['hive_scoring_data']['output_table_name'] drop_sql = "DROP TABLE IF EXISTS " + full_table_name hc.sql(drop_sql) final_df.registerTempTable('final_table') create_sql = "CREATE TABLE " + full_table_name + " ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS SEQUENCEFILE AS SELECT * FROM final_table" hc.sql(create_sql)