def get_grouped_distance(grouped_sub_chi, partition_col, threshold): """ Calculation of chi square percentage and distance from chi measures table """ grouped_sub_chi_t1 = grouped_sub_chi.withColumnRenamed( "chiperc", "t1_chiperc") grouped_sub_chi_t2 = grouped_sub_chi.select( F.col("category_a").alias("category_b"), F.col("category_b").alias("category_a"), F.col("chiperc").alias("t2_chiperc")) return grouped_sub_chi_t1.join(grouped_sub_chi_t2, [ "category_a", "category_b" ], "outer").withColumn( "distance", F.when( (F.col("t1_chiperc") > 0) & (F.col("t2_chiperc") > 0), 1 / ((F.col("t1_chiperc") + F.col("t2_chiperc")) / 2)).otherwise( F.when(F.col("t1_chiperc") > 0, 1 / F.col("t1_chiperc")).otherwise( F.when( F.col("t2_chiperc") > 0, 1 / F.col("t2_chiperc")).otherwise(0))) ).withColumn( "rank", F.dense_rank().over( Window.partitionBy(partition_col).orderBy("distance"))).withColumn( "n_tile", F.ntile(100).over( Window.partitionBy(partition_col).orderBy("distance")) ).filter((F.col("n_tile") <= threshold) & (F.col("distance") <= 1000)).orderBy( "category_a", "distance")
def test_window_functions_without_partitionBy(self): df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.orderBy("key", df.value) from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.row_number().over(w), F.rank().over(w), F.dense_rank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 4, 1, 1, 1, 1), ("2", 1, 1, 1, 4, 2, 2, 2, 1), ("2", 1, 2, 1, 4, 3, 2, 2, 2), ("2", 2, 2, 2, 4, 4, 4, 3, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])
def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[: len(r)])
def split_by_row_index(df, num_partitions=2): # Let's assume you don't have a row_id column that has the row order t = df.withColumn('_row_id', monotonically_increasing_id()) # Using ntile() because monotonically_increasing_id is discontinuous across partitions t = t.withColumn('_partition', ntile(num_partitions).over(Window.orderBy(t._row_id))) return [ t.filter(t._partition == i + 1).drop('_row_id', '_partition') for i in range(num_partitions) ]
def train(self, df, featureCols): ntiles = [] for col in featureCols: w = Window.partitionBy().orderBy(col) aux = df.select(F.ntile(self._n).over(w).alias('ntile'),col) ntiles.append(list(aux.groupby('ntile').max(col).collect())) self.ntiles_ = np.array(ntiles) self.columns_ = map(str,featureCols) self._is_trained = True
def get_median_index(grouped_chi, median_col, partition_col): """ Calculation of median index per category from partial index """ median_udf = F.udf(median, T.FloatType()) return grouped_chi.select(partition_col, median_col).withColumn( 'ntile', F.ntile(2).over(Window().partitionBy(partition_col).orderBy( F.col(median_col).desc()))).groupBy(partition_col, 'ntile').agg( F.count(F.col(median_col)).alias('ntile_count'), F.max(F.col(median_col)).alias('ntile_max'), F.min(F.col(median_col)).alias('ntile_min')).groupBy( partition_col).agg( F.min('ntile_max').alias('1st_tile'), F.max('ntile_min').alias('2nd_tile'), F.sum('ntile_count').alias('count')).select( partition_col, median_udf(F.col('1st_tile'), F.col('2nd_tile'), F.col('count')).alias('median_index'))
def ks_func(predictions, y, probability): decileDF = predictions.select(y, probability) decileDF = decileDF.withColumn('non_target', 1 - decileDF[y]) window = Window.orderBy(desc(probability)) decileDF = decileDF.withColumn("rownum", F.row_number().over(window)) decileDF.cache() decileDF = decileDF.withColumn("rownum", decileDF["rownum"].cast("double")) window2 = Window.orderBy("rownum") RFbucketedData = decileDF.withColumn("deciles", F.ntile(10).over(window2)) RFbucketedData = RFbucketedData.withColumn( 'deciles', RFbucketedData['deciles'].cast("int")) RFbucketedData.cache() ## to pandas from here print('KS calculation starting') target_cnt = RFbucketedData.groupBy('deciles').agg( F.sum(y).alias('target')).toPandas() non_target_cnt = RFbucketedData.groupBy('deciles').agg( F.sum("non_target").alias('non_target')).toPandas() overall_cnt = RFbucketedData.groupBy('deciles').count().alias( 'Total').toPandas() overall_cnt = overall_cnt.merge(target_cnt, on='deciles', how='inner').merge(non_target_cnt, on='deciles', how='inner') overall_cnt = overall_cnt.sort_values(by='deciles', ascending=True) overall_cnt['Pct_target'] = (overall_cnt['target'] / overall_cnt['count']) * 100 overall_cnt['cum_target'] = overall_cnt.target.cumsum() overall_cnt['cum_non_target'] = overall_cnt.non_target.cumsum() overall_cnt['%Dist_Target'] = (overall_cnt['cum_target'] / overall_cnt.target.sum()) * 100 overall_cnt['%Dist_non_Target'] = (overall_cnt['cum_non_target'] / overall_cnt.non_target.sum()) * 100 overall_cnt['spread'] = builtins.abs(overall_cnt['%Dist_Target'] - overall_cnt['%Dist_non_Target']) decile_table = overall_cnt.round(2) print("KS_Value =", builtins.round(overall_cnt.spread.max(), 2)) decileDF.unpersist() RFbucketedData.unpersist() return builtins.round(overall_cnt.spread.max(), 2), overall_cnt
def compile_ntile(t, expr, scope, *, window, **kwargs): op = expr.op() buckets = op.buckets.op().value return F.ntile(buckets).over(window)
interactions = interactions.withColumn( 'rating', interactions['rating'].cast(IntegerType())) user_splits = users.randomSplit([0.6, 0.2, 0.2], seed=seed) user_splits[0].createOrReplaceTempView('train_users') user_splits[1].createOrReplaceTempView('val_users') user_splits[2].createOrReplaceTempView('test_users') interactions.createOrReplaceTempView('interactions') byUser = Window.partitionBy('user').orderBy('book') val_interactions_raw = spark.sql( 'SELECT * FROM interactions WHERE user IN (SELECT _c0 FROM val_users)') val_interactions_raw = val_interactions_raw.select( 'user', 'book', 'rating', ntile(2).over(byUser).alias('half')) test_interactions_raw = spark.sql( 'SELECT * FROM interactions WHERE user IN (SELECT _c0 FROM test_users)') test_interactions_raw = test_interactions_raw.select( 'user', 'book', 'rating', ntile(2).over(byUser).alias('half')) val_interactions_raw.createOrReplaceTempView('val_raw') test_interactions_raw.createOrReplaceTempView('test_raw') val = spark.sql('SELECT user, book, rating FROM val_raw where half = 1') val_train = spark.sql('SELECT user, book, rating FROM val_raw where half = 2') test = spark.sql('SELECT user, book, rating FROM test_raw where half = 1') test_train = spark.sql(
def decile_plot(df, y_column, model, model_input_col='features', columns_to_exclude=(), num_deciles=10): """The function sorts the data points by the predicted positive class probability and divide them into bins. It plots bins based on the cumulative precision and recall in two plots. Parameters ---------- df : DataFrame Data to be plotted y_column : str Name of the class column model : pyspark.ml model The model object to be evaluated model_input_col : str, optional (default='features') The name of the input column of the model, this is also the name of the output column of the VectorAssembler that creates the feature vector columns_to_exclude : tuple, optional (default=()) Names of unwanted columns num_deciles : int, optional (default=10) Number of bars to be plotted, each bar represents about 1/num_deciles of the data Returns ------- plot_wrapper : pytalite_spark.plotwrapper.PlotWrapper The PlotWrapper object that contains the information and data of the plot Raises ------ ValueError If the number of deciles exceeds 50 """ # Validation check if num_deciles > 50: raise ValueError("The number of deciles cannot exceed 50") # Get preprocessed df and create vector assembler df, assembler = preprocess(df, y_column, model_input_col, columns_to_exclude) # Predict probability pred = model.transform(assembler.transform(df)) prob_label = pred.select(p1_proba('probability').alias('p1'), F.col(y_column)) # Label each row with appropriate deciles decile_window = Window.partitionBy().orderBy(prob_label.p1) decile_prob_label = prob_label.select(F.ntile(num_deciles).over(decile_window).alias("decile"), 'p1', y_column) decile_prob_label.cache() # Calculate decile size and true counts decile_stats = decile_prob_label.groupBy('decile') \ .agg(F.count(y_column).alias('size'), F.sum(y_column).alias('true_count')) \ .crossJoin(decile_prob_label.select(F.sum(y_column).alias('true_sum'))) cum_window = Window.orderBy(decile_stats.decile.desc()).rangeBetween(Window.unboundedPreceding, 0) # Calculate decile scores scores = decile_stats.select(F.sum('true_count').over(cum_window).alias('cum_count'), F.sum('size').over(cum_window).alias('cum_size'), F.col('true_sum')) scores = scores.select(F.col('cum_count') / F.col('cum_size'), F.col('cum_count') / F.col('true_sum'))\ .toPandas().values cum_decile_precision = scores[:, 0] cum_decile_recall = scores[:, 1] # Create decile plot with plt.style.context(style_path): fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8,4)) xticks = np.arange(0, 11) / 10 xtick_labels = list(map(lambda x: "%d%%" % x, xticks * 100)) xs = np.arange(num_deciles) / num_deciles # Draw bar plot bar_plot(ax1, xs, cum_decile_precision, width=1 / num_deciles, align='edge', ylim=(0, np.max(cum_decile_precision) * 1.2), ylabel="True Label Fraction", edge_color='w', bar_label=False) # Create cumulative decile plot ax2 = plt.subplot(2, 1, 2, sharex=ax1) # Draw bar plot bar_plot(ax2, xs, cum_decile_recall, width=1 / num_deciles, align='edge', xticks=xticks, xticklabels=xtick_labels, xlim=(0, 1), xlabel="Deciles", ylim=(0, np.max(cum_decile_recall) * 1.2), ylabel="Cumulative True Label Fraction", bar_color=clr.main[0], edge_color='w', bar_label=False) plt.show() return PlotWrapper(fig, (ax1, ax2), {"shared_x": xs, "cum_precision_score": cum_decile_precision, "cum_recall_score": cum_decile_recall})
# ## Examples of various window functions # Create a simple DataFrame: data = [(100, ), (95, ), (95, ), (88, ), (73, ), (73, )] df = spark.createDataFrame(data, ["score"]) df.show() # Create a simple window specification: from pyspark.sql.window import Window from pyspark.sql.functions import desc ws = Window.orderBy(desc("score")) from pyspark.sql.functions import row_number, cume_dist, ntile df.select("score", row_number().over(ws).alias("row_number")).show() df.select("score", cume_dist().over(ws).alias("cume_dist")).show() df.select("score", ntile(2).over(ws).alias("ntile(2)")).show() from pyspark.sql.functions import rank, dense_rank, percent_rank df.select("score", rank().over(ws).alias("rank"), dense_rank().over(ws).alias("dense_rank")).show() df.select("score", percent_rank().over(ws).alias("percent_rank")).show() from pyspark.sql.functions import lag, lead df.select("score", lag("score", count=1).over(ws).alias("lag"), lead("score", count=2).over(ws).alias("lead")).show() # ## Compute mean star rating over last five rides ws = Window.partitionBy("driver_id").orderBy("date_time").rowsBetween(-4, 0)
# | 1| a| 1| # | 1| a| 1| # | 1| b| 2| # | 2| a| 1| # | 2| b| 2| # | 3| b| 1| # | 3| c| 1| # | 3| d| 2| # | 3| e| 2| # +---+--------+-----+ # Answer window = Window.partitionBy("id").orderBy("id") df.withColumn("ntile", F.ntile(2).over(window)).sort("id", "category").show() # COMMAND ---------- # CUBE, ROLLUP functions # COMMAND ---------- # You have such a dataframe, build the cube function with count aggregation on both columns: data=[("item1",2),("item2",5),("item3",20),("item2",20),("item1",10),("item1",5)] df=spark.createDataFrame(data,["Item_Name","Quantity"]) # Expected # +---------+--------+-----+ # |Item_Name|Quantity|count| # +---------+--------+-----+ # | null| null| 6|
#dense_rank Window Function ''' dense_rank() window function is used to get the result with rank of rows within a window partition without any gaps. This is similar to rank() function difference being rank function leaves gaps in rank when there are ties. ''' df.withColumn("dense_rank",dense_rank().over(window_spec)) \ .show() #percent_rank Window Function df.withColumn("percent_rank",percent_rank().over(window_spec)) \ .show() #ntile Window Function ''' ntile() window function returns the relative rank of result rows within a window partition. In below example we have used 2 as an argument to ntile hence it returns ranking between 2 values (1 and 2) ''' df.withColumn("ntile",ntile(2).over(window_spec)) \ .show() #PySpark Window Analytic functions #cume_dist Window Function ''' cume_dist() window function is used to get the cumulative distribution of values within a window partition. ''' df.withColumn("cume_dist",cume_dist().over(window_spec)) \ .show()
def calculate_metrics(predictions,y,data_type): start_time4 = time.time() # Calculate ROC evaluator = BinaryClassificationEvaluator(labelCol=y,rawPredictionCol='probability') auroc = evaluator.evaluate(predictions,{evaluator.metricName: "areaUnderROC"}) print('AUC calculated',auroc) selectedCols = predictions.select(F.col("probability"), F.col('prediction'), F.col(y)).rdd.map(lambda row: (float(row['probability'][1]), float(row['prediction']), float(row[y]))).collect() y_score, y_pred, y_true = zip(*selectedCols) # Calculate Accuracy accuracydf=predictions.withColumn('acc',F.when(predictions.prediction==predictions[y],1).otherwise(0)) accuracydf.createOrReplaceTempView("accuracyTable") RFaccuracy=spark.sql("select sum(acc)/count(1) as accuracy from accuracyTable").collect()[0][0] print('Accuracy calculated',RFaccuracy) # # Build KS Table split1_udf = udf(lambda value: value[1].item(), DoubleType()) if data_type in ['train','valid','test','oot1','oot2']: decileDF = predictions.select(y, split1_udf('probability').alias('probability')) else: decileDF = predictions.select(y, 'probability') decileDF=decileDF.withColumn('non_target',1-decileDF[y]) window = Window.orderBy(desc("probability")) decileDF = decileDF.withColumn("rownum", F.row_number().over(window)) decileDF.cache() decileDF=decileDF.withColumn("rownum",decileDF["rownum"].cast("double")) window2 = Window.orderBy("rownum") RFbucketedData=decileDF.withColumn("deciles", F.ntile(10).over(window2)) RFbucketedData = RFbucketedData.withColumn('deciles',RFbucketedData['deciles'].cast("int")) RFbucketedData.cache() #a = RFbucketedData.count() #print(RFbucketedData.show()) ## to pandas from here print('KS calculation starting') target_cnt=RFbucketedData.groupBy('deciles').agg(F.sum(y).alias('target')).toPandas() non_target_cnt=RFbucketedData.groupBy('deciles').agg(F.sum("non_target").alias('non_target')).toPandas() overall_cnt=RFbucketedData.groupBy('deciles').count().alias('Total').toPandas() overall_cnt = overall_cnt.merge(target_cnt,on='deciles',how='inner').merge(non_target_cnt,on='deciles',how='inner') overall_cnt=overall_cnt.sort_values(by='deciles',ascending=True) overall_cnt['Pct_target']=(overall_cnt['target']/overall_cnt['count'])*100 overall_cnt['cum_target'] = overall_cnt.target.cumsum() overall_cnt['cum_non_target'] = overall_cnt.non_target.cumsum() overall_cnt['%Dist_Target'] = (overall_cnt['cum_target'] / overall_cnt.target.sum())*100 overall_cnt['%Dist_non_Target'] = (overall_cnt['cum_non_target'] / overall_cnt.non_target.sum())*100 overall_cnt['spread'] = builtins.abs(overall_cnt['%Dist_Target']-overall_cnt['%Dist_non_Target']) decile_table=overall_cnt.round(2) print("KS_Value =", builtins.round(overall_cnt.spread.max(),2)) #print "Test Error =", builtin.round((1.0 - RFaccuracy),3) #print "Accuracy =", builtin.round(RFaccuracy,3) #print "AUC=", builtin.round(auroc,3) decileDF.unpersist() RFbucketedData.unpersist() print("Metrics calculation process Completed in : "+ " %s seconds" % (time.time() - start_time4)) return auroc,RFaccuracy,builtins.round(overall_cnt.spread.max(),2), y_score, y_pred, y_true, overall_cnt