Пример #1
0
def get_grouped_distance(grouped_sub_chi, partition_col, threshold):
    """
    Calculation of chi square percentage and distance from chi measures table
    """
    grouped_sub_chi_t1 = grouped_sub_chi.withColumnRenamed(
        "chiperc", "t1_chiperc")
    grouped_sub_chi_t2 = grouped_sub_chi.select(
        F.col("category_a").alias("category_b"),
        F.col("category_b").alias("category_a"),
        F.col("chiperc").alias("t2_chiperc"))
    return grouped_sub_chi_t1.join(grouped_sub_chi_t2, [
        "category_a", "category_b"
    ], "outer").withColumn(
        "distance",
        F.when(
            (F.col("t1_chiperc") > 0) & (F.col("t2_chiperc") > 0),
            1 / ((F.col("t1_chiperc") + F.col("t2_chiperc")) / 2)).otherwise(
                F.when(F.col("t1_chiperc") > 0,
                       1 / F.col("t1_chiperc")).otherwise(
                           F.when(
                               F.col("t2_chiperc") > 0,
                               1 / F.col("t2_chiperc")).otherwise(0)))
    ).withColumn(
        "rank",
        F.dense_rank().over(
            Window.partitionBy(partition_col).orderBy("distance"))).withColumn(
                "n_tile",
                F.ntile(100).over(
                    Window.partitionBy(partition_col).orderBy("distance"))
            ).filter((F.col("n_tile") <= threshold)
                     & (F.col("distance") <= 1000)).orderBy(
                         "category_a", "distance")
Пример #2
0
    def test_window_functions_without_partitionBy(self):
        df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"),
                                         (1, "2")], ["key", "value"])
        w = Window.orderBy("key", df.value)
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.row_number().over(w),
            F.rank().over(w),
            F.dense_rank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 4, 1, 1, 1, 1),
            ("2", 1, 1, 1, 4, 2, 2, 2, 1),
            ("2", 1, 2, 1, 4, 3, 2, 2, 2),
            ("2", 2, 2, 2, 4, 4, 4, 3, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])
Пример #3
0
    def test_window_functions(self):
        df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        w = Window.partitionBy("value").orderBy("key")
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.rowNumber().over(w),
            F.rank().over(w),
            F.denseRank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 1, 1, 1, 1, 1),
            ("2", 1, 1, 1, 3, 1, 1, 1, 1),
            ("2", 1, 2, 1, 3, 2, 1, 1, 1),
            ("2", 2, 2, 2, 3, 3, 3, 2, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[: len(r)])
def split_by_row_index(df, num_partitions=2):
    # Let's assume you don't have a row_id column that has the row order
    t = df.withColumn('_row_id', monotonically_increasing_id())
    # Using ntile() because monotonically_increasing_id is discontinuous across partitions
    t = t.withColumn('_partition',
                     ntile(num_partitions).over(Window.orderBy(t._row_id)))
    return [
        t.filter(t._partition == i + 1).drop('_row_id', '_partition')
        for i in range(num_partitions)
    ]
Пример #5
0
 def train(self, df, featureCols):
     
     ntiles = []
     for col in featureCols:
         w = Window.partitionBy().orderBy(col)
         aux = df.select(F.ntile(self._n).over(w).alias('ntile'),col)
         ntiles.append(list(aux.groupby('ntile').max(col).collect()))
         
     self.ntiles_ = np.array(ntiles)
     self.columns_ = map(str,featureCols)
     self._is_trained = True
Пример #6
0
def get_median_index(grouped_chi, median_col, partition_col):
    """
    Calculation of median index per category from partial index
    """
    median_udf = F.udf(median, T.FloatType())

    return grouped_chi.select(partition_col, median_col).withColumn(
        'ntile',
        F.ntile(2).over(Window().partitionBy(partition_col).orderBy(
            F.col(median_col).desc()))).groupBy(partition_col, 'ntile').agg(
                F.count(F.col(median_col)).alias('ntile_count'),
                F.max(F.col(median_col)).alias('ntile_max'),
                F.min(F.col(median_col)).alias('ntile_min')).groupBy(
                    partition_col).agg(
                        F.min('ntile_max').alias('1st_tile'),
                        F.max('ntile_min').alias('2nd_tile'),
                        F.sum('ntile_count').alias('count')).select(
                            partition_col,
                            median_udf(F.col('1st_tile'), F.col('2nd_tile'),
                                       F.col('count')).alias('median_index'))
Пример #7
0
def ks_func(predictions, y, probability):
    decileDF = predictions.select(y, probability)
    decileDF = decileDF.withColumn('non_target', 1 - decileDF[y])
    window = Window.orderBy(desc(probability))
    decileDF = decileDF.withColumn("rownum", F.row_number().over(window))
    decileDF.cache()
    decileDF = decileDF.withColumn("rownum", decileDF["rownum"].cast("double"))
    window2 = Window.orderBy("rownum")
    RFbucketedData = decileDF.withColumn("deciles", F.ntile(10).over(window2))
    RFbucketedData = RFbucketedData.withColumn(
        'deciles', RFbucketedData['deciles'].cast("int"))
    RFbucketedData.cache()
    ## to pandas from here
    print('KS calculation starting')
    target_cnt = RFbucketedData.groupBy('deciles').agg(
        F.sum(y).alias('target')).toPandas()
    non_target_cnt = RFbucketedData.groupBy('deciles').agg(
        F.sum("non_target").alias('non_target')).toPandas()
    overall_cnt = RFbucketedData.groupBy('deciles').count().alias(
        'Total').toPandas()
    overall_cnt = overall_cnt.merge(target_cnt, on='deciles',
                                    how='inner').merge(non_target_cnt,
                                                       on='deciles',
                                                       how='inner')
    overall_cnt = overall_cnt.sort_values(by='deciles', ascending=True)
    overall_cnt['Pct_target'] = (overall_cnt['target'] /
                                 overall_cnt['count']) * 100
    overall_cnt['cum_target'] = overall_cnt.target.cumsum()
    overall_cnt['cum_non_target'] = overall_cnt.non_target.cumsum()
    overall_cnt['%Dist_Target'] = (overall_cnt['cum_target'] /
                                   overall_cnt.target.sum()) * 100
    overall_cnt['%Dist_non_Target'] = (overall_cnt['cum_non_target'] /
                                       overall_cnt.non_target.sum()) * 100
    overall_cnt['spread'] = builtins.abs(overall_cnt['%Dist_Target'] -
                                         overall_cnt['%Dist_non_Target'])
    decile_table = overall_cnt.round(2)
    print("KS_Value =", builtins.round(overall_cnt.spread.max(), 2))
    decileDF.unpersist()
    RFbucketedData.unpersist()
    return builtins.round(overall_cnt.spread.max(), 2), overall_cnt
Пример #8
0
def compile_ntile(t, expr, scope, *, window, **kwargs):
    op = expr.op()

    buckets = op.buckets.op().value
    return F.ntile(buckets).over(window)
interactions = interactions.withColumn(
    'rating', interactions['rating'].cast(IntegerType()))

user_splits = users.randomSplit([0.6, 0.2, 0.2], seed=seed)
user_splits[0].createOrReplaceTempView('train_users')
user_splits[1].createOrReplaceTempView('val_users')
user_splits[2].createOrReplaceTempView('test_users')

interactions.createOrReplaceTempView('interactions')

byUser = Window.partitionBy('user').orderBy('book')
val_interactions_raw = spark.sql(
    'SELECT * FROM interactions WHERE user IN (SELECT _c0 FROM val_users)')
val_interactions_raw = val_interactions_raw.select(
    'user', 'book', 'rating',
    ntile(2).over(byUser).alias('half'))

test_interactions_raw = spark.sql(
    'SELECT * FROM interactions WHERE user IN (SELECT _c0 FROM test_users)')
test_interactions_raw = test_interactions_raw.select(
    'user', 'book', 'rating',
    ntile(2).over(byUser).alias('half'))

val_interactions_raw.createOrReplaceTempView('val_raw')
test_interactions_raw.createOrReplaceTempView('test_raw')

val = spark.sql('SELECT user, book, rating FROM val_raw where half = 1')
val_train = spark.sql('SELECT user, book, rating FROM val_raw where half = 2')

test = spark.sql('SELECT user, book, rating FROM test_raw where half = 1')
test_train = spark.sql(
Пример #10
0
def decile_plot(df, y_column, model, model_input_col='features', columns_to_exclude=(), num_deciles=10):
    """The function sorts the data points by the predicted positive class probability and divide them into bins.
     It plots bins based on the cumulative precision and recall in two plots.

    Parameters
    ----------
    df : DataFrame
        Data to be plotted

    y_column : str
        Name of the class column

    model : pyspark.ml model
        The model object to be evaluated

    model_input_col : str, optional (default='features')
        The name of the input column of the model, this is also the name of the output column of the VectorAssembler
        that creates the feature vector

    columns_to_exclude : tuple, optional (default=())
        Names of unwanted columns

    num_deciles : int, optional (default=10)
        Number of bars to be plotted, each bar represents about 1/num_deciles of the data

    Returns
    -------
    plot_wrapper : pytalite_spark.plotwrapper.PlotWrapper
        The PlotWrapper object that contains the information and data of the plot

    Raises
    ------
    ValueError
        If the number of deciles exceeds 50
    """

    # Validation check
    if num_deciles > 50:
        raise ValueError("The number of deciles cannot exceed 50")

    # Get preprocessed df and create vector assembler
    df, assembler = preprocess(df, y_column, model_input_col, columns_to_exclude)

    # Predict probability
    pred = model.transform(assembler.transform(df))
    prob_label = pred.select(p1_proba('probability').alias('p1'), F.col(y_column))

    # Label each row with appropriate deciles
    decile_window = Window.partitionBy().orderBy(prob_label.p1)
    decile_prob_label = prob_label.select(F.ntile(num_deciles).over(decile_window).alias("decile"), 'p1', y_column)
    decile_prob_label.cache()

    # Calculate decile size and true counts
    decile_stats = decile_prob_label.groupBy('decile') \
                                    .agg(F.count(y_column).alias('size'), F.sum(y_column).alias('true_count')) \
                                    .crossJoin(decile_prob_label.select(F.sum(y_column).alias('true_sum')))

    cum_window = Window.orderBy(decile_stats.decile.desc()).rangeBetween(Window.unboundedPreceding, 0)

    # Calculate decile scores
    scores = decile_stats.select(F.sum('true_count').over(cum_window).alias('cum_count'),
                                 F.sum('size').over(cum_window).alias('cum_size'), F.col('true_sum'))

    scores = scores.select(F.col('cum_count') / F.col('cum_size'), F.col('cum_count') / F.col('true_sum'))\
                   .toPandas().values

    cum_decile_precision = scores[:, 0]
    cum_decile_recall = scores[:, 1]

    # Create decile plot
    with plt.style.context(style_path):
        fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8,4))
        xticks = np.arange(0, 11) / 10
        xtick_labels = list(map(lambda x: "%d%%" % x, xticks * 100))
        xs = np.arange(num_deciles) / num_deciles

        # Draw bar plot
        bar_plot(ax1, xs, cum_decile_precision, width=1 / num_deciles, align='edge',
                 ylim=(0, np.max(cum_decile_precision) * 1.2), ylabel="True Label Fraction",
                 edge_color='w', bar_label=False)

        # Create cumulative decile plot
        ax2 = plt.subplot(2, 1, 2, sharex=ax1)

        # Draw bar plot
        bar_plot(ax2, xs, cum_decile_recall, width=1 / num_deciles, align='edge',
                 xticks=xticks, xticklabels=xtick_labels, xlim=(0, 1), xlabel="Deciles",
                 ylim=(0, np.max(cum_decile_recall) * 1.2), ylabel="Cumulative True Label Fraction",
                 bar_color=clr.main[0], edge_color='w', bar_label=False)

    plt.show()

    return PlotWrapper(fig, (ax1, ax2), {"shared_x": xs, "cum_precision_score": cum_decile_precision,
                                         "cum_recall_score": cum_decile_recall})
# ## Examples of various window functions

# Create a simple DataFrame:
data = [(100, ), (95, ), (95, ), (88, ), (73, ), (73, )]
df = spark.createDataFrame(data, ["score"])
df.show()

# Create a simple window specification:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc
ws = Window.orderBy(desc("score"))

from pyspark.sql.functions import row_number, cume_dist, ntile
df.select("score", row_number().over(ws).alias("row_number")).show()
df.select("score", cume_dist().over(ws).alias("cume_dist")).show()
df.select("score", ntile(2).over(ws).alias("ntile(2)")).show()

from pyspark.sql.functions import rank, dense_rank, percent_rank
df.select("score",
          rank().over(ws).alias("rank"),
          dense_rank().over(ws).alias("dense_rank")).show()
df.select("score", percent_rank().over(ws).alias("percent_rank")).show()

from pyspark.sql.functions import lag, lead
df.select("score",
          lag("score", count=1).over(ws).alias("lag"),
          lead("score", count=2).over(ws).alias("lead")).show()

# ## Compute mean star rating over last five rides

ws = Window.partitionBy("driver_id").orderBy("date_time").rowsBetween(-4, 0)
# |  1|       a|    1|
# |  1|       a|    1|
# |  1|       b|    2|
# |  2|       a|    1|
# |  2|       b|    2|
# |  3|       b|    1|
# |  3|       c|    1|
# |  3|       d|    2|
# |  3|       e|    2|
# +---+--------+-----+



# Answer
window = Window.partitionBy("id").orderBy("id")
df.withColumn("ntile", F.ntile(2).over(window)).sort("id", "category").show()

# COMMAND ----------

# CUBE, ROLLUP functions

# COMMAND ----------

# You have such a dataframe, build the cube function with count aggregation on both columns:
data=[("item1",2),("item2",5),("item3",20),("item2",20),("item1",10),("item1",5)]
df=spark.createDataFrame(data,["Item_Name","Quantity"])
# Expected
# +---------+--------+-----+
# |Item_Name|Quantity|count|
# +---------+--------+-----+
# |     null|    null|    6|
Пример #13
0
#dense_rank Window Function
'''
dense_rank() window function is used to get the result with rank of rows within a window partition without any gaps. 
This is similar to rank() function difference being rank function leaves gaps in rank when there are ties.
'''
df.withColumn("dense_rank",dense_rank().over(window_spec)) \
    .show()

#percent_rank Window Function
df.withColumn("percent_rank",percent_rank().over(window_spec)) \
    .show()


#ntile Window Function
'''
ntile() window function returns the relative rank of result rows within a window partition. 
In below example we have used 2 as an argument to ntile hence it returns ranking between 2 values (1 and 2)
'''
df.withColumn("ntile",ntile(2).over(window_spec)) \
    .show()


#PySpark Window Analytic functions
#cume_dist Window Function
'''
cume_dist() window function is used to get the cumulative distribution of values within a window partition.
'''
df.withColumn("cume_dist",cume_dist().over(window_spec)) \
    .show()
def calculate_metrics(predictions,y,data_type):
    start_time4 = time.time()

    # Calculate ROC
    evaluator = BinaryClassificationEvaluator(labelCol=y,rawPredictionCol='probability')
    auroc = evaluator.evaluate(predictions,{evaluator.metricName: "areaUnderROC"})
    print('AUC calculated',auroc)

    selectedCols = predictions.select(F.col("probability"), F.col('prediction'), F.col(y)).rdd.map(lambda row: (float(row['probability'][1]), float(row['prediction']), float(row[y]))).collect()
    y_score, y_pred, y_true = zip(*selectedCols)

    # Calculate Accuracy
    accuracydf=predictions.withColumn('acc',F.when(predictions.prediction==predictions[y],1).otherwise(0))
    accuracydf.createOrReplaceTempView("accuracyTable")
    RFaccuracy=spark.sql("select sum(acc)/count(1) as accuracy from accuracyTable").collect()[0][0]
    print('Accuracy calculated',RFaccuracy)

#     # Build KS Table
    split1_udf = udf(lambda value: value[1].item(), DoubleType())

    if data_type in ['train','valid','test','oot1','oot2']:
        decileDF = predictions.select(y, split1_udf('probability').alias('probability'))
    else:
        decileDF = predictions.select(y, 'probability')

    decileDF=decileDF.withColumn('non_target',1-decileDF[y])

    window = Window.orderBy(desc("probability"))
    decileDF = decileDF.withColumn("rownum", F.row_number().over(window))
    decileDF.cache()
    decileDF=decileDF.withColumn("rownum",decileDF["rownum"].cast("double"))

    window2 = Window.orderBy("rownum")
    RFbucketedData=decileDF.withColumn("deciles", F.ntile(10).over(window2))
    RFbucketedData = RFbucketedData.withColumn('deciles',RFbucketedData['deciles'].cast("int"))
    RFbucketedData.cache()
    #a = RFbucketedData.count()
    #print(RFbucketedData.show())

    ## to pandas from here
    print('KS calculation starting')
    target_cnt=RFbucketedData.groupBy('deciles').agg(F.sum(y).alias('target')).toPandas()
    non_target_cnt=RFbucketedData.groupBy('deciles').agg(F.sum("non_target").alias('non_target')).toPandas()
    overall_cnt=RFbucketedData.groupBy('deciles').count().alias('Total').toPandas()
    overall_cnt = overall_cnt.merge(target_cnt,on='deciles',how='inner').merge(non_target_cnt,on='deciles',how='inner')
    overall_cnt=overall_cnt.sort_values(by='deciles',ascending=True)
    overall_cnt['Pct_target']=(overall_cnt['target']/overall_cnt['count'])*100
    overall_cnt['cum_target'] = overall_cnt.target.cumsum()
    overall_cnt['cum_non_target'] = overall_cnt.non_target.cumsum()
    overall_cnt['%Dist_Target'] = (overall_cnt['cum_target'] / overall_cnt.target.sum())*100
    overall_cnt['%Dist_non_Target'] = (overall_cnt['cum_non_target'] / overall_cnt.non_target.sum())*100
    overall_cnt['spread'] = builtins.abs(overall_cnt['%Dist_Target']-overall_cnt['%Dist_non_Target'])
    decile_table=overall_cnt.round(2)
    print("KS_Value =", builtins.round(overall_cnt.spread.max(),2))
    #print "Test Error =", builtin.round((1.0 - RFaccuracy),3)
    #print "Accuracy =", builtin.round(RFaccuracy,3)
    #print "AUC=", builtin.round(auroc,3)
    decileDF.unpersist()
    RFbucketedData.unpersist()
    print("Metrics calculation process Completed in : "+ " %s seconds" % (time.time() - start_time4))
    return auroc,RFaccuracy,builtins.round(overall_cnt.spread.max(),2), y_score, y_pred, y_true, overall_cnt