def cond_fluent_window(pyData): dfData = spark.createDataFrame(pyData) dfData = dfData \ .withColumn("cond", func.when(dfData.E < 0, -1).otherwise( +1)) dfData = dfData \ .orderBy(dfData.grp, dfData.subgrp, dfData.cond, dfData.id) window = Window \ .partitionBy(dfData.grp, dfData.subgrp, dfData.cond) \ .orderBy(dfData.id)\ .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) dfData = dfData \ .withColumn("cond_var_of_E_2_pre1", func.when(dfData.cond < 0, func.variance(dfData.E)\ .over(window))) dfData = dfData \ .groupBy(dfData.grp, dfData.subgrp, dfData.cond)\ .agg(func.sum(dfData.C).alias("sum_of_C_pre"), func.count(dfData.C).alias("count_of_C_pre"), func.max(dfData.D).alias("max_of_D_pre"), func.variance(func.when(dfData.E < 0, dfData.E)).alias("cond_var_of_E_1_pre"), func.last(dfData.cond_var_of_E_2_pre1).alias("cond_var_of_E_2_pre2")) dfData = dfData \ .groupBy(dfData.grp, dfData.subgrp)\ .agg((func.sum(dfData.sum_of_C_pre) \ / func.sum(dfData.count_of_C_pre)\ ).alias("mean_of_C"), func.max(dfData.max_of_D_pre).alias("max_of_D"), func.max(dfData.cond_var_of_E_1_pre).alias("cond_var_of_E_1"), func.max(dfData.cond_var_of_E_2_pre2).alias("cond_var_of_E_2"))\ .orderBy(dfData.grp, dfData.subgrp)\ .collect()
def bi_fluent_window(pyData): df = spark.createDataFrame(pyData) window = Window \ .partitionBy(df.grp, df.subgrp) \ .orderBy(df.id) df = df \ .orderBy(df.grp, df.subgrp, df.id)\ .withColumn("sub_var_of_E", func.variance(df.E)\ .over(window)) df = df \ .groupBy(df.grp, df.subgrp)\ .agg(func.sum(df.C).alias("sub_sum_of_C"), func.count(df.C).alias("sub_count"), func.max(df.D).alias("sub_max_of_D"), func.last(df.sub_var_of_E).alias("sub_var_of_E1"), func.variance(df.E).alias("sub_var_of_E2")) df \ .groupBy(df.grp)\ .agg( (func.sum(df.sub_sum_of_C)/ func.sum(df.sub_count)).alias("mean_of_C"), func.max(df.sub_max_of_D).alias("max_of_D"), func.avg(df.sub_var_of_E1).alias("avg_var_of_E1"), func.avg(df.sub_var_of_E2).alias("avg_var_of_E2"))\ .orderBy(df.grp)\ .collect()
def task_1(data_io, review_data, product_data): # -----------------------------Column names-------------------------------- # Inputs: asin_column = 'asin' overall_column = 'overall' # Outputs: mean_rating_column = 'meanRating' count_rating_column = 'countRating' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ data = review_data.groupBy(F.col(asin_column)).agg( F.avg(F.col(overall_column)).alias(mean_rating_column), F.count("*").alias(count_rating_column)) merged = product_data.join(data, on=asin_column, how='left') aggregate_func = merged.agg( F.count("*"), F.avg(F.col(mean_rating_column)), F.variance(F.col(mean_rating_column)), F.sum(F.isnull(F.col(mean_rating_column)).astype("int")), F.avg(F.col(count_rating_column)), F.variance(F.col(count_rating_column)), F.sum(F.isnull(F.col(count_rating_column)).astype("int"))).collect()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- # Calculate the values programmaticly. Do not change the keys and do not # hard-code values in the dict. Your submission will be evaluated with # different inputs. # Modify the values of the following dictionary accordingly. res = { 'count_total': None, 'mean_meanRating': None, 'variance_meanRating': None, 'numNulls_meanRating': None, 'mean_countRating': None, 'variance_countRating': None, 'numNulls_countRating': None } # Modify res: res['count_total'] = aggregate_func[0] res['mean_meanRating'] = aggregate_func[1] res['variance_meanRating'] = aggregate_func[2] res['numNulls_meanRating'] = aggregate_func[3] res['mean_countRating'] = aggregate_func[4] res['variance_countRating'] = aggregate_func[5] res['numNulls_countRating'] = aggregate_func[6] # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_1') return res
def bi_fluent_join(pyData): df = spark.createDataFrame(pyData) level1 = df \ .groupBy(df.grp) \ .agg( func.mean(df.C).alias("mean_of_C"), func.max(df.D).alias("max_of_D")) level2 = df \ .groupBy(df.grp, df.subgrp) \ .agg( func.variance(df.E).alias("var_of_E"), ((func.sum(df.E * df.E)- func.sum(df.E) * func.avg(df.E)) /(func.count(df.E)-1)).alias("var_of_E2") ) level3 = level2 \ .join(level1, "grp") \ .groupBy(level1.grp) \ .agg( func.last(level1.mean_of_C).alias("mean_of_C"), func.last(level1.max_of_D).alias("max_of_D"), func.avg(level2.var_of_E).alias("avg_var_of_E"), func.avg(level2.var_of_E2).alias("avg_var_of_E2") ) \ .orderBy(level1.grp) # .collect() return level3, None
def ratingFeatures(ratingSamples): ratingSamples.printSchema() ratingSamples.show() # calculate average movie rating score and rating count # 按movieId做聚合,统计电影点击次数count(1) as ratingCount # avg(rating) as avgRating # variance(rating) as ratingVar -- 这个是方差 movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'), F.avg("rating").alias("avgRating"), F.variance('rating').alias('ratingVar')) \ .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating')) # 把平均得分转成只有1列的向量存储,后续做标准化要求的 movieFeatures.show(10) ######## 走pipeline特征处理 ######## # bucketing # 连续值分桶:对ratingCount按分布划分成100个大小一样的桶 ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket") # Normalization # 标准化:将平均得分向量进行标准化 ratingScaler = MinMaxScaler(inputCol="avgRatingVec", outputCol="scaleAvgRating") # 创建pipeline pipelineStage = [ratingCountDiscretizer, ratingScaler] featurePipeline = Pipeline(stages=pipelineStage) movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform( movieFeatures) # 把分桶转成整数类型, 把标准化的向量提取为非向量 movieProcessedFeatures = movieProcessedFeatures.withColumn('ratingCountBucket', F.col('ratingCountBucket').cast(IntegerType()))\ .withColumn('scaleAvgRating', udf(lambda v: float(v[0]), FloatType())(F.col('scaleAvgRating'))).drop(F.col('avgRatingVec')) movieProcessedFeatures.show(10)
def bi_fluent_nested(pyData): df = spark.createDataFrame(pyData) df = df.groupBy(df.grp, df.subgrp)\ .agg(func.mean(df.C).alias("sub_mean_of_C"), func.count(df.C).alias("sub_count"), func.sum(df.C).alias("sub_sum_of_C"), func.max(df.D).alias("sub_max_of_D"), func.variance(df.E).alias("sub_var_of_E"), func.sum(df.E * df.E).alias("sub_sum_of_E_squared"), func.sum(df.E).alias("sub_sum_of_E")) df = df.groupBy(df.grp) \ .agg( ( func.sum(df.sub_mean_of_C * df.sub_count) / func.sum(df.sub_count) ).alias("mean_of_C"), func.max(df.sub_max_of_D).alias("max_of_D"), func.avg(df.sub_var_of_E).alias("cond_var_of_E1"), func.avg( (df.sub_sum_of_E_squared - df.sub_sum_of_E * df.sub_sum_of_E / df.sub_count)).alias("cond_var_of_E2")) df.select('grp', 'mean_of_C', 'max_of_D', 'cond_var_of_E1', 'cond_var_of_E2')\ .orderBy(df.grp)\ .collect()
def cond_fluent_nested(pyData): dfData = spark.createDataFrame(pyData) dfInter = dfData\ .withColumn('cond', func.when(dfData.E < 0, -1).otherwise(1)) dfInter = dfInter.groupBy(dfInter.grp, dfInter.subgrp, dfInter.cond)\ .agg(func.mean(dfData.C).alias("sub_mean_of_C"), func.count(dfData.C).alias("sub_count"), func.sum(dfData.C).alias("sub_sum_of_C"), func.max(dfData.D).alias("sub_max_of_D"), func.variance(dfData.E).alias("sub_var_of_E"), func.sum(dfData.E * dfData.E).alias("sub_sum_of_E_squared"), func.sum(dfData.E).alias("sub_sum_of_E")) dfInter = dfInter.groupBy(dfInter.grp, dfInter.subgrp) \ .agg(func.mean(dfInter.sub_mean_of_C).alias("wrong_mean_of_C"), ( func.sum(dfInter.sub_mean_of_C * dfInter.sub_count) / func.sum(dfInter.sub_count) ).alias("mean_of_C2"), func.sum(dfInter.sub_count).alias("uncond_count"), func.sum(func.when(dfInter.cond < 0,dfInter.sub_count)\ .otherwise(0)).alias("cond_count"), func.sum(dfInter.sub_sum_of_C).alias("sum_of_C"), func.max(dfInter.sub_max_of_D).alias("max_of_D"), func.sum(func.when(dfInter.cond < 0,dfInter.sub_var_of_E)\ .otherwise(0)).alias("cond_var_of_E")) dfInter = dfInter\ .withColumn('mean_of_C', dfInter.sum_of_C / dfInter.uncond_count) dfInter.select('grp', 'subgrp', 'mean_of_C', 'mean_of_C2', 'wrong_mean_of_C', 'max_of_D', 'cond_var_of_E')\ .orderBy(dfInter.grp, dfInter.subgrp)\ .collect()
def _transform(self, df): input = self.getInputCol() prefix = self.getPrefix() outputs = self.getOutputCols() stats = self.getStatistics() groupByCol = self.getGroupByCol() aggs = [] for stat in stats: name = "{}_{}".format(prefix, stat) if stat == 'var': agg = F.variance(input).alias(name) elif stat == 'mean': agg = F.mean(input).alias(name) elif stat == 'count': agg = F.count(input).alias(name) elif stat == 'sum': agg = F.sum(input).alias(name) elif stat == 'nunique' or stat == 'distinct': agg = F.countDistinct(input).alias(name) aggs.append(agg) temp = df.groupBy(groupByCol).agg(*aggs) temp = temp.select(*groupByCol, *outputs) temp = temp.na.fill(0.0) df = df.join(temp, groupByCol, how='left') df = df.coalesce(1) return df
def __init__(self): super(FeatureRequestIntervalVariance, self).__init__() self.w = Window.partitionBy( F.col('client_request_host'), F.col('client_ip') ).orderBy(F.col("@timestamp")) self.group_by_aggs = { 'request_interval_var': F.variance( F.col('request_interval').cast('float') / 60. ), } self.pre_group_by_calcs = { 'row_num_per_group': F.row_number().over(self.w), 'prev_ts': F.lag(F.col('@timestamp')).over( self.w), 'request_interval': F.when( F.col('row_num_per_group') > 1, F.when( F.isnull( F.col('@timestamp').cast('long') - F.col('prev_ts').cast('long') ), 0 ).otherwise( F.col('@timestamp').cast('long') - F.col('prev_ts').cast('long') )).otherwise(None), }
def describe_float_1d(df, column, current_result, nrows): stats_df = df.select(column).na.drop().agg( mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum")).toPandas() for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column).na.drop().selectExpr( "percentile_approx(`{col}`,CAST({n} AS DOUBLE))".format( col=column, n=x)).toPandas().iloc[:, 0]) stats = stats_df.iloc[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column).na.drop().select( df_abs(col(column) - stats["mean"]).alias("delta")).agg( df_sum(col("delta"))).toPandas().iloc[0, 0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) return stats
def dataStatistics(self, categoricalFeatures, numericalFeatures): # self.dataTranform() self.categoricalFeatures = None if categoricalFeatures == None else categoricalFeatures self.numericalFeatures = None if numericalFeatures == None else numericalFeatures summaryList = ['mean', 'stddev', 'min', 'max'] summaryDict = {} dataset = self.dataset import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') for colm in self.numericalFeatures: summaryListTemp = [] for value in summaryList: summ = list( dataset.select(colm).summary(value).toPandas()[colm]) summaryListSubTemp = [] for val in summ: summaryListSubTemp.append(round(float(val), 4)) summaryListTemp.append(summaryListSubTemp) summaryDict[colm] = summaryListTemp summaryList.extend(['skewness', 'kurtosis', 'variance']) summaryDict['summaryName'] = summaryList summaryDict['categoricalColumn'] = self.categoricalFeatures skewnessList = [] kurtosisList = [] varianceList = [] skewKurtVarDict = {} for colm in self.numericalFeatures: skewness = (dataset.select(F.skewness(dataset[colm])).toPandas()) for i, row in skewness.iterrows(): for j, column in row.iteritems(): skewnessList.append(round(column, 4)) kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas()) for i, row in kurtosis.iterrows(): for j, column in row.iteritems(): kurtosisList.append(round(column, 4)) variance = (dataset.select(F.variance(dataset[colm])).toPandas()) for i, row in variance.iterrows(): for j, column in row.iteritems(): varianceList.append(round(column, 4)) for skew, kurt, var, colm in zip(skewnessList, kurtosisList, varianceList, self.numericalFeatures): print(skew, kurt, var) skewKurtVarList = [] skewKurtVarList.append(skew) skewKurtVarList.append(kurt) skewKurtVarList.append(var) skewKurtVarDict[colm] = skewKurtVarList for (keyOne, valueOne), (keyTwo, valueTwo) in zip(summaryDict.items(), skewKurtVarDict.items()): print(keyOne, valueOne, keyTwo, valueTwo) if keyOne == keyTwo: valueOne.extend(valueTwo) summaryDict[keyOne] = valueOne return summaryDict
def __init__(self): super(FeaturePathDepthVariance, self).__init__() self.group_by_aggs = { 'client_url_slash_count_variance': F.variance(F.col('client_url_slash_count')) } self.pre_group_by_calcs = { 'client_url_slash_count': (F.size(F.split(F.col('client_url'), '/')) - 1) }
def __calc_stats(self, df, resolution): """ Calculates statistics for every column in the Spark DF and returns a seperate DF with the results. Statistics: sum, min, max, count, mean, kurtosis, skewness, stddev, variance. :param df: DF containing the columns that you want to run your statistics calculations on :param resolution: int resolution in milli or microseconds OR string '5m'/'1h'/'1d' :return: aggregation dataframe containing statistics """ if type(resolution) is str: # resolution to microseconds res_dict = {'5m': 300000000, '1h': 3600000000, '1d': 86400000000} agg_interval = res_dict[resolution] elif type(resolution) is int: if len(str(resolution)) < 16: resolution = int(str(resolution).ljust(16, '0')) agg_interval = resolution ts_col = F.col('timestamp') df_ori_cols = list(set(df.columns) - set(['timestamp'])) df = df.withColumn('interval_start', (F.floor(ts_col / agg_interval) * agg_interval)) #\ #.withColumn('interval_stop', F.ceil(ts_col/agg_interval) * agg_interval)\ #.orderBy(F.col('interval_start')) agg_df = df.groupBy('interval_start').agg( F.max(ts_col).alias('max_ts')) # TODO Column type checking: string columns are automatically ignored and parse as NaN, so # TODO drop NaN columns? # TODO: interval_stop ignore, as well as drop max_ts # TODO: filter out NaN columns # TODO: question: run the statistics job as a seperate job without having to make a udf script stat_cols = df_ori_cols #[c for c in df_ori_cols if c not in ['interval_start', 'interval_stop', 'timestamp', 'max_ts']] for column in stat_cols: grouped_df = df.groupBy('interval_start')\ .agg(F.sum(column).alias('sum_%s' % column), F.min(column).alias('min_%s' % column), F.max(column).alias('max_%s' % column), F.count(column).alias('count_%s' % column), F.kurtosis(column).alias('kurtosis_%s' % column), F.mean(column).alias('mean_%s' % column), F.skewness(column).alias('skewness_%s' % column), F.stddev(column).alias('stddev_%s' % column), F.variance(column).alias('var_%s' % column)) agg_df = grouped_df.join(agg_df, on='interval_start') #agg_df = agg_df.drop('max_ts').drop(F.when(F.col('*').isna())).dropna(how='all').drop_duplicates() return agg_df
def vanilla_fluent(pyData): df = spark.createDataFrame(pyData, schema=DataPointSchema) df = df \ .groupBy(df.grp, df.subgrp) \ .agg( func.mean(df.C).alias("mean_of_C"), func.max(df.D).alias("max_of_D"), func.variance(df.E).alias("var_of_E"), (( func.sum(df.E *df.E) - func.pow(func.sum(df.E),2)/func.count(df.E) )/(func.count(df.E)-1)).alias("var_of_E2") )\ .orderBy(df.grp, df.subgrp) return None, df
def describe_numeric_1d(df, bins, column, current_result, nrows, k=2, dtype='int'): stats_df = df.select(column).na.drop().agg( mean(col(column)).alias('mean'), min(col(column)).alias('min'), max(col(column)).alias('max'), variance(col(column)).alias('variance'), kurtosis(col(column)).alias('kurtosis'), stddev(col(column)).alias('std'), skewness(col(column)).alias('skewness'), sum(col(column)).alias('sum')).toPandas() if dtype.lower() == 'int': select_expr = 'percentile({c},CAST({p} AS DOUBLE))' else: select_expr = 'percentile_approx({c},CAST({p} AS DOUBLE))' for p in [0.05, 0.25, 0.5, 0.75, 0.95]: stats_df[pretty_name(p)] = (df.select(column).na.drop().selectExpr( select_expr.format(c=column, p=p)).toPandas().ix[:, 0]) stats = stats_df.ix[0].copy() stats.name = column stats['range'] = stats['max'] - stats['min'] q3, q1 = stats[pretty_name(0.75)], stats[pretty_name(0.25)] stats['iqr'] = q3 - q1 stats['cv'] = stats['std'] / float(stats['mean']) stats['mad'] = (df.select(column).na.drop().select( abs(col(column) - stats['mean']).alias('delta')).agg(sum( col('delta'))).toPandas().ix[0, 0] / float(current_result['count'])) stats['type'] = 'NUM' stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) stats['high_idx'] = df.select(column).where( col(column) > q3 + k * (q3 - q1)).count() stats['low_idx'] = df.select(column).where( col(column) < q1 - k * (q3 - q1)).count() # generate histograms hist_data = generate_hist_data(df, column, stats['min'], stats['max'], bins) stats['histogram'] = complete_histogram(hist_data) stats['mini_histogram'] = mini_histogram(hist_data) return stats
def numerical_example(data: DataFrame): movie_features = data.groupBy("movieId").agg( F.count(F.lit(1)).alias("ratingCount"), F.avg("rating").alias("avgRating"), F.variance("rating").alias("ratingVar")).withColumn( "avgRatingVec", udf_avg_rating_to_vec(F.col("avgRating"))) print_info(movie_features) # bucketing rating_count_discretizer = QuantileDiscretizer( numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket") # normalization rating_scaler = MinMaxScaler(inputCol="avgRatingVec", outputCol="scaleAvgRating") pipeline_stage = [rating_count_discretizer, rating_scaler] feature_pipeline = Pipeline(stages=pipeline_stage) movie_processed_features = feature_pipeline.fit(movie_features).transform( movie_features) print_info(movie_processed_features)
def cond_fluent_join(pyData): dfData = spark.createDataFrame(pyData) uncond = dfData \ .groupBy(dfData.grp, dfData.subgrp) \ .agg( func.mean(dfData.C).alias("mean_of_C"), func.max(dfData.D).alias("max_of_D")) cond = dfData \ .filter(dfData.E < 0) \ .groupBy(dfData.grp, dfData.subgrp) \ .agg( func.variance(dfData.E).alias("cond_var_of_E")) uncond \ .join(cond, (uncond.grp == cond.grp) & (uncond.subgrp == cond.subgrp)) \ .drop(cond.grp) \ .drop(cond.subgrp) \ .orderBy(uncond.grp, uncond.subgrp) \ .collect()
def ratingFeatures(ratingSamples): ratingSamples.printSchema() ratingSamples.show() # calculate average movie rating score and rating count movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'), F.avg("rating").alias("avgRating"), F.variance('rating').alias('ratingVar')) \ .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating')) movieFeatures.show(10) # bucketing ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100, inputCol="ratingCount", outputCol="ratingCountBucket") # Normalization ratingScaler = MinMaxScaler(inputCol="avgRatingVec", outputCol="scaleAvgRating") pipelineStage = [ratingCountDiscretizer, ratingScaler] featurePipeline = Pipeline(stages=pipelineStage) movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform( movieFeatures) movieProcessedFeatures.show(10)
def learn3(): df1 = ss.read.csv('F:/Research/data/ccFraud.csv', header=True, inferSchema=True) df1.show() # 按gender列对df进行分组,并统计每组的行数 df2 = df1.groupby('gender').count() df2.show() df3 = df1.describe(['balance', 'numTrans', 'numIntlTrans']) df3.show() # 检查偏度 df1.agg({'balance': 'skewness'}).show() df1.agg( functions.max('balance').alias('max'), functions.avg('balance').alias('avg'), functions.mean('balance').alias('mean'), functions.stddev('balance').alias('stddev'), functions.sum('balance').alias('sum'), functions.skewness('balance').alias('skewness'), functions.variance('balance').alias('variance'), functions.sumDistinct('balance').alias('sumDistinct')).show() corr1 = df1.corr('balance', 'numTrans') print(corr1)
def summaryCustomized(raw_df: DataFrame): param_name = "countDistinct" mySchemaTemp = list(filter(lambda x: (x[1] != 'timestamp'), raw_df.dtypes)) mySchema = list(map(lambda z: (z[0]), mySchemaTemp)) ColumnListWithDistinct_count = [param_name] + mySchema WithDistinctCntSummaryDF = raw_df.select([ F.countDistinct(c).alias(c) for c in mySchema ]).withColumn(param_name, F.lit(param_name)).selectExpr(ColumnListWithDistinct_count) param_name = "NullValueCount" ColumnListWithNullValueCount = [param_name] + mySchema ColumnListWithNullValueCountDF = raw_df.select( [sum(F.when(isnull(F.col(c)), 1).otherwise(0)).name(c) for c in mySchema]).\ withColumn(param_name, F.lit(param_name)).selectExpr(ColumnListWithNullValueCount) param_name = "variance" ColumnListWithVariance = [param_name] + mySchema WithVarianceSummaryDF = raw_df.select([F.variance(c).alias(c) for c in mySchema]).\ withColumn(param_name, F.lit(param_name)).selectExpr(ColumnListWithVariance) return raw_df.summary().union(WithDistinctCntSummaryDF).union( WithVarianceSummaryDF).union(ColumnListWithNullValueCountDF)
return False else: return False # Create UDF funcs get_pval_udf = F.udf(lambda vars: get_normal_pval(vars), FloatType()) if_norm_udf = F.udf(lambda p: if_norm(p), BooleanType()) # COMMAND ---------- toneDataAll = toneData.select('ActionGeo_FullName', 'wTRA_1d', 'wTRA_60d', 'nArticles') \ .groupBy('ActionGeo_FullName') \ .agg( F.skewness('wTRA_1d'), F.kurtosis('wTRA_1d'), F.stddev('wTRA_1d'), F.variance('wTRA_1d'), F.collect_list('wTRA_1d').alias('list_wTRA_1d'), F.skewness('wTRA_60d'), F.kurtosis('wTRA_60d'), F.stddev('wTRA_60d'), F.variance('wTRA_60d'), F.collect_list('wTRA_60d').alias('list_wTRA_60d'), F.sum('nArticles').alias('nArticles'), F.count(F.lit(1)).alias('n_observations') ) # get p-value and define normalcy toneDataAll = toneDataAll.withColumn('p_value_1d', get_pval_udf(toneDataAll.list_wTRA_1d)) toneDataAll = toneDataAll.withColumn('if_normal_1d', if_norm_udf(toneDataAll.p_value_1d)) toneDataAll = toneDataAll.withColumn('p_value_60d', get_pval_udf(toneDataAll.list_wTRA_60d)) toneDataAll = toneDataAll.withColumn('if_normal_60d', if_norm_udf(toneDataAll.p_value_60d))
df = spark.createDataFrame(data=simpleData, schema = schema) df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0])) print("count: "+str(df.select(count("salary")).collect()[0])) df.select(first("salary")).show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \ .show(truncate=False)
else: return False # Create UDF funcs get_pval_udf = F.udf(lambda vars: get_normal_pval(vars), FloatType()) if_norm_udf = F.udf(lambda p: if_norm(p), BooleanType()) # COMMAND ---------- eventsDataAll = eventsData.select('ActionGeo_FullName', 'wERA_3d', 'wERA_60d', 'nArticles') \ .groupBy('ActionGeo_FullName') \ .agg( F.skewness('wERA_3d'), F.kurtosis('wERA_3d'), F.stddev('wERA_3d'), F.variance('wERA_3d'), F.collect_list('wERA_3d').alias('list_wERA_3d'), F.skewness('wERA_60d'), F.kurtosis('wERA_60d'), F.stddev('wERA_60d'), F.variance('wERA_60d'), F.collect_list('wERA_60d').alias('list_wERA_60d'), F.sum('nArticles').alias('nArticles'), F.count(F.lit(1)).alias('n_observations') ) # get p-value and define normalcy eventsDataAll = eventsDataAll.withColumn( 'p_value_3d', get_pval_udf(eventsDataAll.list_wERA_3d)) eventsDataAll = eventsDataAll.withColumn('if_normal_3d', if_norm_udf(eventsDataAll.p_value_3d))
sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases")) \ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # ---------------------------------------------------------- # Example 4 - varience and standard deviation #---------------------------------------------------------- from pyspark.sql.functions import var_pop, stddev_pop, variance, stddev from pyspark.sql.functions import var_samp, stddev_samp df.select(variance("Quantity"), stddev("Quantity"), var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() spark.sql("""SELECT var_pop(Quantity), var_samp(Quantity), stddev_pop(Quantity), stddev_samp(Quantity) FROM dfTable""").show() #---------------------------------------------------------- # Example 5 - skewness & kurtosis #---------------------------------------------------------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show()
def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum") ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum") ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().ix[:,0] ) stats = stats_df.ix[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column)==0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats
def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum"), count(col(column) == 0.0).alias('n_zeros') ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum"), count(col(column) == 0.0).alias('n_zeros') ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].iloc[0], current_result["count"])).toPandas().iloc[0][0] for x in [0.05, 0.25, 0.5, 0.75, 0.95]: stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().iloc[:,0] ) stats = stats_df.iloc[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().iloc[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats
def randomClassifier(dataset_add, feature_colm, label_colm, relation_list, relation): try: dataset = spark.read.parquet(dataset_add) label = '' for y in label_colm: label = y Schema = dataset.schema stringFeatures = [] numericalFeatures = [] for x in Schema: if (str(x.dataType) == "StringType"): for y in feature_colm: if x.name == y: stringFeatures.append(x.name) else: for y in feature_colm: if x.name == y: numericalFeatures.append(x.name) summaryList = ['mean', 'stddev', 'min', 'max'] summaryDict = {} import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') for colm in numericalFeatures: summaryListTemp = [] for value in summaryList: summ = list( dataset.select(colm).summary(value).toPandas()[colm]) summaryListSubTemp = [] for val in summ: summaryListSubTemp.append(round(float(val), 4)) # print(summaryListSubTemp) summaryListTemp.append(summaryListSubTemp) # varianceListTemp = list(dataset.select(variance(col(colm)).alias(colm)).toPandas()[colm]) # summaryListTemp.append(varianceListTemp) summaryDict[colm] = summaryListTemp # summaryList.append('variance') summaryDict['summaryName'] = summaryList summaryDict['categoricalColumn'] = stringFeatures skewnessList = [] kurtosisList = [] varianceList = [] skewKurtVarDict = {} for colm in numericalFeatures: skewness = (dataset.select(F.skewness(dataset[colm])).toPandas()) for i, row in skewness.iterrows(): for j, column in row.iteritems(): skewnessList.append(round(column, 4)) kurtosis = (dataset.select(F.kurtosis(dataset[colm])).toPandas()) for i, row in kurtosis.iterrows(): for j, column in row.iteritems(): kurtosisList.append(round(column, 4)) variance = (dataset.select(F.variance(dataset[colm])).toPandas()) for i, row in variance.iterrows(): for j, column in row.iteritems(): varianceList.append(round(column, 4)) for skew, kurt, var, colm in zip(skewnessList, kurtosisList, varianceList, numericalFeatures): print(skew, kurt, var) skewKurtVarList = [] skewKurtVarList.append(skew) skewKurtVarList.append(kurt) skewKurtVarList.append(var) skewKurtVarDict[colm] = skewKurtVarList for (keyOne, valueOne), (keyTwo, valueTwo) in zip(summaryDict.items(), skewKurtVarDict.items()): print(keyOne, valueOne, keyTwo, valueTwo) if keyOne == keyTwo: valueOne.extend(valueTwo) summaryDict[keyOne] = valueOne print(summaryDict) print(summaryList.extend(['skewness', 'kurtosis', 'variance'])) print(summaryDict) # for colm in numericalFeatures: # skewness = (dataset.select(F.skewness(dataset[colm])).alias('skewness_' + colm)) # kurtosis = (dataset.select(F.kurtosis(dataset[colm])).alias('kurtosis_' + colm)) # variance = (dataset.select(F.variance(dataset[colm]).alias('kurtosis_' + colm))) if relation == 'linear': dataset = dataset if relation == 'non_linear': dataset = Relationship(dataset, relation_list) dataset.show() for x in Schema: if (str(x.dataType) == "StringType" and x.name == label): for labelkey in label_colm: label_indexer = StringIndexer(inputCol=label, outputCol='indexed_' + label).fit(dataset) dataset = label_indexer.transform(dataset) label = 'indexed_' + label else: label = label indexed_features = [] for colm in stringFeatures: indexer = StringIndexer(inputCol=colm, outputCol='indexed_' + colm).fit(dataset) indexed_features.append('indexed_' + colm) dataset = indexer.transform(dataset) final_features = numericalFeatures + indexed_features response_chi_test = chi_square_test(dataset=dataset, features=indexed_features, label_col=label, stringFeatures=stringFeatures) featureassembler = VectorAssembler(inputCols=final_features, outputCol="features") dataset = featureassembler.transform(dataset) dataset.show() vec_indexer = VectorIndexer(inputCol='features', outputCol='vec_indexed_features', maxCategories=4).fit(dataset) categorical_features = vec_indexer.categoryMaps print("Choose %d categorical features: %s" % (len(categorical_features), ", ".join( str(k) for k in categorical_features.keys()))) vec_indexed = vec_indexer.transform(dataset) vec_indexed.show() finalized_data = vec_indexed.select(label, 'vec_indexed_features') train_data, test_data = finalized_data.randomSplit([0.75, 0.25], seed=40) rf = RandomForestClassifier(labelCol=label, featuresCol='vec_indexed_features', numTrees=10) model = rf.fit(train_data) predictions = model.transform(test_data) print(model.featureImportances) feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) import pyspark.sql.functions as F import builtins round = getattr(builtins, 'round') feature_importance = model.featureImportances.toArray().tolist() print(feature_importance) # feature_importance = [round(x,4) for x in feature_importance] featureImportance = [] for x in feature_importance: featureImportance.append(round(x, 4)) print(featureImportance) features_column_for_user = numericalFeatures + stringFeatures feature_imp = { 'feature_importance': featureImportance, "feature_column": features_column_for_user } response_dict = { 'feature_importance': feature_imp, 'ChiSquareTestData': response_chi_test, 'summaryDict': summaryDict } return response_dict except Exception as e: print("exception is = " + str(e))
def task_3(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: asin_column = 'asin' price_column = 'price' attribute = 'also_viewed' related_column = 'related' # Outputs: meanPriceAlsoViewed_column = 'meanPriceAlsoViewed' countAlsoViewed_column = 'countAlsoViewed' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ data = product_data.select( F.col(asin_column), F.explode(F.col(related_column))).filter(F.col('key') == attribute) data = data.select(F.col(asin_column), F.explode_outer(F.col('value'))) first_join = product_data.select( F.col(asin_column).alias("col"), F.col(price_column).alias("prices")) merged = data.join(first_join, on="col", how='left') merged = merged.groupby(F.col(asin_column)).agg( F.avg('prices').alias(meanPriceAlsoViewed_column), F.count('*').alias(countAlsoViewed_column)) merged = merged.withColumn( countAlsoViewed_column, F.when(F.col(countAlsoViewed_column) == 0, None).otherwise(F.col(countAlsoViewed_column))) count_total = product_data.count() numNulls_meanPriceAlsoViewed = count_total - merged.where( (merged[meanPriceAlsoViewed_column].isNotNull())).count() numNulls_countAlsoViewed = count_total - merged.where( (merged[countAlsoViewed_column].isNotNull())).count() # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'mean_meanPriceAlsoViewed': None, 'variance_meanPriceAlsoViewed': None, 'numNulls_meanPriceAlsoViewed': None, 'mean_countAlsoViewed': None, 'variance_countAlsoViewed': None, 'numNulls_countAlsoViewed': None } # Modify res: res['count_total'] = count_total res['mean_meanPriceAlsoViewed'] = merged.select( F.avg(F.col(meanPriceAlsoViewed_column))).head()[0] res['variance_meanPriceAlsoViewed'] = merged.select( F.variance(F.col(meanPriceAlsoViewed_column))).head()[0] res['numNulls_meanPriceAlsoViewed'] = numNulls_meanPriceAlsoViewed res['mean_countAlsoViewed'] = merged.select( F.avg(F.col(countAlsoViewed_column))).head()[0] res['variance_countAlsoViewed'] = merged.select( F.variance(F.col(countAlsoViewed_column))).head()[0] res['numNulls_countAlsoViewed'] = numNulls_countAlsoViewed # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_3') return res
def task_4(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: price_column = 'price' title_column = 'title' # Outputs: meanImputedPrice_column = 'meanImputedPrice' medianImputedPrice_column = 'medianImputedPrice' unknownImputedTitle_column = 'unknownImputedTitle' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ casted_data = product_data.withColumn( price_column, F.col(price_column).cast(T.FloatType())) mean_imputer = M.feature.Imputer(strategy='mean', inputCols=[price_column], outputCols=[meanImputedPrice_column]) mean_model = mean_imputer.fit(casted_data) meanImputedPrice = mean_model.transform( casted_data.select('asin', price_column)) mean_meanImputedPrice = meanImputedPrice.select( F.avg(F.col(meanImputedPrice_column))).head()[0] variance_meanImputedPrice = meanImputedPrice.select( F.variance(F.col(meanImputedPrice_column))).head()[0] numNulls_meanImputedPrice = meanImputedPrice.filter( F.col(meanImputedPrice_column).isNull()).count() median_imputer = M.feature.Imputer(strategy='median', inputCols=[price_column], outputCols=[medianImputedPrice_column]) median_model = median_imputer.fit(casted_data) medianImputedPrice = median_model.transform(meanImputedPrice) mean_medianImputedPrice = medianImputedPrice.select( F.avg(F.col(medianImputedPrice_column))).head()[0] variance_medianImputedPrice = medianImputedPrice.select( F.variance(F.col(medianImputedPrice_column))).head()[0] numNulls_medianImputedPrice = medianImputedPrice.filter( F.col(medianImputedPrice_column).isNull()).count() unknownImputedTitle = product_data.select( F.col(title_column).alias(unknownImputedTitle_column)).fillna( 'unknown').replace('', 'unknown') numUnknowns_unknownImputedTitle = unknownImputedTitle.filter( F.col(unknownImputedTitle_column) == 'unknown').count() # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'mean_meanImputedPrice': None, 'variance_meanImputedPrice': None, 'numNulls_meanImputedPrice': None, 'mean_medianImputedPrice': None, 'variance_medianImputedPrice': None, 'numNulls_medianImputedPrice': None, 'numUnknowns_unknownImputedTitle': None } # Modify res: res['count_total'] = medianImputedPrice.count() res['mean_meanImputedPrice'] = mean_meanImputedPrice res['variance_meanImputedPrice'] = variance_meanImputedPrice res['numNulls_meanImputedPrice'] = numNulls_meanImputedPrice res['mean_medianImputedPrice'] = mean_medianImputedPrice res['variance_medianImputedPrice'] = variance_medianImputedPrice res['numNulls_medianImputedPrice'] = numNulls_medianImputedPrice res['numUnknowns_unknownImputedTitle'] = numUnknowns_unknownImputedTitle # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_4') return res
def task_2(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: salesRank_column = 'salesRank' categories_column = 'categories' asin_column = 'asin' # Outputs: category_column = 'category' bestSalesCategory_column = 'bestSalesCategory' bestSalesRank_column = 'bestSalesRank' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ ### category data = product_data.withColumn( category_column, F.when(F.col(categories_column)[0][0] == '', None).otherwise(F.col(categories_column)[0][0])) category_nulls = data.filter(F.col(category_column).isNull()).count() category_distinct = data.agg(F.countDistinct( F.col(category_column))).head()[0] ### salesRank and salesCategory key_and_values = data.select( asin_column, category_column, F.map_keys(salesRank_column)[0].alias(bestSalesCategory_column), F.map_values(salesRank_column)[0].alias(bestSalesRank_column)) mean_of_salesRank = key_and_values.select( F.avg(F.col(bestSalesRank_column))).head()[0] variance_of_salesRank = key_and_values.select( F.variance(F.col(bestSalesRank_column))).head()[0] salesCategory_nulls = key_and_values.filter( F.col(bestSalesCategory_column).isNull()).count() salesCategory_distinct = key_and_values.agg( F.countDistinct(F.col(bestSalesCategory_column))).head()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'mean_bestSalesRank': None, 'variance_bestSalesRank': None, 'numNulls_category': None, 'countDistinct_category': None, 'numNulls_bestSalesCategory': None, 'countDistinct_bestSalesCategory': None } # Modify res: res['count_total'] = data.count() res['mean_bestSalesRank'] = mean_of_salesRank res['variance_bestSalesRank'] = variance_of_salesRank res['numNulls_category'] = category_nulls res['countDistinct_category'] = category_distinct res['numNulls_bestSalesCategory'] = salesCategory_nulls res['countDistinct_bestSalesCategory'] = salesCategory_distinct # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_2') return res
def run_pipeline(self): try: logging.info( "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/" ) # check collect_list and collect_set #collect_set() function returns all values from an input column with duplicate values eliminated. #collect_list() function returns all values from an input column with duplicates logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/' ) simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)] schema = ["employee_name", "department", "salary"] df = self.spark.createDataFrame(data=simpleData, schema=schema).cache() df.show(truncate=False) from pyspark.sql.functions import approx_count_distinct, collect_list from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct from pyspark.sql.functions import variance, var_samp, var_pop df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: " + str(df2.collect()[0][0])) print("count: " + str(df.select(count("salary")).collect()[0])) dffirst = df.select(first("salary")) dffirst.show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \ .show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return