def getTargetStatsByGroups_1(df, factors, target): stats_names = ['mean', 'variance'] output_cols = ['group', *stats_names] result_schema = StructType([ StructField('group', StringType(), True), StructField('mean', DoubleType(), True), StructField('variance', DoubleType(), True) ]) cube_df = df.cube(*factors).agg(F.avg(target).alias('mean'), F.var_pop(target).alias('variance')) \ .fillna('All', factors) levels_of_factor = getLevelsOfFactors(df, factors) groups = [(f, l) for f in levels_of_factors for l in levels_of_factors[f]] default_condi = {f: 'All' for f in factors} result_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), result_schema) for g in groups: condi = {**default_condi, g[0]: g[1]} stats_of_g = cube_df.filter(' and '.join([f'`{f}` == "{condi[f]}"' for f in condi])) \ .withColumn('group', F.lit(g[1])) \ .orderBy('group') \ .select(*output_cols) result_df = result_df.union(stats_of_g) return result_df
def SST(self): '''return Sum of Squared Total, mainly for calculation verification''' from pyspark.sql.functions import countDistinct SST_query = [ F.var_pop(varname).alias(varname + '_var_pop') for varname in self.varnames ] SST_components = self.df.select( self.varnames).agg(*SST_query).toPandas() #vec SST = SST_components.as_matrix().sum() * self.N return SST, SST_components
def SSE(self, clusterlab=None): '''return Sum of Squared within cluster Error (see cluster as model)''' if clusterlab == None: clusterlab = self.clusterLabelCol SSE_query = [F.var_pop(varname).alias(varname+'_var_cluster') for varname in self.varnames] + \ [F.count(F.lit(1)).alias('count')] SSE_components = self.df.groupBy(clusterlab).agg(*SSE_query).toPandas() SSE_components1 = SSE_components[[ varname + '_var_cluster' for varname in self.varnames ]].multiply(SSE_components["count"], axis="index") SSE = SSE_components1.as_matrix().sum() return SSE, SSE_components
def getTargetStatsByFactors(df, factors, target): stats_names = ['mean', 'median', 'variance'] output_cols = [*factors, *stats_names] w = Window.partitionBy(*factors).orderBy(target) rank_df = df.withColumn('rank', F.row_number().over(w)) result_df = df.groupBy(*factors).agg(F.avg(target).alias('mean'), F.var_pop(target).alias('variance'), F.count(F.lit(1)).alias('count')) \ .join(rank_df, [*factors]) \ .filter(F.col('rank') == F.expr('count+1/2').cast(IntegerType())) \ .withColumn('median', F.col(target)) \ .orderBy(*factors) \ .select(*output_cols) return result_df
def getTargetStatsByGroups_2(df, factors, target): stats_names = ['mean', 'variance'] output_cols = ['group', *stats_names] cube_df = df.cube(*factors).agg(F.avg(target).alias('mean'), F.var_pop(target).alias('variance')) \ .fillna('', factors) \ .withColumn('group', F.concat(*factors)) levels_of_factors = getLevelsOfFactors(df, factors) groups = [[l] for f in levels_of_factors for l in levels_of_factors[f]] group_df = spark.createDataFrame(groups, ["group"]) result_df = group_df.join(cube_df, 'group', 'left') \ .orderBy('group') \ .select(*output_cols) return result_df
# get the min and max amount of calories burned dailyActivitiesDF.select(min("CaloriesBurned"), max("CaloriesBurned")).show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ## Statistical functions # MAGIC # MAGIC - We can do some basic statistical functions as well using the Spark API # COMMAND ---------- # standard deviation and variance dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"), stddev_pop("CaloriesBurned"), stddev_samp("CaloriesBurned")).show() # COMMAND ---------- # Any extreme points in our data? dailyActivitiesDF.select(skewness("CaloriesBurned"), kurtosis("CaloriesBurned")).show() # COMMAND ---------- # Covariance and Correlation dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"), covar_samp("CaloriesBurned", "Steps"), covar_pop("CaloriesBurned", "Steps")).show()
#df.filter(df.SYMBOL.startswith("KOTAK")).show() #df.filter(col("SYMBOL").startswith("KOTAKBANK")).show() #df.columns.str.replace(' ','') #df.select(regexp_replace("OPEN_PRICE"," ","")).show(5) #df.where(col("SERIES").like("%EQ")).orderBy(desc(" OPEN_PRICE")).describe().show() #df_strip_spaces.where(col(" SERIES").like("%EQ")).orderBy(to_date(col("DATE1"))) #counts aggregations df_strip_spaces.count() df_strip_spaces.select(count("SERIES")).show() df_strip_spaces.select(countDistinct("SERIES")).show() df_strip_spaces.select(approx_count_distinct("SERIES", .1)).show() df_strip_spaces.select(first("SERIES"), last("SERIES"), min("SERIES"), max("SERIES"), sum("OPEN_PRICE"), sumDistinct("OPEN_PRICE")).show() df_strip_spaces.select(mean("OPEN_PRICE")).show() df_strip_spaces.select(avg("OPEN_PRICE")).show() df_strip_spaces.select(("SERIES")) df_strip_spaces.groupBy("SERIES", "SYMBOL").count().show() df_strip_spaces.where("SYMBOL like '%BANK%'").groupBy("SERIES").avg().show() df_strip_spaces.select(avg("OPEN_PRICE")) df_strip_spaces.select(var_pop("OPEN_PRICE"), stddev_pop("OPEN_PRICE")).show() #df.select(covar_pop("OPEN PRICE","CLOSE PRICE"),corr("OPEN PRICE","CLOSE PRICE")).show() windowSpec = Window.partitionBy("SYMBOL", to_date("DATE1")).orderBy( to_date("DATE1")).rowsBetween(Window.unboundedPreceding, Window.currentRow) win = sum(col("OPEN PRICE")).over(windowSpec) df.select("DATE", "OPEN PRICE", win.alias("d")).orderBy(to_date("DATE")).show() win1 = dense_rank().over(windowSpec) df.select(rank().over(windowSpec)).show()
avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases")) \ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # ---------------------------------------------------------- # Example 4 - varience and standard deviation #---------------------------------------------------------- from pyspark.sql.functions import var_pop, stddev_pop, variance, stddev from pyspark.sql.functions import var_samp, stddev_samp df.select(variance("Quantity"), stddev("Quantity"), var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() spark.sql("""SELECT var_pop(Quantity), var_samp(Quantity), stddev_pop(Quantity), stddev_samp(Quantity) FROM dfTable""").show() #---------------------------------------------------------- # Example 5 - skewness & kurtosis #---------------------------------------------------------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() spark.sql("SELECT skewness(Quantity), kurtosis(Quantity) FROM dfTable").show()
def compute_variance(df): df_res = df.groupBy("title").agg( f.round(f.var_pop("count"), 2).alias("variance")) return df_res
def get_builtin_aggregator_column(agg, ctx): try: aggregator = ctx.aggregators[agg["aggregator"]] try: input = ctx.populate_values(agg["input"], aggregator["input"], preserve_column_refs=False) except CortexException as e: e.wrap("input") raise if aggregator["name"] == "approx_count_distinct": return F.approxCountDistinct(input["col"], input.get("rsd")).alias(agg["name"]) if aggregator["name"] == "avg": return F.avg(input).alias(agg["name"]) if aggregator["name"] in { "collect_set_int", "collect_set_float", "collect_set_string" }: return F.collect_set(input).alias(agg["name"]) if aggregator["name"] == "count": return F.count(input).alias(agg["name"]) if aggregator["name"] == "count_distinct": return F.countDistinct(*input).alias(agg["name"]) if aggregator["name"] == "covar_pop": return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "covar_samp": return F.covar_samp(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "kurtosis": return F.kurtosis(input).alias(agg["name"]) if aggregator["name"] in {"max_int", "max_float", "max_string"}: return F.max(input).alias(agg["name"]) if aggregator["name"] == "mean": return F.mean(input).alias(agg["name"]) if aggregator["name"] in {"min_int", "min_float", "min_string"}: return F.min(input).alias(agg["name"]) if aggregator["name"] == "skewness": return F.skewness(input).alias(agg["name"]) if aggregator["name"] == "stddev": return F.stddev(input).alias(agg["name"]) if aggregator["name"] == "stddev_pop": return F.stddev_pop(input).alias(agg["name"]) if aggregator["name"] == "stddev_samp": return F.stddev_samp(input).alias(agg["name"]) if aggregator["name"] in {"sum_int", "sum_float"}: return F.sum(input).alias(agg["name"]) if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}: return F.sumDistinct(input).alias(agg["name"]) if aggregator["name"] == "var_pop": return F.var_pop(input).alias(agg["name"]) if aggregator["name"] == "var_samp": return F.var_samp(input).alias(agg["name"]) if aggregator["name"] == "variance": return F.variance(input).alias(agg["name"]) raise ValueError("missing builtin aggregator") # unexpected except CortexException as e: e.wrap("aggregate " + agg["name"]) raise
pMean_df.registerTempTable("pMean_df") #print('max') #sqlCtx.sql("SELECT ticker, MAX(close) AS pMax from price GROUP BY ticker").show() pMax_df = sqlCtx.sql("SELECT ticker, MAX(close) AS pMax from price GROUP BY ticker") pMax_df.registerTempTable("pMax_df") #print('min') #sqlCtx.sql("SELECT ticker, MIN(close) AS pMin from price GROUP BY ticker").show() pMin_df = sqlCtx.sql("SELECT ticker, MIN(close) AS pMin from price GROUP BY ticker") pMin_df.registerTempTable("pMin_df") #print('pop variance') import pyspark.sql.functions as F #(price_df.groupBy("ticker").agg(F.var_pop("close").alias("pVar"))).show() pVar_df = (price_df.groupBy("ticker").agg(F.var_pop("close").alias("pVar"))) pVar_df.registerTempTable("pVar_df") ################# ####TASK 3####### valuation_df = sqlCtx.sql("SELECT p.ticker, p.close, e.avg_ESO, (p.close*e.avg_ESO) as valuation \ FROM ESO e, price p WHERE e.ticker=p.ticker") valuation_df.registerTempTable("val") #print('mean valuation') #sqlCtx.sql("SELECT ticker, AVG(valuation) AS vMean from val GROUP BY ticker").show() vMean_df = sqlCtx.sql("SELECT ticker, AVG(valuation) AS vMean from val GROUP BY ticker") vMean_df.registerTempTable("vMean_df")
from pyspark.sql.functions import first, last, min, max df.select( first("StockCode").alias("First_stock"), last("StockCode"), min("StockCode"), max("StockCode")).show(2) #sum,sumDistinct, avg from pyspark.sql.functions import sum, sumDistinct, avg df.select(sum("Quantity"), sumDistinct("Quantity"), avg("Quantity")).show(2) #표본분산 , 표본표준편차 from pyspark.sql.functions import var_samp, stddev_samp df.select(var_samp("Quantity"), stddev_samp("Quantity")).show(2) #모분산, 모표본편차 from pyspark.sql.functions import var_pop, stddev_pop df.select(var_pop("Quantity"), stddev_pop("Quantity")).show(2) #비대칭도, 척도 from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show(2) #공분산과 상관관계 from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity")).show(2) #복합데이터 타입의 집계 from pyspark.sql.functions import collect_set, collect_list df.agg(collect_set("Country"), collect_list("Country")).show(2) # COMMAND ----------
#仅对ip分组,对其他特征做count和unique gp = train_df.groupby(['ip', 'app']).agg( fn.count('channel').cast(IntegerType()).alias('ip_app_cha_count')) train_df = train_df.join(gp, on=['ip', 'app'], how='left') gp = train_df.groupby(['ip', 'app']).agg( fn.countDistinct('channel').cast(IntegerType()).alias('ip_app_cha_unique')) train_df = train_df.join(gp, on=['ip', 'app'], how='left') gp = train_df.groupby(['ip', 'app', 'os']).agg( fn.count('channel').cast(IntegerType()).alias('ip_app_os_count')) train_df = train_df.join(gp, on=['ip', 'app', 'os'], how='left') gp = train_df.groupby(['ip', 'app', 'os']).agg( fn.countDistinct('channel').cast(IntegerType()).alias('ip_app_os_unique')) train_df = train_df.join(gp, on=['ip', 'app', 'os'], how='left') gp = train_df.groupby(['ip', 'app', 'channel']).agg( fn.var_pop('day').cast(IntegerType()).alias('ip_app_channel_var_day')) train_df = train_df.join(gp, on=['ip', 'app', 'channel'], how='left') gp = train_df.groupby(['ip', 'app', 'channel']).agg( fn.mean('day').cast(IntegerType()).alias('ip_app_channel_mean_day')) train_df = train_df.join(gp, on=['ip', 'app', 'channel'], how='left') gp = train_df.groupby(['ip', 'app', 'channel']).agg( fn.mean('hour').cast(IntegerType()).alias('ip_app_channel_mean_hour')) train_df = train_df.join(gp, on=['ip', 'app', 'channel'], how='left') gp = train_df.groupby(['ip', 'app', 'channel']).agg( fn.var_pop('hour').cast(IntegerType()).alias('ip_app_channel_var_hour')) train_df = train_df.join(gp, on=['ip', 'app', 'channel'], how='left') gp = train_df.groupby(['ip', 'app', 'os']).agg( fn.mean('hour').cast(IntegerType()).alias('ip_app_os_mean_hour')) train_df = train_df.join(gp, on=['ip', 'app', 'os'], how='left') gp = train_df.groupby(['ip', 'app', 'os']).agg( fn.var_pop('hour').cast(IntegerType()).alias('ip_app_os_var_hour'))
def run_pipeline(self): try: logging.info( "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/" ) # check collect_list and collect_set #collect_set() function returns all values from an input column with duplicate values eliminated. #collect_list() function returns all values from an input column with duplicates logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/' ) simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)] schema = ["employee_name", "department", "salary"] df = self.spark.createDataFrame(data=simpleData, schema=schema).cache() df.show(truncate=False) from pyspark.sql.functions import approx_count_distinct, collect_list from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct from pyspark.sql.functions import variance, var_samp, var_pop df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: " + str(df2.collect()[0][0])) print("count: " + str(df.select(count("salary")).collect()[0])) dffirst = df.select(first("salary")) dffirst.show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \ .show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return
maxRatingMultiLangRDD = maxRatingRDD.filter(lambda data: data[15] > 1) TotalHRatedMultiLang = maxRatingMultiLangRDD.count() percentageMaxRatedMultiLang = (TotalHRatedMultiLang / TotalHRated) * 100 print(percentageMaxRatedMultiLang) # Does length of app description contribute to the ratings? sqlContext = SQLContext(sc) app_desc = pd.read_csv("datasets/appleStore_description.csv") appCSVDescDF = sqlContext.createDataFrame(app_desc) appCSVDescRDD = appCSVDescDF.rdd appIDDescRDD = appCSVDescRDD.map(lambda data: (data[0], len(data[3]))) appIDRatingRDD = appCSVRDD.map(lambda data: (data[1], data[8])) appRatingDescRDD = appIDRatingRDD.join(appIDDescRDD) axisX = appRatingDescRDD.map(lambda data: data[1][0]) axisY = appRatingDescRDD.map(lambda data: data[1][1]) Statistics.corr(axisX, axisY, method="pearson") # Compare the statistics of different app groups/genres #genres,user_rating,sup_devices.num,ipadSc_urls.num,lang.num CSVRDDByGenre = appCSVRDD.map( lambda data: (data[12], data[8], data[13], data[14], data[15])) CSVDF = CSVRDDByGenre.toDF( ["Genre", "Rating", "SupportDevices", "ScreenShots", "NumberLanguages"]) CSVDF.groupBy("Genre").agg(sqlf.var_pop("NumberLanguages"), sqlf.corr("Rating", "SupportDevices"), sqlf.avg("ScreenShots"), sqlf.mean("NumberLanguages")).show()
df.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ----------
df.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show()
df = spark.createDataFrame(data=simpleData, schema = schema) df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0])) print("count: "+str(df.select(count("salary")).collect()[0])) df.select(first("salary")).show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \ .show(truncate=False)
df.select(collect_list("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) df.select(first("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(variance("salary"), var_samp("salary"), var_pop("salary")).show(truncate=False) ########################################################## # ## 12. Drop records containing null filePath = "fill_na_example.csv" df = spark.read.options(header='true', inferSchema='true').csv(filePath) df.printSchema() df.show(truncate=False) df.na.drop().show(truncate=False) df.na.drop(how="any").show(truncate=False) df.na.drop(subset=["population", "type"]).show(truncate=False)