def is_except(df, cols: dict, groupBy: list): """ :param df: 包含的columns:cust_id com_id cust_seg value :param cols: :param groupBy: list """ value = cols["value"] abnormal = cols["abnormal"] mean_plus_3std = cols["mean_plus_3std"] mean_minus_3std = cols["mean_minus_3std"] mean = cols["mean"] #按照 市 档位分组 求vlaue的均值和标准差 mean_3std = df.groupBy(groupBy) \ .agg(f.mean(value).alias("mean"), (f.stddev_pop(value) * 3).alias("+3std"), (f.stddev_pop(value) * (-3)).alias("-3std")) \ .withColumn("mean+3std", col("mean")+col("+3std")) \ .withColumn("mean-3std", col("mean")+col("-3std")) result=df.join(mean_3std, groupBy)\ .withColumn(abnormal,f.when(col(value)>col("mean+3std"),1) .when(col(value)<col("mean-3std"),0) )\ .withColumnRenamed("mean+3std", mean_plus_3std) \ .withColumnRenamed("mean-3std", mean_minus_3std) \ .withColumnRenamed("mean", mean) \ .dropna(subset=[abnormal]) return result
def non_lambda_standardize(df): # get a list of columns to standardized standardize_cols = [colName for colName in df.columns if colName != "ehhn"] # create a list of expressions to average/stddev each column avg_columns = [ avg(col(colName)).alias(colName) for colName in standardize_cols ] stddev_columns = [ stddev_pop(col(col_name)).alias(col_name) for col_name in standardize_cols ] # create a dataframe of the averages/stddevs df_avgs = df[avg_columns].na.fill(0) df_stddevs = df[stddev_columns].na.fill(0) # df_avgs.select('population_density', 'cafeteria_count', 'casual_dining_count', 'family_restaurant_count').show() avgs = df_avgs.first().asDict() stddevs = df_stddevs.first().asDict() exprs = [ zscore(colName, avgs[colName][0], stddevs[colName][0]) for colName in standardize_cols ] exprs.insert(0, col("ehhn")) return df[exprs]
def poi_index(spark, cust_lng_lat, coordinate, regex, is_fill=False): """ :param cust_lng_lat:DataFrame,列:city cust_id lng lat lng_l lng_r lat_d lat_u :param coordinate:poi 列:cityname lng lat types :param regex: 过滤poi的正则 :param is_fill: 是否填充 :return DataFrame city cust_id poi_index """ print(f"{str(dt.now())} poi指数 {regex}") # coordinate先过滤符合条件的服务 再去join零售户, 在按照city cust_id分组 count_df = coordinate.where(col("types").rlike(regex)) \ .join(cust_lng_lat.drop("lng","lat"), (col("cityname")==col("city")) & (col("lng") >= col("lng_l")) & (col("lng") <= col("lng_r")) & (col("lat") >= col("lat_d")) & (col("lat") <= col("lat_u"))) \ .groupBy("city", "cust_id").count() if is_fill == True: # 周围有对应店铺数量的零售户 exist_df = count_df.join(cust_lng_lat, ["city", "cust_id"]) \ .select("city", "cust_id", "lng", "lat", "count") # 周围没有对应店铺的零售户 not_df = cust_lng_lat.select("city", "cust_id") \ .exceptAll(count_df.select("city", "cust_id")) \ .join(cust_lng_lat, ["city", "cust_id"]) \ .select("city", "cust_id", "lng", "lat") if not_df.count() > 0: # 用knn填充没有店铺数量的零售户 fill_pd_df = fillWithKNN(exist_df.toPandas(), not_df.toPandas(), "count") fill_df = spark.createDataFrame(fill_pd_df) all_df = exist_df.unionByName(fill_df) else: all_df = exist_df # 计算阈值threshold =mean(对应数量)+3*std(对应数量) threshold = all_df.groupBy("city").agg( (f.mean("count") + 3 * f.stddev_pop("count")).alias("threshold"), f.max("count")) # 如果数量大于threshold,就用threshold替换 replace_df = all_df.join(threshold, "city") \ .withColumn("count", f.when(col("count") > col("threshold"), col("threshold")) .otherwise(col("count")) ) result = replace_df else: result = count_df # 全市所有店铺同等计算方式的最大值 count_max = result.groupBy("city").agg(f.max("count").alias("max")) # result return result.join(count_max, "city").withColumn("poi_index", col("count") / col("max") * 5)
def standardize(df): # get a list of columns to standardized standardize_cols = list(filter(lambda c: c != "ehhn", df.columns)) # create a list of expressions to average/stddev each column avg_columns = map(lambda c: avg(col(c).cast(DoubleType())).alias(c), standardize_cols) stddev_columns = map( lambda c: stddev_pop(col(c).cast(DoubleType())).alias(c), standardize_cols) # create a dataframe of the averages/stddevs df_avgs = df.groupBy().agg(*avg_columns).na.fill(0) df_stddevs = df.groupBy().agg(*stddev_columns).na.fill(0) # df_avgs.select('population_density', 'cafeteria_count', 'casual_dining_count', 'family_restaurant_count').show() avgs = df_avgs.first().asDict() stddevs = df_stddevs.first().asDict() exprs = [col("ehhn")] + list( map( lambda c: lit(0).alias(c) if (stddevs[c] == 0) else ((col(c) - avgs[c]) / stddevs[c]).alias(c), standardize_cols)) return df[exprs]
def add_additional_fields(self, table): days = lambda i: i * 86400 windowSpec = Window.partitionBy("ticker").orderBy( col("Date").cast('long')).rangeBetween(-days(7), 0) windowSpec20 = Window.partitionBy("ticker").orderBy( col("Date").cast('long')).rangeBetween(-days(20), 0) union_table = table.withColumn('rolling_seven_day_average', avg("Close").over(windowSpec)) union_table = union_table.withColumn('rolling_20_day_average', avg("Close").over(windowSpec20)) union_table = union_table.withColumn( 'stdev', stddev_pop("Close").over(windowSpec20)) union_table = union_table.withColumn( 'Upper', union_table['rolling_20_day_average'] + (union_table['stdev'] * 2)) union_table = union_table.withColumn( 'Lower', union_table['rolling_20_day_average'] - (union_table['stdev'] * 2)) return union_table
def _add_outliers(dataframe, **kwargs): """ Calculate a boundary for which a data point will be considered an outlier [bool] The boundary is the mean plus "stddev" (number of standard derivations) * the standard derivation Uses pyspark's Window function to partition over the special predictions and thereby count number of data points in each cluster, their number of outliers and the outlier percentage :param dataframe: :param kwargs: prediction_col can be set in the function call, else it will search for the column name 'predictionCol' distance_col can be set in the function call, else it will search for the column name 'distance' no_stddev (number of standard derivations) can be set in the function call, else default sat to 2 :return: dataframe with added 'is_outlier' bool column """ assert kwargs.get('distance_col', 'distance') in dataframe.columns, 'Distances have not been computed!' prediction_col = F.col(kwargs.get('prediction_col', 'prediction')) distance_col = F.col(kwargs.get('distance_col', 'distance')) no_stddev = kwargs.get('no_stddev', 2.0) window_outlier = Window().partitionBy(prediction_col) computed_boundary = (F.mean(distance_col).over(window_outlier) + no_stddev * F.stddev_pop(distance_col).over(window_outlier) ) return (dataframe .withColumn(colName='computed_boundary', col=computed_boundary) .withColumn(colName='is_outlier', col=F.when(distance_col > computed_boundary, 1).otherwise(0)) )
def _apply_on_cols(self) -> List[Callable]: input_col = self.input_col return [ f.mean(f.col(input_col)).alias( StandardScalarScalerConfig.mean_token), f.stddev_pop(f.col(input_col)).alias( StandardScalarScalerConfig.std_token) ]
def filter_using_constraints(consistency_df, constraint_list): consistency_df = consistency_df.where(func.col("winner")\ .isin(constraint_list)) # applying the constraint list consistency_df = consistency_df.groupBy("winner")\ .agg(func.stddev_pop("count").alias("stddev"),\ func.sum("count").alias("total_wins"))\ .orderBy("stddev","total_wins") # calculating the performance consistency return consistency_df
def get_vfr_index(): try: print(f"{str(dt.now())} 零售户周边人流指数") # 有人流数据的零售户 vfr = get_around_vfr(spark) vfr.cache() # 零售户 co_cust = get_co_cust(spark).select("cust_id") # 有经纬度的零售户 cust_lng_lat = get_cust_lng_lat(spark) \ .select("city", "cust_id", "lng", "lat") \ .join(co_cust, "cust_id") cust_lng_lat.cache() # 周边没有人流的零售户 not_df = cust_lng_lat.select("cust_id") \ .exceptAll(vfr.select("cust_id")) \ .join(cust_lng_lat, "cust_id") exist_df = vfr.join(cust_lng_lat, ["city", "cust_id"]) if not_df.count()>0: #knn填充 fill_df = fillWithKNN(exist_df.toPandas(), not_df.toPandas(), "avg_vfr") all_df = spark.createDataFrame(fill_df) \ .unionByName(exist_df) else: all_df=exist_df #阈值 threshold = all_df.groupBy("city") \ .agg((f.mean("avg_vfr") + 3 * f.stddev_pop("avg_vfr")).alias("threshold")) truncate_df = all_df.join(threshold, "city") \ .withColumn("avg_vfr", f.when(col("avg_vfr") > col("threshold"), col("threshold")) .otherwise(col("avg_vfr")) ) log_df = truncate_df.withColumn("log", f.log(col("avg_vfr") + 1)) log_max = log_df.groupBy("city").agg(f.max("log").alias("log_max")) colName = "people_count" log_df.join(log_max, "city") \ .withColumn(colName, col("log") / col("log_max") * 5) \ .foreachPartition(lambda x: write_hbase1(x, [colName], hbase)) vfr.unpersist() cust_lng_lat.unpersist() except Exception: tb.print_exc()
def normalizeFeatures(df, cols): """ Normalized feature method is used to normalize each feature passed into the list cols""" allCols = df.columns #remove the cols to normalized to set the columns of return dataframe _ = [allCols.remove(x) for x in cols] # calculate the avg and stddev of the features to normalized stats = (df.groupBy().agg(*([stddev_pop(x).alias(x + '_stddev') for x in cols] + [avg(x).alias(x + '_avg') for x in cols]))) # broadcast and join into current DF df = df.join(broadcast(stats)) # normalized the columns and select the required columns gor final DF exprs = [x for x in allCols] + [((df[x] - df[x + '_avg']) / df[x + '_stddev']).alias(x) for x in cols] return df.select(*exprs)
def create_stats( df, tenant_col: str, value_col: str = AccessAnomalyConfig.default_output_col) -> StatsMap: stat_rows = df.groupBy(tenant_col).agg( f.count('*').alias('__count__'), f.min(f.col(value_col)).alias('__min__'), f.max(f.col(value_col)).alias('__max__'), f.mean(f.col(value_col)).alias('__mean__'), f.stddev_pop(f.col(value_col)).alias('__std__')).collect() stats_map = { row[tenant_col]: BasicStats(row['__count__'], row['__min__'], row['__max__'], row['__mean__'], row['__std__']) for row in stat_rows } return StatsMap(stats_map)
def transform(self, dataframe): """Applies standardization to the specified columns. # Arguments dataframe: dataframe. Spark Dataframe. """ # Compute the means of the specified columns. means = [mean(x) for x in self.columns] means = dataframe.select(means).collect()[0].asDict() self.means = self.clean_mean_keys(means) # Compute the standard deviation of the specified columns. stddevs = [stddev_pop(x) for x in self.columns] stddevs = dataframe.select(stddevs).collect()[0].asDict() self.stddevs = self.clean_stddev_keys(stddevs) # For every feature, add a new column to the dataframe. for column in self.columns: self.current_column = column dataframe = dataframe.rdd.map(self._transform).toDF() return dataframe
def except_grade(df, cols: dict, groupBy: list, grade: list): """ 零售户指标与 {groupBy} 零售户相比 :param df: 包含的columns:cust_id com_id cust_seg value :param cols: 列:cust_id,cols["value"] cols["value"]:计算均值、标准差的值 如 条均价/订单额 :param groupBy: list [city]/[city,cluster_index] 按照city或 city和聚类结果 分组 :param grade: [3,4,5] 按照 3/4/5倍标准差分档 """ value = cols["value"] abnormal = "warning_level_code" plus_one_grade = "avg_orders_plus3" minus_one_grade = "avg_orders_minu3" plus_two_grade = "avg_orders_plus4" minus_two_grade = "avg_orders_minu4" plus_three_grade = "avg_orders_plus5" minus_three_grade = "avg_orders_minu5" # 按照 [city]/[city,cust_seg]分组 求vlaue的均值和标准差 mean_std = df.groupBy(groupBy) \ .agg(f.mean(value).alias("mean"), f.stddev_pop(value).alias("stddev")) \ .withColumn(plus_one_grade, col("mean") + col("stddev") * grade[0]) \ .withColumn(minus_one_grade, col("mean") - col("stddev") * grade[0]) \ .withColumn(plus_two_grade, col("mean") + col("stddev") * grade[1]) \ .withColumn(minus_two_grade, col("mean") - col("stddev") * grade[1]) \ .withColumn(plus_three_grade, col("mean") + col("stddev") * grade[2]) \ .withColumn(minus_three_grade, col("mean") - col("stddev") * grade[2]) result = df.join(mean_std, groupBy) \ .withColumn(abnormal, f.when((col(value) > col(plus_one_grade)) & (col(value) < col(plus_two_grade)), "C1") .when((col(value) > col(minus_two_grade)) & (col(value) < col(minus_one_grade)), "C0") .when((col(value) > col(plus_two_grade)) & (col(value) < col(plus_three_grade)), "B1") .when((col(value) > col(minus_three_grade)) & (col(value) < col(minus_two_grade)), "B0") .when((col(value) > col(plus_three_grade)), "A1") .when((col(value) < col(minus_three_grade)), "A0") ).dropna(subset=[abnormal]) return result
# _read csv file flightdata = spark.read.option('inferSchema', 'true').option('header', 'true').csv('2015-summary.csv') flightdata.show(5) flightdata.printSchema() # _add new column using withColumn, we are just printing the updated dataframe by show, but it should be taken into new variable as new dataframe. flightdata.withColumn("newCol", col("count") + 10).show(4) # _using select, we can also mention column names explicitly in place of * flightdata_mod = flightdata.select("*", (col("count") + 20).alias("newCol2")) flightdata_mod.show(5) # _basic statistical functions flightdata.select(mean("count")).show() flightdata.select(min("count")).show() flightdata.select(max("count")).show() flightdata.select(stddev_pop("count")).show() flightdata.select(stddev_samp("count")).show() flightdata.select() # _group by and aggregations flightdata.groupBy("DEST_COUNTRY_NAME").agg(sum('count')).show(5) dest_count_data = flightdata.groupBy("DEST_COUNTRY_NAME").agg({'count': 'sum'}) # _write the data to csv after coalesce dest_count_data_merged = dest_count_data.coalesce(1) dest_count_data_merged.write.format('csv').option('header', 'true').save('dest_country')
def around_except_grade(around_cust, value_df, cust_cluster, cols: dict, grade=[3, 4, 5]): """ 零售户指标与周围零售户相比 :param around_cust: 列:cust_id1,cust_id0 零售户cust_id1周边包含cust_id0这些零售户 :param value_df: 列:cust_id,cols["value"] cols["value"]:计算均值、标准差的值 如 条均价/订单额 :param cust_cluster: 列:cust_id,cluster_index 零售户聚类结果 :param cols: {"value":value,} :param grade: [3,4,5] 按照 3/4/5倍标准差分档 :return: """ value = cols["value"] abnormal = "warning_level_code" plus_one_grade = "avg_orders_plus3" minus_one_grade = "avg_orders_minu3" plus_two_grade = "avg_orders_plus4" minus_two_grade = "avg_orders_minu4" plus_three_grade = "avg_orders_plus5" minus_three_grade = "avg_orders_minu5" # 1.获取cust_id1的类别 # 2.获取cust_id0的类别 around_cust = around_cust.join(cust_cluster, col("cust_id") == col("cust_id1")) \ .drop("cust_id") \ .withColumnRenamed("cluster_index", "cluster_index1") \ .join(cust_cluster, col("cust_id") == col("cust_id0")) \ .withColumnRenamed("cluster_index", "cluster_index0") \ .where(col("cluster_index0") == col("cluster_index1")) \ .select("cust_id1", "cust_id0") #每个零售户周边均值 标准差 mean_std = around_cust.join(value_df, col("cust_id0") == col("cust_id")) \ .select("cust_id1", value) \ .groupBy("cust_id1") \ .agg(f.mean(value).alias("mean"), f.stddev_pop(col(value)).alias("stddev")) """ 预警C 均值 + 3标准差 < 条均价 < 均值 + 4标准差 过高 11 均值 - 4标准差 < 条均价 < 均值 - 3标准差 过低 10 预警B 均值 + 4标准差 < 条均价 < 均值 + 5标准差 过高 21 均值 - 5标准差 < 条均价 < 均值 - 4标准差 过低 20 预警A 均值 + 5标准差 < 条均价 过高 31 条均价 < 均值 - 5标准差 过低 30 """ except_cust = mean_std.join(value_df, col("cust_id1") == col("cust_id")) \ .withColumn(plus_one_grade, col("mean") + col("stddev") * grade[0]) \ .withColumn(minus_one_grade, col("mean") - col("stddev") * grade[0]) \ .withColumn(plus_two_grade, col("mean") + col("stddev") * grade[1]) \ .withColumn(minus_two_grade, col("mean") - col("stddev") * grade[1]) \ .withColumn(plus_three_grade, col("mean") + col("stddev") * grade[2]) \ .withColumn(minus_three_grade, col("mean") - col("stddev") * grade[2]) \ .withColumn(abnormal, f.when((col(value) > col(plus_one_grade)) & (col(value) < col(plus_two_grade)), "C1") .when((col(value) > col(minus_two_grade)) & (col(value) < col(minus_one_grade)), "C0") .when((col(value) > col(plus_two_grade)) & (col(value) < col(plus_three_grade)), "B1") .when((col(value) > col(minus_three_grade)) & (col(value) < col(minus_two_grade)), "B0") .when((col(value) > col(plus_three_grade)), "A1") .when((col(value) < col(minus_three_grade)), "A0") ).dropna(subset=[abnormal]) return except_cust
def fill_dp(spark, cust_lng_lat, dp, colName, coordinate, regex): """ 填充价格 :param cust_lng_lat:零售户经纬度DataFrame,列: city cust_id lng lat lng_l lng_r lat_d lat_u :param dp:零售户房租/餐饮/酒店的平均价格DataFrame, 列: city cust_id {colName} :param colName:价格字段 :param coordinate:poi数据DataFrame, 列: lng lat types :param regex: 过滤poi的正则 """ # 列:city,cust_id,poi_index poi_index_df = poi_index(spark, cust_lng_lat, coordinate, regex) # 得到包含所有零售户的DataFrame df0 = cust_lng_lat.join(poi_index_df, ["city", "cust_id"], "left") \ .join(dp, ["city", "cust_id"], "left") \ .select("city", "cust_id", colName, "poi_index", "lng", "lat") summary = df0.select(colName, "poi_index").summary().cache() # 全市平均价格的min,max及各分位数 price_min = summary.where(col("summary") == "min").collect()[0][colName] price_25 = summary.where(col("summary") == "25%").collect()[0][colName] price_50 = summary.where(col("summary") == "50%").collect()[0][colName] price_75 = summary.where(col("summary") == "75%").collect()[0][colName] price_max = summary.where(col("summary") == "max").collect()[0][colName] # 全市对应poi指数的min,max及各分位数 poi_25 = summary.where(col("summary") == "25%").collect()[0]["poi_index"] poi_50 = summary.where(col("summary") == "50%").collect()[0]["poi_index"] poi_75 = summary.where(col("summary") == "75%").collect()[0]["poi_index"] poi_max = summary.where(col("summary") == "max").collect()[0]["poi_index"] # 零售户周边既没有poi也没有价格的零售户,先用0填充 df1 = df0.withColumn( colName, f.when(col(colName).isNull() & col("poi_index").isNull(), 0).otherwise(col(colName))) # 零售户周边有poi没有价格的,使用分箱填充价格 df2 = df1.withColumn( colName, f.when( (col(colName).isNull()) & (col("poi_index") < poi_25), price_min).when( (col(colName).isNull()) & (col("poi_index") >= poi_25) & (col("poi_index") < poi_50), price_25).when( (col(colName).isNull()) & (col("poi_index") >= poi_50) & (col("poi_index") < poi_75), price_50).when( (col(colName).isNull()) & (col("poi_index") >= poi_75) & (col("poi_index") < poi_max), price_75).otherwise(price_max)) # -----零售户周边价格为0的零售户,使用KNN填充 # 价格不为0 exist_df = df2.where(col(colName) > 0).drop("poi_index") # 价格为0 not_df = df2.where(col(colName) == 0).drop("poi_index") if not_df.count() > 0: # knn填充 fill_pd_df = fillWithKNN(exist_df.toPandas(), not_df.toPandas(), colName) # 合并 all_df = spark.createDataFrame(fill_pd_df).unionByName(exist_df) else: all_df = exist_df #-----截断 #计算每个城市的 threshold threshold = all_df.groupBy("city")\ .agg((f.mean(col(colName)) + 3 * f.stddev_pop(col(colName))).alias("threshold")) truncate_df = all_df.join(threshold, "city") \ .withColumn(colName, f.when(col(colName) > col("threshold"), col("threshold")) .otherwise(col(colName)) ) #计算价格+1 的对数 log_df = truncate_df.withColumn("log", f.log(col(colName) + 1)) log_max = log_df.groupBy("city").agg(f.max("log").alias("log_max")) result=log_df.join(log_max, "city") \ .withColumn(colName+"_index", col("log") / col("log_max") * 5) #'city','cust_id','food_avg','poi_index','lng','lat','threshold','log','log_max','{colName}_index' return result
def runModel(regressionMethodName, stationID, stationDataFrame, featureInputCols, normalize, splitMethod='random'): print("=" * 80) print('Station:{0}'.format(stationID)) print( 'Model:{0}, Normalize:{1}, LinkFunction:{2}, train/test splitMethod:{3}' .format(regressionMethodName, normalize, labelLinkFunction, splitMethod)) print(featureInputCols) oneHot = OneHotEncoderEstimator( inputCols=["hourOfDay", "dayOfWeek"], outputCols=["hourOfDayVector", "dayOfWeekVector"]) stationSummaryAll = stationDataFrame.groupBy('station_id').agg( count('label'), sum('label'), avg("label"), stddev_pop("label")) stationAvg = stationSummaryAll.select('avg(label)').where( col('station_id') == stationID).collect() stationSum = stationSummaryAll.select('sum(label)').where( col('station_id') == stationID).collect() stationStd = stationSummaryAll.select('stddev_pop(label)').where( col('station_id') == stationID).collect() stationNonZeroCount = stationSummaryAll.select('count(label)').where( col('station_id') == stationID).collect() stationCount = stationSummaryAll.select('count(label)').where( col('station_id') == "None").collect() featureInputCols.extend(["hourOfDayVector", "dayOfWeekVector"]) assembler = VectorAssembler(inputCols=featureInputCols, outputCol='features') if normalize == True: normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0) featureName = "normFeatures" regressionMethod, regressionModelParameters = selectRegressionMethod( 'rf', featureName) pipeline = Pipeline( stages=[oneHot, assembler, normalizer, regressionMethod]) else: featureName = "features" regressionMethod, regressionModelParameters = selectRegressionMethod( 'rf', featureName) pipeline = Pipeline(stages=[oneHot, assembler, regressionMethod]) trainingDates = ['2016-10-01 00:00:00', '2017-9-30 23:59:59'] testDates = ['2017-10-01 00:00:00', '2017-10-31 23:59:59'] dates = {'train': trainingDates, 'test': testDates} if splitMethod == 'random': # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = stationDataFrame.randomSplit([0.6, 0.4]) else: (trainingData, testData) = timeSeriesTestTrain(stationDataFrame, dates) #fit model and make predictions model = pipeline.fit(trainingData) predictedData = model.transform(testData) #predictedData.select("prediction", "label", featureName).show(5) predictedData evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") evaluator2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") evaluator3 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="explainedVariance") rmse = evaluator.evaluate(predictedData) rSquared = evaluator2.evaluate(predictedData) varianceExplained = evaluator2.evaluate(predictedData) print( "RMSE, R2, and variance explained on test data = {0:6.3f}, {1:6.3f}, {2:6.3f}" .format(rmse, rSquared, varianceExplained)) print() basetime = 1541216769 experimentTimeStamp = int((time.time() - basetime) / 6) experiment = { experimentTimeStamp: { "station": stationID, 'stationNonZeroCount': stationNonZeroCount, 'stationCount': stationCount, 'stationSum': stationSum, 'stationAvg': stationAvg, 'stationStd': stationStd, 'regressionMethodName': regressionMethodName, 'normalize': normalize, 'linkFunctionLabel': labelLinkFunction, 'featureInputCols': featureInputCols, 'rmse': rmse, 'rSquared': rSquared, 'varianceExplained': varianceExplained, 'version': "Added OneHotEncode for hOD, dOW", 'trainSplitMethod': splitMethod } } experiments.update(experiment) with open(pathFigure + "experiments.json", "w") as f: json.dump(experiments, f) return ()
def get_cpt_index(): # -----零售户店铺竞争指数 # def get_cpt_index(): print(f"{str(dt.now())} cust competitive index") try: # 零售户档位 co_cust = get_co_cust(spark).select("cust_id", "cust_seg") \ .withColumn("cust_seg", f.regexp_replace(col("cust_seg"), "zz|ZZ", "31")) co_cust.cache() # 1.cust_id1 周围有cust_id0这些零售户 # 2.零售户与范围内每个零售户的距离 around_cust = get_around_cust(spark, 0.5)\ .withColumn("length", haversine(col("lng1"), col("lat1"), col("lng0"),col("lat0"))) \ .select("city1", "cust_id1", "cust_id0", "length") around_cust.cache() # 算出范围内距离的最大值和最小值 max_min = around_cust.groupBy("city1", "cust_id1") \ .agg(f.max("length").alias("max"), f.min("length").alias("min")) """ 距离系数=1/零售户与其他零售户距离的标准化后的值 距离标准化值= ( 距离-min( 距离) )/(max( 距离 )-min( 距离 )) """ # 距离系数 dis_coe = around_cust.join(max_min, ["city1", "cust_id1"]) \ .withColumn("dis_coe", 1 * (col("max") - col("min")) / (col("length") - col("min"))) \ .select("city1", "cust_id1", "cust_id0", "dis_coe") # 1.获取cust_id1 档位 cust_seg1 # 2.获取cust_id0 档位 cust_seg0 # 3.档位差*距离系数 # 4.加权平均档位差=sum((其他零售户档位-零售户档位)*距离系数) avg_seg_diff = dis_coe.join(co_cust, col("cust_id") == col("cust_id1")) \ .drop("cust_id") \ .withColumnRenamed("cust_seg", "cust_seg1") \ .join(co_cust, col("cust_id") == col("cust_id0")) \ .withColumnRenamed("cust_seg", "cust_seg0") \ .groupBy("city1", "cust_id1") \ .agg(f.sum((col("cust_seg0") - col("cust_seg1")) * col("dis_coe")).alias("avg_seg_diff")) \ .select("city1", "cust_id1", "avg_seg_diff") # 零售户竞争强度=零售户500m范围加权平均档位差*范围内零售户数量 cust_cpt = around_cust.groupBy("city1", "cust_id1").count() \ .join(avg_seg_diff, ["city1", "cust_id1"]) \ .withColumn("cust_cpt", col("avg_seg_diff") * col("count")) \ .select("city1", "cust_id1", "cust_cpt") cust_cpt.cache() threshold = cust_cpt.groupBy("city1").agg( (f.mean("cust_cpt") + 3 * f.stddev_pop(col("cust_cpt"))).alias("threshold_plus"), (f.mean("cust_cpt") - 3 * f.stddev_pop(col("cust_cpt"))).alias("threshold_minus") ) truncate_df = cust_cpt.join(threshold, "city1") \ .withColumn("cust_cpt", f.when(col("cust_cpt") > col("threshold_plus"), col("threshold_plus")) .when(col("cust_cpt") < col("threshold_minus"), col("threshold_minus")) \ .otherwise(col("cust_cpt")) ) # 全市最大竞争强度 max_cpt = truncate_df.groupBy("city1").agg(f.max(col("cust_cpt")).alias("max_cpt"), f.min(col("cust_cpt")).alias("min_cpt")) # 店铺竞争指数 colName = "order_competitive_index" result = truncate_df.join(max_cpt, "city1") \ .withColumnRenamed("cust_id1", "cust_id") \ .withColumn(colName, 5 - (col("cust_cpt") - col("min_cpt")) / (col("max_cpt") - col("min_cpt")) * 5) \ .foreachPartition(lambda x: write_hbase1(x, [colName], hbase)) co_cust.unpersist() around_cust.unpersist() cust_cpt.unpersist() except Exception: tb.print_exc()
def get_builtin_aggregator_column(agg, ctx): try: aggregator = ctx.aggregators[agg["aggregator"]] try: input = ctx.populate_values(agg["input"], aggregator["input"], preserve_column_refs=False) except CortexException as e: e.wrap("input") raise if aggregator["name"] == "approx_count_distinct": return F.approxCountDistinct(input["col"], input.get("rsd")).alias(agg["name"]) if aggregator["name"] == "avg": return F.avg(input).alias(agg["name"]) if aggregator["name"] in { "collect_set_int", "collect_set_float", "collect_set_string" }: return F.collect_set(input).alias(agg["name"]) if aggregator["name"] == "count": return F.count(input).alias(agg["name"]) if aggregator["name"] == "count_distinct": return F.countDistinct(*input).alias(agg["name"]) if aggregator["name"] == "covar_pop": return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "covar_samp": return F.covar_samp(input["col1"], input["col2"]).alias(agg["name"]) if aggregator["name"] == "kurtosis": return F.kurtosis(input).alias(agg["name"]) if aggregator["name"] in {"max_int", "max_float", "max_string"}: return F.max(input).alias(agg["name"]) if aggregator["name"] == "mean": return F.mean(input).alias(agg["name"]) if aggregator["name"] in {"min_int", "min_float", "min_string"}: return F.min(input).alias(agg["name"]) if aggregator["name"] == "skewness": return F.skewness(input).alias(agg["name"]) if aggregator["name"] == "stddev": return F.stddev(input).alias(agg["name"]) if aggregator["name"] == "stddev_pop": return F.stddev_pop(input).alias(agg["name"]) if aggregator["name"] == "stddev_samp": return F.stddev_samp(input).alias(agg["name"]) if aggregator["name"] in {"sum_int", "sum_float"}: return F.sum(input).alias(agg["name"]) if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}: return F.sumDistinct(input).alias(agg["name"]) if aggregator["name"] == "var_pop": return F.var_pop(input).alias(agg["name"]) if aggregator["name"] == "var_samp": return F.var_samp(input).alias(agg["name"]) if aggregator["name"] == "variance": return F.variance(input).alias(agg["name"]) raise ValueError("missing builtin aggregator") # unexpected except CortexException as e: e.wrap("aggregate " + agg["name"]) raise
format="parquet", path="/data/intermediate_data/cdr_step1/") df = sqlContext.read.parquet("/data/intermediate_data/cdr_step1/") df1 = df.sort('number').groupBy('number', 'type', 'week').agg( f.sum('callduration').alias('call_sum'), f.count('callduration').alias('call_count'), f.sum('iscompethot').alias('call_count_competitor')) df1.write.mode("overwrite").saveAsTable( "cdr_step2", format="parquet", path="/data/intermediate_data/cdr_step2/") df = sqlContext.read.parquet('/data/intermediate_data/cdr_step2/') df1 = df.groupBy('number', 'type').agg( (f.stddev_pop('call_sum') / f.mean('call_sum')).alias('coefficiant_of_variance'), f.sum('call_count').alias('call_count'), f.sum('call_count_competitor').alias('call_count_competitor')) df1.write.mode("overwrite").saveAsTable( "cdr_step3", format="parquet", path="/data/intermediate_data/cdr_step3/") df = sqlContext.read.parquet('/data/intermediate_data/cdr_step3/') df_out = df.where(df.type == "out").select(df.number, df.coefficiant_of_variance, df.call_count, df.call_count_competitor) df_in = df.where(df.type == "in").select(df.number, df.coefficiant_of_variance,
def run_pipeline(self): try: logging.info( "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/" ) # check collect_list and collect_set #collect_set() function returns all values from an input column with duplicate values eliminated. #collect_list() function returns all values from an input column with duplicates logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/' ) simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)] schema = ["employee_name", "department", "salary"] df = self.spark.createDataFrame(data=simpleData, schema=schema).cache() df.show(truncate=False) from pyspark.sql.functions import approx_count_distinct, collect_list from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct from pyspark.sql.functions import variance, var_samp, var_pop df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: " + str(df2.collect()[0][0])) print("count: " + str(df.select(count("salary")).collect()[0])) dffirst = df.select(first("salary")) dffirst.show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \ .show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return
count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show()
def num_iqr_filter(self, data_frame, inputCol="VALUE", labelCol="ITEMID", REPARTITION_CONST=None, sc=None): from pyspark.sql.window import Window from pyspark.sql.functions import abs, percent_rank, row_number, collect_list, udf, struct, count, avg, stddev_pop from pyspark.sql.types import MapType, StringType, DoubleType self.logger.debug("[NUM_IQR_FILTER] IN") self.logger.debug("[NUM_IQR_FILTER] BEFORE QUANTILE TAGGED") if REPARTITION_CONST is None: value_order = Window.partitionBy(labelCol).orderBy( col(inputCol).cast("float")) Q1_percentile = Window.partitionBy(labelCol).orderBy( abs(0.25 - col("percentile"))) Q2_percentile = Window.partitionBy(labelCol).orderBy( abs(0.5 - col("percentile"))) Q3_percentile = Window.partitionBy(labelCol).orderBy( abs(0.75 - col("percentile"))) percent_data_frame = data_frame.select( labelCol, inputCol, percent_rank().over(value_order).alias("percentile")) Q1_data_frame = percent_data_frame.withColumn("Q1_rn",row_number().over(Q1_percentile)).where("Q1_rn == 1")\ .select(labelCol,inputCol,lit("Q1").alias("quantile")) Q2_data_frame = percent_data_frame.withColumn("Q2_rn",row_number().over(Q2_percentile)).where("Q2_rn == 1")\ .select(labelCol,inputCol,lit("Q2").alias("quantile")) Q3_data_frame = percent_data_frame.withColumn("Q3_rn",row_number().over(Q3_percentile)).where("Q3_rn == 1")\ .select(labelCol,inputCol,lit("Q3").alias("quantile")) self.logger.debug( "[NUM_IQR_FILTER] REPARTITION_CONST Not Asserted") merge_all = Q1_data_frame.unionAll(Q2_data_frame).unionAll( Q3_data_frame).persist() self.logger.debug("[NUM_IQR_FILTER] Qs Merged") udf_parse_list_to_map = udf( lambda maps: dict(list(tuple(x) for x in maps)), MapType(StringType(), StringType())) self.logger.debug("[NUM_IQR_FILTER] Before merge quantiles") aggregate_quantiles = merge_all.groupBy(labelCol).agg( collect_list(struct("quantile", inputCol)).alias("quant_info")) self.logger.debug("AGG ONLY") aggregate_quantiles = aggregate_quantiles.select( labelCol, udf_parse_list_to_map("quant_info").alias("quant_info")) self.logger.debug("TRANSFORM") iqr_data_frame = aggregate_quantiles.withColumn("Q1",col("quant_info").getItem("Q1").cast("float"))\ .withColumn("Q2",col("quant_info").getItem("Q2").cast("float"))\ .withColumn("Q3",col("quant_info").getItem("Q3").cast("float")) self.logger.debug("QUANTILE_EXTRACTION") else: cur_label_list = data_frame.select( labelCol).distinct().rdd.flatMap(list).collect() cur_iqr_list = list() cnt = -1 for cur_item in cur_label_list: cnt = cnt + 1 data_frame.where( col(labelCol) == cur_item).registerTempTable("cur_table") self.logger.debug("{0}/{1},::{2}".format( cnt, len(cur_label_list), sc.sql( "select {0} from cur_table".format(labelCol)).count())) cur_iqr = sc.sql( "select {0}, percentile_approx({1},0.25) as Q1, percentile_approx({2},0.5) as Q2, percentile_approx({3},0.75) as Q3 from cur_table group by {4}" .format(labelCol, inputCol, inputCol, inputCol, labelCol)).first().asDict() cur_iqr_list.append(cur_iqr) sc.catalog.dropTempView("cur_table") #percent_data_frame = data_frame.select(labelCol, inputCol, percent_rank().over(value_order).alias("percentile")).repartition(REPARTITION_CONST).cache().checkpoint() iqr_data_frame = sc.createDataFrame(cur_iqr_list).repartition( REPARTITION_CONST) if REPARTITION_CONST is None: iqr_data_frame = iqr_data_frame.withColumn("IQR",col("Q3")-col("Q1"))\ .withColumn("LB",col("Q1")-1.5*col("IQR"))\ .withColumn("UB",col("Q3")+1.5*col("IQR"))\ .select(labelCol,"LB","UB") else: iqr_data_frame = iqr_data_frame.withColumn("IQR",col("Q3")-col("Q1"))\ .withColumn("LB",col("Q1")-1.5*col("IQR"))\ .withColumn("UB",col("Q3")+1.5*col("IQR"))\ .select(labelCol,"LB","UB").repartition(REPARTITION_CONST).persist() self.logger.debug("CUR_ITEMID_ALL_COUNT:{0}".format( iqr_data_frame.count())) self.logger.debug("[NUM_IQR_FILTER] iqr_data_frame merged") if REPARTITION_CONST is None: self.logger.debug( "[NUM_IQR_FILTER] RETURN_PREP, REPARTITION_CONST NOT ASSERTED") ret_data_frame = data_frame.repartition(labelCol).join(iqr_data_frame,labelCol).where((col("LB").cast("float") <= col(inputCol).cast("float")) & (col("UB").cast("float")>=col(inputCol).cast("float")))\ .drop("LB").drop("UB").persist() ref_df = ret_data_frame.repartition(labelCol).groupBy(labelCol)\ .agg(count(inputCol).alias("ref_count"),avg(inputCol).alias("ref_avg"),stddev_pop(inputCol).alias("ref_std")).persist() self.logger.debug("CHECK DF") self.logger.debug(ref_df.count()) self.logger.debug(ret_data_frame.count()) return (ret_data_frame, ref_df) else: self.logger.debug( "[NUM_IQR_FILTER] RETURN_PREP, REPARTITION_CONST ASSERTED: {0}" .format(REPARTITION_CONST)) ret_data_frame = data_frame.join(iqr_data_frame,labelCol).where((col("LB").cast("float") <= col(inputCol).cast("float")) & (col("UB").cast("float")>=col(inputCol).cast("float")))\ .drop("LB").drop("UB").repartition(REPARTITION_CONST) ref_df = ret_data_frame.groupBy(labelCol)\ .agg(count(inputCol).alias("ref_count"),avg(inputCol).alias("ref_avg"),stddev_pop(inputCol).alias("ref_std")).repartition(REPARTITION_CONST) return (ret_data_frame, ref_df)
spark = SparkSession.builder.appName("learning").master( "local").getOrCreate() df = spark.read.format('csv')\ .option('sep', ';')\ .option('header', 'true')\ .load('user.csv') df.repartition(5).select(max('id')).show() df.repartition(5).select(avg('age')).show() df.select(round(lit(2.5)), bround(lit(2.5))).show() df.select(round(lit("2.5")), bround(lit("2.5"))).show() spark.range(0, 10, 2)\ .select(monotonically_increasing_id().alias('id_v2'), round((rand() * 5 + 5)).alias('rand')).show() spark.range(0, 10, 2)\ .select(monotonically_increasing_id().alias('id'), pow(col('id'), 2).alias('pow')).show() df.describe().show() df.select(count('age'), mean('age'), min('age'), max('age'), stddev_pop('age')).show() df.select(corr(col('age'), col('name'))).show() df.stat.corr(col('age'), col('name')).show()
count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # by default, if we use function `variance` or `stddev`, # Spark performs the formula for sample variance or standard deviation # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show()
df.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() # COMMAND ---------- from pyspark.sql.functions import var_pop, stddev_pop from pyspark.sql.functions import var_samp, stddev_samp df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"), stddev_samp("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ----------
#df.filter(df.SYMBOL.startswith("KOTAK")).show() #df.filter(col("SYMBOL").startswith("KOTAKBANK")).show() #df.columns.str.replace(' ','') #df.select(regexp_replace("OPEN_PRICE"," ","")).show(5) #df.where(col("SERIES").like("%EQ")).orderBy(desc(" OPEN_PRICE")).describe().show() #df_strip_spaces.where(col(" SERIES").like("%EQ")).orderBy(to_date(col("DATE1"))) #counts aggregations df_strip_spaces.count() df_strip_spaces.select(count("SERIES")).show() df_strip_spaces.select(countDistinct("SERIES")).show() df_strip_spaces.select(approx_count_distinct("SERIES", .1)).show() df_strip_spaces.select(first("SERIES"), last("SERIES"), min("SERIES"), max("SERIES"), sum("OPEN_PRICE"), sumDistinct("OPEN_PRICE")).show() df_strip_spaces.select(mean("OPEN_PRICE")).show() df_strip_spaces.select(avg("OPEN_PRICE")).show() df_strip_spaces.select(("SERIES")) df_strip_spaces.groupBy("SERIES", "SYMBOL").count().show() df_strip_spaces.where("SYMBOL like '%BANK%'").groupBy("SERIES").avg().show() df_strip_spaces.select(avg("OPEN_PRICE")) df_strip_spaces.select(var_pop("OPEN_PRICE"), stddev_pop("OPEN_PRICE")).show() #df.select(covar_pop("OPEN PRICE","CLOSE PRICE"),corr("OPEN PRICE","CLOSE PRICE")).show() windowSpec = Window.partitionBy("SYMBOL", to_date("DATE1")).orderBy( to_date("DATE1")).rowsBetween(Window.unboundedPreceding, Window.currentRow) win = sum(col("OPEN PRICE")).over(windowSpec) df.select("DATE", "OPEN PRICE", win.alias("d")).orderBy(to_date("DATE")).show() win1 = dense_rank().over(windowSpec) df.select(rank().over(windowSpec)).show()
pw_df = pw_df.localCheckpoint(eager = True) print("Merged to:", pw_df.count()) print() #Okay now we have a list of outages, restore_times, locations, core_ids #First let's calculate some high level metrics #size of outages pw_finalized_outages = pw_finalized_outages.withColumn("cluster_size", F.size(F.array_distinct("core_id"))) #standard deviation outage times pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_stddev", F.explode("outage_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"),*exprs) #range of outage times pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_range", F.array_max("outage_times") - F.array_min("outage_times")) #standard deviation and range of restore times pw_finalized_outages = pw_finalized_outages.withColumn("restore_times", col("restore_time")) pw_finalized_outages = pw_finalized_outages.withColumn("restore_time", F.explode("restore_time")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_time' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.avg("restore_time").alias("restore_times_mean"),*exprs) pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_stddev", F.explode("restore_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode)
def main(): """ Lavora sui dati del database originale per fare ETL e caricare sulla dwh i dati che ci servono """ spark = SparkSession \ .builder \ .appName('DBAnalysis') \ .config('spark.driver.extraClassPath', 'postgresql-42.2.10.jar') \ .getOrCreate() properties = { 'driver': 'org.postgresql.Driver', 'url': 'jdbc:postgresql://postgres:5432/postgres', 'user': '******', 'password': '******', 'dbtable': ' spotify_details', } properties_dwh = { 'driver': 'org.postgresql.Driver', 'url': 'jdbc:postgresql://postgres_dwh:5432/postgres', 'user': '******', 'password': '******' } df2 = spark.read \ .format('jdbc') \ .option('driver', properties['driver']) \ .option('url', properties['url']) \ .option('user', properties['user']) \ .option('password', properties['password']) \ .option('dbtable', properties['dbtable']) \ .load() count_tracks_distribution = df2.groupby('id_playlist')\ .count() # Creo la colonna year_month df3 = df2.withColumn( 'year_month', date_format(to_timestamp(df2.timestamp, "yyyy-MM-dd'T'HH:mm:ssXXX"), "yyyy-MM").alias('year_month')) # Aggrego sulla playlist e conto le ricorrenze df4 = df3.groupby('id_playlist', 'year_month').count() # Per assegnare un mese alla playlist si é deciso di scegliere il mese con maggiori "aggiunte" di canzoni # Per ogni playlist seleziono solo quella con ricorrenze per mese maggiore w = Window.partitionBy('id_playlist') df5 = df4.withColumn('max', f.max('count').over(w))\ .where(f.col('count') == f.col('max'))\ .drop('max', 'count') month_distribution = df5\ .where(f.col('year_month')>="2020-01")\ .groupby('year_month')\ .count() # Il df completo ha l'informazione sul mese di riferimento assegnata ad ogni playlist spotify_complete = df2.join(df5, on=['id_playlist'], how='left') df_complete2 = spotify_complete.groupBy("id_playlist", 'name_playlist' , 'year_month')\ .agg(f.mean('danceability'),f.stddev_pop('danceability'),f.mean('energy'),f.stddev_pop('energy'),f.mean('valence'),f.stddev_pop('valence'))\ .sort('year_month', ascending=True) newColumns = [ "id_playlist", "name_playlist", "year_month", "avgdanceability", "stdddanceability", "avgenergy", "stddenergy", "avgvalence", "stddvalence" ] df_complete2 = df_complete2.toDF(*newColumns) df_complete3 = df_complete2.groupBy('year_month')\ .agg(f.mean('avgdanceability'),f.mean('stdddanceability'),f.mean('avgenergy'),f.mean('stddenergy'),f.mean('avgvalence'),f.mean('stddvalence'))\ .sort('year_month', ascending=True) newColumns = [ "timestamp", "mean_danceability", "stdev_danceability", "mean_energy", "stdev_energy", "mean_valence", "stdev_valence" ] audiofeatures_stat = df_complete3.toDF(*newColumns) audiofeatures_stat.write.jdbc(url=properties_dwh['url'], table='audiofeatures_stat', mode='overwrite', properties=properties_dwh) spotify_complete.write.jdbc(url=properties_dwh['url'], table='spotify_complete', mode='overwrite', properties=properties_dwh) month_distribution.write.jdbc(url=properties_dwh['url'], table='month_distribution', mode='overwrite', properties=properties_dwh) count_tracks_distribution.write.jdbc(url=properties_dwh['url'], table='count_tracks_distribution', mode='overwrite', properties=properties_dwh)
df = spark.createDataFrame(data=simpleData, schema = schema) df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0])) print("count: "+str(df.select(count("salary")).collect()[0])) df.select(first("salary")).show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"),var_samp("salary"),var_pop("salary")) \ .show(truncate=False)
# get the min and max amount of calories burned dailyActivitiesDF.select(min("CaloriesBurned"), max("CaloriesBurned")).show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC ## Statistical functions # MAGIC # MAGIC - We can do some basic statistical functions as well using the Spark API # COMMAND ---------- # standard deviation and variance dailyActivitiesDF.select(var_pop("CaloriesBurned"), var_samp("CaloriesBurned"), stddev_pop("CaloriesBurned"), stddev_samp("CaloriesBurned")).show() # COMMAND ---------- # Any extreme points in our data? dailyActivitiesDF.select(skewness("CaloriesBurned"), kurtosis("CaloriesBurned")).show() # COMMAND ---------- # Covariance and Correlation dailyActivitiesDF.select(corr("CaloriesBurned", "Steps"), covar_samp("CaloriesBurned", "Steps"), covar_pop("CaloriesBurned", "Steps")).show()