def get_last_amount_recurrency(recurrents_filter_worktable, group_array): """ Getting statistics (average , standard deviation and coefficient of variation )on transaction_amount field and it is group by customer and commerce :param recurrents_filter_worktable:Recurrents Dataframe :param group_array: Array groups by the following fields['customer_id', 'commerce_affiliation_id'] :return: Recurrents Dataframe contains transaction_amount statistics group by group_array parameters """ return recurrents_filter_worktable.groupBy(group_array).agg( # means avg( when( c('order2') <= 4, c(fields.transaction_amount).cast('Float')) ).cast('Float').alias("MEAN_transaction_amount"), # SD stddev( when( c('order2') <= 4, c(fields.transaction_amount).cast('Float')) ).cast('Float').alias("SD_transaction_amount"), # CV ((stddev( when( c('order2') <= 4, c(fields.transaction_amount).cast('Float'))).cast('Float')) / (avg( when( c('order2') <= 4, c(fields.transaction_amount).cast('Float'))).cast('Float') )).cast('Float').alias("CV_transaction_amount"))
def bikes_summary(bikes, stations): member_summary = bikes.groupBy('member_type').count().sort(F.desc("count")) member_summary.show() bikes_summary = bikes.groupBy('bike_number').count().sort(F.desc("count")) bikes_summary.show() duration_summary = bikes.select([ F.min("duration"), F.max("duration"), F.mean("duration"), F.stddev("duration") ]) duration_summary.show() #print statistics for stations check-ins and check-outs check_in_summary = bikes.groupBy('start_station_id').count().sort( F.desc("count")) check_in_summary.show() check_out_summary = bikes.groupBy('end_station_id').count().sort( F.desc("count")) check_out_summary.show() #print statistics for distance distance_summary = bikes.select([F.min("abs_distance"), F.max("abs_distance"), F.mean("abs_distance"), \ F.stddev("abs_distance")]) distance_summary.show()
def addUserFeatures(df: DataFrame) -> DataFrame: ''' 提取用户特征 :param df: :return: ''' extractGenresUdf = udf(extractGenres, returnType=ArrayType(IntegerType())) print('start user feature') samplesWithUserFeatures = df.withColumn('userPositiveHistory',collect_list(when(col('label')==1,col('movieId')).otherwise(lit(None))).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)))\ .withColumn('userPositiveHistory',reverse(col('userPositiveHistory'))) \ .withColumn("userRatedMovie1", col("userPositiveHistory").getItem(0)) \ .withColumn("userRatedMovie2", col("userPositiveHistory").getItem(1)) \ .withColumn("userRatedMovie3", col("userPositiveHistory").getItem(2)) \ .withColumn("userRatedMovie4", col("userPositiveHistory").getItem(3)) \ .withColumn("userRatedMovie5", col("userPositiveHistory").getItem(4)) \ .withColumn('userRatingCount',count(lit(1)).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1))) \ .withColumn('userAvgReleaseYear',avg(col('releaseYear')).over(Window.partitionBy("userId").orderBy(col('timestamp')).rowsBetween(-100, -1)).cast('int')) \ .withColumn('userReleaseYearStddev',stddev(col('releaseYear')).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1))) \ .withColumn('userAvgRating',format_number(avg(col('rating')).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)),2)) \ .withColumn('userRatingStddev', stddev(col("rating")).over(Window.partitionBy("userId").orderBy(col("timestamp")).rowsBetween(-100, -1))) \ .withColumn('userGenres',extractGenresUdf(collect_list(when(col('label') == 1,col('genres')).otherwise(lit(None))).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)))).na.fill(0) \ .withColumn("userRatingStddev", format_number(col("userRatingStddev"), 2)) \ .withColumn("userReleaseYearStddev", format_number(col("userReleaseYearStddev"), 2)) \ .withColumn("userGenre1", col("userGenres").getItem(0)) \ .withColumn("userGenre2", col("userGenres").getItem(1)) \ .withColumn("userGenre3", col("userGenres").getItem(2)) \ .withColumn("userGenre4", col("userGenres").getItem(3)) \ .withColumn("userGenre5", col("userGenres").getItem(4))\ .drop("genres", "userGenres", "userPositiveHistory").filter(col('userRatingCount') > 1) # samplesWithUserFeatures.printSchema() samplesWithUserFeatures.show(10, truncate=True) return samplesWithUserFeatures
def get_metrics_by_source(df: SparkDataFrame) -> SparkDataFrame: """ Parameters ---------- df: SparkDataFrame Returns ------- return: SparkDataFrame """ frame_len = col("frame_len") time_diff = col("Source_time_diff") metrics = [ F.count("*").alias("Spc"), F.sum(frame_len).alias("Tss"), F.min(frame_len).alias("Smin"), F.max(frame_len).alias("Smax"), F.mean(frame_len).alias("Savg"), F.stddev(frame_len).alias("Svar"), F.min(time_diff).alias("SITmin"), F.max(time_diff).alias("SITmax"), F.mean(time_diff).alias("SITavg"), F.stddev(time_diff).alias("SITvar"), ] grouped_by_source = group_by_source(df) return grouped_by_source.agg(*metrics)
def get_metrics_by_destination(df: SparkDataFrame) -> SparkDataFrame: """ Parameters ---------- df: SparkDataFrame Returns ------- return: SparkDataFrame """ frame_len = col("frame_len") time_diff = col("Destination_time_diff") metrics = [ F.count("*").alias("Rpc"), F.sum(frame_len).alias("Tsr"), F.mean(frame_len).alias("Ravg"), F.stddev(frame_len).alias("Rvar"), F.min(time_diff).alias("RITmin"), F.max(time_diff).alias("RITmax"), F.mean(time_diff).alias("RITavg"), F.stddev(time_diff).alias("RITvar"), ] grouped_by_destination = group_by_destination(df) return grouped_by_destination.agg(*metrics)
def groupby_statistics(df): ret = df.groupby('passenger_count').agg(f.mean('fare_amount'), f.stddev('fare_amount'), f.mean('tip_amount'), f.stddev('tip_amount')) ret.take(3) return ret
def print_avg_predictions(predictions_df, label): result_row = (predictions_df.agg( F.avg('rating').alias('avg_rating'), F.stddev('rating').alias('stddev_rating'), F.avg('prediction').alias('avg_prediction'), F.stddev('prediction').alias('stddev_prediction')).head()) print('[{} Prediction] Rating Avg: {} Stddev: {}'.format( label, result_row[0], result_row[1])) print('[{} Prediction] Prediction Avg: {} Stddev: {}'.format( label, result_row[2], result_row[3]))
def stats(): stats1DF = joinDF.orderBy('fhours').groupBy('fhours').agg( count('fhours').alias('count'), mean('tempD').alias('tempM'), stddev('tempd').alias('tempSD'), mean('humidityD').alias('humidityM'), stddev('humidityD').alias('humiditySD'), mean('wind_mphD').alias('wind_mphM'), stddev('wind_mphD').alias('wind_mphSD'), mean('feelslikeD').alias('feelslikeM'), stddev('feelslikeD').alias('feelslikeSD')) return stats1DF
def addUserFeatures(samplesWithMovieFeatures): extractGenresUdf = udf(extractGenres, ArrayType(StringType())) ''' userPositiveHistory: 样本出现之前该用户喜欢的电影ID列表,按最近到最远排序 userRatedMovie1~5:基于userPositiveHistory生成,最近访问的5部电影 userRatingCount:最近评分的次数 userAvgReleaseYear:最近电影平均年份 userReleaseYearStddev:最近电影年份标准差 userAvgRating:最近电影平均评分 userRatingStddev:最近电影评分标准差 userGenres:最近喜欢电影的分类出现次数统计,返回从高到低频率的分类 userGenre1~5:从userGenres取top 5分类 保留userRatingCount大于1的样本 ''' samplesWithUserFeatures = samplesWithMovieFeatures \ .withColumn('userPositiveHistory', F.collect_list(when(F.col('label') == 1, F.col('movieId')).otherwise(F.lit(None))).over( sql.Window.partitionBy("userId").orderBy(F.col("timestamp")).rowsBetween(-100, -1))) \ .withColumn("userPositiveHistory", reverse(F.col("userPositiveHistory"))) \ .withColumn('userRatedMovie1', F.col('userPositiveHistory')[0]) \ .withColumn('userRatedMovie2', F.col('userPositiveHistory')[1]) \ .withColumn('userRatedMovie3', F.col('userPositiveHistory')[2]) \ .withColumn('userRatedMovie4', F.col('userPositiveHistory')[3]) \ .withColumn('userRatedMovie5', F.col('userPositiveHistory')[4]) \ .withColumn('userRatingCount', F.count(F.lit(1)).over(sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))) \ .withColumn('userAvgReleaseYear', F.avg(F.col('releaseYear')).over( sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)).cast(IntegerType())) \ .withColumn('userReleaseYearStddev', F.stddev(F.col("releaseYear")).over( sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))) \ .withColumn("userReleaseYearStddev", format_number(F.col("userReleaseYearStddev"), NUMBER_PRECISION)) \ .withColumn("userAvgRating", format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION)) \ .withColumn("userRatingStddev", F.stddev(F.col("rating")).over(sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))) \ .withColumn("userRatingStddev", format_number(F.col("userRatingStddev"), NUMBER_PRECISION)) \ .withColumn("userGenres", extractGenresUdf(F.collect_list(when(F.col('label') == 1, F.col('genres')).otherwise(F.lit(None))).over( sql.Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)))) \ .withColumn("userGenre1", F.col("userGenres")[0]) \ .withColumn("userGenre2", F.col("userGenres")[1]) \ .withColumn("userGenre3", F.col("userGenres")[2]) \ .withColumn("userGenre4", F.col("userGenres")[3]) \ .withColumn("userGenre5", F.col("userGenres")[4]) \ .drop("genres", "userGenres", "userPositiveHistory") \ .filter(F.col("userRatingCount") > 1) samplesWithUserFeatures.printSchema() samplesWithUserFeatures.show(10) # 挑选1个用户看一下 samplesWithUserFeatures.filter( samplesWithMovieFeatures['userId'] == 1).orderBy( F.col('timestamp').asc()).show(truncate=False) return samplesWithUserFeatures
def featurize(df, col_name): df_stats = df.select( F.mean(F.col(col_name)).alias('mean'), F.stddev(F.col(col_name)).alias('std')).collect() mean = df_stats[0]['mean'] std = df_stats[0]['std'] data = df.withColumn(col_name, (df[col_name] - mean) / std) data_stats = data.select( F.mean(F.col(col_name)).alias('mean'), F.stddev(F.col(col_name)).alias('std')).collect() new_mean = data_stats[0]['mean'] new_std = data_stats[0]['std'] return data
def testAgg_1() -> None: spark = SparkSession.builder.appName("aggregation").getOrCreate() df = spark.read.csv("./datasets/sales_info.csv", inferSchema=True, header=True) df.printSchema() df.show() print("Sales average") df.groupBy("Company").mean().show() print("Count") df.groupBy("Company").count().show() print("Aggregation") df.agg({"Sales": "sum"}).show() df.agg({"Sales": "max"}).show() print("Group and aggregation") group_data = df.groupBy("Company") group_data.agg({"Sales": "max"}).show() print("Functions") df.select(countDistinct("Sales")).show() df.select(avg("Sales")).show() df.select(avg("Sales").alias("Average Sales")).show() print("Format") sales_std = df.select(stddev("Sales").alias("std")) sales_std.select(format_number("std", 2).alias("Standard Deviation")).show() print("Order By") df.orderBy("Sales").show() # Ascending order. df.orderBy(df["Sales"].desc()).show() # Descending order.
def election_donations_analysis(input, output1, output2): from pyspark.sql import SparkSession import pyspark.sql.functions as F spark = SparkSession.builder.getOrCreate() reader = spark.read reader.option("header", True).option("inferSchema", True) df = reader.csv(input) cand_info = ['cand_id', 'cand_nm'] grouped_df = df.groupBy(cand_info)\ .agg( F.count(F.when(F.col('contb_receipt_amt') >= '0', 1)).alias("NumDon"), F.sum('contb_receipt_amt').alias("Total_cntrb"), F.countDistinct(F.when(F.col('contb_receipt_amt') >= '0', F.array('contbr_nm', 'contbr_zip', 'contbr_employer', 'contbr_occupation'))).alias('UnqCtb'), F.mean("contb_receipt_amt").alias("Mean"), F.stddev("contb_receipt_amt").alias("SD"), F.count(F.when((F.col('contb_receipt_amt') < '50') & (F.col('contb_receipt_amt') >= '0'), 1)).alias("smCtb") ).withColumn("%", (F.col("smCtb") / F.col("NumDon"))*100) grouped_df.sort(F.desc('cand_id')).show() grouped_df.write.format('csv').mode('overwrite').save(output1) required_cand_ids = ["P80001571", "P80000722"] df_6 = df.filter(df.cand_id.isin(required_cand_ids)).filter( "contb_receipt_amt >= '0'").select('cand_nm', 'contb_receipt_amt') df_6.sort(F.desc('contb_receipt_amt')).show() df_6.coalesce(1).write.format('csv').mode('overwrite').save(output2)
def _calculate_poi_stats(df): """ Calculate all the relevant statistics for a given position of interest (POI) :param poi_df: A Spark dataframe containing information mapping a request ID to a POI together with the distance between the ID and the POI :return: A Spark dataframe containing information about the mean and standard deviations of the distances of all various requests, the maximum distance (radius), request counts, and density associated to a POI """ poi_stats_df = df \ .groupBy('POIID') \ .agg( count('*').alias('req_count'), avg('min_distance').alias('mean_distance'), stddev('min_distance').alias('stddev_distance'), max('min_distance').alias('radius') ) \ .select( 'POIID', 'req_count', 'mean_distance', 'stddev_distance', 'radius', round(col('req_count') / (pi * col('radius') ** 2), 6).alias('density') ) return poi_stats_df
class AverageEstimator(Estimator, HasInputCol, HasOutputCol): def fit(self, X): ''' Creates a Spark dataframe containing the average and standard deviation on a per-game basis for all games in the input dataframe X. Parameters: self and a dataframe X that contains steamid, steamid_int, appid, and playtime_forever. Outputs: a self object wih a Spark dataframe (self.table) containing the appid, average playtime for each appid, and standard deviation of the playtime for each appid. ''' grp_window = Window.partitionBy('appid') avg_func = func.mean('playtime_forever') stddev_func = func.stddev('playtime_forever') X = X.withColumn('avg', avg_func.over(grp_window)) X = X.withColumn('std_dev', stddev_func.over(grp_window)) self.table = X for col in ['steamid', 'steamid_int', 'playtime_forever']: self.table = self.table.drop(col) self.table = self.table.dropDuplicates(subset=['appid']) return self
def standardize(df): ''' Standardize all columns formula = [(x - mean) / std] CREDITS: https://gist.github.com/morganmcg1/15a9de711b9c5e8e1bd142b4be80252d Based on the solution on stackoverflow: https://stackoverflow.com/questions/44580644/subtract-mean-from-pyspark-dataframe Paramters: - dataframe Returns: - dataframe with standardized variables ''' # create list to aggregate all mean and stds aggAvg = [] aggStd = [] # aggregate all means and stds for c in df.columns: aggAvg.append(mean(df[c]).alias(c)) aggStd.append(stddev(df[c]).alias(c + '_stddev')) averages = df.agg(*aggAvg).collect()[0] std_devs = df.agg(*aggStd).collect()[0] for c in df.columns: df = df.withColumn(c + '_norm', ((df[c] - averages[c]) / std_devs[c + '_stddev'])) return df
def occCalc(self, channelID, testing=False): """ Calculates occupancy for the user defined month """ if type(channelID) != list: raise TypeError('ChannelID is required to be a list') conf = SparkConf()\ .setAppName("Occupancy Calc")\ .set("spark.master", "local[*]")\ .set("spark.driver.maxResultSize", "15G") sc = SparkContext(conf=conf) sql = SQLContext(sc) path = 'AZURE PATH' + self.month +\ '/*/*/' + self.sensor + '*' data = sql.read.parquet(path) timeCount = data.select('scan_time').distinct().count() timeCount = sc.broadcast(timeCount) subData = data.select('scan_time', 'channel_id', 'power_dbm').filter( data.channel_id.isin(channelID)) subData = subData.groupBy('channel_id').agg( (count(column('power_dbm')) / timeCount.value).alias('freq'), stddev(column('power_dbm')).alias('sd')).sort( asc('freq'), desc('sd')) if testing: subData.toPandas().to_csv('C:/path/freq.csv', sep='\t') sc.stop() else: sc.stop() return (subData.toPandas())
def matching_set(self, scores, thresh=1.5): """Best-guess method with eccentricity threshold. Scores is a spark dataframe that may contain multiple custId. Parameters: - scores: DF(custId_1, custId_2, value) """ # Sigma (custId_1, std): Standard deviation for each attacked customer. sigma = scores.groupBy('custId_1').agg( F.stddev(scores.value).alias('std')) window = Window.partitionBy(scores.custId_1).orderBy( scores.value.desc()) # Top scores (custId_1, custId_2, value, rank): # For each customer, the two closest customers in the dataset. top_scores = scores.select( '*', F.row_number().over(window).alias('rank')).filter( F.col('rank') <= 2) # First match: (custId_1, custId_2, value_1) top_1 = top_scores.filter('rank == 1').drop('rank').withColumnRenamed( 'value', 'value_1') # Second match: (custId_1, custId_3, value_2) top_2 = top_scores.filter('rank == 2').drop('rank').withColumnRenamed( 'value', 'value_2').withColumnRenamed('custId_2', 'custId_3') # (custId_1, custId_2, custId_3, value_1, value_2, std) scores = top_1.join(top_2, ['custId_1']).join(sigma, 'custId_1') # (..., eccentricity) # For each customer, the two closests customers in the dataset, along with the # eccentricity measure. scores_w_eccentricity = scores.withColumn( 'eccentricity', (F.col('value_1') - F.col('value_2')) / F.col('std')) return scores_w_eccentricity.filter( 'eccentricity >= {}'.format(thresh))
def get_session_stats(data_in, name): # Limit to song plays session_vars = data_in.filter(data_in['page'] == 'NextSong') # Get play time for each song session_window = ssw.Window.partitionBy( session_vars['userId'], session_vars['sessionId']).orderBy('ts') session_vars = session_vars.withColumn( 'nextPlayStart', ssf.lead(session_vars['ts']).over(session_window)) session_vars = session_vars.withColumn( 'playTime', session_vars['nextPlayStart'] - session_vars['ts']) # Get play time for each session session_vars = session_vars.groupby('userId', 'sessionId').agg( #pylint: disable=no-member ssf.sum('playTime').alias('sessionLength')) # Get play time stats for each user session_vars = session_vars.groupby('userId').agg( #pylint: disable=no-member ssf.mean('sessionLength').alias(f'lengthMean{name}'), ssf.stddev('sessionLength').alias(f'lengthStd{name}'), ssf.sum('sessionLength').alias(f'lengthSum{name}')) return session_vars
def baseline_model(self, df_processed): cond_dvs = F.when(col("dvs_p30days") > 400, 400).otherwise(col("dvs_p30days")) stats=df_processed\ .withColumn("dvs_p30days",cond_dvs)\ .filter(col("recent_event_date")==1)\ .filter(col("event_date")>=F.date_add(F.current_date(),-15))\ .groupby(['grt_l2_cat_name'])\ .agg(F.avg(col("dvs_p30days")).alias("avg_dvs_p30days") ,F.stddev(col("dvs_p30days")).alias("std_deal_view") ,F.round(F.avg(col("dvs_p30days")).cast('integer')).alias("avg_deal_view") ,F.max("dvs_p30days").alias("max_deal_view")) w = Window.partitionBy(F.col('consumer_id')).orderBy( F.col('normalized_dvs_p30days').desc()) df_final=df_processed\ .filter(col("recent_event_date")==1)\ .filter(col("event_date")>=F.date_add(F.current_date(),-15))\ .join(stats,on='grt_l2_cat_name')\ .withColumn('normalized_dvs_p30days',(col('dvs_p30days')-col('avg_deal_view'))/col('std_deal_view'))\ .withColumn('normalized_dvs_p30days_rank',F.row_number().over(w)) df_micro=df_final\ .filter(col('normalized_dvs_p30days')>=0)\ .filter(col('normalized_dvs_p30days_rank')==1)\ .filter(col('grt_l2_purchaser_14d')==0) return df_micro
def addMovieFeatures(movieSamples, ratingSamplesWithLabel): # add movie basic features samplesWithMovies1 = ratingSamplesWithLabel.join(movieSamples, on=['movieId'], how='left') # add releaseYear,title samplesWithMovies2 = samplesWithMovies1.withColumn('releaseYear', udf(extractReleaseYearUdf, IntegerType())('title')) \ .withColumn('title', udf(lambda x: x.strip()[:-6].strip(), StringType())('title')) \ .drop('title') # split genres samplesWithMovies3 = samplesWithMovies2 \ .withColumn('movieGenre1', split(F.col('genres'), "\\|")[0]) \ .withColumn('movieGenre2', split(F.col('genres'), "\\|")[1]) \ .withColumn('movieGenre3', split(F.col('genres'), "\\|")[2]) # add rating features movieRatingFeatures = samplesWithMovies3.groupBy('movieId') \ .agg(F.count(F.lit(1)).alias('movieRatingCount'), format_number(F.avg(F.col('rating')), NUMBER_PRECISION) \ .alias('movieAvgRating'), F.stddev(F.col('rating')) \ .alias('movieRatingStddev')).fillna(0) \ .withColumn('movieRatingStddev', format_number(F.col('movieRatingStddev'), NUMBER_PRECISION)) # join movie rating features samplesWithMovies4 = samplesWithMovies3.join(movieRatingFeatures, on=['movieId'], how='left') samplesWithMovies4.printSchema() samplesWithMovies4.show(5, truncate=False) return samplesWithMovies4
def main(inputs): reviews_df = utilities.get_completereviews_dataframe(spark).dropna() #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA) reviews_df.cache() helpful_df = reviews_df.filter(reviews_df.product_id.isNotNull()).filter(reviews_df.total_votes > 1).filter(reviews_df.helpful_votes > 0).cache() helpful_df = helpful_df.withColumn('helpfulness_percentage',(helpful_df.helpful_votes / helpful_df.total_votes) * 100) helpful_df = helpful_df.filter(helpful_df.helpfulness_percentage > 90).select('review_body') # take most helpful reviews helpful_df = helpful_df.withColumn('word_count', word_count(helpful_df.review_body)).cache() ## median of most helpful reviews median = helpful_df.approxQuantile('word_count', [0.5], 0.25) sc.parallelize(median).repartition(1).saveAsTextFile('median_words_in_helpfulreviews.csv') ## average, std of most helpful reviews aggregations = [functions.avg('word_count').alias('average_wordcount'), functions.stddev('word_count').alias('std_wordcount')] stats_df = helpful_df.agg(*aggregations) stats_df.repartition(1).write.mode('overwrite').csv('average_stddev_wordcount_mosthelpful') count = reviews_df.filter(reviews_df.customer_id.isNotNull()).select('customer_id').agg(functions.countDistinct('customer_id')) count.repartition(1).write.mode('overwrite').csv('total_customersin_dataset') product_count = reviews_df.filter(reviews_df.product_id.isNotNull()).select('product_id').agg(functions.countDistinct('product_id')) product_count.repartition(1).write.mode('overwrite').csv('total_productsin_dataset')
def standard(): self.logger.info("conducting Standard Scaling") if (self.parameterDF is None) & (self.columns is not None): # 没有提供参数表,执行fit_transform操作 mmParamList = [] self.transformedDF = self.originalDF for col in self.columns: mmRow = self.originalDF.select( functions.avg(col), functions.stddev(col)).first() mmAvg = mmRow["avg(" + col + ")"] mmStd = mmRow["stddev_samp(" + col + ")"] mmParamList.append([col, float(mmAvg), float(mmStd)]) self.transformedDF = self.transformedDF\ .withColumn(col + "ss", (functions.col(col) - mmAvg) / mmStd) \ .drop(col)\ .withColumnRenamed(col + "ss", col) self.parameterDF = self.spark.createDataFrame( mmParamList, ['col', 'avg', 'std']) elif self.parameterDF is not None: # 读取参数表中的参数并用于转换原始表 self.transformedDF = self.originalDF self.parameterDFP = self.parameterDF.toPandas() for col in self.parameterDFP["col"]: mmParamList = [ self.parameterDFP.loc[self.parameterDFP["col"] == col, "avg"].tolist()[0], self.parameterDFP.loc[self.parameterDFP["col"] == col, "std"].tolist()[0] ] self.transformedDF = self.transformedDF\ .withColumn(col+"ss", (functions.col(col) - mmParamList[0])/(mmParamList[1] - mmParamList[0])) \ .drop(col)\ .withColumnRenamed(col + "ss", col)
def describe_float_1d(df, column, current_result, nrows): stats_df = df.select(column).na.drop().agg( mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum")).toPandas() for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column).na.drop().selectExpr( "percentile_approx(`{col}`,CAST({n} AS DOUBLE))".format( col=column, n=x)).toPandas().iloc[:, 0]) stats = stats_df.iloc[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column).na.drop().select( df_abs(col(column) - stats["mean"]).alias("delta")).agg( df_sum(col("delta"))).toPandas().iloc[0, 0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column) == 0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) return stats
def replaceByStandardDeviation(self, feature, df, stddev_=-1): stddevValue = df.select(stddev(col(feature.name)).alias( 'stddev')).collect()[0]["stddev"] df.fillna(stddevValue, subset=[feature.name]) df = df.withColumn(feature.name, when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name))) return df
def _partial_dependence_num(vals, df, model, feature_column, assembler, model_type, n_samples): """Compute PD values for numerical feature""" vals_df = spark.createDataFrame(list(map(lambda x: (float(x),), vals)), schema=StructType([StructField(feature_column, DoubleType())])) vals_df.cache() result_df = spark.createDataFrame(sc.emptyRDD(), schema=StructType([StructField(feature_column, DoubleType()), StructField('pd', DoubleType())])) df = df.drop(feature_column) for i in range(n_samples): cross = df.sample(withReplacement=False, fraction=1 / n_samples).crossJoin(vals_df) if model_type == 'classification': pred = model.transform(assembler.transform(cross))\ .select(feature_column, p1_proba('probability').alias('pred')) else: pred = model.transform(assembler.transform(cross))\ .select(feature_column, F.col('prediction').alias('pred')) pd = pred.groupBy(feature_column).agg(F.avg('pred').alias('pd')) result_df = result_df.union(pd) stats = result_df.groupBy(feature_column).agg(F.avg('pd'), F.stddev('pd')).orderBy(feature_column).toPandas().values means = stats[:, 1] cis = np.apply_along_axis(lambda arr: st.norm.interval(0.95, loc=arr[0], scale=arr[1] / np.sqrt(n_samples)), 1, stats[:, 1:]) return means, cis
def addMovieFeatures(movieSamples: DataFrame, ratingSamplesWithLabel: DataFrame) -> DataFrame: # 在movieId列上等值连接,ratingSamplesWithLabel作为左表,movieSamples作为右表, samplesWithMovie = ratingSamplesWithLabel.join(movieSamples, on=['movieId'], how='left') # 从title列中提取处于其末尾括号中的releaseYear字段 samplesWithMovie = samplesWithMovie.withColumn('releaseYear', F.udf(extractReleaseYearUDF, IntegerType())('title')) \ .withColumn('title', F.udf(lambda x: x.strip()[:-6].strip(), StringType())('title')) \ .drop('title') # DataFrame中去掉暂时无用的title字段(只需要movieId即可) # 从genres列中分隔以'|'符号为分界符的各genre字段 samplesWithMovie = samplesWithMovie.withColumn('genre0', F.split(F.col('genres'), "\\|")[0]) \ .withColumn('genre1', F.split(F.col('genres'), "\\|")[1]) \ .withColumn('genre2', F.split(F.col('genres'), "\\|")[2]) # 添加评分特征 movieRatingFeatures = samplesWithMovie.groupBy('movieId').agg(F.count(F.lit(1)).alias('movieRatingCount'), F.format_number(F.avg(F.col('rating')), NUMBER_PRECISION).alias( 'movieAvgRating'), F.stddev(F.col('rating')).alias( 'movieRatingStddev')) \ .fillna(0) \ .withColumn('movieRatingStddev', F.format_number(F.col('movieRatingStddev'), NUMBER_PRECISION)) # 将samplesWithMovie和movieRatingFeatures在movieId列上等值连接,前者作为左表,后者作为右表 return samplesWithMovie.join(movieRatingFeatures, on=['movieId'], how='left')
def preprocessing(spark_df): smart_feature_columns=[column for column in spark_df.columns if 'smart' in column] window_spec_7 = Window.partitionBy('model', 'serial_number').orderBy( F.datediff(F.col('dt'), F.lit('2017-07-01'))).rangeBetween(-7, 0) prefix_window7='window_7_' for smart_col in smart_feature_columns: spark_df=spark_df.withColumn(smart_col,F.col(smart_col).cast(DoubleType())) if smart_col in ['smart_1_normalized','smart_5raw','smart_7_normalized','smart_194raw','smart_199raw', 'smart_190raw','smart_191raw','smart_193raw','smart_195_normalized','smart_195raw']: spark_df = spark_df.withColumn(prefix_window7 + 'range_' + smart_col, F.max(F.col(smart_col)).over(window_spec_7) - F.min(F.col(smart_col)).over( window_spec_7)) spark_df = spark_df.withColumn(prefix_window7 + 'std_' + smart_col, F.stddev(F.col(smart_col)).over(window_spec_7)) #if smart_col in ['smart_187raw','smart_188raw','smart_197raw','smart_198raw']: # spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)>0,1).otherwise(0)) #if smart_col in ['smart_187_normalized','smart_188_normalized','smart_197_normalized','smart_198_normalized']: # spark_df=spark_df.withColumn(smart_col,F.when(F.col(smart_col)<100,1).otherwise(0)) if smart_col in ['smart_4raw','smart_5raw','smart_191raw', 'smart_187raw','smart_197raw','smart_198raw', 'smart_199raw','window_7_range_smart_199raw']: spark_df=spark_df.withColumn(smart_col,F.log2(F.col(smart_col)+F.lit(1.))) spark_df=spark_df.withColumn('smart_199raw',F.col('smart_199raw')*F.col('window_7_range_smart_199raw')) spark_df = spark_df.withColumn('anomaly_sum', F.col('smart_4raw') / 12 + F.col('smart_5raw') / 16 + F.col('smart_191raw') / 18 + F.col('smart_198raw')/18 +F.col('smart_197raw')/18+F.col('smart_187raw')/15) return spark_df
def output(self, scores, thresh=1.5, mode="best-guess"): """Standard output of the algorithm De-anonymisation has two modes: entropic (keeps the full distribution) or best-guess (matching with threshold). """ if mode == "best-guess": return self.matching_set(scores, thresh) elif mode == "entropic": # (custId_1, std) sigma = scores.groupBy('custId_1').agg( F.stddev(scores.value).alias('std')) # (custId_1, custId_2, probas_raw) probas_raw = scores\ .join(sigma, ['custId_1'])\ .withColumn("probas_raw", F.exp(F.col('value')/F.col('std')))\ .select(['custId_1', 'custId_2', 'probas_raw', 'std']) # (custId_1, probas_z) probas_z = probas_raw.groupBy('custId_1').agg( F.sum(probas_raw.probas_raw).alias('probas_z')) # (custId_1, custId_2, probas) return scores\ .join(probas_raw, ['custId_1', 'custId_2'])\ .join(probas_z, ['custId_1'])\ .withColumn("probas", F.col('probas_raw')/F.col('probas_z'))\ .select(['custId_1', 'custId_2', 'probas', 'value', 'std']) else: raise "Mode '{}' is invalid.".format(mode)
def get_standardized_residuals(res): resstd = res.select( fns.stddev('residuals').alias('std')).collect()[0]['std'] resstd = res.withColumn('std', fns.lit(resstd)) resstd = resstd.withColumn('resstd', fns.col('residuals') / fns.col('std')) resstd = resstd.drop('residuals') return resstd
def threeSTD(df, column): df_non_nan = df.filter(df[column].isNotNull()) avg = df_non_nan.select(mean(column).alias(column)).collect() avg = round(avg[0][column], 2) std = df_non_nan.select(stddev(column).alias(column)).collect() std = round(std[0][column], 2) down_limit = avg - (3 * std) up_limit = avg + (3 * std) def udf_XiGeMa(s): if (s == None) | (s == 'NULL'): return s else: s = float(s) if (s >= down_limit) & (s <= up_limit): return float(s) else: return None udf_transf = udf(udf_XiGeMa, FloatType()) df = df_non_nan.select('*', udf_transf(df_non_nan[column]).alias('temp_name')) df = df.drop(column) df = df.withColumnRenamed('temp_name', column) df = df.filter(df[column].isNotNull()) return df
def describe_float_1d(df, column, current_result, nrows): if spark_version == "1.6+": stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), variance(col(column)).alias("variance"), kurtosis(col(column)).alias("kurtosis"), stddev(col(column)).alias("std"), skewness(col(column)).alias("skewness"), df_sum(col(column)).alias("sum") ).toPandas() else: stats_df = df.select(column).na.drop().agg(mean(col(column)).alias("mean"), df_min(col(column)).alias("min"), df_max(col(column)).alias("max"), df_sum(col(column)).alias("sum") ).toPandas() stats_df["variance"] = df.select(column).na.drop().agg(variance_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["std"] = np.sqrt(stats_df["variance"]) stats_df["skewness"] = df.select(column).na.drop().agg(skewness_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] stats_df["kurtosis"] = df.select(column).na.drop().agg(kurtosis_custom(col(column), stats_df["mean"].ix[0], current_result["count"])).toPandas().ix[0][0] for x in np.array([0.05, 0.25, 0.5, 0.75, 0.95]): stats_df[pretty_name(x)] = (df.select(column) .na.drop() .selectExpr("percentile_approx(`{col}`,CAST({n} AS DOUBLE))" .format(col=column, n=x)).toPandas().ix[:,0] ) stats = stats_df.ix[0].copy() stats.name = column stats["range"] = stats["max"] - stats["min"] stats["iqr"] = stats[pretty_name(0.75)] - stats[pretty_name(0.25)] stats["cv"] = stats["std"] / float(stats["mean"]) stats["mad"] = (df.select(column) .na.drop() .select(df_abs(col(column)-stats["mean"]).alias("delta")) .agg(df_sum(col("delta"))).toPandas().ix[0,0] / float(current_result["count"])) stats["type"] = "NUM" stats['n_zeros'] = df.select(column).where(col(column)==0.0).count() stats['p_zeros'] = stats['n_zeros'] / float(nrows) # Large histogram imgdata = BytesIO() hist_data = create_hist_data(df, column, stats["min"], stats["max"], bins) figure = plt.figure(figsize=(6, 4)) plot = plt.subplot() plt.bar(hist_data["left_edge"], hist_data["count"], width=hist_data["width"], facecolor='#337ab7') plot.set_ylabel("Frequency") plot.figure.subplots_adjust(left=0.15, right=0.95, top=0.9, bottom=0.1, wspace=0, hspace=0) plot.figure.savefig(imgdata) imgdata.seek(0) stats['histogram'] = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue())) #TODO Think about writing this to disk instead of caching them in strings plt.close(plot.figure) stats['mini_histogram'] = mini_histogram(hist_data) return stats
weather_df.humidity, weather_df.pressure, weather_df.visibility, weather_df.precipIntensity, weather_df.windSpeed )) status_joined_df.show() stats_df = (status_joined_df .agg( F.mean(status_joined_df.temperature).alias("avgTemp"), F.mean(status_joined_df.humidity).alias("avgHumidity"), F.mean(status_joined_df.pressure).alias("avgPressure"), F.stddev(status_joined_df.temperature).alias("stddevTemp"), F.stddev(status_joined_df.humidity).alias("stddevHumidity"), F.stddev(status_joined_df.pressure).alias("stddevPressure"), F.stddev(status_joined_df.visibility).alias("stddevVisibility"), F.stddev(status_joined_df.precipIntensity).alias("stddevPrecipitation"), F.stddev(status_joined_df.windSpeed).alias("stddevWindSpeed"))) stats_df.write.mode('overwrite').parquet("hdfs://hadoop:9000/models/weather-stats") stats = stats_df.collect()[0] print "Statistics: %s" % (stats,) day_of_week = F.udf( lambda d: datetime.datetime.strptime(d, "%Y-%m-%d").weekday(), IntegerType())